Adds AArch64 Native Code Generator

In which we add a new code generator to the Glasgow Haskell Compiler. This codegen supports ELF and Mach-O targets, thus covering Linux, macOS, and BSDs in principle. It was tested only on macOS and Linux. The NCG follows a similar structure as the other native code generators we already have, and should therfore be realtively easy to follow. It supports most of the features required for a proper native code generator, but does not claim to be perfect or fully optimised. There are still opportunities for optimisations. Metric Decrease: ManyAlternatives ManyConstructors MultiLayerModules PmSeriesG PmSeriesS PmSeriesT PmSeriesV T10421 T10421a T10858 T11195 T11276 T11303b T11374 T11822 T12227 T12545 T12707 T13035 T13253 T13253-spj T13379 T13701 T13719 T14683 T14697 T15164 T15630 T16577 T17096 T17516 T17836 T17836b T17977 T17977b T18140 T18282 T18304 T18478 T18698a T18698b T18923 T1969 T3064 T5030 T5321FD T5321Fun T5631 T5642 T5837 T783 T9198 T9233 T9630 T9872d T9961 WWRec Metric Increase: T4801
author: Moritz Angermann <moritz.angermann@gmail.com> 2020-10-22 12:08:34 +0800
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2021-06-05 10:29:57 -0400
commit: 3b1aa7dba7c006b473855a6b199b15fa3a3d77a0 (patch)
tree: 631816f4aef8d28ee596b51b70a0a0ba2df12230
parent: 8c90e6c758769b068aea2891b26cc17577b6d36a (diff)
download: haskell-3b1aa7dba7c006b473855a6b199b15fa3a3d77a0.tar.gz
48 files changed, 3594 insertions, 132 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3e0db17aa4..2d33eaef13 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -740,14 +740,6 @@ validate-aarch64-darwin:
   tags:
     - aarch64-linux
 
-.build-aarch64-linux-deb10-llvm:
-  extends: .build-aarch64-linux-deb10
-  stage: full-build
-  variables:
-    BUILD_FLAVOUR: perf-llvm
-  tags:
-    - aarch64-linux
-
 validate-aarch64-linux-deb10:
   extends: .build-aarch64-linux-deb10
   artifacts:
@@ -760,6 +752,20 @@ nightly-aarch64-linux-deb10:
   variables:
     TEST_TYPE: slowtest
 
+.build-aarch64-linux-deb10-llvm:
+  extends: .build-aarch64-linux-deb10
+  stage: full-build
+  variables:
+    BUILD_FLAVOUR: perf-llvm
+  tags:
+    - aarch64-linux
+
+validate-aarch64-linux-deb10-llvm:
+  extends: .build-aarch64-linux-deb10-llvm
+  artifacts:
+    when: always
+    expire_in: 2 week
+
 #################################
 # armv7-linux-deb10
 #################################
diff --git a/compiler/GHC/Cmm/CLabel.hs b/compiler/GHC/Cmm/CLabel.hs
index cda35a4943..b6ad2b3431 100644
--- a/compiler/GHC/Cmm/CLabel.hs
+++ b/compiler/GHC/Cmm/CLabel.hs
@@ -287,6 +287,12 @@ data CLabel
 
   deriving Eq
 
+instance Show CLabel where
+  show = showPprUnsafe . pprDebugCLabel genericPlatform
+
+instance Outputable CLabel where
+  ppr = text . show
+
 isIdLabel :: CLabel -> Bool
 isIdLabel IdLabel{} = True
 isIdLabel _ = False
@@ -1544,6 +1550,7 @@ pprDynamicLinkerAsmLabel !platform dllInfo ppLbl =
             SymbolPtr       -> char 'L' <> ppLbl <> text "$non_lazy_ptr"
             GotSymbolPtr    -> ppLbl <> text "@GOTPCREL"
             GotSymbolOffset -> ppLbl
+        | platformArch platform == ArchAArch64 -> ppLbl
         | otherwise ->
           case dllInfo of
             CodeStub  -> char 'L' <> ppLbl <> text "$stub"
@@ -1572,6 +1579,10 @@ pprDynamicLinkerAsmLabel !platform dllInfo ppLbl =
           SymbolPtr -> text ".LC_" <> ppLbl
           _         -> panic "pprDynamicLinkerAsmLabel"
 
+      | platformArch platform == ArchAArch64
+      = ppLbl
+
+
       | platformArch platform == ArchX86_64
       = case dllInfo of
           CodeStub        -> ppLbl <> text "@plt"
diff --git a/compiler/GHC/Cmm/Expr.hs b/compiler/GHC/Cmm/Expr.hs
index 86b06271d1..52cb63c901 100644
--- a/compiler/GHC/Cmm/Expr.hs
+++ b/compiler/GHC/Cmm/Expr.hs
@@ -65,6 +65,7 @@ data CmmExpr
         --        ** is shorthand only, meaning **
         -- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
         --      where rep = typeWidth (cmmRegType reg)
+  deriving Show
 
 instance Eq CmmExpr where       -- Equality ignores the types
   CmmLit l1          == CmmLit l2          = l1==l2
@@ -78,7 +79,7 @@ instance Eq CmmExpr where       -- Equality ignores the types
 data CmmReg
   = CmmLocal  {-# UNPACK #-} !LocalReg
   | CmmGlobal GlobalReg
-  deriving( Eq, Ord )
+  deriving( Eq, Ord, Show )
 
 -- | A stack area is either the stack slot where a variable is spilled
 -- or the stack space where function arguments and results are passed.
@@ -86,7 +87,7 @@ data Area
   = Old            -- See Note [Old Area]
   | Young {-# UNPACK #-} !BlockId  -- Invariant: must be a continuation BlockId
                    -- See Note [Continuation BlockId] in GHC.Cmm.Node.
-  deriving (Eq, Ord)
+  deriving (Eq, Ord, Show)
 
 {- Note [Old Area]
 ~~~~~~~~~~~~~~~~~~
@@ -209,7 +210,7 @@ data CmmLit
                      -- During the stack-layout pass, CmmHighStackMark
                      -- is replaced by a CmmInt for the actual number
                      -- of bytes used
-  deriving Eq
+  deriving (Eq, Show)
 
 instance Outputable CmmLit where
   ppr (CmmInt n w) = text "CmmInt" <+> ppr n <+> ppr w
@@ -279,6 +280,7 @@ data LocalReg
     -- ^ Parameters:
     --   1. Identifier
     --   2. Type
+  deriving Show
 
 instance Eq LocalReg where
   (LocalReg u1 _) == (LocalReg u2 _) = u1 == u2
diff --git a/compiler/GHC/Cmm/Parser.y b/compiler/GHC/Cmm/Parser.y
index 490a3c4976..b8a6f7de7c 100644
--- a/compiler/GHC/Cmm/Parser.y
+++ b/compiler/GHC/Cmm/Parser.y
@@ -1021,6 +1021,45 @@ machOps = listToUFM $
 callishMachOps :: Platform -> UniqFM FastString ([CmmExpr] -> (CallishMachOp, [CmmExpr]))
 callishMachOps platform = listToUFM $
         map (\(x, y) -> (mkFastString x, y)) [
+
+        ( "pow64f", (MO_F64_Pwr,) ),
+        ( "sin64f", (MO_F64_Sin,) ),
+        ( "cos64f", (MO_F64_Cos,) ),
+        ( "tan64f", (MO_F64_Tan,) ),
+        ( "sinh64f", (MO_F64_Sinh,) ),
+        ( "cosh64f", (MO_F64_Cosh,) ),
+        ( "tanh64f", (MO_F64_Tanh,) ),
+        ( "asin64f", (MO_F64_Asin,) ),
+        ( "acos64f", (MO_F64_Acos,) ),
+        ( "atan64f", (MO_F64_Atan,) ),
+        ( "asinh64f", (MO_F64_Asinh,) ),
+        ( "acosh64f", (MO_F64_Acosh,) ),
+        ( "log64f", (MO_F64_Log,) ),
+        ( "log1p64f", (MO_F64_Log1P,) ),
+        ( "exp64f", (MO_F64_Exp,) ),
+        ( "expM164f", (MO_F64_ExpM1,) ),
+        ( "fabs64f", (MO_F64_Fabs,) ),
+        ( "sqrt64f", (MO_F64_Sqrt,) ),
+
+        ( "pow32f", (MO_F32_Pwr,) ),
+        ( "sin32f", (MO_F32_Sin,) ),
+        ( "cos32f", (MO_F32_Cos,) ),
+        ( "tan32f", (MO_F32_Tan,) ),
+        ( "sinh32f", (MO_F32_Sinh,) ),
+        ( "cosh32f", (MO_F32_Cosh,) ),
+        ( "tanh32f", (MO_F32_Tanh,) ),
+        ( "asin32f", (MO_F32_Asin,) ),
+        ( "acos32f", (MO_F32_Acos,) ),
+        ( "atan32f", (MO_F32_Atan,) ),
+        ( "asinh32f", (MO_F32_Asinh,) ),
+        ( "acosh32f", (MO_F32_Acosh,) ),
+        ( "log32f", (MO_F32_Log,) ),
+        ( "log1p32f", (MO_F32_Log1P,) ),
+        ( "exp32f", (MO_F32_Exp,) ),
+        ( "expM132f", (MO_F32_ExpM1,) ),
+        ( "fabs32f", (MO_F32_Fabs,) ),
+        ( "sqrt32f", (MO_F32_Sqrt,) ),
+
         ( "read_barrier", (MO_ReadBarrier,)),
         ( "write_barrier", (MO_WriteBarrier,)),
         ( "memcpy", memcpyLikeTweakArgs MO_Memcpy ),
@@ -1060,10 +1099,6 @@ callishMachOps platform = listToUFM $
         ( "xchg16", (MO_Xchg W16,)),
         ( "xchg32", (MO_Xchg W32,)),
         ( "xchg64", (MO_Xchg W64,))
-
-        -- ToDo: the rest, maybe
-        -- edit: which rest?
-        -- also: how do we tell CMM Lint how to type check callish macops?
     ]
   where
     memcpyLikeTweakArgs :: (Int -> CallishMachOp) -> [CmmExpr] -> (CallishMachOp, [CmmExpr])
diff --git a/compiler/GHC/Cmm/Type.hs b/compiler/GHC/Cmm/Type.hs
index c7e2a4069b..1227b37ced 100644
--- a/compiler/GHC/Cmm/Type.hs
+++ b/compiler/GHC/Cmm/Type.hs
@@ -51,13 +51,14 @@ import Data.Int
 
 data CmmType    -- The important one!
   = CmmType CmmCat !Width
+  deriving Show
 
 data CmmCat                -- "Category" (not exported)
    = GcPtrCat              -- GC pointer
    | BitsCat               -- Non-pointer
    | FloatCat              -- Float
    | VecCat Length CmmCat  -- Vector
-   deriving( Eq )
+   deriving( Eq, Show )
         -- See Note [Signed vs unsigned] at the end
 
 instance Outputable CmmType where
@@ -434,4 +435,3 @@ C calling convention rather early on in the compiler).  However, given
 this, the cons outweigh the pros.
 
 -}
-
diff --git a/compiler/GHC/Cmm/Utils.hs b/compiler/GHC/Cmm/Utils.hs
index c1419cdd12..596b8d050f 100644
--- a/compiler/GHC/Cmm/Utils.hs
+++ b/compiler/GHC/Cmm/Utils.hs
@@ -14,7 +14,7 @@
 
 module GHC.Cmm.Utils(
         -- CmmType
-        primRepCmmType, slotCmmType, slotForeignHint,
+        primRepCmmType, slotCmmType,
         typeCmmType, typeForeignHint, primRepForeignHint,
 
         -- CmmLit
@@ -159,14 +159,6 @@ primRepForeignHint FloatRep     = NoHint
 primRepForeignHint DoubleRep    = NoHint
 primRepForeignHint (VecRep {})  = NoHint
 
-slotForeignHint :: SlotTy -> ForeignHint
-slotForeignHint PtrLiftedSlot   = AddrHint
-slotForeignHint PtrUnliftedSlot = AddrHint
-slotForeignHint WordSlot        = NoHint
-slotForeignHint Word64Slot      = NoHint
-slotForeignHint FloatSlot       = NoHint
-slotForeignHint DoubleSlot      = NoHint
-
 typeForeignHint :: UnaryType -> ForeignHint
 typeForeignHint = primRepForeignHint . typePrimRep1
 
diff --git a/compiler/GHC/CmmToAsm.hs b/compiler/GHC/CmmToAsm.hs
index 00ef59660f..82122911b6 100644
--- a/compiler/GHC/CmmToAsm.hs
+++ b/compiler/GHC/CmmToAsm.hs
@@ -84,6 +84,7 @@ import GHC.Prelude
 import qualified GHC.CmmToAsm.X86   as X86
 import qualified GHC.CmmToAsm.PPC   as PPC
 import qualified GHC.CmmToAsm.SPARC as SPARC
+import qualified GHC.CmmToAsm.AArch64 as AArch64
 
 import GHC.CmmToAsm.Reg.Liveness
 import qualified GHC.CmmToAsm.Reg.Linear                as Linear
@@ -166,7 +167,7 @@ nativeCodeGen logger dflags this_mod modLoc h us cmms
       ArchSPARC64   -> panic "nativeCodeGen: No NCG for SPARC64"
       ArchS390X     -> panic "nativeCodeGen: No NCG for S390X"
       ArchARM {}    -> panic "nativeCodeGen: No NCG for ARM"
-      ArchAArch64   -> panic "nativeCodeGen: No NCG for AArch64"
+      ArchAArch64   -> nCG' (AArch64.ncgAArch64 config)
       ArchAlpha     -> panic "nativeCodeGen: No NCG for Alpha"
       ArchMipseb    -> panic "nativeCodeGen: No NCG for mipseb"
       ArchMipsel    -> panic "nativeCodeGen: No NCG for mipsel"
@@ -174,7 +175,6 @@ nativeCodeGen logger dflags this_mod modLoc h us cmms
       ArchUnknown   -> panic "nativeCodeGen: No NCG for unknown arch"
       ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript"
 
-
 -- | Data accumulated during code generation. Mostly about statistics,
 -- but also collects debug data for DWARF generation.
 data NativeGenAcc statics instr
@@ -1191,9 +1191,9 @@ initNCGConfig dflags this_mod = NCGConfig
             ArchX86    -> v
             _          -> Nothing
 
-   , ncgDwarfEnabled        = debugLevel dflags >  0
-   , ncgDwarfUnwindings     = debugLevel dflags >= 1
+   , ncgDwarfEnabled        = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags > 0 && platformArch (targetPlatform dflags) /= ArchAArch64
+   , ncgDwarfUnwindings     = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags > 0
+   , ncgDwarfStripBlockInfo = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags < 2 -- We strip out block information when running with -g0 or -g1.
+   , ncgDwarfSourceNotes    = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags > 2 -- We produce GHC-specific source-note DIEs only with -g3
    , ncgExposeInternalSymbols = gopt Opt_ExposeInternalSymbols dflags
-   , ncgDwarfStripBlockInfo = debugLevel dflags <  2 -- We strip out block information when running with -g0 or -g1.
-   , ncgDwarfSourceNotes    = debugLevel dflags >= 3 -- We produce GHC-specific source-note DIEs only with -g3
    }
diff --git a/compiler/GHC/CmmToAsm/AArch64.hs b/compiler/GHC/CmmToAsm/AArch64.hs
new file mode 100644
index 0000000000..14f4b5d0bd
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64.hs
@@ -0,0 +1,60 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+-- | Native code generator for x86 and x86-64 architectures
+module GHC.CmmToAsm.AArch64
+   ( ncgAArch64 )
+where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+
+import qualified GHC.CmmToAsm.AArch64.Instr   as AArch64
+import qualified GHC.CmmToAsm.AArch64.Ppr     as AArch64
+import qualified GHC.CmmToAsm.AArch64.CodeGen as AArch64
+import qualified GHC.CmmToAsm.AArch64.Regs    as AArch64
+import qualified GHC.CmmToAsm.AArch64.RegInfo as AArch64
+
+ncgAArch64 :: NCGConfig -> NcgImpl RawCmmStatics AArch64.Instr AArch64.JumpDest
+ncgAArch64 config
+ = NcgImpl {
+        ncgConfig                 = config
+       ,cmmTopCodeGen             = AArch64.cmmTopCodeGen
+       ,generateJumpTableForInstr = AArch64.generateJumpTableForInstr config
+       ,getJumpDestBlockId        = AArch64.getJumpDestBlockId
+       ,canShortcut               = AArch64.canShortcut
+       ,shortcutStatics           = AArch64.shortcutStatics
+       ,shortcutJump              = AArch64.shortcutJump
+       ,pprNatCmmDecl             = AArch64.pprNatCmmDecl config
+       ,maxSpillSlots             = AArch64.maxSpillSlots config
+       ,allocatableRegs           = AArch64.allocatableRegs platform
+       ,ncgAllocMoreStack         = AArch64.allocMoreStack platform
+       ,ncgExpandTop              = id
+       ,ncgMakeFarBranches        = const id
+       ,extractUnwindPoints       = const []
+       ,invertCondBranches        = \_ _ -> id
+  }
+    where
+      platform = ncgPlatform config
+
+-- | Instruction instance for aarch64
+instance Instruction AArch64.Instr where
+        regUsageOfInstr         = AArch64.regUsageOfInstr
+        patchRegsOfInstr        = AArch64.patchRegsOfInstr
+        isJumpishInstr          = AArch64.isJumpishInstr
+        jumpDestsOfInstr        = AArch64.jumpDestsOfInstr
+        patchJumpInstr          = AArch64.patchJumpInstr
+        mkSpillInstr            = AArch64.mkSpillInstr
+        mkLoadInstr             = AArch64.mkLoadInstr
+        takeDeltaInstr          = AArch64.takeDeltaInstr
+        isMetaInstr             = AArch64.isMetaInstr
+        mkRegRegMoveInstr _     = AArch64.mkRegRegMoveInstr
+        takeRegRegMoveInstr     = AArch64.takeRegRegMoveInstr
+        mkJumpInstr             = AArch64.mkJumpInstr
+        mkStackAllocInstr       = AArch64.mkStackAllocInstr
+        mkStackDeallocInstr     = AArch64.mkStackDeallocInstr
+        mkComment               = pure . AArch64.COMMENT
+        pprInstr                = AArch64.pprInstr
diff --git a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
new file mode 100644
index 0000000000..b0984070fc
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
@@ -0,0 +1,1358 @@
+{-# language GADTs #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BinaryLiterals #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE NumericUnderscores #-}
+module GHC.CmmToAsm.AArch64.CodeGen (
+      cmmTopCodeGen
+    , generateJumpTableForInstr
+)
+
+where
+
+-- NCG stuff:
+import GHC.Prelude hiding (EQ)
+
+import GHC.Platform.Regs
+import GHC.CmmToAsm.AArch64.Instr
+import GHC.CmmToAsm.AArch64.Regs
+import GHC.CmmToAsm.AArch64.Cond
+
+import GHC.CmmToAsm.CPrim
+import GHC.Cmm.DebugBlock
+import GHC.CmmToAsm.Monad
+   ( NatM, getNewRegNat
+   , getPicBaseMaybeNat, getPlatform, getConfig
+   , getDebugBlock, getFileId
+   )
+-- import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+import GHC.Platform.Reg
+import GHC.Platform
+
+-- Our intermediate code:
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm.CLabel
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Types.Tickish ( GenTickish(..) )
+import GHC.Types.SrcLoc  ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
+
+-- The rest:
+import GHC.Data.OrdList
+import GHC.Utils.Outputable
+
+import Control.Monad    ( mapAndUnzipM, when, foldM )
+import Data.Word
+import Data.Maybe
+import GHC.Float
+
+import GHC.Types.Basic
+import GHC.Types.ForeignCall
+import GHC.Data.FastString
+import GHC.Utils.Misc
+import GHC.Utils.Panic
+
+-- Note [General layout of an NCG]
+-- @cmmTopCodeGen@ will be our main entry point to code gen.  Here we'll get
+-- @RawCmmDecl@; see GHC.Cmm
+--
+--   RawCmmDecl = GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) CmmGraph
+--
+--   GenCmmDecl d h g = CmmProc h CLabel [GlobalReg] g
+--                    | CmmData Section d
+--
+-- As a result we want to transform this to a list of @NatCmmDecl@, which is
+-- defined @GHC.CmmToAsm.Instr@ as
+--
+--   type NatCmmDecl statics instr
+--        = GenCmmDecl statics (LabelMap RawCmmStatics) (ListGraph instr)
+--
+-- Thus well' turn
+--   GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) CmmGraph
+-- into
+--   [GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) (ListGraph Instr)]
+--
+-- where @CmmGraph@ is
+--
+--   type CmmGraph = GenCmmGraph CmmNode
+--   data GenCmmGraph n = CmmGraph { g_entry :: BlockId, g_graph :: Graph n C C }
+--   type CmmBlock = Block CmmNode C C
+--
+-- and @ListGraph Instr@ is
+--
+--   newtype ListGraph i = ListGraph [GenBasicBlock i]
+--   data GenBasicBlock i = BasicBlock BlockId [i]
+
+cmmTopCodeGen
+    :: RawCmmDecl
+    -> NatM [NatCmmDecl RawCmmStatics Instr]
+
+-- Thus we'll have to deal with either CmmProc ...
+cmmTopCodeGen _cmm@(CmmProc info lab live graph) = do
+  -- do
+  --   traceM $ "-- -------------------------- cmmTopGen (CmmProc) -------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr cmm)
+
+  let blocks = toBlockListEntryFirst graph
+  (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+  picBaseMb <- getPicBaseMaybeNat
+
+  let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+      tops = proc : concat statics
+
+  case picBaseMb of
+      Just _picBase -> panic "AArch64.cmmTopCodeGen: picBase not implemented"
+      Nothing -> return tops
+
+-- ... or CmmData.
+cmmTopCodeGen _cmm@(CmmData sec dat) = do
+  -- do
+  --   traceM $ "-- -------------------------- cmmTopGen (CmmData) -------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr cmm)
+  return [CmmData sec dat] -- no translation, we just use CmmStatic
+
+basicBlockCodeGen
+        :: Block CmmNode C C
+        -> NatM ( [NatBasicBlock Instr]
+                , [NatCmmDecl RawCmmStatics Instr])
+
+basicBlockCodeGen block = do
+  config <- getConfig
+  -- do
+  --   traceM $ "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr block)
+  let (_, nodes, tail)  = blockSplit block
+      id = entryLabel block
+      stmts = blockToList nodes
+
+      header_comment_instr = unitOL $ MULTILINE_COMMENT (
+          text "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
+          $+$ pdoc (ncgPlatform config) block
+          )
+  -- Generate location directive
+  dbg <- getDebugBlock (entryLabel block)
+  loc_instrs <- case dblSourceTick =<< dbg of
+    Just (SourceNote span name)
+      -> do fileId <- getFileId (srcSpanFile span)
+            let line = srcSpanStartLine span; col = srcSpanStartCol span
+            return $ unitOL $ LOCATION fileId line col name
+    _ -> return nilOL
+  (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+  (!tail_instrs,_) <- stmtToInstrs mid_bid tail
+  let instrs = header_comment_instr `appOL` loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+  -- TODO: Then x86 backend run @verifyBasicBlock@ here and inserts
+  --      unwinding info. See Ticket 19913
+  -- code generation may introduce new basic block boundaries, which
+  -- are indicated by the NEWBLOCK instruction.  We must split up the
+  -- instruction stream into basic blocks again.  Also, we extract
+  -- LDATAs here too.
+  let
+        (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs
+
+        mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+          = ([], BasicBlock id instrs : blocks, statics)
+        mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+          = (instrs, blocks, CmmData sec dat:statics)
+        mkBlocks instr (instrs,blocks,statics)
+          = (instr:instrs, blocks, statics)
+  return (BasicBlock id top : other_blocks, statics)
+
+
+-- -----------------------------------------------------------------------------
+-- | Utilities
+ann :: SDoc -> Instr -> Instr
+ann doc instr {- | debugIsOn -} = ANN doc instr
+-- ann _ instr = instr
+{-# INLINE ann #-}
+
+-- Using pprExpr will hide the AST, @ANN@ will end up in the assembly with
+-- -dppr-debug.  The idea is that we can trivially see how a cmm expression
+-- ended up producing the assmebly we see.  By having the verbatim AST printed
+-- we can simply check the patterns that were matched to arrive at the assmebly
+-- we generated.
+--
+-- pprExpr will hide a lot of noise of the underlying data structure and print
+-- the expression into something that can be easily read by a human. However
+-- going back to the exact CmmExpr representation can be labourous and adds
+-- indirections to find the matches that lead to the assembly.
+--
+-- An improvement oculd be to have
+--
+--    (pprExpr genericPlatform e) <> parens (text. show e)
+--
+-- to have the best of both worlds.
+--
+-- Note: debugIsOn is too restrictive, it only works for debug compilers.
+-- However, we do not only want to inspect this for debug compilers. Ideally
+-- we'd have a check for -dppr-debug here already, such that we don't even
+-- generate the ANN expressions. However, as they are lazy, they shouldn't be
+-- forced until we actually force them, and without -dppr-debug they should
+-- never end up being forced.
+annExpr :: CmmExpr -> Instr -> Instr
+annExpr e instr {- | debugIsOn -} = ANN (text . show $ e) instr
+-- annExpr e instr {- | debugIsOn -} = ANN (pprExpr genericPlatform e) instr
+-- annExpr _ instr = instr
+{-# INLINE annExpr #-}
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+-- TODO jump tables would be a lot faster, but we'll use bare bones for now.
+-- this is usually done by sticking the jump table ids into an instruction
+-- and then have the @generateJumpTableForInstr@ callback produce the jump
+-- table as a static.
+--
+-- See Ticket 19912
+--
+-- data SwitchTargets =
+--    SwitchTargets
+--        Bool                       -- Signed values
+--        (Integer, Integer)         -- Range
+--        (Maybe Label)              -- Default value
+--        (M.Map Integer Label)      -- The branches
+--
+-- Non Jumptable plan:
+-- xE <- expr
+--
+genSwitch :: CmmExpr -> SwitchTargets -> NatM InstrBlock
+genSwitch expr targets = do -- pprPanic "genSwitch" (ppr expr)
+  (reg, format, code) <- getSomeReg expr
+  let w = formatToWidth format
+  let mkbranch acc (key, bid) = do
+        (keyReg, _format, code) <- getSomeReg (CmmLit (CmmInt key w))
+        return $ code `appOL`
+                 toOL [ CMP (OpReg w reg) (OpReg w keyReg)
+                      , BCOND EQ (TBlock bid)
+                      ] `appOL` acc
+      def_code = case switchTargetsDefault targets of
+        Just bid -> unitOL (B (TBlock bid))
+        Nothing  -> nilOL
+
+  switch_code <- foldM mkbranch nilOL (switchTargetsCases targets)
+  return $ code `appOL` switch_code `appOL` def_code
+
+-- We don't do jump tables for now, see Ticket 19912
+generateJumpTableForInstr :: NCGConfig -> Instr
+  -> Maybe (NatCmmDecl RawCmmStatics Instr)
+generateJumpTableForInstr _ _ = Nothing
+
+-- -----------------------------------------------------------------------------
+-- Top-level of the instruction selector
+
+-- See Note [Keeping track of the current block] for why
+-- we pass the BlockId.
+stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
+              -> [CmmNode O O] -- ^ Cmm Statement
+              -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
+stmtsToInstrs bid stmts =
+    go bid stmts nilOL
+  where
+    go bid  []        instrs = return (instrs,bid)
+    go bid (s:stmts)  instrs = do
+      (instrs',bid') <- stmtToInstrs bid s
+      -- If the statement introduced a new block, we use that one
+      let !newBid = fromMaybe bid bid'
+      go newBid stmts (instrs `appOL` instrs')
+
+-- | `bid` refers to the current block and is used to update the CFG
+--   if new blocks are inserted in the control flow.
+-- See Note [Keeping track of the current block] for more details.
+stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
+             -> CmmNode e x
+             -> NatM (InstrBlock, Maybe BlockId)
+             -- ^ Instructions, and bid of new block if successive
+             -- statements are placed in a different basic block.
+stmtToInstrs bid stmt = do
+  -- traceM $ "-- -------------------------- stmtToInstrs -------------------------- --\n"
+  --     ++ showSDocUnsafe (ppr stmt)
+  platform <- getPlatform
+  case stmt of
+    CmmUnsafeForeignCall target result_regs args
+       -> genCCall target result_regs args bid
+
+    _ -> (,Nothing) <$> case stmt of
+      CmmComment s   -> return (unitOL (COMMENT (ftext s)))
+      CmmTick {}     -> return nilOL
+
+      CmmAssign reg src
+        | isFloatType ty         -> assignReg_FltCode format reg src
+        | otherwise              -> assignReg_IntCode format reg src
+          where ty = cmmRegType platform reg
+                format = cmmTypeFormat ty
+
+      CmmStore addr src
+        | isFloatType ty         -> assignMem_FltCode format addr src
+        | otherwise              -> assignMem_IntCode format addr src
+          where ty = cmmExprType platform src
+                format = cmmTypeFormat ty
+
+      CmmBranch id          -> genBranch id
+
+      --We try to arrange blocks such that the likely branch is the fallthrough
+      --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
+      CmmCondBranch arg true false _prediction ->
+          genCondBranch bid true false arg
+
+      CmmSwitch arg ids -> genSwitch arg ids
+
+      CmmCall { cml_target = arg } -> genJump arg
+
+      CmmUnwind _regs -> return nilOL
+
+      _ -> pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt)
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+--      They are really trees of insns to facilitate fast appending, where a
+--      left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+        = OrdList Instr
+
+-- | Register's passed up the tree.  If the stix code forces the register
+--      to live in a pre-decided machine register, it comes out as @Fixed@;
+--      otherwise, it comes out as @Any@, and the parent can decide which
+--      register to put it in.
+--
+data Register
+        = Fixed Format Reg InstrBlock
+        | Any   Format (Reg -> InstrBlock)
+
+-- | Sometimes we need to change the Format of a register. Primarily during
+-- conversion.
+swizzleRegisterRep :: Format -> Register -> Register
+swizzleRegisterRep format (Fixed _ reg code) = Fixed format reg code
+swizzleRegisterRep format (Any _ codefn)     = Any   format codefn
+
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform -> CmmReg -> Reg
+
+getRegisterReg _ (CmmLocal (LocalReg u pk))
+  = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
+
+getRegisterReg platform (CmmGlobal mid)
+  = case globalRegMaybe platform mid of
+        Just reg -> RegReal reg
+        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
+        -- By this stage, the only MagicIds remaining should be the
+        -- ones which map to a real machine register on this
+        -- platform.  Hence if it's not mapped to a registers something
+        -- went wrong earlier in the pipeline.
+-- | Convert a BlockId to some CmmStatic data
+-- TODO: Add JumpTable Logic, see Ticket 19912
+-- jumpTableEntry :: NCGConfig -> Maybe BlockId -> CmmStatic
+-- jumpTableEntry config Nothing   = CmmStaticLit (CmmInt 0 (ncgWordWidth config))
+-- jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+--     where blockLabel = blockLbl blockid
+
+-- -----------------------------------------------------------------------------
+-- General things for putting together code sequences
+
+-- | The dual to getAnyReg: compute an expression into a register, but
+--      we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, Format, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, rep, code tmp)
+    Fixed rep reg code ->
+        return (reg, rep, code)
+
+-- TODO OPT: we might be able give getRegister
+--          a hint, what kind of register we want.
+getFloatReg :: HasCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
+getFloatReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code | isFloatFormat rep -> do
+      tmp <- getNewRegNat rep
+      return (tmp, rep, code tmp)
+    Any II32 code -> do
+      tmp <- getNewRegNat FF32
+      return (tmp, FF32, code tmp)
+    Any II64 code -> do
+      tmp <- getNewRegNat FF64
+      return (tmp, FF64, code tmp)
+    Any _w _code -> do
+      config <- getConfig
+      pprPanic "can't do getFloatReg on" (pdoc (ncgPlatform config) expr)
+    -- can't do much for fixed.
+    Fixed rep reg code ->
+      return (reg, rep, code)
+
+-- TODO: TODO, bounds. We can't put any immediate
+-- value in. They are constrained.
+-- See Ticket 19911
+litToImm' :: CmmLit -> NatM (Operand, InstrBlock)
+litToImm' lit = return (OpImm (litToImm lit), nilOL)
+
+
+getRegister :: CmmExpr -> NatM Register
+getRegister e = do
+  config <- getConfig
+  getRegister' config (ncgPlatform config) e
+
+-- Note [Handling PIC on AArch64]
+-- AArch64 does not have a special PIC register, the general approach is to
+-- simply go through the GOT, and there is assembly support for this:
+--
+--   // Load the address of 'sym' from the GOT using ADRP and LDR (used for
+--   // position-independent code on AArch64):
+--   adrp x0, #:got:sym
+--   ldr x0, [x0, #:got_lo12:sym]
+--
+-- See also: https://developer.arm.com/documentation/dui0774/i/armclang-integrated-assembler-directives/assembly-expressions
+--
+-- CmmGlobal @PicBaseReg@'s are generated in @GHC.CmmToAsm.PIC@ in the
+-- @cmmMakePicReference@.  This is in turn called from @cmmMakeDynamicReference@
+-- also in @Cmm.CmmToAsm.PIC@ from where it is also exported.  There are two
+-- callsites for this. One is in this module to produce the @target@ in @genCCall@
+-- the other is in @GHC.CmmToAsm@ in @cmmExprNative@.
+--
+-- Conceptually we do not want any special PicBaseReg to be used on AArch64. If
+-- we want to distinguish between symbol loading, we need to address this through
+-- the way we load it, not through a register.
+--
+
+getRegister' :: NCGConfig -> Platform -> CmmExpr -> NatM Register
+-- OPTIMIZATION WARNING: CmmExpr rewrites
+-- 1. Rewrite: Reg + (-n) => Reg - n
+--    TODO: this expression souldn't even be generated to begin with.
+getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt i w1)]) | i < 0
+  = getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt (-i) w1)])
+
+getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt i w1)]) | i < 0
+  = getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt (-i) w1)])
+
+
+-- Generic case.
+getRegister' config plat expr
+  = case expr of
+    CmmReg (CmmGlobal PicBaseReg)
+      -> pprPanic "getRegisterReg-memory" (ppr $ PicBaseReg)
+    CmmLit lit
+      -> case lit of
+
+        -- TODO handle CmmInt 0 specially, use wzr or xzr.
+
+        CmmInt i W8  -> do
+          return (Any (intFormat W8) (\dst -> unitOL $ annExpr expr (MOV (OpReg W8 dst) (OpImm (ImmInteger (narrowS W8 i))))))
+        CmmInt i W16 -> do
+          return (Any (intFormat W16) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger (narrowS W16 i))))))
+
+        -- We need to be careful to not shorten this for negative literals.
+        -- Those need the upper bits set. We'd either have to explicitly sign
+        -- or figure out something smarter. Lowered to
+        -- `MOV dst XZR`
+        CmmInt i w | isNbitEncodeable 16 i, i >= 0 -> do
+          return (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger i)))))
+        CmmInt i w | isNbitEncodeable 32 i, i >= 0 -> do
+          let  half0 = fromIntegral (fromIntegral i :: Word16)
+               half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
+          return (Any (intFormat w) (\dst -> toOL [ annExpr expr
+                                                  $ MOV (OpReg W32 dst) (OpImm (ImmInt half0))
+                                                  , MOVK (OpReg W32 dst) (OpImmShift (ImmInt half1) SLSL 16)
+                                                  ]))
+        -- fallback for W32
+        CmmInt i W32 -> do
+          let  half0 = fromIntegral (fromIntegral i :: Word16)
+               half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
+          return (Any (intFormat W32) (\dst -> toOL [ annExpr expr
+                                                    $ MOV (OpReg W32 dst) (OpImm (ImmInt half0))
+                                                    , MOVK (OpReg W32 dst) (OpImmShift (ImmInt half1) SLSL 16)
+                                                    ]))
+        -- anything else
+        CmmInt i W64 -> do
+          let  half0 = fromIntegral (fromIntegral i :: Word16)
+               half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
+               half2 = fromIntegral (fromIntegral (i `shiftR` 32) :: Word16)
+               half3 = fromIntegral (fromIntegral (i `shiftR` 48) :: Word16)
+          return (Any (intFormat W64) (\dst -> toOL [ annExpr expr
+                                                    $ MOV (OpReg W64 dst) (OpImm (ImmInt half0))
+                                                    , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half1) SLSL 16)
+                                                    , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half2) SLSL 32)
+                                                    , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half3) SLSL 48)
+                                                    ]))
+        CmmInt _i rep -> do
+          (op, imm_code) <- litToImm' lit
+          return (Any (intFormat rep) (\dst -> imm_code `snocOL` annExpr expr (MOV (OpReg rep dst) op)))
+
+        -- floatToBytes (fromRational f)
+        CmmFloat 0 w   -> do
+          (op, imm_code) <- litToImm' lit
+          return (Any (floatFormat w) (\dst -> imm_code `snocOL` annExpr expr (MOV (OpReg w dst) op)))
+
+        CmmFloat _f W8  -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (pdoc plat expr)
+        CmmFloat _f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (pdoc plat expr)
+        CmmFloat f W32 -> do
+          let word = castFloatToWord32 (fromRational f) :: Word32
+              half0 = fromIntegral (fromIntegral word :: Word16)
+              half1 = fromIntegral (fromIntegral (word `shiftR` 16) :: Word16)
+          tmp <- getNewRegNat (intFormat W32)
+          return (Any (floatFormat W32) (\dst -> toOL [ annExpr expr
+                                                      $ MOV (OpReg W32 tmp) (OpImm (ImmInt half0))
+                                                      , MOVK (OpReg W32 tmp) (OpImmShift (ImmInt half1) SLSL 16)
+                                                      , MOV (OpReg W32 dst) (OpReg W32 tmp)
+                                                      ]))
+        CmmFloat f W64 -> do
+          let word = castDoubleToWord64 (fromRational f) :: Word64
+              half0 = fromIntegral (fromIntegral word :: Word16)
+              half1 = fromIntegral (fromIntegral (word `shiftR` 16) :: Word16)
+              half2 = fromIntegral (fromIntegral (word `shiftR` 32) :: Word16)
+              half3 = fromIntegral (fromIntegral (word `shiftR` 48) :: Word16)
+          tmp <- getNewRegNat (intFormat W64)
+          return (Any (floatFormat W64) (\dst -> toOL [ annExpr expr
+                                                      $ MOV (OpReg W64 tmp) (OpImm (ImmInt half0))
+                                                      , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half1) SLSL 16)
+                                                      , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half2) SLSL 32)
+                                                      , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half3) SLSL 48)
+                                                      , MOV (OpReg W64 dst) (OpReg W64 tmp)
+                                                      ]))
+        CmmFloat _f _w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (pdoc plat expr)
+        CmmVec _ -> pprPanic "getRegister' (CmmLit:CmmVec): " (pdoc plat expr)
+        CmmLabel _lbl -> do
+          (op, imm_code) <- litToImm' lit
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+          return (Any format (\dst -> imm_code `snocOL` (annExpr expr $ LDR format (OpReg (formatToWidth format) dst) op)))
+
+        CmmLabelOff _lbl off | isNbitEncodeable 12 (fromIntegral off) -> do
+          (op, imm_code) <- litToImm' lit
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+              -- width = typeWidth rep
+          return (Any format (\dst -> imm_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op))
+
+        CmmLabelOff lbl off -> do
+          (op, imm_code) <- litToImm' (CmmLabel lbl)
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+              width = typeWidth rep
+          (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
+          return (Any format (\dst -> imm_code `appOL` off_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op `snocOL` ADD (OpReg width dst) (OpReg width dst) (OpReg width off_r)))
+
+        CmmLabelDiffOff _ _ _ _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+        CmmBlock _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+        CmmHighStackMark -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+    CmmLoad mem rep -> do
+      Amode addr addr_code <- getAmode plat mem
+      let format = cmmTypeFormat rep
+      return (Any format (\dst -> addr_code `snocOL` LDR format (OpReg (formatToWidth format) dst) (OpAddr addr)))
+    CmmStackSlot _ _
+      -> pprPanic "getRegister' (CmmStackSlot): " (pdoc plat expr)
+    CmmReg reg
+      -> return (Fixed (cmmTypeFormat (cmmRegType plat reg))
+                       (getRegisterReg plat reg)
+                       nilOL)
+    CmmRegOff reg off | isNbitEncodeable 12 (fromIntegral off) -> do
+      getRegister' config plat $
+            CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+          where width = typeWidth (cmmRegType plat reg)
+
+    CmmRegOff reg off -> do
+      (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
+      (reg, _format, code) <- getSomeReg $ CmmReg reg
+      return $ Any (intFormat width) (\dst -> off_code `appOL` code `snocOL` ADD (OpReg width dst) (OpReg width reg) (OpReg width off_r))
+          where width = typeWidth (cmmRegType plat reg)
+
+
+
+    -- for MachOps, see GHC.Cmm.MachOp
+    -- For CmmMachOp, see GHC.Cmm.Expr
+    CmmMachOp op [e] -> do
+      (reg, _format, code) <- getSomeReg e
+      case op of
+        MO_Not w -> return $ Any (intFormat w) (\dst -> code `snocOL` MVN (OpReg w dst) (OpReg w reg))
+
+        MO_S_Neg w -> return $ Any (intFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))
+        MO_F_Neg w -> return $ Any (floatFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))
+
+        MO_SF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` SCVTF (OpReg to dst) (OpReg from reg))  -- (Signed ConVerT Float)
+        MO_FS_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from reg)) -- (float convert (-> zero) signed)
+
+        -- TODO this is very hacky
+        -- Note, UBFM and SBFM expect source and target register to be of the same size, so we'll use @max from to@
+        -- UBFM will set the high bits to 0. SBFM will copy the sign (sign extend).
+        MO_UU_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` UBFM (OpReg (max from to) dst) (OpReg (max from to) reg) (OpImm (ImmInt 0)) (toImm (min from to)))
+        MO_SS_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` SBFM (OpReg (max from to) dst) (OpReg (max from to) reg) (OpImm (ImmInt 0)) (toImm (min from to)))
+        MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` FCVT (OpReg to dst) (OpReg from reg))
+
+        -- Conversions
+        MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e
+
+        _ -> pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
+      where toImm W8 =  (OpImm (ImmInt 7))
+            toImm W16 = (OpImm (ImmInt 15))
+            toImm W32 = (OpImm (ImmInt 31))
+            toImm W64 = (OpImm (ImmInt 63))
+            toImm W128 = (OpImm (ImmInt 127))
+            toImm W256 = (OpImm (ImmInt 255))
+            toImm W512 = (OpImm (ImmInt 511))
+    -- Dyadic machops:
+    --
+    -- The general idea is:
+    -- compute x<i> <- x
+    -- compute x<j> <- y
+    -- OP x<r>, x<i>, x<j>
+    --
+    -- TODO: for now we'll only implement the 64bit versions. And rely on the
+    --      fallthrough to alert us if things go wrong!
+    -- OPTIMIZATION WARNING: Dyadic CmmMachOp destructuring
+    -- 0. TODO This should not exist! Rewrite: Reg +- 0 -> Reg
+    CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
+    CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
+    -- 1. Compute Reg +/- n directly.
+    --    For Add/Sub we can directly encode 12bits, or 12bits lsl #12.
+    CmmMachOp (MO_Add w) [(CmmReg reg), CmmLit (CmmInt n _)]
+      | n > 0 && n < 4096 -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ADD (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
+            r' = getRegisterReg plat reg
+    CmmMachOp (MO_Sub w) [(CmmReg reg), CmmLit (CmmInt n _)]
+      | n > 0 && n < 4096 -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (SUB (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
+            r' = getRegisterReg plat reg
+
+    -- 2. Shifts. x << n, x >> n.
+    CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W32, 0 <= n, n < 32 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+    CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W64, 0 <= n, n < 64 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+
+    CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W32, 0 <= n, n < 32 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+    CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W64, 0 <= n, n < 64 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+
+    -- 3. Logic &&, ||
+    CmmMachOp (MO_And w) [(CmmReg reg), CmmLit (CmmInt n _)] | isBitMaskImmediate (fromIntegral n) ->
+      return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (AND (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
+            r' = getRegisterReg plat reg
+
+    CmmMachOp (MO_Or w) [(CmmReg reg), CmmLit (CmmInt n _)] | isBitMaskImmediate (fromIntegral n) ->
+      return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ORR (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
+            r' = getRegisterReg plat reg
+
+    -- Generic case.
+    CmmMachOp op [x, y] -> do
+      -- alright, so we have an operation, and two expressions. And we want to essentially do
+      -- ensure we get float regs
+      let genOp w op = do
+            (reg_x, format_x, code_x) <- getSomeReg x
+            (reg_y, format_y, code_y) <- getSomeReg y
+            when ((isFloatFormat format_x && isIntFormat format_y) || (isIntFormat format_x && isFloatFormat format_y)) $ pprPanic "getRegister:genOp" (text "formats don't match:" <+> text (show format_x) <+> text "/=" <+> text (show format_y))
+            return $ Any format_x (\dst -> code_x `appOL` code_y `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
+
+          withTempIntReg w op = OpReg w <$> getNewRegNat (intFormat w) >>= op
+          -- withTempFloatReg w op = OpReg w <$> getNewRegNat (floatFormat w) >>= op
+
+          intOp w op = do
+            -- compute x<m> <- x
+            -- compute x<o> <- y
+            -- <OP> x<n>, x<m>, x<o>
+            (reg_x, _format_x, code_x) <- getSomeReg x
+            (reg_y, _format_y, code_y) <- getSomeReg y
+            return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
+          floatOp w op = do
+            (reg_fx, _format_x, code_fx) <- getFloatReg x
+            (reg_fy, _format_y, code_fy) <- getFloatReg y
+            return $ Any (floatFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))
+          -- need a special one for conditionals, as they return ints
+          floatCond w op = do
+            (reg_fx, _format_x, code_fx) <- getFloatReg x
+            (reg_fy, _format_y, code_fy) <- getFloatReg y
+            return $ Any (intFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))
+
+      case op of
+        -- Integer operations
+        -- Add/Sub should only be Interger Options.
+        -- But our Cmm parser doesn't care about types
+        -- and thus we end up with <float> + <float> => MO_Add <float> <float>
+        MO_Add w -> genOp w (\d x y -> unitOL $ annExpr expr (ADD d x y))
+        MO_Sub w -> genOp w (\d x y -> unitOL $ annExpr expr (SUB d x y))
+        --    31  30  29  28
+        --  .---+---+---+---+-- - -
+        --  | N | Z | C | V |
+        --  '---+---+---+---+-- - -
+        --  Negative
+        --  Zero
+        --  Carry
+        --  oVerflow
+        --
+        --  .------+-------------------------------------+-----------------+----------.
+        --  | Code | Meaning                             | Flags           | Encoding |
+        --  |------+-------------------------------------+-----------------+----------|
+        --  |  EQ  | Equal                               | Z = 1           | 0000     |
+        --  |  NE  | Not Equal                           | Z = 0           | 0001     |
+        --  |  HI  | Unsigned Higher                     | C = 1 && Z = 0  | 1000     |
+        --  |  HS  | Unsigned Higher or Same             | C = 1           | 0010     |
+        --  |  LS  | Unsigned Lower or Same              | C = 0 || Z = 1  | 1001     |
+        --  |  LO  | Unsigned Lower                      | C = 0           | 0011     |
+        --  |  GT  | Signed Greater Than                 | Z = 0 && N = V  | 1100     |
+        --  |  GE  | Signed Greater Than or Equal        | N = V           | 1010     |
+        --  |  LE  | Signed Less Than or Equal           | Z = 1 || N /= V | 1101     |
+        --  |  LT  | Signed Less Than                    | N /= V          | 1011     |
+        --  |  CS  | Carry Set (Unsigned Overflow)       | C = 1           | 0010     |
+        --  |  CC  | Carry Clear (No Unsigned Overflow)  | C = 0           | 0011     |
+        --  |  VS  | Signed Overflow                     | V = 1           | 0110     |
+        --  |  VC  | No Signed Overflow                  | V = 0           | 0111     |
+        --  |  MI  | Minus, Negative                     | N = 1           | 0100     |
+        --  |  PL  | Plus, Positive or Zero (!)          | N = 0           | 0101     |
+        --  |  AL  | Always                              | Any             | 1110     |
+        --  |  NV  | Never                               | Any             | 1111     |
+        --- '-------------------------------------------------------------------------'
+
+        MO_Eq w -> intOp w (\d x y -> toOL [ CMP x y, CSET d EQ ])
+        MO_Ne w -> intOp w (\d x y -> toOL [ CMP x y, CSET d NE ])
+        MO_Mul w -> intOp w (\d x y -> unitOL $ MUL d x y)
+
+        -- Signed multiply/divide
+        MO_S_MulMayOflo w -> intOp w (\d x y -> toOL [ MUL d x y, CSET d VS ])
+        MO_S_Quot w -> intOp w (\d x y -> unitOL $ SDIV d x y)
+
+        -- No native rem instruction. So we'll compute the following
+        -- Rd  <- Rx / Ry             | 2 <- 7 / 3      -- SDIV Rd Rx Ry
+        -- Rd' <- Rx - Rd * Ry        | 1 <- 7 - 2 * 3  -- MSUB Rd' Rd Ry Rx
+        --        |     '---|----------------|---'   |
+        --        |         '----------------|-------'
+        --        '--------------------------'
+        -- Note the swap in Rx and Ry.
+        MO_S_Rem w -> withTempIntReg w $ \t ->
+          intOp w (\d x y -> toOL [ SDIV t x y, MSUB d t y x ])
+
+        -- Unsigned multiply/divide
+        MO_U_MulMayOflo _w -> unsupportedP plat expr
+        MO_U_Quot w -> intOp w (\d x y -> unitOL $ UDIV d x y)
+        MO_U_Rem w  -> withTempIntReg w $ \t ->
+          intOp w (\d x y -> toOL [ UDIV t x y, MSUB d t y x ])
+
+        -- Signed comparisons -- see above for the CSET discussion
+        MO_S_Ge w -> intOp w (\d x y -> toOL [ CMP x y, CSET d SGE ])
+        MO_S_Le w -> intOp w (\d x y -> toOL [ CMP x y, CSET d SLE ])
+        MO_S_Gt w -> intOp w (\d x y -> toOL [ CMP x y, CSET d SGT ])
+        MO_S_Lt w -> intOp w (\d x y -> toOL [ CMP x y, CSET d SLT ])
+
+        -- Unsigned comparisons
+        MO_U_Ge w -> intOp w (\d x y -> toOL [ CMP x y, CSET d UGE ])
+        MO_U_Le w -> intOp w (\d x y -> toOL [ CMP x y, CSET d ULE ])
+        MO_U_Gt w -> intOp w (\d x y -> toOL [ CMP x y, CSET d UGT ])
+        MO_U_Lt w -> intOp w (\d x y -> toOL [ CMP x y, CSET d ULT ])
+
+        -- Floating point arithmetic
+        MO_F_Add w   -> floatOp w (\d x y -> unitOL $ ADD d x y)
+        MO_F_Sub w   -> floatOp w (\d x y -> unitOL $ SUB d x y)
+        MO_F_Mul w   -> floatOp w (\d x y -> unitOL $ MUL d x y)
+        MO_F_Quot w  -> floatOp w (\d x y -> unitOL $ SDIV d x y)
+
+        -- Floating point comparison
+        MO_F_Eq w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d EQ ])
+        MO_F_Ne w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d NE ])
+
+        -- careful with the floating point operations.
+        -- SLE is effectively LE or unordered (NaN)
+        -- SLT is the same. ULE, and ULT will not return true for NaN.
+        -- This is a bit counter intutive. Don't let yourself be fooled by
+        -- the S/U prefix for floats, it's only meaningful for integers.
+        MO_F_Ge w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OGE ])
+        MO_F_Le w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OLE ]) -- x <= y <=> y > x
+        MO_F_Gt w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OGT ])
+        MO_F_Lt w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OLT ]) -- x < y <=> y >= x
+
+        -- Bitwise operations
+        MO_And   w -> intOp w (\d x y -> unitOL $ AND d x y)
+        MO_Or    w -> intOp w (\d x y -> unitOL $ ORR d x y)
+        MO_Xor   w -> intOp w (\d x y -> unitOL $ EOR d x y)
+        -- MO_Not   W64 ->
+        MO_Shl   w -> intOp w (\d x y -> unitOL $ LSL d x y)
+        MO_U_Shr w -> intOp w (\d x y -> unitOL $ LSR d x y)
+        MO_S_Shr w -> intOp w (\d x y -> unitOL $ ASR d x y)
+
+        -- TODO
+
+        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+    CmmMachOp _op _xs
+      -> pprPanic "getRegister' (variadic CmmMachOp): " (pdoc plat expr)
+
+  where
+    unsupportedP :: OutputableP env a => env -> a -> b
+    unsupportedP platform op = pprPanic "Unsupported op:" (pdoc platform op)
+
+    isNbitEncodeable :: Int -> Integer -> Bool
+    isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
+    -- This needs to check if n can be encoded as a bitmask immediate:
+    --
+    -- See https://stackoverflow.com/questions/30904718/range-of-immediate-values-in-armv8-a64-assembly
+    --
+    isBitMaskImmediate :: Integer -> Bool
+    isBitMaskImmediate i = i `elem` [0b0000_0001, 0b0000_0010, 0b0000_0100, 0b0000_1000, 0b0001_0000, 0b0010_0000, 0b0100_0000, 0b1000_0000
+                                    ,0b0000_0011, 0b0000_0110, 0b0000_1100, 0b0001_1000, 0b0011_0000, 0b0110_0000, 0b1100_0000
+                                    ,0b0000_0111, 0b0000_1110, 0b0001_1100, 0b0011_1000, 0b0111_0000, 0b1110_0000
+                                    ,0b0000_1111, 0b0001_1110, 0b0011_1100, 0b0111_1000, 0b1111_0000
+                                    ,0b0001_1111, 0b0011_1110, 0b0111_1100, 0b1111_1000
+                                    ,0b0011_1111, 0b0111_1110, 0b1111_1100
+                                    ,0b0111_1111, 0b1111_1110
+                                    ,0b1111_1111]
+
+
+-- -----------------------------------------------------------------------------
+--  The 'Amode' type: Memory addressing modes passed up the tree.
+data Amode = Amode AddrMode InstrBlock
+
+getAmode :: Platform -> CmmExpr -> NatM Amode
+-- TODO: Specialize stuff we can destructure here.
+
+-- OPTIMIZATION WARNING: Addressing modes.
+-- Addressing options:
+-- LDUR/STUR: imm9: -256 - 255
+getAmode platform (CmmRegOff reg off) | -256 <= off, off <= 255
+  = return $ Amode (AddrRegImm reg' off') nilOL
+    where reg' = getRegisterReg platform reg
+          off' = ImmInt off
+-- LDR/STR: imm12: if reg is 32bit: 0 -- 16380 in multiples of 4
+getAmode platform (CmmRegOff reg off)
+  | typeWidth (cmmRegType platform reg) == W32, 0 <= off, off <= 16380, off `mod` 4 == 0
+  = return $ Amode (AddrRegImm reg' off') nilOL
+    where reg' = getRegisterReg platform reg
+          off' = ImmInt off
+-- LDR/STR: imm12: if reg is 64bit: 0 -- 32760 in multiples of 8
+getAmode platform (CmmRegOff reg off)
+  | typeWidth (cmmRegType platform reg) == W64, 0 <= off, off <= 32760, off `mod` 8 == 0
+  = return $ Amode (AddrRegImm reg' off') nilOL
+    where reg' = getRegisterReg platform reg
+          off' = ImmInt off
+
+-- For Stores we often see something like this:
+-- CmmStore (CmmMachOp (MO_Add w) [CmmLoad expr, CmmLit (CmmInt n w')]) (expr2)
+-- E.g. a CmmStoreOff really. This can be translated to `str $expr2, [$expr, #n ]
+-- for `n` in range.
+getAmode _platform (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')])
+  | -256 <= off, off <= 255
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrRegImm reg (ImmInteger off)) code
+
+getAmode _platform (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')])
+  | -256 <= -off, -off <= 255
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrRegImm reg (ImmInteger (-off))) code
+
+-- Generic case
+getAmode _platform expr
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrReg reg) code
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business.  Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers.  If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side.  This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_IntCode rep addrE srcE
+  = do
+    (src_reg, _format, code) <- getSomeReg srcE
+    platform <- getPlatform
+    Amode addr addr_code <- getAmode platform addrE
+    return $ COMMENT (text "CmmStore" <+> parens (text (show addrE)) <+> parens (text (show srcE)))
+            `consOL` (code
+            `appOL` addr_code
+            `snocOL` STR rep (OpReg (formatToWidth rep) src_reg) (OpAddr addr))
+
+assignReg_IntCode _ reg src
+  = do
+    platform <- getPlatform
+    let dst = getRegisterReg platform reg
+    r <- getRegister src
+    return $ case r of
+      Any _ code              -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` code dst
+      Fixed format freg fcode -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` (fcode `snocOL` MOV (OpReg (formatToWidth format) dst) (OpReg (formatToWidth format) freg))
+
+-- Let's treat Floating point stuff
+-- as integer code for now. Opaque.
+assignMem_FltCode = assignMem_IntCode
+assignReg_FltCode = assignReg_IntCode
+
+-- -----------------------------------------------------------------------------
+-- Jumps
+genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
+genJump expr@(CmmLit (CmmLabel lbl))
+  = return $ unitOL (annExpr expr (J (TLabel lbl)))
+
+genJump expr = do
+    (target, _format, code) <- getSomeReg expr
+    return (code `appOL` unitOL (annExpr expr (J (TReg target))))
+
+-- -----------------------------------------------------------------------------
+--  Unconditional branches
+genBranch :: BlockId -> NatM InstrBlock
+genBranch = return . toOL . mkJumpInstr
+
+-- -----------------------------------------------------------------------------
+-- Conditional branches
+genCondJump
+    :: BlockId
+    -> CmmExpr
+    -> NatM InstrBlock
+genCondJump bid expr = do
+    case expr of
+      -- Optimized == 0 case.
+      CmmMachOp (MO_Eq w) [x, CmmLit (CmmInt 0 _)] -> do
+        (reg_x, _format_x, code_x) <- getSomeReg x
+        return $ code_x `snocOL` (annExpr expr (CBZ (OpReg w reg_x) (TBlock bid)))
+
+      -- Optimized /= 0 case.
+      CmmMachOp (MO_Ne w) [x, CmmLit (CmmInt 0 _)] -> do
+        (reg_x, _format_x, code_x) <- getSomeReg x
+        return $ code_x `snocOL`  (annExpr expr (CBNZ (OpReg w reg_x) (TBlock bid)))
+
+      -- Generic case.
+      CmmMachOp mop [x, y] -> do
+
+        let bcond w cmp = do
+              -- compute both sides.
+              (reg_x, _format_x, code_x) <- getSomeReg x
+              (reg_y, _format_y, code_y) <- getSomeReg y
+              return $ code_x `appOL` code_y `snocOL` CMP (OpReg w reg_x) (OpReg w reg_y) `snocOL` (annExpr expr (BCOND cmp (TBlock bid)))
+            fbcond w cmp = do
+              -- ensure we get float regs
+              (reg_fx, _format_fx, code_fx) <- getFloatReg x
+              (reg_fy, _format_fy, code_fy) <- getFloatReg y
+              return $ code_fx `appOL` code_fy `snocOL` CMP (OpReg w reg_fx) (OpReg w reg_fy) `snocOL` (annExpr expr (BCOND cmp (TBlock bid)))
+
+        case mop of
+          MO_F_Eq w -> fbcond w EQ
+          MO_F_Ne w -> fbcond w NE
+
+          MO_F_Gt w -> fbcond w OGT
+          MO_F_Ge w -> fbcond w OGE
+          MO_F_Lt w -> fbcond w OLT
+          MO_F_Le w -> fbcond w OLE
+
+          MO_Eq w   -> bcond w EQ
+          MO_Ne w   -> bcond w NE
+
+          MO_S_Gt w -> bcond w SGT
+          MO_S_Ge w -> bcond w SGE
+          MO_S_Lt w -> bcond w SLT
+          MO_S_Le w -> bcond w SLE
+          MO_U_Gt w -> bcond w UGT
+          MO_U_Ge w -> bcond w UGE
+          MO_U_Lt w -> bcond w ULT
+          MO_U_Le w -> bcond w ULE
+          _ -> pprPanic "AArch64.genCondJump:case mop: " (text $ show expr)
+      _ -> pprPanic "AArch64.genCondJump: " (text $ show expr)
+
+
+genCondBranch
+    :: BlockId      -- the source of the jump
+    -> BlockId      -- the true branch target
+    -> BlockId      -- the false branch target
+    -> CmmExpr      -- the condition on which to branch
+    -> NatM InstrBlock -- Instructions
+
+genCondBranch _ true false expr = do
+  b1 <- genCondJump true expr
+  b2 <- genBranch false
+  return (b1 `appOL` b2)
+
+-- -----------------------------------------------------------------------------
+--  Generating C calls
+
+-- Now the biggest nightmare---calls.  Most of the nastiness is buried in
+-- @get_arg@, which moves the arguments to the correct registers/stack
+-- locations.  Apart from that, the code is easy.
+--
+-- As per *convention*:
+-- x0-x7:   (volatile) argument registers
+-- x8:      (volatile) indirect result register / Linux syscall no
+-- x9-x15:  (volatile) caller saved regs
+-- x16,x17: (volatile) intra-procedure-call registers
+-- x18:     (volatile) platform register. don't use for portability
+-- x19-x28: (non-volatile) callee save regs
+-- x29:     (non-volatile) frame pointer
+-- x30:                    link register
+-- x31:                    stack pointer / zero reg
+--
+-- Thus, this is what a c function will expect. Find the arguments in x0-x7,
+-- anything above that on the stack.  We'll ignore c functions with more than
+-- 8 arguments for now.  Sorry.
+--
+-- We need to make sure we preserve x9-x15, don't want to touch x16, x17.
+
+-- Note [PLT vs GOT relocations]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- When linking objects together, we may need to lookup foreign references. That
+-- is symbolic references to functions or values in other objects. When
+-- compiling the object, we can not know where those elements will end up in
+-- memory (relative to the current location). Thus the use of symbols. There
+-- are two types of items we are interested, code segments we want to jump to
+-- and continue execution there (functions, ...), and data items we want to look
+-- up (strings, numbers, ...). For functions we can use the fact that we can use
+-- an intermediate jump without visibility to the programs execution.  If we
+-- want to jump to a function that is simply too far away to reach for the B/BL
+-- instruction, we can create a small piece of code that loads the full target
+-- address and jumps to that on demand. Say f wants to call g, however g is out
+-- of range for a direct jump, we can create a function h in range for f, that
+-- will load the address of g, and jump there. The area where we construct h
+-- is called the Procedure Linking Table (PLT), we have essentially replaced
+-- f -> g with f -> h -> g.  This is fine for function calls.  However if we
+-- want to lookup values, this trick doesn't work, so we need something else.
+-- We will instead reserve a slot in memory, and have a symbol pointing to that
+-- slot. Now what we essentially do is, we reference that slot, and expect that
+-- slot to hold the final resting address of the data we are interested in.
+-- Thus what that symbol really points to is the location of the final data.
+-- The block of memory where we hold all those slots is the Global Offset Table
+-- (GOT).  Instead of x <- $foo, we now do y <- $fooPtr, and x <- [$y].
+--
+-- For JUMP/CALLs we have 26bits (+/- 128MB), for conditional branches we only
+-- have 19bits (+/- 1MB).  Symbol lookups are also within +/- 1MB, thus for most
+-- of the LOAD/STOREs we'd want to use adrp, and add to compute a value within
+-- 4GB of the PC, and load that.  For anything outside of that range, we'd have
+-- to go through the GOT.
+--
+--  adrp x0, <symbol>
+--  add x0, :lo:<symbol>
+--
+-- will compute the address of <symbol> int x0 if <symbol> is within 4GB of the
+-- PC.
+--
+-- If we want to get the slot in the global offset table (GOT), we can do this:
+--
+--   adrp x0, #:got:<symbol>
+--   ldr x0, [x0, #:got_lo12:<symbol>]
+--
+-- this will compute the address anywhere in the addressable 64bit space into
+-- x0, by loading the address from the GOT slot.
+--
+-- To actually get the value of <symbol>, we'd need to ldr x0, x0 still, which
+-- for the first case can be optimized to use ldr x0, [x0, #:lo12:<symbol>]
+-- instaed of the add instruction.
+--
+-- As the memory model for AArch64 for PIC is considered to be +/- 4GB, we do
+-- not need to go through the GOT, unless we want to address the full address
+-- range within 64bit.
+
+genCCall
+    :: ForeignTarget      -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> BlockId            -- The block we are in
+    -> NatM (InstrBlock, Maybe BlockId)
+-- TODO: Specialize where we can.
+-- Generic impl
+genCCall target dest_regs arg_regs bid = do
+  -- we want to pass arg_regs into allArgRegs
+  -- pprTraceM "genCCall target" (ppr target)
+  -- pprTraceM "genCCall formal" (ppr dest_regs)
+  -- pprTraceM "genCCall actual" (ppr arg_regs)
+
+  case target of
+    -- The target :: ForeignTarget call can either
+    -- be a foreign procedure with an address expr
+    -- and a calling convention.
+    ForeignTarget expr _cconv -> do
+      (call_target, call_target_code) <- case expr of
+        -- if this is a label, let's just directly to it.  This will produce the
+        -- correct CALL relocation for BL...
+        (CmmLit (CmmLabel lbl)) -> pure (TLabel lbl, nilOL)
+        -- ... if it's not a label--well--let's compute the expression into a
+        -- register and jump to that. See Note [PLT vs GOT relocations]
+        _ -> do (reg, _format, reg_code) <- getSomeReg expr
+                pure (TReg reg, reg_code)
+      -- compute the code and register logic for all arg_regs.
+      -- this will give us the format information to match on.
+      arg_regs' <- mapM getSomeReg arg_regs
+
+      -- Now this is stupid.  Our Cmm expressions doesn't carry the proper sizes
+      -- so while in Cmm we might get W64 incorrectly for an int, that is W32 in
+      -- STG; this thenn breaks packing of stack arguments, if we need to pack
+      -- for the pcs, e.g. darwinpcs.  Option one would be to fix the Int type
+      -- in Cmm proper. Option two, which we choose here is to use extended Hint
+      -- information to contain the size information and use that when packing
+      -- arguments, spilled onto the stack.
+      let (_res_hints, arg_hints) = foreignTargetHints target
+          arg_regs'' = zipWith (\(r, f, c) h -> (r,f,h,c)) arg_regs' arg_hints
+
+      platform <- getPlatform
+      let packStack = platformOS platform == OSDarwin
+
+      (stackSpace', passRegs, passArgumentsCode) <- passArguments packStack allGpArgRegs allFpArgRegs arg_regs'' 0 [] nilOL
+
+      -- if we pack the stack, we may need to adjust to multiple of 8byte.
+      -- if we don't pack the stack, it will always be multiple of 8.
+      let stackSpace = if stackSpace' `mod` 8 /= 0
+                       then 8 * (stackSpace' `div` 8 + 1)
+                       else stackSpace'
+
+      (returnRegs, readResultsCode)   <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL
+
+      let moveStackDown 0 = toOL [ PUSH_STACK_FRAME
+                                 , DELTA (-16) ]
+          moveStackDown i | odd i = moveStackDown (i + 1)
+          moveStackDown i = toOL [ PUSH_STACK_FRAME
+                                 , SUB (OpReg W64 (regSingle 31)) (OpReg W64 (regSingle 31)) (OpImm (ImmInt (8 * i)))
+                                 , DELTA (-8 * i - 16) ]
+          moveStackUp 0 = toOL [ POP_STACK_FRAME
+                               , DELTA 0 ]
+          moveStackUp i | odd i = moveStackUp (i + 1)
+          moveStackUp i = toOL [ ADD (OpReg W64 (regSingle 31)) (OpReg W64 (regSingle 31)) (OpImm (ImmInt (8 * i)))
+                               , POP_STACK_FRAME
+                               , DELTA 0 ]
+
+      let code =    call_target_code          -- compute the label (possibly into a register)
+            `appOL` moveStackDown (stackSpace `div` 8)
+            `appOL` passArgumentsCode         -- put the arguments into x0, ...
+            `appOL` (unitOL $ BL call_target passRegs returnRegs) -- branch and link.
+            `appOL` readResultsCode           -- parse the results into registers
+            `appOL` moveStackUp (stackSpace `div` 8)
+      return (code, Nothing)
+
+    -- or a possibly side-effecting machine operation
+    -- mop :: CallishMachOp (see GHC.Cmm.MachOp)
+    PrimTarget mop -> do
+      -- We'll need config to construct forien targets
+      case mop of
+        -- 64 bit float ops
+        MO_F64_Pwr   -> mkCCall "pow"
+
+        MO_F64_Sin   -> mkCCall "sin"
+        MO_F64_Cos   -> mkCCall "cos"
+        MO_F64_Tan   -> mkCCall "tan"
+
+        MO_F64_Sinh  -> mkCCall "sinh"
+        MO_F64_Cosh  -> mkCCall "cosh"
+        MO_F64_Tanh  -> mkCCall "tanh"
+
+        MO_F64_Asin  -> mkCCall "asin"
+        MO_F64_Acos  -> mkCCall "acos"
+        MO_F64_Atan  -> mkCCall "atan"
+
+        MO_F64_Asinh -> mkCCall "asinh"
+        MO_F64_Acosh -> mkCCall "acosh"
+        MO_F64_Atanh -> mkCCall "atanh"
+
+        MO_F64_Log   -> mkCCall "log"
+        MO_F64_Log1P -> mkCCall "log1p"
+        MO_F64_Exp   -> mkCCall "exp"
+        MO_F64_ExpM1 -> mkCCall "expm1"
+        MO_F64_Fabs  -> mkCCall "fabs"
+        MO_F64_Sqrt  -> mkCCall "sqrt"
+
+        -- 32 bit float ops
+        MO_F32_Pwr   -> mkCCall "powf"
+
+        MO_F32_Sin   -> mkCCall "sinf"
+        MO_F32_Cos   -> mkCCall "cosf"
+        MO_F32_Tan   -> mkCCall "tanf"
+        MO_F32_Sinh  -> mkCCall "sinhf"
+        MO_F32_Cosh  -> mkCCall "coshf"
+        MO_F32_Tanh  -> mkCCall "tanhf"
+        MO_F32_Asin  -> mkCCall "asinf"
+        MO_F32_Acos  -> mkCCall "acosf"
+        MO_F32_Atan  -> mkCCall "atanf"
+        MO_F32_Asinh -> mkCCall "asinhf"
+        MO_F32_Acosh -> mkCCall "acoshf"
+        MO_F32_Atanh -> mkCCall "atanhf"
+        MO_F32_Log   -> mkCCall "logf"
+        MO_F32_Log1P -> mkCCall "log1pf"
+        MO_F32_Exp   -> mkCCall "expf"
+        MO_F32_ExpM1 -> mkCCall "expm1f"
+        MO_F32_Fabs  -> mkCCall "fasbf"
+        MO_F32_Sqrt  -> mkCCall "sqrtf"
+
+        -- Conversion
+        MO_UF_Conv w        -> mkCCall (word2FloatLabel w)
+
+        -- Arithmatic
+        -- These are not supported on X86, so I doubt they are used much.
+        MO_S_Mul2     _w -> unsupported mop
+        MO_S_QuotRem  _w -> unsupported mop
+        MO_U_QuotRem  _w -> unsupported mop
+        MO_U_QuotRem2 _w -> unsupported mop
+        MO_Add2       _w -> unsupported mop
+        MO_AddWordC   _w -> unsupported mop
+        MO_SubWordC   _w -> unsupported mop
+        MO_AddIntC    _w -> unsupported mop
+        MO_SubIntC    _w -> unsupported mop
+        MO_U_Mul2     _w -> unsupported mop
+
+        -- Memory Ordering
+        -- TODO DMBSY is probably *way* too much!
+        MO_ReadBarrier      ->  return (unitOL DMBSY, Nothing)
+        MO_WriteBarrier     ->  return (unitOL DMBSY, Nothing)
+        MO_Touch            ->  return (nilOL, Nothing) -- Keep variables live (when using interior pointers)
+        -- Prefetch
+        MO_Prefetch_Data _n -> return (nilOL, Nothing) -- Prefetch hint.
+
+        -- Memory copy/set/move/cmp, with alignment for optimization
+
+        -- TODO Optimize and use e.g. quad registers to move memory around instead
+        -- of offloading this to memcpy. For small memcpys we can utilize
+        -- the 128bit quad registers in NEON to move block of bytes around.
+        -- Might also make sense of small memsets? Use xzr? What's the function
+        -- call overhead?
+        MO_Memcpy  _align   -> mkCCall "memcpy"
+        MO_Memset  _align   -> mkCCall "memset"
+        MO_Memmove _align   -> mkCCall "memmove"
+        MO_Memcmp  _align   -> mkCCall "memcmp"
+
+        MO_SuspendThread    -> mkCCall "suspendThread"
+        MO_ResumeThread     -> mkCCall "resumeThread"
+
+        MO_PopCnt w         -> mkCCall (popCntLabel w)
+        MO_Pdep w           -> mkCCall (pdepLabel w)
+        MO_Pext w           -> mkCCall (pextLabel w)
+        MO_Clz w            -> mkCCall (clzLabel w)
+        MO_Ctz w            -> mkCCall (ctzLabel w)
+        MO_BSwap w          -> mkCCall (bSwapLabel w)
+        MO_BRev w           -> mkCCall (bRevLabel w)
+
+        -- -- Atomic read-modify-write.
+        MO_AtomicRMW w amop -> mkCCall (atomicRMWLabel w amop)
+        MO_AtomicRead w     -> mkCCall (atomicReadLabel w)
+        MO_AtomicWrite w    -> mkCCall (atomicWriteLabel w)
+        MO_Cmpxchg w        -> mkCCall (cmpxchgLabel w)
+        -- -- Should be an AtomicRMW variant eventually.
+        -- -- Sequential consistent.
+        -- TODO: this should be implemented properly!
+        MO_Xchg w           -> mkCCall (xchgLabel w)
+
+  where
+    unsupported :: Show a => a -> b
+    unsupported mop = panic ("outOfLineCmmOp: " ++ show mop
+                          ++ " not supported here")
+    mkCCall :: FastString -> NatM (InstrBlock, Maybe BlockId)
+    mkCCall name = do
+      config <- getConfig
+      target <- cmmMakeDynamicReference config CallReference $
+          mkForeignLabel name Nothing ForeignLabelInThisPackage IsFunction
+      let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn
+      genCCall (ForeignTarget target cconv) dest_regs arg_regs bid
+
+    -- TODO: Optimize using paired stores and loads (STP, LDP). It is
+    -- automomatically done by the allocator for us. However it's not optimal,
+    -- as we'd rather want to have control over
+    --     all spill/load registers, so we can optimize with instructions like
+    --       STP xA, xB, [sp, #-16]!
+    --     and
+    --       LDP xA, xB, sp, #16
+    --
+    passArguments :: Bool -> [Reg] -> [Reg] -> [(Reg, Format, ForeignHint, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock)
+    passArguments _packStack _ _ [] stackSpace accumRegs accumCode = return (stackSpace, accumRegs, accumCode)
+    -- passArguments _ _ [] accumCode stackSpace | isEven stackSpace = return $ SUM (OpReg W64 x31) (OpReg W64 x31) OpImm (ImmInt (-8 * stackSpace))
+    -- passArguments _ _ [] accumCode stackSpace = return $ SUM (OpReg W64 x31) (OpReg W64 x31) OpImm (ImmInt (-8 * (stackSpace + 1)))
+    -- passArguments [] fpRegs (arg0:arg1:args) stack accumCode = do
+    --   -- allocate this on the stack
+    --   (r0, format0, code_r0) <- getSomeReg arg0
+    --   (r1, format1, code_r1) <- getSomeReg arg1
+    --   let w0 = formatToWidth format0
+    --       w1 = formatToWidth format1
+    --       stackCode = unitOL $ STP (OpReg w0 r0) (OpReg w1 R1), (OpAddr (AddrRegImm x31 (ImmInt (stackSpace * 8)))
+    --   passArguments gpRegs (fpReg:fpRegs) args (stackCode `appOL` accumCode)
+
+      -- float promotion.
+      -- According to
+      --  ISO/IEC 9899:2018
+      --  Information technology — Programming languages — C
+      --
+      -- e.g.
+      -- http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1124.pdf
+      -- http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf
+      --
+      -- GHC would need to know the prototype.
+      --
+      -- > If the expression that denotes the called function has a type that does not include a
+      -- > prototype, the integer promotions are performed on each argument, and arguments that
+      -- > have type float are promoted to double.
+      --
+      -- As we have no way to get prototypes for C yet, we'll *not* promote this
+      -- which is in line with the x86_64 backend :(
+      --
+      -- See the encode_values.cmm test.
+      --
+      -- We would essentially need to insert an FCVT (OpReg W64 fpReg) (OpReg W32 fpReg)
+      -- if w == W32.  But *only* if we don't have a prototype m(
+      --
+      -- For AArch64 specificies see: https://developer.arm.com/docs/ihi0055/latest/procedure-call-standard-for-the-arm-64-bit-architecture
+      --
+    -- Still have GP regs, and we want to pass an GP argument.
+    passArguments pack (gpReg:gpRegs) fpRegs ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isIntFormat format = do
+      let w = formatToWidth format
+      passArguments pack gpRegs fpRegs args stackSpace (gpReg:accumRegs) (accumCode `appOL` code_r `snocOL` (ann (text "Pass gp argument: " <> ppr r) $ MOV (OpReg w gpReg) (OpReg w r)))
+
+    -- Still have FP regs, and we want to pass an FP argument.
+    passArguments pack gpRegs (fpReg:fpRegs) ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isFloatFormat format = do
+      let w = formatToWidth format
+      passArguments pack gpRegs fpRegs args stackSpace (fpReg:accumRegs) (accumCode `appOL` code_r `snocOL` (ann (text "Pass fp argument: " <> ppr r) $ MOV (OpReg w fpReg) (OpReg w r)))
+
+    -- No mor regs left to pass. Must pass on stack.
+    passArguments pack [] [] ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode = do
+      let w = formatToWidth format
+          bytes = widthInBits w `div` 8
+          space = if pack then bytes else 8
+          stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace))))
+      passArguments pack [] [] args (stackSpace+space) accumRegs (stackCode `appOL` accumCode)
+
+    -- Still have fpRegs left, but want to pass a GP argument. Must be passed on the stack then.
+    passArguments pack [] fpRegs ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isIntFormat format = do
+      let w = formatToWidth format
+          bytes = widthInBits w `div` 8
+          space = if pack then bytes else 8
+          stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace))))
+      passArguments pack [] fpRegs args (stackSpace+space) accumRegs (stackCode `appOL` accumCode)
+
+    -- Still have gpRegs left, but want to pass a FP argument. Must be passed on the stack then.
+    passArguments pack gpRegs [] ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isFloatFormat format = do
+      let w = formatToWidth format
+          bytes = widthInBits w `div` 8
+          space = if pack then bytes else 8
+          stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace))))
+      passArguments pack gpRegs [] args (stackSpace+space) accumRegs (stackCode `appOL` accumCode)
+
+    passArguments _ _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")
+
+    readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg]-> InstrBlock -> NatM ([Reg], InstrBlock)
+    readResults _ _ [] accumRegs accumCode = return (accumRegs, accumCode)
+    readResults [] _ _ _ _ = do
+      platform <- getPlatform
+      pprPanic "genCCall, out of gp registers when reading results" (pdoc platform target)
+    readResults _ [] _ _ _ = do
+      platform <- getPlatform
+      pprPanic "genCCall, out of fp registers when reading results" (pdoc platform target)
+    readResults (gpReg:gpRegs) (fpReg:fpRegs) (dst:dsts) accumRegs accumCode = do
+      -- gp/fp reg -> dst
+      platform <- getPlatform
+      let rep = cmmRegType platform (CmmLocal dst)
+          format = cmmTypeFormat rep
+          w   = cmmRegWidth platform (CmmLocal dst)
+          r_dst = getRegisterReg platform (CmmLocal dst)
+      if isFloatFormat format
+        then readResults (gpReg:gpRegs) fpRegs dsts (fpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg))
+        else readResults gpRegs (fpReg:fpRegs) dsts (gpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w gpReg))
diff --git a/compiler/GHC/CmmToAsm/AArch64/Cond.hs b/compiler/GHC/CmmToAsm/AArch64/Cond.hs
new file mode 100644
index 0000000000..687daccfda
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/Cond.hs
@@ -0,0 +1,66 @@
+module GHC.CmmToAsm.AArch64.Cond  where
+
+import GHC.Prelude
+
+-- https://developer.arm.com/documentation/den0024/a/the-a64-instruction-set/data-processing-instructions/conditional-instructions
+
+-- TODO: This appears to go a bit overboard? Maybe we should stick with what LLVM
+-- settled on for fcmp?
+-- false: always yields false, regardless of operands.
+-- oeq: yields true if both operands are not a QNAN and op1 is equal to op2.
+-- ogt: yields true if both operands are not a QNAN and op1 is greater than op2.
+-- oge: yields true if both operands are not a QNAN and op1 is greater than or equal to op2.
+-- olt: yields true if both operands are not a QNAN and op1 is less than op2.
+-- ole: yields true if both operands are not a QNAN and op1 is less than or equal to op2.
+-- one: yields true if both operands are not a QNAN and op1 is not equal to op2.
+-- ord: yields true if both operands are not a QNAN.
+-- ueq: yields true if either operand is a QNAN or op1 is equal to op2.
+-- ugt: yields true if either operand is a QNAN or op1 is greater than op2.
+-- uge: yields true if either operand is a QNAN or op1 is greater than or equal to op2.
+-- ult: yields true if either operand is a QNAN or op1 is less than op2.
+-- ule: yields true if either operand is a QNAN or op1 is less than or equal to op2.
+-- une: yields true if either operand is a QNAN or op1 is not equal to op2.
+-- uno: yields true if either operand is a QNAN.
+-- true: always yields true, regardless of operands.
+--
+-- LLVMs icmp knows about:
+-- eq: yields true if the operands are equal, false otherwise. No sign interpretation is necessary or performed.
+-- ne: yields true if the operands are unequal, false otherwise. No sign interpretation is necessary or performed.
+-- ugt: interprets the operands as unsigned values and yields true if op1 is greater than op2.
+-- uge: interprets the operands as unsigned values and yields true if op1 is greater than or equal to op2.
+-- ult: interprets the operands as unsigned values and yields true if op1 is less than op2.
+-- ule: interprets the operands as unsigned values and yields true if op1 is less than or equal to op2.
+-- sgt: interprets the operands as signed values and yields true if op1 is greater than op2.
+-- sge: interprets the operands as signed values and yields true if op1 is greater than or equal to op2.
+-- slt: interprets the operands as signed values and yields true if op1 is less than op2.
+-- sle: interprets the operands as signed values and yields true if op1 is less than or equal to op2.
+
+data Cond
+    = ALWAYS -- b.al
+    | EQ     -- b.eq
+    | NE     -- b.ne
+    -- signed
+    | SLT    -- b.lt
+    | SLE    -- b.le
+    | SGE    -- b.ge
+    | SGT    -- b.gt
+    -- unsigned
+    | ULT    -- b.lo
+    | ULE    -- b.ls
+    | UGE    -- b.hs
+    | UGT    -- b.hi
+    -- ordered
+    | OLT    -- b.mi
+    | OLE    -- b.ls
+    | OGE    -- b.ge
+    | OGT    -- b.gt
+    -- unordered
+    | UOLT   -- b.lt
+    | UOLE   -- b.le
+    | UOGE   -- b.pl
+    | UOGT   -- b.hi
+    -- others
+    | NEVER  -- b.nv
+    | VS     -- oVerflow set
+    | VC     -- oVerflow clear
+    deriving Eq
diff --git a/compiler/GHC/CmmToAsm/AArch64/Instr.hs b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
new file mode 100644
index 0000000000..7d4eaa95f6
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
@@ -0,0 +1,758 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+module GHC.CmmToAsm.AArch64.Instr
+
+where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.AArch64.Cond
+import GHC.CmmToAsm.AArch64.Regs
+
+import GHC.CmmToAsm.Instr (RegUsage(..))
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Types
+import GHC.CmmToAsm.Utils
+import GHC.CmmToAsm.Config
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm
+import GHC.Cmm.CLabel
+import GHC.Utils.Outputable
+import GHC.Platform
+import GHC.Types.Unique.Supply
+
+import GHC.Utils.Panic
+
+import Control.Monad (replicateM)
+import Data.Maybe (fromMaybe)
+
+import GHC.Stack
+
+-- | TODO: verify this!
+stackFrameHeaderSize :: Platform -> Int
+stackFrameHeaderSize _ = 64
+
+-- | All registers are 8 byte wide.
+spillSlotSize :: Int
+spillSlotSize = 8
+
+-- | The number of bytes that the stack pointer should be aligned
+-- to.
+stackAlign :: Int
+stackAlign = 16
+
+-- | The number of spill slots available without allocating more.
+maxSpillSlots :: NCGConfig -> Int
+maxSpillSlots config
+--  = 0 -- set to zero, to see when allocMoreStack has to fire.
+    = let platform = ncgPlatform config
+      in ((ncgSpillPreallocSize config - stackFrameHeaderSize platform)
+         `div` spillSlotSize) - 1
+
+-- | Convert a spill slot number to a *byte* offset, with no sign.
+spillSlotToOffset :: NCGConfig -> Int -> Int
+spillSlotToOffset config slot
+   = stackFrameHeaderSize (ncgPlatform config) + spillSlotSize * slot
+
+-- | Get the registers that are being used by this instruction.
+-- regUsage doesn't need to do any trickery for jumps and such.
+-- Just state precisely the regs read and written by that insn.
+-- The consequences of control flow transfers, as far as register
+-- allocation goes, are taken care of by the register allocator.
+--
+-- RegUsage = RU [<read regs>] [<write regs>]
+
+instance Outputable RegUsage where
+    ppr (RU reads writes) = text "RegUsage(reads:" <+> ppr reads <> comma <+> text "writes:" <+> ppr writes <> char ')'
+
+regUsageOfInstr :: Platform -> Instr -> RegUsage
+regUsageOfInstr platform instr = case instr of
+  ANN _ i                  -> regUsageOfInstr platform i
+  -- 1. Arithmetic Instructions ------------------------------------------------
+  ADD dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  CMN l r                  -> usage (regOp l ++ regOp r, [])
+  CMP l r                  -> usage (regOp l ++ regOp r, [])
+  MSUB dst src1 src2 src3  -> usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
+  MUL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  NEG dst src              -> usage (regOp src, regOp dst)
+  SDIV dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  SUB dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  UDIV dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+
+  -- 2. Bit Manipulation Instructions ------------------------------------------
+  SBFM dst src _ _         -> usage (regOp src, regOp dst)
+  UBFM dst src _ _         -> usage (regOp src, regOp dst)
+
+  -- 3. Logical and Move Instructions ------------------------------------------
+  AND dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  ASR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  BIC dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  BICS dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  EON dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  EOR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  LSL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  LSR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  MOV dst src              -> usage (regOp src, regOp dst)
+  MOVK dst src             -> usage (regOp src, regOp dst)
+  MVN dst src              -> usage (regOp src, regOp dst)
+  ORR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  ROR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  TST src1 src2            -> usage (regOp src1 ++ regOp src2, [])
+  -- 4. Branch Instructions ----------------------------------------------------
+  J t                      -> usage (regTarget t, [])
+  B t                      -> usage (regTarget t, [])
+  BCOND _ t                -> usage (regTarget t, [])
+  BL t ps _rs              -> usage (regTarget t ++ ps, callerSavedRegisters)
+
+  -- 5. Atomic Instructions ----------------------------------------------------
+  -- 6. Conditional Instructions -----------------------------------------------
+  CSET dst _               -> usage ([], regOp dst)
+  CBZ src _                -> usage (regOp src, [])
+  CBNZ src _               -> usage (regOp src, [])
+  -- 7. Load and Store Instructions --------------------------------------------
+  STR _ src dst            -> usage (regOp src ++ regOp dst, [])
+  LDR _ dst src            -> usage (regOp src, regOp dst)
+  -- TODO is this right? see STR, which I'm only partial about being right?
+  STP _ src1 src2 dst      -> usage (regOp src1 ++ regOp src2 ++ regOp dst, [])
+  LDP _ dst1 dst2 src      -> usage (regOp src, regOp dst1 ++ regOp dst2)
+
+  -- 8. Synchronization Instructions -------------------------------------------
+  DMBSY                    -> usage ([], [])
+
+  -- 9. Floating Point Instructions --------------------------------------------
+  FCVT dst src             -> usage (regOp src, regOp dst)
+  SCVTF dst src            -> usage (regOp src, regOp dst)
+  FCVTZS dst src           -> usage (regOp src, regOp dst)
+
+  _ -> panic "regUsageOfInstr"
+
+  where
+        -- filtering the usage is necessary, otherwise the register
+        -- allocator will try to allocate pre-defined fixed stg
+        -- registers as well, as they show up.
+        usage (src, dst) = RU (filter (interesting platform) src)
+                              (filter (interesting platform) dst)
+
+        regAddr :: AddrMode -> [Reg]
+        regAddr (AddrRegReg r1 r2) = [r1, r2]
+        regAddr (AddrRegImm r1 _)  = [r1]
+        regAddr (AddrReg r1)       = [r1]
+        regOp :: Operand -> [Reg]
+        regOp (OpReg _ r1) = [r1]
+        regOp (OpRegExt _ r1 _ _) = [r1]
+        regOp (OpRegShift _ r1 _ _) = [r1]
+        regOp (OpAddr a) = regAddr a
+        regOp (OpImm _) = []
+        regOp (OpImmShift _ _ _) = []
+        regTarget :: Target -> [Reg]
+        regTarget (TBlock _) = []
+        regTarget (TLabel _) = []
+        regTarget (TReg r1)  = [r1]
+
+        -- Is this register interesting for the register allocator?
+        interesting :: Platform -> Reg -> Bool
+        interesting _        (RegVirtual _)                 = True
+        interesting _        (RegReal (RealRegSingle (-1))) = False
+        interesting platform (RegReal (RealRegSingle i))    = freeReg platform i
+        interesting _        (RegReal (RealRegPair{}))
+            = panic "AArch64.Instr.interesting: no reg pairs on this arch"
+
+-- Save caller save registers
+-- This is x0-x18
+--
+-- For SIMD/FP Registers:
+-- Registers v8-v15 must be preserved by a callee across subroutine calls;
+-- the remaining registers (v0-v7, v16-v31) do not need to be preserved (or
+-- should be preserved by the caller). Additionally, only the bottom 64 bits
+-- of each value stored in v8-v15 need to be preserved [7]; it is the
+-- responsibility of the caller to preserve larger values.
+--
+-- .---------------------------------------------------------------------------------------------------------------------------------------------------------------.
+-- |  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
+-- | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
+-- |== General Purpose registers ==================================================================================================================================|
+-- | <---- argument passing -------------> | IR | <------- tmp registers --------> | IP0| IP1| PL | <------------------- callee saved ------------> | FP | LR | SP |
+-- | <------ free registers --------------------------------------------------------------------> | BR | Sp | Hp | R1 | R2 | R3 | R4 | R5 | R6 | SL | -- | -- | -- |
+-- |== SIMD/FP Registers ==========================================================================================================================================|
+-- | <---- argument passing -------------> | <-- callee saved (lower 64 bits) ---> | <--------------------------------------- caller saved ----------------------> |
+-- | <------ free registers -------------> | F1 | F2 | F3 | F4 | D1 | D2 | D3 | D4 | <------ free registers -----------------------------------------------------> |
+-- '---------------------------------------------------------------------------------------------------------------------------------------------------------------'
+-- IR: Indirect result location register, IP: Intra-procedure register, PL: Platform register, FP: Frame pointer, LR: Link register, SP: Stack pointer
+-- BR: Base, SL: SpLim
+callerSavedRegisters :: [Reg]
+callerSavedRegisters
+    = map regSingle [0..18]
+    ++ map regSingle [32..39]
+    ++ map regSingle [48..63]
+
+-- | Apply a given mapping to all the register references in this
+-- instruction.
+patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+patchRegsOfInstr instr env = case instr of
+    -- 0. Meta Instructions
+    ANN d i        -> ANN d (patchRegsOfInstr i env)
+    -- 1. Arithmetic Instructions ----------------------------------------------
+    ADD o1 o2 o3   -> ADD (patchOp o1) (patchOp o2) (patchOp o3)
+    CMN o1 o2      -> CMN (patchOp o1) (patchOp o2)
+    CMP o1 o2      -> CMP (patchOp o1) (patchOp o2)
+    MSUB o1 o2 o3 o4 -> MSUB (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+    MUL o1 o2 o3   -> MUL (patchOp o1) (patchOp o2) (patchOp o3)
+    NEG o1 o2      -> NEG (patchOp o1) (patchOp o2)
+    SDIV o1 o2 o3  -> SDIV (patchOp o1) (patchOp o2) (patchOp o3)
+    SUB o1 o2 o3   -> SUB  (patchOp o1) (patchOp o2) (patchOp o3)
+    UDIV o1 o2 o3  -> UDIV (patchOp o1) (patchOp o2) (patchOp o3)
+
+    -- 2. Bit Manipulation Instructions ----------------------------------------
+    SBFM o1 o2 o3 o4 -> SBFM (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+    UBFM o1 o2 o3 o4 -> UBFM (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+
+    -- 3. Logical and Move Instructions ----------------------------------------
+    AND o1 o2 o3   -> AND  (patchOp o1) (patchOp o2) (patchOp o3)
+    ANDS o1 o2 o3  -> ANDS (patchOp o1) (patchOp o2) (patchOp o3)
+    ASR o1 o2 o3   -> ASR  (patchOp o1) (patchOp o2) (patchOp o3)
+    BIC o1 o2 o3   -> BIC  (patchOp o1) (patchOp o2) (patchOp o3)
+    BICS o1 o2 o3  -> BICS (patchOp o1) (patchOp o2) (patchOp o3)
+    EON o1 o2 o3   -> EON  (patchOp o1) (patchOp o2) (patchOp o3)
+    EOR o1 o2 o3   -> EOR  (patchOp o1) (patchOp o2) (patchOp o3)
+    LSL o1 o2 o3   -> LSL  (patchOp o1) (patchOp o2) (patchOp o3)
+    LSR o1 o2 o3   -> LSR  (patchOp o1) (patchOp o2) (patchOp o3)
+    MOV o1 o2      -> MOV  (patchOp o1) (patchOp o2)
+    MOVK o1 o2     -> MOVK (patchOp o1) (patchOp o2)
+    MVN o1 o2      -> MVN  (patchOp o1) (patchOp o2)
+    ORR o1 o2 o3   -> ORR  (patchOp o1) (patchOp o2) (patchOp o3)
+    ROR o1 o2 o3   -> ROR  (patchOp o1) (patchOp o2) (patchOp o3)
+    TST o1 o2      -> TST  (patchOp o1) (patchOp o2)
+
+    -- 4. Branch Instructions --------------------------------------------------
+    J t            -> J (patchTarget t)
+    B t            -> B (patchTarget t)
+    BL t rs ts     -> BL (patchTarget t) rs ts
+    BCOND c t      -> BCOND c (patchTarget t)
+
+    -- 5. Atomic Instructions --------------------------------------------------
+    -- 6. Conditional Instructions ---------------------------------------------
+    CSET o c       -> CSET (patchOp o) c
+    CBZ o l        -> CBZ (patchOp o) l
+    CBNZ o l       -> CBNZ (patchOp o) l
+    -- 7. Load and Store Instructions ------------------------------------------
+    STR f o1 o2    -> STR f (patchOp o1) (patchOp o2)
+    LDR f o1 o2    -> LDR f (patchOp o1) (patchOp o2)
+    STP f o1 o2 o3 -> STP f (patchOp o1) (patchOp o2) (patchOp o3)
+    LDP f o1 o2 o3 -> LDP f (patchOp o1) (patchOp o2) (patchOp o3)
+
+    -- 8. Synchronization Instructions -----------------------------------------
+    DMBSY          -> DMBSY
+
+    -- 9. Floating Point Instructions ------------------------------------------
+    FCVT o1 o2     -> FCVT (patchOp o1) (patchOp o2)
+    SCVTF o1 o2    -> SCVTF (patchOp o1) (patchOp o2)
+    FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
+
+    _ -> pprPanic "patchRegsOfInstr" (text $ show instr)
+    where
+        patchOp :: Operand -> Operand
+        patchOp (OpReg w r) = OpReg w (env r)
+        patchOp (OpRegExt w r x s) = OpRegExt w (env r) x s
+        patchOp (OpRegShift w r m s) = OpRegShift w (env r) m s
+        patchOp (OpAddr a) = OpAddr (patchAddr a)
+        patchOp op = op
+        patchTarget :: Target -> Target
+        patchTarget (TReg r) = TReg (env r)
+        patchTarget t = t
+        patchAddr :: AddrMode -> AddrMode
+        patchAddr (AddrRegReg r1 r2) = AddrRegReg (env r1) (env r2)
+        patchAddr (AddrRegImm r1 i)  = AddrRegImm (env r1) i
+        patchAddr (AddrReg r) = AddrReg (env r)
+--------------------------------------------------------------------------------
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+isJumpishInstr :: Instr -> Bool
+isJumpishInstr instr = case instr of
+    ANN _ i -> isJumpishInstr i
+    CBZ{} -> True
+    CBNZ{} -> True
+    J{} -> True
+    B{} -> True
+    BL{} -> True
+    BCOND{} -> True
+    _ -> False
+
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+jumpDestsOfInstr :: Instr -> [BlockId]
+jumpDestsOfInstr (ANN _ i) = jumpDestsOfInstr i
+jumpDestsOfInstr (CBZ _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (CBNZ _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (J t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (B t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (BL t _ _) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (BCOND _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr _ = []
+
+-- | Change the destination of this jump instruction.
+-- Used in the linear allocator when adding fixup blocks for join
+-- points.
+patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
+patchJumpInstr instr patchF
+    = case instr of
+        ANN d i -> ANN d (patchJumpInstr i patchF)
+        CBZ r (TBlock bid) -> CBZ r (TBlock (patchF bid))
+        CBNZ r (TBlock bid) -> CBNZ r (TBlock (patchF bid))
+        J (TBlock bid) -> J (TBlock (patchF bid))
+        B (TBlock bid) -> B (TBlock (patchF bid))
+        BL (TBlock bid) ps rs -> BL (TBlock (patchF bid)) ps rs
+        BCOND c (TBlock bid) -> BCOND c (TBlock (patchF bid))
+        _ -> pprPanic "patchJumpInstr" (text $ show instr)
+
+-- -----------------------------------------------------------------------------
+-- Note [Spills and Reloads]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~
+-- We reserve @RESERVED_C_STACK_BYTES@ on the C stack for spilling and reloading
+-- registers.  AArch64s maximum displacement for SP relative spills and reloads
+-- is essentially [-256,255], or [0, 0xFFF]*8 = [0, 32760] for 64bits.
+--
+-- The @RESERVED_C_STACK_BYTES@ is 16k, so we can't address any location in a
+-- single instruction.  The idea is to use the Inter Procedure 0 (ip0) register
+-- to perform the computations for larger offsets.
+--
+-- Using sp to compute the offset will violate assumptions about the stack pointer
+-- pointing to the top of the stack during signal handling.  As we can't force
+-- every signal to use its own stack, we have to ensure that the stack poitner
+-- always poitns to the top of the stack, and we can't use it for computation.
+--
+-- | An instruction to spill a register into a spill slot.
+mkSpillInstr
+   :: HasCallStack
+   => NCGConfig
+   -> Reg       -- register to spill
+   -> Int       -- current stack delta
+   -> Int       -- spill slot to use
+   -> [Instr]
+
+mkSpillInstr config reg delta slot =
+  case (spillSlotToOffset config slot) - delta of
+    imm | -256 <= imm && imm <= 255                               -> [ mkStrSp imm ]
+    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkStrSp imm ]
+    imm | imm > 0xfff && imm <= 0xffffff && imm .&. 0x7 == 0x0    -> [ mkIp0SpillAddr (imm .&~. 0xfff)
+                                                                     , mkStrIp0 (imm .&.  0xfff)
+                                                                     ]
+    imm -> pprPanic "mkSpillInstr" (text "Unable to spill into" <+> int imm)
+    where
+        a .&~. b = a .&. (complement b)
+
+        fmt = case reg of
+            RegReal (RealRegSingle n) | n < 32 -> II64
+            _                                  -> FF64
+        mkIp0SpillAddr imm = ANN (text "Spill: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
+        mkStrSp imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
+        mkStrIp0 imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
+
+        off = spillSlotToOffset config slot
+
+mkLoadInstr
+   :: NCGConfig
+   -> Reg       -- register to load
+   -> Int       -- current stack delta
+   -> Int       -- spill slot to use
+   -> [Instr]
+
+mkLoadInstr config reg delta slot =
+  case (spillSlotToOffset config slot) - delta of
+    imm | -256 <= imm && imm <= 255                               -> [ mkLdrSp imm ]
+    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkLdrSp imm ]
+    imm | imm > 0xfff && imm <= 0xffffff && imm .&. 0x7 == 0x0    -> [ mkIp0SpillAddr (imm .&~. 0xfff)
+                                                                     , mkLdrIp0 (imm .&.  0xfff)
+                                                                     ]
+    imm -> pprPanic "mkSpillInstr" (text "Unable to spill into" <+> int imm)
+    where
+        a .&~. b = a .&. (complement b)
+
+        fmt = case reg of
+            RegReal (RealRegSingle n) | n < 32 -> II64
+            _                                  -> FF64
+
+        mkIp0SpillAddr imm = ANN (text "Reload: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
+        mkLdrSp imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
+        mkLdrIp0 imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
+
+        off = spillSlotToOffset config slot
+
+--------------------------------------------------------------------------------
+-- | See if this instruction is telling us the current C stack delta
+takeDeltaInstr :: Instr -> Maybe Int
+takeDeltaInstr (ANN _ i) = takeDeltaInstr i
+takeDeltaInstr (DELTA i) = Just i
+takeDeltaInstr _         = Nothing
+
+-- Not real instructions.  Just meta data
+isMetaInstr :: Instr -> Bool
+isMetaInstr instr
+ = case instr of
+    ANN _ i     -> isMetaInstr i
+    COMMENT{}   -> True
+    MULTILINE_COMMENT{} -> True
+    LOCATION{}  -> True
+    LDATA{}     -> True
+    NEWBLOCK{}  -> True
+    DELTA{}     -> True
+    PUSH_STACK_FRAME -> True
+    POP_STACK_FRAME -> True
+    _           -> False
+
+-- | Copy the value in a register to another one.
+-- Must work for all register classes.
+mkRegRegMoveInstr :: Reg -> Reg -> Instr
+mkRegRegMoveInstr src dst = ANN (text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst) $ MOV (OpReg W64 dst) (OpReg W64 src)
+
+-- | Take the source and destination from this reg -> reg move instruction
+-- or Nothing if it's not one
+takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
+--takeRegRegMoveInstr (MOV (OpReg fmt dst) (OpReg fmt' src)) | fmt == fmt' = Just (src, dst)
+takeRegRegMoveInstr _ = Nothing
+
+-- | Make an unconditional jump instruction.
+mkJumpInstr :: BlockId -> [Instr]
+mkJumpInstr id = [B (TBlock id)]
+
+mkStackAllocInstr :: Platform -> Int -> [Instr]
+mkStackAllocInstr platform n
+    | n == 0 = []
+    | n > 0 && n < 4096 = [ ANN (text "Alloc More Stack") $ SUB sp sp (OpImm (ImmInt n)) ]
+    | n > 0 =  ANN (text "Alloc More Stack") (SUB sp sp (OpImm (ImmInt 4095))) : mkStackAllocInstr platform (n - 4095)
+mkStackAllocInstr _platform n = pprPanic "mkStackAllocInstr" (int n)
+
+mkStackDeallocInstr :: Platform -> Int -> [Instr]
+mkStackDeallocInstr platform n
+    | n == 0 = []
+    | n > 0 && n < 4096 = [ ANN (text "Dealloc More Stack") $ ADD sp sp (OpImm (ImmInt n)) ]
+    | n > 0 =  ANN (text "Dealloc More Stack") (ADD sp sp (OpImm (ImmInt 4095))) : mkStackDeallocInstr platform (n - 4095)
+mkStackDeallocInstr _platform n = pprPanic "mkStackDeallocInstr" (int n)
+
+--
+-- See note [extra spill slots] in X86/Instr.hs
+--
+allocMoreStack
+  :: Platform
+  -> Int
+  -> NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr
+  -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
+    let entries = entryBlocks proc
+
+    uniqs <- replicateM (length entries) getUniqueM
+
+    let
+      delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+        where x = slots * spillSlotSize -- sp delta
+
+      alloc   = mkStackAllocInstr   platform delta
+      dealloc = mkStackDeallocInstr platform delta
+
+      retargetList = (zip entries (map mkBlockId uniqs))
+
+      new_blockmap :: LabelMap BlockId
+      new_blockmap = mapFromList retargetList
+
+      insert_stack_insn (BasicBlock id insns)
+        | Just new_blockid <- mapLookup id new_blockmap
+        = [ BasicBlock id $ alloc ++ [ B (TBlock new_blockid) ]
+          , BasicBlock new_blockid block' ]
+        | otherwise
+        = [ BasicBlock id block' ]
+        where
+          block' = foldr insert_dealloc [] insns
+
+      insert_dealloc insn r = case insn of
+        J _ -> dealloc ++ (insn : r)
+        ANN _ (J _) -> dealloc ++ (insn : r)
+        _other | jumpDestsOfInstr insn /= []
+            -> patchJumpInstr insn retarget : r
+        _other -> insn : r
+
+        where retarget b = fromMaybe b (mapLookup b new_blockmap)
+
+      new_code = concatMap insert_stack_insn code
+    -- in
+    return (CmmProc info lbl live (ListGraph new_code), retargetList)
+-- -----------------------------------------------------------------------------
+-- Machine's assembly language
+
+-- We have a few common "instructions" (nearly all the pseudo-ops) but
+-- mostly all of 'Instr' is machine-specific.
+
+-- Some additional (potential future) instructions are commented out. They are
+-- not needed yet for the backend but could be used in the future.
+data Instr
+    -- comment pseudo-op
+    = COMMENT SDoc
+    | MULTILINE_COMMENT SDoc
+
+    -- Annotated instruction. Should print <instr> # <doc>
+    | ANN SDoc Instr
+
+    -- location pseudo-op (file, line, col, name)
+    | LOCATION Int Int Int String
+
+    -- some static data spat out during code
+    -- generation.  Will be extracted before
+    -- pretty-printing.
+    | LDATA   Section RawCmmStatics
+
+    -- start a new basic block.  Useful during
+    -- codegen, removed later.  Preceding
+    -- instruction should be a jump, as per the
+    -- invariants for a BasicBlock (see Cmm).
+    | NEWBLOCK BlockId
+
+    -- specify current stack offset for
+    -- benefit of subsequent passes
+    | DELTA   Int
+
+    -- 0. Pseudo Instructions --------------------------------------------------
+    -- These are instructions not contained or only partially contained in the
+    -- official ISA, but make reading clearer. Some of them might even be
+    -- implemented in the assembler, but are not guaranteed to be portable.
+    -- | SXTB Operand Operand
+    -- | SXTH Operand Operand
+    -- | SXTW Operand Operand
+    -- | SXTX Operand Operand
+    | PUSH_STACK_FRAME
+    | POP_STACK_FRAME
+    -- 1. Arithmetic Instructions ----------------------------------------------
+    -- | ADC Operand Operand Operang -- rd = rn + rm + C
+    -- | ADCS ...
+    | ADD Operand Operand Operand -- rd = rn + rm
+    -- | ADDS Operand Operand Operand -- rd = rn + rm
+    -- | ADR ...
+    -- | ADRP ...
+    | CMN Operand Operand -- rd + op2
+    | CMP Operand Operand -- rd - op2
+    -- | MADD ...
+    -- | MNEG ...
+    | MSUB Operand Operand Operand Operand -- rd = ra - rn × rm
+    | MUL Operand Operand Operand -- rd = rn × rm
+    | NEG Operand Operand -- rd = -op2
+    -- | NEGS ...
+    -- | NGC ...
+    -- | NGCS ...
+    -- | SBC ...
+    -- | SBCS ...
+    | SDIV Operand Operand Operand -- rd = rn ÷ rm
+    -- | SMADDL ...
+    -- | SMNEGL ...
+    -- | SMSUBL ...
+    -- | SMULH ...
+    -- | SMULL ...
+    | SUB Operand Operand Operand -- rd = rn - op2
+    -- | SUBS ...
+    | UDIV Operand Operand Operand -- rd = rn ÷ rm
+    -- | UMADDL ...  -- Xd = Xa + Wn × Wm
+    -- | UMNEGL ... -- Xd = - Wn × Wm
+    -- | UMSUBL ... -- Xd = Xa - Wn × Wm
+    -- | UMULH ... -- Xd = (Xn × Xm)_127:64
+    -- | UMULL ... -- Xd = Wn × Wm
+
+    -- 2. Bit Manipulation Instructions ----------------------------------------
+    | SBFM Operand Operand Operand Operand -- rd = rn[i,j]
+    -- SXTB = SBFM <Wd>, <Wn>, #0, #7
+    -- SXTH = SBFM <Wd>, <Wn>, #0, #15
+    -- SXTW = SBFM <Wd>, <Wn>, #0, #31
+    | UBFM Operand Operand Operand Operand -- rd = rn[i,j]
+    -- UXTB = UBFM <Wd>, <Wn>, #0, #7
+    -- UXTH = UBFM <Wd>, <Wn>, #0, #15
+
+    -- 3. Logical and Move Instructions ----------------------------------------
+    | AND Operand Operand Operand -- rd = rn & op2
+    | ANDS Operand Operand Operand -- rd = rn & op2
+    | ASR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
+    | BIC Operand Operand Operand -- rd = rn & ~op2
+    | BICS Operand Operand Operand -- rd = rn & ~op2
+    | EON Operand Operand Operand -- rd = rn ⊕ ~op2
+    | EOR Operand Operand Operand -- rd = rn ⊕ op2
+    | LSL Operand Operand Operand -- rd = rn ≪ rm  or rd = rn ≪ #i, i is 6 bits
+    | LSR Operand Operand Operand -- rd = rn ≫ rm  or rd = rn ≫ #i, i is 6 bits
+    | MOV Operand Operand -- rd = rn  or  rd = #i
+    | MOVK Operand Operand
+    -- | MOVN Operand Operand
+    -- | MOVZ Operand Operand
+    | MVN Operand Operand -- rd = ~rn
+    | ORN Operand Operand Operand -- rd = rn | ~op2
+    | ORR Operand Operand Operand -- rd = rn | op2
+    | ROR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
+    | TST Operand Operand -- rn & op2
+    -- Load and stores.
+    -- TODO STR/LDR might want to change to STP/LDP with XZR for the second register.
+    | STR Format Operand Operand -- str Xn, address-mode // Xn -> *addr
+    | LDR Format Operand Operand -- ldr Xn, address-mode // Xn <- *addr
+    | STP Format Operand Operand Operand -- stp Xn, Xm, address-mode // Xn -> *addr, Xm -> *(addr + 8)
+    | LDP Format Operand Operand Operand -- stp Xn, Xm, address-mode // Xn <- *addr, Xm <- *(addr + 8)
+
+    -- Conditional instructions
+    | CSET Operand Cond   -- if(cond) op <- 1 else op <- 0
+
+    | CBZ Operand Target  -- if op == 0, then branch.
+    | CBNZ Operand Target -- if op /= 0, then branch.
+    -- Branching.
+    | J Target            -- like B, but only generated from genJump. Used to distinguish genJumps from others.
+    | B Target            -- unconditional branching b/br. (To a blockid, label or register)
+    | BL Target [Reg] [Reg] -- branch and link (e.g. set x30 to next pc, and branch)
+    | BCOND Cond Target   -- branch with condition. b.<cond>
+
+    -- 8. Synchronization Instructions -----------------------------------------
+    | DMBSY
+    -- 9. Floating Point Instructions
+    -- Float ConVerT
+    | FCVT Operand Operand
+    -- Signed ConVerT Float
+    | SCVTF Operand Operand
+    -- Float ConVerT to Zero Signed
+    | FCVTZS Operand Operand
+
+instance Show Instr where
+    show (LDR _f o1 o2) = "LDR " ++ show o1 ++ ", " ++ show o2
+    show (MOV o1 o2) = "MOV " ++ show o1 ++ ", " ++ show o2
+    show _ = "missing"
+
+data Target
+    = TBlock BlockId
+    | TLabel CLabel
+    | TReg   Reg
+
+
+-- Extension
+-- {Unsigned|Signed}XT{Byte|Half|Word|Doube}
+data ExtMode
+    = EUXTB | EUXTH | EUXTW | EUXTX
+    | ESXTB | ESXTH | ESXTW | ESXTX
+    deriving (Eq, Show)
+
+data ShiftMode
+    = SLSL | SLSR | SASR | SROR
+    deriving (Eq, Show)
+
+
+-- We can also add ExtShift to Extension.
+-- However at most 3bits.
+type ExtShift = Int
+-- at most 6bits
+type RegShift = Int
+
+data Operand
+        = OpReg Width Reg            -- register
+        | OpRegExt Width Reg ExtMode ExtShift -- rm, <ext>[, <shift left>]
+        | OpRegShift Width Reg ShiftMode RegShift     -- rm, <shift>, <0-64>
+        | OpImm Imm            -- immediate value
+        | OpImmShift Imm ShiftMode RegShift
+        | OpAddr AddrMode       -- memory reference
+        deriving (Eq, Show)
+
+-- Smart constructors
+opReg :: Width -> Reg -> Operand
+opReg = OpReg
+
+xzr, wzr, sp, ip0 :: Operand
+xzr = OpReg W64 (RegReal (RealRegSingle (-1)))
+wzr = OpReg W32 (RegReal (RealRegSingle (-1)))
+sp  = OpReg W64 (RegReal (RealRegSingle 31))
+ip0 = OpReg W64 (RegReal (RealRegSingle 16))
+
+_x :: Int -> Operand
+_x i = OpReg W64 (RegReal (RealRegSingle i))
+x0,  x1,  x2,  x3,  x4,  x5,  x6,  x7  :: Operand
+x8,  x9,  x10, x11, x12, x13, x14, x15 :: Operand
+x16, x17, x18, x19, x20, x21, x22, x23 :: Operand
+x24, x25, x26, x27, x28, x29, x30, x31 :: Operand
+x0  = OpReg W64 (RegReal (RealRegSingle  0))
+x1  = OpReg W64 (RegReal (RealRegSingle  1))
+x2  = OpReg W64 (RegReal (RealRegSingle  2))
+x3  = OpReg W64 (RegReal (RealRegSingle  3))
+x4  = OpReg W64 (RegReal (RealRegSingle  4))
+x5  = OpReg W64 (RegReal (RealRegSingle  5))
+x6  = OpReg W64 (RegReal (RealRegSingle  6))
+x7  = OpReg W64 (RegReal (RealRegSingle  7))
+x8  = OpReg W64 (RegReal (RealRegSingle  8))
+x9  = OpReg W64 (RegReal (RealRegSingle  9))
+x10 = OpReg W64 (RegReal (RealRegSingle 10))
+x11 = OpReg W64 (RegReal (RealRegSingle 11))
+x12 = OpReg W64 (RegReal (RealRegSingle 12))
+x13 = OpReg W64 (RegReal (RealRegSingle 13))
+x14 = OpReg W64 (RegReal (RealRegSingle 14))
+x15 = OpReg W64 (RegReal (RealRegSingle 15))
+x16 = OpReg W64 (RegReal (RealRegSingle 16))
+x17 = OpReg W64 (RegReal (RealRegSingle 17))
+x18 = OpReg W64 (RegReal (RealRegSingle 18))
+x19 = OpReg W64 (RegReal (RealRegSingle 19))
+x20 = OpReg W64 (RegReal (RealRegSingle 20))
+x21 = OpReg W64 (RegReal (RealRegSingle 21))
+x22 = OpReg W64 (RegReal (RealRegSingle 22))
+x23 = OpReg W64 (RegReal (RealRegSingle 23))
+x24 = OpReg W64 (RegReal (RealRegSingle 24))
+x25 = OpReg W64 (RegReal (RealRegSingle 25))
+x26 = OpReg W64 (RegReal (RealRegSingle 26))
+x27 = OpReg W64 (RegReal (RealRegSingle 27))
+x28 = OpReg W64 (RegReal (RealRegSingle 28))
+x29 = OpReg W64 (RegReal (RealRegSingle 29))
+x30 = OpReg W64 (RegReal (RealRegSingle 30))
+x31 = OpReg W64 (RegReal (RealRegSingle 31))
+
+_d :: Int -> Operand
+_d = OpReg W64 . RegReal . RealRegSingle
+d0,  d1,  d2,  d3,  d4,  d5,  d6,  d7  :: Operand
+d8,  d9,  d10, d11, d12, d13, d14, d15 :: Operand
+d16, d17, d18, d19, d20, d21, d22, d23 :: Operand
+d24, d25, d26, d27, d28, d29, d30, d31 :: Operand
+d0  = OpReg W64 (RegReal (RealRegSingle 32))
+d1  = OpReg W64 (RegReal (RealRegSingle 33))
+d2  = OpReg W64 (RegReal (RealRegSingle 34))
+d3  = OpReg W64 (RegReal (RealRegSingle 35))
+d4  = OpReg W64 (RegReal (RealRegSingle 36))
+d5  = OpReg W64 (RegReal (RealRegSingle 37))
+d6  = OpReg W64 (RegReal (RealRegSingle 38))
+d7  = OpReg W64 (RegReal (RealRegSingle 39))
+d8  = OpReg W64 (RegReal (RealRegSingle 40))
+d9  = OpReg W64 (RegReal (RealRegSingle 41))
+d10 = OpReg W64 (RegReal (RealRegSingle 42))
+d11 = OpReg W64 (RegReal (RealRegSingle 43))
+d12 = OpReg W64 (RegReal (RealRegSingle 44))
+d13 = OpReg W64 (RegReal (RealRegSingle 45))
+d14 = OpReg W64 (RegReal (RealRegSingle 46))
+d15 = OpReg W64 (RegReal (RealRegSingle 47))
+d16 = OpReg W64 (RegReal (RealRegSingle 48))
+d17 = OpReg W64 (RegReal (RealRegSingle 49))
+d18 = OpReg W64 (RegReal (RealRegSingle 50))
+d19 = OpReg W64 (RegReal (RealRegSingle 51))
+d20 = OpReg W64 (RegReal (RealRegSingle 52))
+d21 = OpReg W64 (RegReal (RealRegSingle 53))
+d22 = OpReg W64 (RegReal (RealRegSingle 54))
+d23 = OpReg W64 (RegReal (RealRegSingle 55))
+d24 = OpReg W64 (RegReal (RealRegSingle 56))
+d25 = OpReg W64 (RegReal (RealRegSingle 57))
+d26 = OpReg W64 (RegReal (RealRegSingle 58))
+d27 = OpReg W64 (RegReal (RealRegSingle 59))
+d28 = OpReg W64 (RegReal (RealRegSingle 60))
+d29 = OpReg W64 (RegReal (RealRegSingle 61))
+d30 = OpReg W64 (RegReal (RealRegSingle 62))
+d31 = OpReg W64 (RegReal (RealRegSingle 63))
+
+opRegUExt :: Width -> Reg -> Operand
+opRegUExt W64 r = OpRegExt W64 r EUXTX 0
+opRegUExt W32 r = OpRegExt W32 r EUXTW 0
+opRegUExt W16 r = OpRegExt W16 r EUXTH 0
+opRegUExt W8  r = OpRegExt W8  r EUXTB 0
+opRegUExt w  _r = pprPanic "opRegUExt" (text $ show w)
+
+opRegSExt :: Width -> Reg -> Operand
+opRegSExt W64 r = OpRegExt W64 r ESXTX 0
+opRegSExt W32 r = OpRegExt W32 r ESXTW 0
+opRegSExt W16 r = OpRegExt W16 r ESXTH 0
+opRegSExt W8  r = OpRegExt W8  r ESXTB 0
+opRegSExt w  _r = pprPanic "opRegSExt" (text $ show w)
diff --git a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
new file mode 100644
index 0000000000..3f413339c2
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
@@ -0,0 +1,587 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.AArch64.Ppr (pprNatCmmDecl, pprInstr) where
+
+import GHC.Prelude hiding (EQ)
+
+import Data.Word
+import qualified Data.Array.Unsafe as U ( castSTUArray )
+import Data.Array.ST
+import Control.Monad.ST
+
+import GHC.CmmToAsm.AArch64.Instr
+import GHC.CmmToAsm.AArch64.Regs
+import GHC.CmmToAsm.AArch64.Cond
+import GHC.CmmToAsm.Ppr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+import GHC.CmmToAsm.Utils
+
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Types.Basic (Alignment, mkAlignment, alignmentBytes)
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+
+import GHC.Types.Unique ( pprUniqueAlways, getUnique )
+import GHC.Platform
+import GHC.Utils.Outputable
+
+import GHC.Utils.Panic
+
+pprProcAlignment :: NCGConfig -> SDoc
+pprProcAlignment config = maybe empty (pprAlign platform . mkAlignment) (ncgProcAlignment config)
+   where
+      platform = ncgPlatform config
+
+pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> SDoc
+pprNatCmmDecl config (CmmData section dats) =
+  pprSectionAlign config section $$ pprDatas config dats
+
+pprNatCmmDecl config proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+  let platform = ncgPlatform config in
+  pprProcAlignment config $$
+  case topInfoTable proc of
+    Nothing ->
+        -- special case for code without info table:
+        pprSectionAlign config (Section Text lbl) $$
+        -- do not
+        -- pprProcAlignment config $$
+        pprLabel platform lbl $$ -- blocks guaranteed not null, so label needed
+        vcat (map (pprBasicBlock config top_info) blocks) $$
+        (if ncgDwarfEnabled config
+         then ppr (mkAsmTempEndLabel lbl) <> char ':' else empty) $$
+        pprSizeDecl platform lbl
+
+    Just (CmmStaticsRaw info_lbl _) ->
+      pprSectionAlign config (Section Text info_lbl) $$
+      -- pprProcAlignment config $$
+      (if platformHasSubsectionsViaSymbols platform
+          then ppr (mkDeadStripPreventer info_lbl) <> char ':'
+          else empty) $$
+      vcat (map (pprBasicBlock config top_info) blocks) $$
+      -- above: Even the first block gets a label, because with branch-chain
+      -- elimination, it might be the target of a goto.
+      (if platformHasSubsectionsViaSymbols platform
+       then -- See Note [Subsections Via Symbols]
+                text "\t.long "
+            <+> ppr info_lbl
+            <+> char '-'
+            <+> ppr (mkDeadStripPreventer info_lbl)
+       else empty) $$
+      pprSizeDecl platform info_lbl
+
+pprLabel :: Platform -> CLabel -> SDoc
+pprLabel platform lbl =
+   pprGloblDecl platform lbl
+   $$ pprTypeDecl platform lbl
+   $$ (pdoc platform lbl <> char ':')
+
+pprAlign :: Platform -> Alignment -> SDoc
+pprAlign _platform alignment
+        = text "\t.balign " <> int (alignmentBytes alignment)
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: Platform -> SectionType -> SDoc
+pprAlignForSection _platform _seg
+    -- .balign is stable, whereas .align is platform dependent.
+    = text "\t.balign 8" --  always 8
+
+instance Outputable Instr where
+    ppr = pprInstr genericPlatform
+
+-- | Print section header and appropriate alignment for that section.
+--
+-- This one will emit the header:
+--
+--     .section .text
+--     .balign 8
+--
+pprSectionAlign :: NCGConfig -> Section -> SDoc
+pprSectionAlign _config (Section (OtherSection _) _) =
+     panic "AArch64.Ppr.pprSectionAlign: unknown section"
+pprSectionAlign config sec@(Section seg _) =
+    pprSectionHeader config sec
+    $$ pprAlignForSection (ncgPlatform config) seg
+
+-- | Output the ELF .size directive.
+pprSizeDecl :: Platform -> CLabel -> SDoc
+pprSizeDecl platform lbl
+ = if osElfTarget (platformOS platform)
+   then text "\t.size" <+> pdoc platform lbl <> text ", .-" <> pdoc platform lbl
+   else empty
+
+pprBasicBlock :: NCGConfig -> LabelMap RawCmmStatics -> NatBasicBlock Instr
+              -> SDoc
+pprBasicBlock config info_env (BasicBlock blockid instrs)
+  = maybe_infotable $
+    pprLabel platform asmLbl $$
+    vcat (map (pprInstr platform) (id {-detectTrivialDeadlock-} optInstrs)) $$
+    (if  ncgDwarfEnabled config
+      then ppr (mkAsmTempEndLabel asmLbl) <> char ':'
+      else empty
+    )
+  where
+    -- Filter out identity moves. E.g. mov x18, x18 will be dropped.
+    optInstrs = filter f instrs
+      where f (MOV o1 o2) | o1 == o2 = False
+            f _ = True
+
+    asmLbl = blockLbl blockid
+    platform = ncgPlatform config
+    maybe_infotable c = case mapLookup blockid info_env of
+       Nothing   -> c
+       Just (CmmStaticsRaw info_lbl info) ->
+          --  pprAlignForSection platform Text $$
+           infoTableLoc $$
+           vcat (map (pprData config) info) $$
+           pprLabel platform info_lbl $$
+           c $$
+           (if ncgDwarfEnabled config
+             then ppr (mkAsmTempEndLabel info_lbl) <> char ':'
+             else empty)
+    -- Make sure the info table has the right .loc for the block
+    -- coming right after it. See [Note: Info Offset]
+    infoTableLoc = case instrs of
+      (l@LOCATION{} : _) -> pprInstr platform l
+      _other             -> empty
+
+pprDatas :: NCGConfig -> RawCmmStatics -> SDoc
+-- See Note [emit-time elimination of static indirections] in "GHC.Cmm.CLabel".
+pprDatas config (CmmStaticsRaw alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+  | lbl == mkIndStaticInfoLabel
+  , let labelInd (CmmLabelOff l _) = Just l
+        labelInd (CmmLabel l) = Just l
+        labelInd _ = Nothing
+  , Just ind' <- labelInd ind
+  , alias `mayRedirectTo` ind'
+  = pprGloblDecl (ncgPlatform config) alias
+    $$ text ".equiv" <+> pdoc (ncgPlatform config) alias <> comma <> pdoc (ncgPlatform config) (CmmLabel ind')
+
+pprDatas config (CmmStaticsRaw lbl dats)
+  = vcat (pprLabel platform lbl : map (pprData config) dats)
+   where
+      platform = ncgPlatform config
+
+pprData :: NCGConfig -> CmmStatic -> SDoc
+pprData _config (CmmString str) = pprString str
+pprData _config (CmmFileEmbed path) = pprFileEmbed path
+
+pprData config (CmmUninitialised bytes)
+ = let platform = ncgPlatform config
+   in if platformOS platform == OSDarwin
+         then text ".space " <> int bytes
+         else text ".skip "  <> int bytes
+
+pprData config (CmmStaticLit lit) = pprDataItem config lit
+
+pprGloblDecl :: Platform -> CLabel -> SDoc
+pprGloblDecl platform lbl
+  | not (externallyVisibleCLabel lbl) = empty
+  | otherwise = text "\t.globl " <> pdoc platform lbl
+
+-- Note [Always use objects for info tables]
+-- See discussion in X86.Ppr
+-- for why this is necessary.  Essentially we need to ensure that we never
+-- pass function symbols when we migth want to lookup the info table.  If we
+-- did, we could end up with procedure linking tables (PLT)s, and thus the
+-- lookup wouldn't point to the function, but into the jump table.
+--
+-- Fun fact: The LLVMMangler exists to patch this issue su on the LLVM side as
+-- well.
+pprLabelType' :: Platform -> CLabel -> SDoc
+pprLabelType' platform lbl =
+  if isCFunctionLabel lbl || functionOkInfoTable then
+    text "@function"
+  else
+    text "@object"
+  where
+    functionOkInfoTable = platformTablesNextToCode platform &&
+      isInfoTableLabel lbl && not (isConInfoTableLabel lbl)
+
+-- this is called pprTypeAndSizeDecl in PPC.Ppr
+pprTypeDecl :: Platform -> CLabel -> SDoc
+pprTypeDecl platform lbl
+    = if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
+      then text ".type " <> pdoc platform lbl <> text ", " <> pprLabelType' platform lbl
+      else empty
+
+pprDataItem :: NCGConfig -> CmmLit -> SDoc
+pprDataItem config lit
+  = vcat (ppr_item (cmmTypeFormat $ cmmLitType platform lit) lit)
+    where
+        platform = ncgPlatform config
+
+        imm = litToImm lit
+
+        ppr_item II8  _ = [text "\t.byte\t"  <> pprImm platform imm]
+        ppr_item II16 _ = [text "\t.short\t" <> pprImm platform imm]
+        ppr_item II32 _ = [text "\t.long\t"  <> pprImm platform imm]
+        ppr_item II64 _ = [text "\t.quad\t"  <> pprImm platform imm]
+
+        ppr_item FF32  (CmmFloat r _)
+           = let bs = floatToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm platform (ImmInt b)) bs
+
+        ppr_item FF64 (CmmFloat r _)
+           = let bs = doubleToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm platform (ImmInt b)) bs
+
+        ppr_item _ _ = pprPanic "pprDataItem:ppr_item" (text $ show lit)
+
+floatToBytes :: Float -> [Int]
+floatToBytes f
+   = runST (do
+        arr <- newArray_ ((0::Int),3)
+        writeArray arr 0 f
+        arr <- castFloatToWord8Array arr
+        i0 <- readArray arr 0
+        i1 <- readArray arr 1
+        i2 <- readArray arr 2
+        i3 <- readArray arr 3
+        return (map fromIntegral [i0,i1,i2,i3])
+     )
+
+castFloatToWord8Array :: STUArray s Int Float -> ST s (STUArray s Int Word8)
+castFloatToWord8Array = U.castSTUArray
+
+pprImm :: Platform -> Imm -> SDoc
+pprImm _ (ImmInt i)     = int i
+pprImm _ (ImmInteger i) = integer i
+pprImm p (ImmCLbl l)    = pdoc p l
+pprImm p (ImmIndex l i) = pdoc p l <> char '+' <> int i
+pprImm _ (ImmLit s)     = s
+
+-- TODO: See pprIm below for why this is a bad idea!
+pprImm _ (ImmFloat f)
+  | f == 0 = text "wzr"
+  | otherwise = float (fromRational f)
+pprImm _ (ImmDouble d)
+  | d == 0 = text "xzr"
+  | otherwise = double (fromRational d)
+
+pprImm p (ImmConstantSum a b) = pprImm p a <> char '+' <> pprImm p b
+pprImm p (ImmConstantDiff a b) = pprImm p a <> char '-'
+                   <> lparen <> pprImm p b <> rparen
+
+
+-- aarch64 GNU as uses // for comments.
+asmComment :: SDoc -> SDoc
+asmComment c = whenPprDebug $ text "#" <+> c
+
+asmDoubleslashComment :: SDoc -> SDoc
+asmDoubleslashComment c = whenPprDebug $ text "//" <+> c
+
+asmMultilineComment :: SDoc -> SDoc
+asmMultilineComment c = whenPprDebug $ text "/*" $+$ c $+$ text "*/"
+
+pprIm :: Platform -> Imm -> SDoc
+pprIm platform im = case im of
+  ImmInt i     -> char '#' <> int i
+  ImmInteger i -> char '#' <> integer i
+
+  -- TODO: This will only work for
+  -- The floating point value must be expressable as ±n ÷ 16 × 2^r,
+  -- where n and r are integers such that 16 ≤ n ≤ 31 and -3 ≤ r ≤ 4.
+  -- and 0 needs to be encoded as wzr/xzr.
+  --
+  -- Except for 0, we might want to either split it up into enough
+  -- ADD operations into an Integer register and then just bit copy it into
+  -- the double register? See the toBytes + fromRational above for data items.
+  -- This is something the x86 backend does.
+  --
+  -- We could also just turn them into statics :-/ Which is what the
+  -- PowerPC backend odes.
+  ImmFloat f | f == 0 -> text "wzr"
+  ImmFloat f -> char '#' <> float (fromRational f)
+  ImmDouble d | d == 0 -> text "xzr"
+  ImmDouble d -> char '#' <> double (fromRational d)
+  -- =<lbl> pseudo instruction!
+  ImmCLbl l    -> char '=' <> pdoc platform l
+  ImmIndex l o -> text "[=" <> pdoc platform l <> comma <+> char '#' <> int o <> char ']'
+  _            -> panic "AArch64.pprIm"
+
+pprExt :: ExtMode -> SDoc
+pprExt EUXTB = text "uxtb"
+pprExt EUXTH = text "uxth"
+pprExt EUXTW = text "uxtw"
+pprExt EUXTX = text "uxtx"
+pprExt ESXTB = text "sxtb"
+pprExt ESXTH = text "sxth"
+pprExt ESXTW = text "sxtw"
+pprExt ESXTX = text "sxtx"
+
+pprShift :: ShiftMode -> SDoc
+pprShift SLSL = text "lsl"
+pprShift SLSR = text "lsr"
+pprShift SASR = text "asr"
+pprShift SROR = text "ror"
+
+pprOp :: Platform -> Operand -> SDoc
+pprOp plat op = case op of
+  OpReg w r           -> pprReg w r
+  OpRegExt w r x 0 -> pprReg w r <> comma <+> pprExt x
+  OpRegExt w r x i -> pprReg w r <> comma <+> pprExt x <> comma <+> char '#' <> int i
+  OpRegShift w r s i -> pprReg w r <> comma <+> pprShift s <> comma <+> char '#' <> int i
+  OpImm im          -> pprIm plat im
+  OpImmShift im s i -> pprIm plat im <> comma <+> pprShift s <+> char '#' <> int i
+  -- TODO: Address compuation always use registers as 64bit -- is this correct?
+  OpAddr (AddrRegReg r1 r2) -> char '[' <+> pprReg W64 r1 <> comma <+> pprReg W64 r2 <+> char ']'
+  OpAddr (AddrRegImm r1 im) -> char '[' <+> pprReg W64 r1 <> comma <+> pprImm plat im <+> char ']'
+  OpAddr (AddrReg r1)       -> char '[' <+> pprReg W64 r1 <+> char ']'
+
+pprReg :: Width -> Reg -> SDoc
+pprReg w r = case r of
+  RegReal    (RealRegSingle i) -> ppr_reg_no w i
+  RegReal    (RealRegPair{})   -> panic "AArch64.pprReg: no reg pairs on this arch!"
+  -- virtual regs should not show up, but this is helpful for debugging.
+  RegVirtual (VirtualRegI u)   -> text "%vI_" <> pprUniqueAlways u
+  RegVirtual (VirtualRegF u)   -> text "%vF_" <> pprUniqueAlways u
+  RegVirtual (VirtualRegD u)   -> text "%vD_" <> pprUniqueAlways u
+  _                            -> pprPanic "AArch64.pprReg" (text $ show r)
+
+  where
+    ppr_reg_no :: Width -> Int -> SDoc
+    ppr_reg_no w 31
+         | w == W64 = text "sp"
+         | w == W32 = text "wsp"
+
+    ppr_reg_no w i
+         | i < 0, w == W32 = text "wzr"
+         | i < 0, w == W64 = text "xzr"
+         | i < 0 = pprPanic "Invalid Zero Reg" (ppr w <+> int i)
+         -- General Purpose Registers
+         | i <= 31, w == W8  = text "w" <> int i      -- there are no byte or half
+         | i <= 31, w == W16 = text "w" <> int i      -- words... word will do.
+         | i <= 31, w == W32 = text "w" <> int i
+         | i <= 31, w == W64 = text "x" <> int i
+         | i <= 31 = pprPanic "Invalid Reg" (ppr w <+> int i)
+         -- Floating Point Registers
+         | i <= 63, w == W8  = text "b" <> int (i-32)
+         | i <= 63, w == W16 = text "h" <> int (i-32)
+         | i <= 63, w == W32 = text "s" <> int (i-32)
+         | i <= 63, w == W64 = text "d" <> int (i-32)
+         -- no support for 'q'uad in GHC's NCG yet.
+         | otherwise = text "very naughty powerpc register"
+
+isFloatOp :: Operand -> Bool
+isFloatOp (OpReg _ (RegReal (RealRegSingle i))) | i > 31 = True
+isFloatOp (OpReg _ (RegVirtual (VirtualRegF _))) = True
+isFloatOp (OpReg _ (RegVirtual (VirtualRegD _))) = True
+isFloatOp _ = False
+
+pprInstr :: Platform -> Instr -> SDoc
+pprInstr platform instr = case instr of
+  -- Meta Instructions ---------------------------------------------------------
+  COMMENT s  -> asmComment s
+  MULTILINE_COMMENT s -> asmMultilineComment s
+  ANN d i -> pprInstr platform i <+> asmDoubleslashComment d
+  LOCATION file line col _name
+    -> text "\t.loc" <+> ppr file <+> ppr line <+> ppr col
+  DELTA d    -> asmComment $ text ("\tdelta = " ++ show d)
+  NEWBLOCK _ -> panic "PprInstr: NEWBLOCK"
+  LDATA _ _  -> panic "pprInstr: LDATA"
+
+  -- Pseudo Instructions -------------------------------------------------------
+
+  PUSH_STACK_FRAME -> text "\tstp x29, x30, [sp, #-16]!"
+                   $$ text "\tmov x29, sp"
+
+  POP_STACK_FRAME -> text "\tldp x29, x30, [sp], #16"
+  -- ===========================================================================
+  -- AArch64 Instruction Set
+  -- 1. Arithmetic Instructions ------------------------------------------------
+  ADD  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfadd"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tadd"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  CMN  o1 o2    -> text "\tcmn"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  CMP  o1 o2
+    | isFloatOp o1 && isFloatOp o2 -> text "\tfcmp"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tcmp" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MSUB o1 o2 o3 o4 -> text "\tmsub" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  MUL  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfmul"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tmul"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  NEG  o1 o2
+    | isFloatOp o1 && isFloatOp o2 -> text "\tfneg"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tneg"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  SDIV o1 o2 o3 | isFloatOp o1 && isFloatOp o2 && isFloatOp o3
+    -> text "\tfdiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  SDIV o1 o2 o3 -> text "\tsdiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  SUB  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfsub"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tsub"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  UDIV o1 o2 o3 -> text "\tudiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  -- 2. Bit Manipulation Instructions ------------------------------------------
+  SBFM o1 o2 o3 o4 -> text "\tsbfm" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  UBFM o1 o2 o3 o4 -> text "\tubfm" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  -- 3. Logical and Move Instructions ------------------------------------------
+  AND o1 o2 o3  -> text "\tand" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ANDS o1 o2 o3 -> text "\tands" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ASR o1 o2 o3  -> text "\tasr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  BIC o1 o2 o3  -> text "\tbic" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  BICS o1 o2 o3 -> text "\tbics" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  EON o1 o2 o3  -> text "\teon" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  EOR o1 o2 o3  -> text "\teor" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LSL o1 o2 o3  -> text "\tlsl" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LSR o1 o2 o3  -> text "\tlsr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  MOV o1 o2
+    | isFloatOp o1 || isFloatOp o2 -> text "\tfmov" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tmov" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MOVK o1 o2    -> text "\tmovk" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MVN o1 o2     -> text "\tmvn" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  ORN o1 o2 o3  -> text "\torn" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ORR o1 o2 o3  -> text "\torr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ROR o1 o2 o3  -> text "\tror" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  TST o1 o2     -> text "\ttst" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+  -- 4. Branch Instructions ----------------------------------------------------
+  J t            -> pprInstr platform (B t)
+  B (TBlock bid) -> text "\tb" <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  B (TLabel lbl) -> text "\tb" <+> pdoc platform lbl
+  B (TReg r)     -> text "\tbr" <+> pprReg W64 r
+
+  BL (TBlock bid) _ _ -> text "\tbl" <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  BL (TLabel lbl) _ _ -> text "\tbl" <+> pdoc platform lbl
+  BL (TReg r)     _ _ -> text "\tblr" <+> pprReg W64 r
+
+  BCOND c (TBlock bid) -> text "\t" <> pprBcond c <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  BCOND c (TLabel lbl) -> text "\t" <> pprBcond c <+> pdoc platform lbl
+  BCOND _ (TReg _)     -> panic "AArch64.ppr: No conditional branching to registers!"
+
+  -- 5. Atomic Instructions ----------------------------------------------------
+  -- 6. Conditional Instructions -----------------------------------------------
+  CSET o c  -> text "\tcset" <+> pprOp platform o <> comma <+> pprCond c
+
+  CBZ o (TBlock bid) -> text "\tcbz" <+> pprOp platform o <> comma <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  CBZ o (TLabel lbl) -> text "\tcbz" <+> pprOp platform o <> comma <+> pdoc platform lbl
+  CBZ _ (TReg _)     -> panic "AArch64.ppr: No conditional (cbz) branching to registers!"
+
+  CBNZ o (TBlock bid) -> text "\tcbnz" <+> pprOp platform o <> comma <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  CBNZ o (TLabel lbl) -> text "\tcbnz" <+> pprOp platform o <> comma <+> pdoc platform lbl
+  CBNZ _ (TReg _)     -> panic "AArch64.ppr: No conditional (cbnz) branching to registers!"
+
+  -- 7. Load and Store Instructions --------------------------------------------
+  -- NOTE: GHC may do whacky things where it only load the lower part of an
+  --       address. Not observing the correct size when loading will lead
+  --       inevitably to crashes.
+  STR _f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tstrb" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  STR _f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tstrh" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  STR _f o1 o2 -> text "\tstr" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+#if defined(darwin_HOST_OS)
+  LDR _f o1 (OpImm (ImmIndex lbl' off)) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@page" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@pageoff" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmCLbl lbl')) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@page" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@pageoff"
+#else
+  LDR _f o1 (OpImm (ImmIndex lbl' off)) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text ":lo12:" <> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmCLbl lbl')) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text ":lo12:" <> pdoc platform lbl
+#endif
+
+  LDR _f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tldrsb" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  LDR _f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tldrsh" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  LDR _f o1 o2 -> text "\tldr" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+  STP _f o1 o2 o3 -> text "\tstp" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LDP _f o1 o2 o3 -> text "\tldp" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  -- 8. Synchronization Instructions -------------------------------------------
+  DMBSY -> text "\tdmb sy"
+  -- 8. Synchronization Instructions -------------------------------------------
+  FCVT o1 o2 -> text "\tfcvt" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  SCVTF o1 o2 -> text "\tscvtf" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  FCVTZS o1 o2 -> text "\tfcvtzs" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+pprBcond :: Cond -> SDoc
+pprBcond c = text "b." <> pprCond c
+
+pprCond :: Cond -> SDoc
+pprCond c = case c of
+  ALWAYS -> text "al" -- Always
+  EQ     -> text "eq" -- Equal
+  NE     -> text "ne" -- Not Equal
+
+  SLT    -> text "lt" -- Signed less than                  ; Less than, or unordered
+  SLE    -> text "le" -- Signed less than or equal         ; Less than or equal, or unordered
+  SGE    -> text "ge" -- Signed greater than or equal      ; Greater than or equal
+  SGT    -> text "gt" -- Signed greater than               ; Greater than
+
+  ULT    -> text "lo" -- Carry clear/ unsigned lower       ; less than
+  ULE    -> text "ls" -- Unsigned lower or same            ; Less than or equal
+  UGE    -> text "hs" -- Carry set/unsigned higher or same ; Greater than or equal, or unordered
+  UGT    -> text "hi" -- Unsigned higher                   ; Greater than, or unordered
+
+  NEVER  -> text "nv" -- Never
+  VS     -> text "vs" -- Overflow                          ; Unordered (at least one NaN operand)
+  VC     -> text "vc" -- No overflow                       ; Not unordered
+
+  -- Orderd variants.  Respecting NaN.
+  OLT    -> text "mi"
+  OLE    -> text "ls"
+  OGE    -> text "ge"
+  OGT    -> text "gt"
+
+  -- Unordered
+  UOLT   -> text "lt"
+  UOLE   -> text "le"
+  UOGE   -> text "pl"
+  UOGT   -> text "hi"
diff --git a/compiler/GHC/CmmToAsm/AArch64/RegInfo.hs b/compiler/GHC/CmmToAsm/AArch64/RegInfo.hs
new file mode 100644
index 0000000000..8c3d081e92
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/RegInfo.hs
@@ -0,0 +1,31 @@
+module GHC.CmmToAsm.AArch64.RegInfo where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.AArch64.Instr
+import GHC.Cmm.BlockId
+import GHC.Cmm
+
+import GHC.Utils.Outputable
+
+data JumpDest = DestBlockId BlockId
+
+-- Debug Instance
+instance Outputable JumpDest where
+  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
+
+-- TODO: documen what this does. See Ticket 19914
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+
+-- TODO: document what this does. See Ticket 19914
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut _ = Nothing
+
+-- TODO: document what this does. See Ticket 19914
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
+shortcutStatics _ other_static = other_static
+
+-- TODO: document what this does. See Ticket 19914
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump _ other = other
diff --git a/compiler/GHC/CmmToAsm/AArch64/Regs.hs b/compiler/GHC/CmmToAsm/AArch64/Regs.hs
new file mode 100644
index 0000000000..fd1669eeac
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/Regs.hs
@@ -0,0 +1,167 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.CmmToAsm.AArch64.Regs where
+
+import GHC.Prelude
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+import GHC.Cmm.CLabel           ( CLabel )
+import GHC.Types.Unique
+
+import GHC.Platform.Regs
+import GHC.Utils.Outputable
+import GHC.Utils.Panic
+import GHC.Platform
+
+allMachRegNos   :: [RegNo]
+allMachRegNos   = [0..31] ++ [32..63]
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: Platform -> [RealReg]
+allocatableRegs platform
+   = let isFree i = freeReg platform i
+     in  map RealRegSingle $ filter isFree allMachRegNos
+
+
+-- argRegs is the set of regs which are read for an n-argument call to C.
+allGpArgRegs :: [Reg]
+allGpArgRegs = map regSingle [0..7]
+allFpArgRegs :: [Reg]
+allFpArgRegs = map regSingle [32..39]
+
+-- STG:
+-- 19: Base
+-- 20: Sp
+-- 21: Hp
+-- 22-27: R1-R6
+-- 28: SpLim
+
+-- This is the STG Sp reg.
+-- sp :: Reg
+-- sp = regSingle 20
+
+-- addressing modes ------------------------------------------------------------
+
+data AddrMode
+        = AddrRegReg    Reg Reg
+        | AddrRegImm    Reg Imm
+        | AddrReg       Reg
+        deriving (Eq, Show)
+
+-- -----------------------------------------------------------------------------
+-- Immediates
+
+data Imm
+  = ImmInt      Int
+  | ImmInteger  Integer     -- Sigh.
+  | ImmCLbl     CLabel      -- AbstractC Label (with baggage)
+  | ImmLit      SDoc        -- Simple string
+  | ImmIndex    CLabel Int
+  | ImmFloat    Rational
+  | ImmDouble   Rational
+  | ImmConstantSum Imm Imm
+  | ImmConstantDiff Imm Imm
+  deriving (Eq, Show)
+
+instance Show SDoc where
+  show = showPprUnsafe . ppr
+
+instance Eq SDoc where
+  lhs == rhs = show lhs == show rhs
+
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+litToImm :: CmmLit -> Imm
+litToImm (CmmInt i w)        = ImmInteger (narrowS w i)
+                -- narrow to the width: a CmmInt might be out of
+                -- range, but we assume that ImmInteger only contains
+                -- in-range values.  A signed value should be fine here.
+litToImm (CmmFloat f W32)    = ImmFloat f
+litToImm (CmmFloat f W64)    = ImmDouble f
+litToImm (CmmLabel l)        = ImmCLbl l
+litToImm (CmmLabelOff l off) = ImmIndex l off
+litToImm (CmmLabelDiffOff l1 l2 off _)
+                             = ImmConstantSum
+                               (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+                               (ImmInt off)
+litToImm _                   = panic "AArch64.Regs.litToImm: no match"
+
+
+-- == To satisfy GHC.CmmToAsm.Reg.Target =======================================
+
+-- squeese functions for the graph allocator -----------------------------------
+-- | regSqueeze_class reg
+--      Calculate the maximum number of register colors that could be
+--      denied to a node of this class due to having this reg
+--      as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+virtualRegSqueeze cls vr
+ = case cls of
+        RcInteger
+         -> case vr of
+                VirtualRegI{}           -> 1
+                VirtualRegHi{}          -> 1
+                _other                  -> 0
+
+        RcDouble
+         -> case vr of
+                VirtualRegD{}           -> 1
+                VirtualRegF{}           -> 0
+                _other                  -> 0
+
+        _other -> 0
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+realRegSqueeze cls rr
+ = case cls of
+        RcInteger
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 1     -- first fp reg is 32
+                        | otherwise     -> 0
+
+                RealRegPair{}           -> 0
+
+        RcDouble
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 0
+                        | otherwise     -> 1
+
+                RealRegPair{}           -> 0
+
+        _other -> 0
+
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+   | not (isFloatFormat format) = VirtualRegI u
+   | otherwise
+   = case format of
+        FF32    -> VirtualRegD u
+        FF64    -> VirtualRegD u
+        _       -> panic "AArch64.mkVirtualReg"
+
+{-# INLINE classOfRealReg      #-}
+classOfRealReg :: RealReg -> RegClass
+classOfRealReg (RealRegSingle i)
+        | i < 32        = RcInteger
+        | otherwise     = RcDouble
+
+classOfRealReg (RealRegPair{})
+        = panic "regClass(ppr): no reg pairs on this architecture"
+
+regDotColor :: RealReg -> SDoc
+regDotColor reg
+ = case classOfRealReg reg of
+        RcInteger       -> text "blue"
+        RcFloat         -> text "red"
+        RcDouble        -> text "green"
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Constants.hs b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
index e9047256e8..b8fb5706cb 100644
--- a/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
+++ b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
@@ -216,6 +216,7 @@ dwarfRegNo p r = case platformArch p of
     | r == xmm14 -> 31
     | r == xmm15 -> 32
   ArchPPC_64 _ -> fromIntegral $ toRegNo r
+  ArchAArch64  -> fromIntegral $ toRegNo r
   _other -> error "dwarfRegNo: Unsupported platform or unknown register!"
 
 -- | Virtual register number to use for return address.
@@ -228,4 +229,5 @@ dwarfReturnRegNo p
     ArchX86    -> 8  -- eip
     ArchX86_64 -> 16 -- rip
     ArchPPC_64 ELF_V2 -> 65 -- lr (link register)
+    ArchAArch64-> 30
     _other     -> error "dwarfReturnRegNo: Unsupported platform!"
diff --git a/compiler/GHC/CmmToAsm/Format.hs b/compiler/GHC/CmmToAsm/Format.hs
index 207de095ae..390ef29bd2 100644
--- a/compiler/GHC/CmmToAsm/Format.hs
+++ b/compiler/GHC/CmmToAsm/Format.hs
@@ -12,6 +12,7 @@ module GHC.CmmToAsm.Format (
     Format(..),
     intFormat,
     floatFormat,
+    isIntFormat,
     isFloatFormat,
     cmmTypeFormat,
     formatToWidth,
@@ -73,6 +74,9 @@ floatFormat width
 
         other   -> pprPanic "Format.floatFormat" (ppr other)
 
+-- | Check if a format represent an integer value.
+isIntFormat :: Format -> Bool
+isIntFormat = not . isFloatFormat
 
 -- | Check if a format represents a floating point value.
 isFloatFormat :: Format -> Bool
diff --git a/compiler/GHC/CmmToAsm/Instr.hs b/compiler/GHC/CmmToAsm/Instr.hs
index 0a62c1d3bb..bc2e2969e6 100644
--- a/compiler/GHC/CmmToAsm/Instr.hs
+++ b/compiler/GHC/CmmToAsm/Instr.hs
@@ -31,6 +31,7 @@ data RegUsage
                 reads :: [Reg],
                 writes :: [Reg]
                 }
+        deriving Show
 
 -- | No regs read or written to.
 noUsage :: RegUsage
@@ -90,7 +91,7 @@ class Instruction instr where
                 -> Reg          -- ^ the reg to spill
                 -> Int          -- ^ the current stack delta
                 -> Int          -- ^ spill slot to use
-                -> instr
+                -> [instr]        -- ^ instructions
 
 
         -- | An instruction to reload a register from a spill slot.
@@ -99,7 +100,7 @@ class Instruction instr where
                 -> Reg          -- ^ the reg to reload.
                 -> Int          -- ^ the current stack delta
                 -> Int          -- ^ the spill slot to use
-                -> instr
+                -> [instr]        -- ^ instructions
 
         -- | See if this instruction is telling us the current C stack delta
         takeDeltaInstr
@@ -157,3 +158,6 @@ class Instruction instr where
 
         -- | Pretty-print an instruction
         pprInstr :: Platform -> instr -> SDoc
+
+        -- Create a comment instruction
+        mkComment :: SDoc -> [instr]
diff --git a/compiler/GHC/CmmToAsm/PIC.hs b/compiler/GHC/CmmToAsm/PIC.hs
index 7fe90c3ec6..81ce9d34a9 100644
--- a/compiler/GHC/CmmToAsm/PIC.hs
+++ b/compiler/GHC/CmmToAsm/PIC.hs
@@ -122,6 +122,15 @@ cmmMakeDynamicReference config referenceKind lbl
               addImport stub
               return $ CmmLit $ CmmLabel stub
 
+        -- GOT relative loads work differently on AArch64.  We don't do two
+        -- step loads. The got symbol is loaded directly, and not through an
+        -- additional load. Thus we do not need the CmmLoad decoration we have
+        -- on other platforms.
+        AccessViaSymbolPtr | ArchAArch64 <- platformArch platform -> do
+              let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
+              addImport symbolPtr
+              return $ cmmMakePicReference config symbolPtr
+
         AccessViaSymbolPtr -> do
               let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
               addImport symbolPtr
@@ -135,7 +144,6 @@ cmmMakeDynamicReference config referenceKind lbl
                 -- so just jump there if it's a call or a jump
               _ -> return $ CmmLit $ CmmLabel lbl
 
-
 -- -----------------------------------------------------------------------------
 -- Create a position independent reference to a label.
 -- (but do not bother with dynamic linking).
@@ -150,6 +158,11 @@ cmmMakePicReference config lbl
   | OSMinGW32 <- platformOS platform
   = CmmLit $ CmmLabel lbl
 
+  -- no pic base reg on AArch64, however indicate this symbol should go through
+  -- the global offset table (GOT).
+  | ArchAArch64 <- platformArch platform
+  = CmmLit $ CmmLabel lbl
+
   | OSAIX <- platformOS platform
   = CmmMachOp (MO_Add W32)
           [ CmmReg (CmmGlobal PicBaseReg)
@@ -241,6 +254,20 @@ howToAccessLabel config _arch OSMinGW32 _kind lbl
         | otherwise
         = AccessDirectly
 
+-- On AArch64, relocations for JUMP and CALL will be emitted with 26bits, this
+-- is enough for ~64MB of range. Anything else will need to go through a veneer,
+-- which is the job of the linker to build.  We might only want to lookup
+-- Data References through the GOT.
+howToAccessLabel config ArchAArch64 _os _kind lbl
+        | not (ncgExternalDynamicRefs config)
+        = AccessDirectly
+
+        | labelDynamic config lbl
+        = AccessViaSymbolPtr
+
+        | otherwise
+        = AccessDirectly
+
 
 -- Mach-O (Darwin, Mac OS X)
 --
@@ -275,7 +302,7 @@ howToAccessLabel config arch OSDarwin JumpReference lbl
         -- dyld code stubs don't work for tailcalls because the
         -- stack alignment is only right for regular calls.
         -- Therefore, we have to go via a symbol pointer:
-        | arch == ArchX86 || arch == ArchX86_64
+        | arch == ArchX86 || arch == ArchX86_64 || arch == ArchAArch64
         , labelDynamic config lbl
         = AccessViaSymbolPtr
 
@@ -283,15 +310,15 @@ howToAccessLabel config arch OSDarwin JumpReference lbl
 howToAccessLabel config arch OSDarwin _kind lbl
         -- Code stubs are the usual method of choice for imported code;
         -- not needed on x86_64 because Apple's new linker, ld64, generates
-        -- them automatically.
+        -- them automatically, neither on Aarch64 (arm64).
         | arch /= ArchX86_64
+        , arch /= ArchAArch64
         , labelDynamic config lbl
         = AccessViaStub
 
         | otherwise
         = AccessDirectly
 
-
 ----------------------------------------------------------------------------
 -- AIX
 
@@ -616,7 +643,9 @@ pprImportedSymbol config importedLbl = case (arch,os) of
         | otherwise
         -> empty
 
-   (_, OSDarwin) -> empty
+   (ArchAArch64, OSDarwin)
+        -> empty
+
 
 
    -- XCOFF / AIX
diff --git a/compiler/GHC/CmmToAsm/PPC.hs b/compiler/GHC/CmmToAsm/PPC.hs
index 148fd1b4b2..d38eb84c64 100644
--- a/compiler/GHC/CmmToAsm/PPC.hs
+++ b/compiler/GHC/CmmToAsm/PPC.hs
@@ -57,5 +57,4 @@ instance Instruction PPC.Instr where
    mkStackAllocInstr   = PPC.mkStackAllocInstr
    mkStackDeallocInstr = PPC.mkStackDeallocInstr
    pprInstr            = PPC.pprInstr
-
-
+   mkComment           = const []
diff --git a/compiler/GHC/CmmToAsm/PPC/Instr.hs b/compiler/GHC/CmmToAsm/PPC/Instr.hs
index 92567989ed..54a73f24a9 100644
--- a/compiler/GHC/CmmToAsm/PPC/Instr.hs
+++ b/compiler/GHC/CmmToAsm/PPC/Instr.hs
@@ -533,7 +533,7 @@ mkSpillInstr
    -> Reg       -- register to spill
    -> Int       -- current stack delta
    -> Int       -- spill slot to use
-   -> Instr
+   -> [Instr]
 
 mkSpillInstr config reg delta slot
   = let platform = ncgPlatform config
@@ -550,7 +550,7 @@ mkSpillInstr config reg delta slot
                 Just _  -> ST
                 Nothing -> STFAR -- pseudo instruction: 32 bit offsets
 
-    in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+    in [instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))]
 
 
 mkLoadInstr
@@ -558,7 +558,7 @@ mkLoadInstr
    -> Reg       -- register to load
    -> Int       -- current stack delta
    -> Int       -- spill slot to use
-   -> Instr
+   -> [Instr]
 
 mkLoadInstr config reg delta slot
   = let platform = ncgPlatform config
@@ -575,7 +575,7 @@ mkLoadInstr config reg delta slot
                 Just _  -> LD
                 Nothing -> LDFAR -- pseudo instruction: 32 bit offsets
 
-    in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+    in [instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))]
 
 
 -- | The size of a minimal stackframe header including minimal
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
index 1050fbaa96..83f581cac4 100644
--- a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
@@ -111,7 +111,10 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts excl
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 15
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            -- We should be able to allocate *a lot* more in princple.
+                            -- essentially all 32 - SP, so 31, we'd trash the link reg
+                            -- as well as the platform and all others though.
+                            ArchAArch64   -> 18
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
@@ -143,7 +146,10 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclus
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 0
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            -- we can in princple address all the float regs as
+                            -- segments. So we could have 64 Float regs. Or
+                            -- 128 Half regs, or even 256 Byte regs.
+                            ArchAArch64   -> 0
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
@@ -177,7 +183,7 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclu
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 20
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            ArchAArch64   -> 32
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear.hs b/compiler/GHC/CmmToAsm/Reg/Linear.hs
index 88fdcd6bce..a9a4545f62 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear.hs
@@ -110,10 +110,11 @@ import GHC.CmmToAsm.Reg.Linear.StackMap
 import GHC.CmmToAsm.Reg.Linear.FreeRegs
 import GHC.CmmToAsm.Reg.Linear.Stats
 import GHC.CmmToAsm.Reg.Linear.JoinToTargets
-import qualified GHC.CmmToAsm.Reg.Linear.PPC    as PPC
-import qualified GHC.CmmToAsm.Reg.Linear.SPARC  as SPARC
-import qualified GHC.CmmToAsm.Reg.Linear.X86    as X86
-import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.PPC     as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC   as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
 import GHC.CmmToAsm.Reg.Target
 import GHC.CmmToAsm.Reg.Liveness
 import GHC.CmmToAsm.Reg.Utils
@@ -121,6 +122,7 @@ import GHC.CmmToAsm.Instr
 import GHC.CmmToAsm.Config
 import GHC.CmmToAsm.Types
 import GHC.Platform.Reg
+import GHC.Platform.Reg.Class (RegClass(..))
 
 import GHC.Cmm.BlockId
 import GHC.Cmm.Dataflow.Collections
@@ -202,7 +204,7 @@ regAlloc _ (CmmProc _ _ _ _)
 --   an entry in the block map or it is the first block.
 --
 linearRegAlloc
-        :: forall instr. Instruction instr
+        :: forall instr. (Instruction instr)
         => NCGConfig
         -> [BlockId] -- ^ entry points
         -> BlockMap RegSet
@@ -220,7 +222,7 @@ linearRegAlloc config entry_ids block_live sccs
       ArchSPARC64    -> panic "linearRegAlloc ArchSPARC64"
       ArchPPC        -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
       ArchARM _ _ _  -> panic "linearRegAlloc ArchARM"
-      ArchAArch64    -> panic "linearRegAlloc ArchAArch64"
+      ArchAArch64    -> go $ (frInitFreeRegs platform :: AArch64.FreeRegs)
       ArchPPC_64 _   -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
       ArchAlpha      -> panic "linearRegAlloc ArchAlpha"
       ArchMipseb     -> panic "linearRegAlloc ArchMipseb"
@@ -487,7 +489,7 @@ isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
 
 
 genRaInsn :: forall freeRegs instr.
-             OutputableRegConstraint freeRegs instr
+             (OutputableRegConstraint freeRegs instr)
           => BlockMap RegSet
           -> [instr]
           -> BlockId
@@ -497,7 +499,7 @@ genRaInsn :: forall freeRegs instr.
           -> RegM freeRegs ([instr], [NatBasicBlock instr])
 
 genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
---   pprTraceM "genRaInsn" $ ppr (block_id, instr)
+-- pprTraceM "genRaInsn" $ ppr (block_id, instr)
   platform <- getPlatform
   case regUsageOfInstr platform instr of { RU read written ->
     do
@@ -509,19 +511,20 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
     -- so using nub isn't a problem).
     let virt_read       = nub [ vr      | (RegVirtual vr) <- read ] :: [VirtualReg]
 
-    -- debugging
-{-    freeregs <- getFreeRegsR
-    assig    <- getAssigR
-    pprDebugAndThen (defaultDynFlags Settings{ sTargetPlatform=platform } undefined) trace "genRaInsn"
-        (ppr instr
-                $$ text "r_dying      = " <+> ppr r_dying
-                $$ text "w_dying      = " <+> ppr w_dying
-                $$ text "virt_read    = " <+> ppr virt_read
-                $$ text "virt_written = " <+> ppr virt_written
-                $$ text "freeregs     = " <+> text (show freeregs)
-                $$ text "assig        = " <+> ppr assig)
-        $ do
--}
+--     do
+--         let real_read       = nub [ rr      | (RegReal rr) <- read]
+--         freeregs <- getFreeRegsR
+--         assig    <- getAssigR
+
+--         pprTraceM "genRaInsn"
+--                 (          text "block        = " <+> ppr block_id
+--                         $$ text "instruction  = " <+> ppr instr
+--                         $$ text "r_dying      = " <+> ppr r_dying
+--                         $$ text "w_dying      = " <+> ppr w_dying
+--                         $$ text "read         = " <+> ppr real_read    <+> ppr virt_read
+--                         $$ text "written      = " <+> ppr real_written <+> ppr virt_written
+--                         $$ text "freeregs     = " <+> ppr freeregs
+--                         $$ text "assign       = " <+> ppr assig)
 
     -- (a), (b) allocate real regs for all regs read by this instruction.
     (r_spills, r_allocd) <-
@@ -580,7 +583,6 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
                         Nothing -> x
                         Just y  -> y
 
-
     -- (j) free up stack slots for dead spilled regs
     -- TODO (can't be bothered right now)
 
@@ -592,7 +594,32 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
                                  | src == dst   -> []
                                 _               -> [patched_instr]
 
-    let code = concat [ squashed_instr, w_spills, reverse r_spills, clobber_saves, new_instrs ]
+    -- On the use of @reverse@ below.
+    -- Since we can have spills and reloads produce multiple instructions
+    -- we need to ensure they are emitted in the correct order.  We used to only
+    -- emit single instructions in mkSpill/mkReload/mkRegRegMove.
+    -- As such order of spills and reloads didn't matter.  However,  with
+    -- mutliple instructions potentially issued by those functions we need to be
+    -- careful to not break execution order. Reversing the spills (clobber will
+    -- also spill), will ensure they are emitted in the right order.
+    --
+    -- See also Ticket 19910 for changing the return type from [] to OrdList.
+
+    -- For debugging, uncomment the follow line and the mkComment lines.
+    -- u <- getUniqueR
+    let code = concat [ --  mkComment (text "<genRaInsn(" <> ppr u <> text ")>")
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):squashed>")]
+                        squashed_instr
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):w_spills>")
+                      , reverse w_spills
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):r_spills>")
+                      , reverse r_spills
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):clobber_saves>")
+                      , reverse clobber_saves
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):new_instrs>")
+                      , new_instrs
+                        -- ,mkComment (text "</genRaInsn(" <> ppr u <> text ")>")
+                      ]
 
 --    pprTrace "patched-code" ((vcat $ map (docToSDoc . pprInstr) code)) $ do
 --    pprTrace "pached-fixup" ((ppr fixup_blocks)) $ do
@@ -609,6 +636,7 @@ releaseRegs regs = do
   platform <- getPlatform
   assig <- getAssigR
   free <- getFreeRegsR
+
   let loop assig !free [] = do setAssigR assig; setFreeRegsR free; return ()
       loop assig !free (RegReal rr : rs) = loop assig (frReleaseReg platform rr free) rs
       loop assig !free (r:rs) =
@@ -662,8 +690,9 @@ saveClobberedTemps clobbered dying
 
         (instrs,assig') <- clobber assig [] to_spill
         setAssigR assig'
-        return instrs
-
+        return $ -- mkComment (text "<saveClobberedTemps>") ++
+                 instrs
+--              ++ mkComment (text "</saveClobberedTemps>")
    where
      -- See Note [UniqFM and the register allocator]
      clobber :: RegMap Loc -> [instr] -> [(Unique,RealReg)] -> RegM freeRegs ([instr], RegMap Loc)
@@ -700,7 +729,7 @@ saveClobberedTemps clobbered dying
 
                   let new_assign  = addToUFM_Directly assig temp (InBoth reg slot)
 
-                  clobber new_assign (spill : instrs) rest
+                  clobber new_assign (spill ++ instrs) rest
 
 
 
@@ -714,7 +743,17 @@ clobberRegs []
 clobberRegs clobbered
  = do   platform <- getPlatform
         freeregs <- getFreeRegsR
-        setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered
+
+        let gpRegs  = frGetFreeRegs platform RcInteger freeregs :: [RealReg]
+            fltRegs = frGetFreeRegs platform RcFloat   freeregs :: [RealReg]
+            dblRegs = frGetFreeRegs platform RcDouble  freeregs :: [RealReg]
+
+        let extra_clobbered = [ r | r <- clobbered
+                                  , r `elem` (gpRegs ++ fltRegs ++ dblRegs) ]
+
+        setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs extra_clobbered
+
+        -- setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered
 
         assig           <- getAssigR
         setAssigR $! clobber assig (nonDetUFMToList assig)
@@ -909,10 +948,7 @@ allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
                         | (temp_to_push_out, (my_reg :: RealReg)) : _
                                         <- candidates_inReg
                         = do
-                                (spill_insn, slot) <- spillR (RegReal my_reg) temp_to_push_out
-                                let spill_store  = (if reading then id else reverse)
-                                                        [ -- COMMENT (fsLit "spill alloc")
-                                                           spill_insn ]
+                                (spill_store, slot) <- spillR (RegReal my_reg) temp_to_push_out
 
                                 -- record that this temp was spilled
                                 recordSpill (SpillAlloc temp_to_push_out)
@@ -962,7 +998,7 @@ loadTemp vreg (ReadMem slot) hreg spills
  = do
         insn <- loadR (RegReal hreg) slot
         recordSpill (SpillLoad $ getUnique vreg)
-        return  $  {- COMMENT (fsLit "spill load") : -} insn : spills
+        return  $  {- mkComment (text "spill load") : -} insn ++ spills
 
 loadTemp _ _ _ spills =
    return spills
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs b/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs
new file mode 100644
index 0000000000..50299c4e74
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs
@@ -0,0 +1,137 @@
+module GHC.CmmToAsm.Reg.Linear.AArch64 where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.AArch64.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Utils.Outputable
+import GHC.Utils.Panic
+import GHC.Platform
+
+import Data.Word
+
+import GHC.Stack
+-- AArch64 has 32 64bit general purpose register r0..r30, and zr/sp
+-- AArch64 has 32 128bit floating point registers v0..v31 as part of the NEON
+-- extension in Armv8-A.
+--
+-- Armv8-A is a fundamental change to the Arm architecture. It supports the
+-- 64-bit Execution state called “AArch64”, and a new 64-bit instruction set
+-- “A64”. To provide compatibility with the Armv7-A (32-bit architecture)
+-- instruction set, a 32-bit variant of Armv8-A “AArch32” is provided. Most of
+-- existing Armv7-A code can be run in the AArch32 execution state of Armv8-A.
+--
+-- these can be addresses as q/d/s/h/b 0..31, or v.f<size>[idx]
+-- where size is 64, 32, 16, 8, ... and the index i allows us
+-- to access the given part.
+--
+-- History of Arm Adv SIMD
+-- .---------------------------------------------------------------------------.
+-- | Armv6                  | Armv7-A                | Armv8-A AArch64         |
+-- | SIMD extension         | NEON                   | NEON                    |
+-- |===========================================================================|
+-- | - Operates on 32-bit   | - Separate reg. bank,  | - Separate reg. bank,   |
+-- |   GP ARM registers     |    32x64-bit NEON regs |   32x128-bit NEON regs  |
+-- | - 8-bit/16-bit integer | - 8/16/32/64-bit int   | - 8/16/32/64-bit int    |
+-- |                        | - Single percision fp  | - Single percision fp   |
+-- |                        |                        | - Double precision fp   |
+-- |                        |                        | - Single/Double fp are  |
+-- |                        |                        |   IEEE compliant        |
+-- | - 2x16-bit/4x8-bit ops | - Up to 16x8-bit ops   | - Up to 16x8-bit ops    |
+-- |   per instruction      |   per instruction      |   per instruction       |
+-- '---------------------------------------------------------------------------'
+
+data FreeRegs = FreeRegs !Word32 !Word32
+
+instance Show FreeRegs where
+  show (FreeRegs g f) = "FreeRegs: " ++ showBits g ++ "; " ++ showBits f
+
+instance Outputable FreeRegs where
+    ppr (FreeRegs g f) = text "   " <+> foldr (\i x -> pad_int i    <+> x) (text "") [0..31]
+                      $$ text "GPR" <+> foldr (\i x -> show_bit g i <+> x) (text "") [0..31]
+                      $$ text "FPR" <+> foldr (\i x -> show_bit f i <+> x) (text "") [0..31]
+      where pad_int i | i < 10 = char ' ' <> int i
+            pad_int i = int i
+            -- remember bit = 1 means it's available.
+            show_bit bits bit | testBit bits bit = text "  "
+            show_bit _    _ = text " x"
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0 0
+
+showBits :: Word32 -> String
+showBits w = map (\i -> if testBit w i then '1' else '0') [0..31]
+
+-- FR instance implementation (See Linear.FreeRegs)
+allocateReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs g f)
+    | r > 31 && testBit f (r - 32) = FreeRegs g (clearBit f (r - 32))
+    | r < 32 && testBit g r = FreeRegs (clearBit g r) f
+    | r > 31 = panic $ "Linear.AArch64.allocReg: double allocation of float reg v" ++ show (r - 32) ++ "; " ++ showBits f
+    | otherwise = pprPanic "Linear.AArch64.allocReg" $ text ("double allocation of gp reg x" ++ show r ++ "; " ++ showBits g)
+allocateReg _ _ = panic "Linear.AArch64.allocReg: bad reg"
+
+-- we start from 28 downwards... the logic is similar to the ppc logic.
+-- 31 is Stack Pointer
+-- 30 is Link Register
+-- 29 is Stack Frame (by convention)
+-- 19-28 are callee save
+-- the lower ones are all caller save
+
+-- For this reason someone decided to give aarch64 only 6 regs for
+-- STG:
+-- 19: Base
+-- 20: Sp
+-- 21: Hp
+-- 22-27: R1-R6
+-- 28: SpLim
+
+-- For LLVM code gen interop:
+-- See https://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20150119/253722.html
+-- and the current ghccc implementation here:
+-- https://github.com/llvm/llvm-project/blob/161ae1f39816edf667aaa190bce702a86879c7bd/llvm/lib/Target/AArch64/AArch64CallingConvention.td#L324-L363
+-- and https://gitlab.haskell.org/ghc/ghc/-/wikis/commentary/compiler/generated-code
+-- for the STG discussion.
+{- For reference the ghcc from the link above:
+let Entry = 1 in
+def CC_AArch64_GHC : CallingConv<[
+  CCIfType<[iPTR], CCBitConvertToType<i64>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>,
+  CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>,
+
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
+]>;
+-}
+
+getFreeRegs :: RegClass -> FreeRegs -> [RealReg]
+getFreeRegs cls (FreeRegs g f)
+  | RcFloat   <- cls = [] -- For now we only support double and integer registers, floats will need to be promoted.
+  | RcDouble  <- cls = go 32 f 31
+  | RcInteger <- cls = go  0 g 18
+    where
+        go _   _ i | i < 0 = []
+        go off x i | testBit x i = RealRegSingle (off + i) : (go off x $! i - 1)
+                   | otherwise   = go off x $! i - 1
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+releaseReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle r) (FreeRegs g f)
+  | r > 31 && testBit f (r - 32) = pprPanic "Linear.AArch64.releaseReg" (text  "can't release non-allocated reg v" <> int (r - 32))
+  | r < 32 && testBit g r = pprPanic "Linear.AArch64.releaseReg" (text "can't release non-allocated reg x" <> int r)
+  | r > 31 = FreeRegs g (setBit f (r - 32))
+  | otherwise = FreeRegs (setBit g r) f
+releaseReg _ _ = pprPanic "Linear.AArch64.releaseReg" (text "bad reg")
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
index d501718c4a..3ae0fa140d 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
@@ -25,14 +25,16 @@ import GHC.Platform
 --      getFreeRegs cls f = filter ( (==cls) . regClass . RealReg ) f
 --      allocateReg f r = filter (/= r) f
 
-import qualified GHC.CmmToAsm.Reg.Linear.PPC    as PPC
-import qualified GHC.CmmToAsm.Reg.Linear.SPARC  as SPARC
-import qualified GHC.CmmToAsm.Reg.Linear.X86    as X86
-import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.PPC     as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC   as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
 
-import qualified GHC.CmmToAsm.PPC.Instr   as PPC.Instr
-import qualified GHC.CmmToAsm.SPARC.Instr as SPARC.Instr
-import qualified GHC.CmmToAsm.X86.Instr   as X86.Instr
+import qualified GHC.CmmToAsm.PPC.Instr     as PPC.Instr
+import qualified GHC.CmmToAsm.SPARC.Instr   as SPARC.Instr
+import qualified GHC.CmmToAsm.X86.Instr     as X86.Instr
+import qualified GHC.CmmToAsm.AArch64.Instr as AArch64.Instr
 
 class Show freeRegs => FR freeRegs where
     frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs
@@ -58,6 +60,12 @@ instance FR PPC.FreeRegs where
     frInitFreeRegs = PPC.initFreeRegs
     frReleaseReg   = \_ -> PPC.releaseReg
 
+instance FR AArch64.FreeRegs where
+    frAllocateReg = \_ -> AArch64.allocateReg
+    frGetFreeRegs = \_ -> AArch64.getFreeRegs
+    frInitFreeRegs = AArch64.initFreeRegs
+    frReleaseReg = \_ -> AArch64.releaseReg
+
 instance FR SPARC.FreeRegs where
     frAllocateReg  = SPARC.allocateReg
     frGetFreeRegs  = \_ -> SPARC.getFreeRegs
@@ -73,7 +81,7 @@ maxSpillSlots config = case platformArch (ncgPlatform config) of
    ArchSPARC     -> SPARC.Instr.maxSpillSlots config
    ArchSPARC64   -> panic "maxSpillSlots ArchSPARC64"
    ArchARM _ _ _ -> panic "maxSpillSlots ArchARM"
-   ArchAArch64   -> panic "maxSpillSlots ArchAArch64"
+   ArchAArch64   -> AArch64.Instr.maxSpillSlots config
    ArchPPC_64 _  -> PPC.Instr.maxSpillSlots config
    ArchAlpha     -> panic "maxSpillSlots ArchAlpha"
    ArchMipseb    -> panic "maxSpillSlots ArchMipseb"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
index d0330a4f6a..cbdf5d030b 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
@@ -26,6 +26,7 @@ import GHC.Cmm.BlockId
 import GHC.Cmm.Dataflow.Collections
 import GHC.Data.Graph.Directed
 import GHC.Utils.Panic
+import GHC.Utils.Monad (concatMapM)
 import GHC.Types.Unique
 import GHC.Types.Unique.FM
 import GHC.Types.Unique.Set
@@ -306,7 +307,7 @@ handleComponent
 --      go via a spill slot.
 --
 handleComponent delta _  (AcyclicSCC (DigraphNode vreg src dsts))
-        = mapM (makeMove delta vreg src) dsts
+        = concatMapM (makeMove delta vreg src) dsts
 
 
 -- Handle some cyclic moves.
@@ -340,7 +341,7 @@ handleComponent delta instr
 
         -- make sure to do all the reloads after all the spills,
         --      so we don't end up clobbering the source values.
-        return ([instrSpill] ++ concat remainingFixUps ++ [instrLoad])
+        return (instrSpill ++ concat remainingFixUps ++ instrLoad)
 
 handleComponent _ _ (CyclicSCC _)
  = panic "Register Allocator: handleComponent cyclic"
@@ -354,7 +355,7 @@ makeMove
     -> Unique   -- ^ unique of the vreg that we're moving.
     -> Loc      -- ^ source location.
     -> Loc      -- ^ destination location.
-    -> RegM freeRegs instr  -- ^ move instruction.
+    -> RegM freeRegs [instr]  -- ^ move instruction.
 
 makeMove delta vreg src dst
  = do config <- getConfig
@@ -363,7 +364,7 @@ makeMove delta vreg src dst
       case (src, dst) of
           (InReg s, InReg d) ->
               do recordSpill (SpillJoinRR vreg)
-                 return $ mkRegRegMoveInstr platform (RegReal s) (RegReal d)
+                 return $ [mkRegRegMoveInstr platform (RegReal s) (RegReal d)]
           (InMem s, InReg d) ->
               do recordSpill (SpillJoinRM vreg)
                  return $ mkLoadInstr config (RegReal d) delta s
@@ -377,4 +378,3 @@ makeMove delta vreg src dst
               panic ("makeMove " ++ show vreg ++ " (" ++ show src ++ ") ("
                   ++ show dst ++ ")"
                   ++ " we don't handle mem->mem moves.")
-
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
index 24a75121b8..ec1cd517ea 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
@@ -121,7 +121,7 @@ makeRAStats state
 
 
 spillR :: Instruction instr
-       => Reg -> Unique -> RegM freeRegs (instr, Int)
+       => Reg -> Unique -> RegM freeRegs ([instr], Int)
 
 spillR reg temp = mkRegM $ \s ->
   let (stack1,slot) = getStackSlotFor (ra_stack s) temp
@@ -131,7 +131,7 @@ spillR reg temp = mkRegM $ \s ->
 
 
 loadR :: Instruction instr
-      => Reg -> Int -> RegM freeRegs instr
+      => Reg -> Int -> RegM freeRegs [instr]
 
 loadR reg slot = mkRegM $ \s ->
   RA_Result s (mkLoadInstr (ra_config s) reg (ra_delta s) slot)
diff --git a/compiler/GHC/CmmToAsm/Reg/Liveness.hs b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
index 4d70533624..ad8190270f 100644
--- a/compiler/GHC/CmmToAsm/Reg/Liveness.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
@@ -174,6 +174,8 @@ instance Instruction instr => Instruction (InstrSR instr) where
 
         pprInstr platform i = ppr (fmap (pprInstr platform) i)
 
+        mkComment               = fmap Instr . mkComment
+
 
 -- | An instruction with liveness information.
 data LiveInstr instr
@@ -565,16 +567,20 @@ stripLiveBlock config (BasicBlock i lis)
  where  (instrs', _)
                 = runState (spillNat [] lis) 0
 
+        -- spillNat :: [instr] -> [LiveInstr instr] -> State Int [instr]
+        spillNat :: Instruction instr => [instr] -> [LiveInstr instr] -> State Int [instr]
         spillNat acc []
          =      return (reverse acc)
 
+        -- The SPILL/RELOAD cases do not appear to be exercised by our codegens
+        --
         spillNat acc (LiveInstr (SPILL reg slot) _ : instrs)
          = do   delta   <- get
-                spillNat (mkSpillInstr config reg delta slot : acc) instrs
+                spillNat (mkSpillInstr config reg delta slot ++ acc) instrs
 
         spillNat acc (LiveInstr (RELOAD slot reg) _ : instrs)
          = do   delta   <- get
-                spillNat (mkLoadInstr config reg delta slot : acc) instrs
+                spillNat (mkLoadInstr config reg delta slot ++ acc) instrs
 
         spillNat acc (LiveInstr (Instr instr) _ : instrs)
          | Just i <- takeDeltaInstr instr
diff --git a/compiler/GHC/CmmToAsm/Reg/Target.hs b/compiler/GHC/CmmToAsm/Reg/Target.hs
index 4611c3a8e8..22b22e21cc 100644
--- a/compiler/GHC/CmmToAsm/Reg/Target.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Target.hs
@@ -34,6 +34,8 @@ import qualified GHC.CmmToAsm.X86.Regs       as X86
 import qualified GHC.CmmToAsm.X86.RegInfo    as X86
 import qualified GHC.CmmToAsm.PPC.Regs       as PPC
 import qualified GHC.CmmToAsm.SPARC.Regs     as SPARC
+import qualified GHC.CmmToAsm.AArch64.Regs   as AArch64
+
 
 targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int
 targetVirtualRegSqueeze platform
@@ -46,7 +48,7 @@ targetVirtualRegSqueeze platform
       ArchSPARC64   -> panic "targetVirtualRegSqueeze ArchSPARC64"
       ArchPPC_64 _  -> PPC.virtualRegSqueeze
       ArchARM _ _ _ -> panic "targetVirtualRegSqueeze ArchARM"
-      ArchAArch64   -> panic "targetVirtualRegSqueeze ArchAArch64"
+      ArchAArch64   -> AArch64.virtualRegSqueeze
       ArchAlpha     -> panic "targetVirtualRegSqueeze ArchAlpha"
       ArchMipseb    -> panic "targetVirtualRegSqueeze ArchMipseb"
       ArchMipsel    -> panic "targetVirtualRegSqueeze ArchMipsel"
@@ -66,7 +68,7 @@ targetRealRegSqueeze platform
       ArchSPARC64   -> panic "targetRealRegSqueeze ArchSPARC64"
       ArchPPC_64 _  -> PPC.realRegSqueeze
       ArchARM _ _ _ -> panic "targetRealRegSqueeze ArchARM"
-      ArchAArch64   -> panic "targetRealRegSqueeze ArchAArch64"
+      ArchAArch64   -> AArch64.realRegSqueeze
       ArchAlpha     -> panic "targetRealRegSqueeze ArchAlpha"
       ArchMipseb    -> panic "targetRealRegSqueeze ArchMipseb"
       ArchMipsel    -> panic "targetRealRegSqueeze ArchMipsel"
@@ -85,7 +87,7 @@ targetClassOfRealReg platform
       ArchSPARC64   -> panic "targetClassOfRealReg ArchSPARC64"
       ArchPPC_64 _  -> PPC.classOfRealReg
       ArchARM _ _ _ -> panic "targetClassOfRealReg ArchARM"
-      ArchAArch64   -> panic "targetClassOfRealReg ArchAArch64"
+      ArchAArch64   -> AArch64.classOfRealReg
       ArchAlpha     -> panic "targetClassOfRealReg ArchAlpha"
       ArchMipseb    -> panic "targetClassOfRealReg ArchMipseb"
       ArchMipsel    -> panic "targetClassOfRealReg ArchMipsel"
@@ -104,7 +106,7 @@ targetMkVirtualReg platform
       ArchSPARC64   -> panic "targetMkVirtualReg ArchSPARC64"
       ArchPPC_64 _  -> PPC.mkVirtualReg
       ArchARM _ _ _ -> panic "targetMkVirtualReg ArchARM"
-      ArchAArch64   -> panic "targetMkVirtualReg ArchAArch64"
+      ArchAArch64   -> AArch64.mkVirtualReg
       ArchAlpha     -> panic "targetMkVirtualReg ArchAlpha"
       ArchMipseb    -> panic "targetMkVirtualReg ArchMipseb"
       ArchMipsel    -> panic "targetMkVirtualReg ArchMipsel"
@@ -123,7 +125,7 @@ targetRegDotColor platform
       ArchSPARC64   -> panic "targetRegDotColor ArchSPARC64"
       ArchPPC_64 _  -> PPC.regDotColor
       ArchARM _ _ _ -> panic "targetRegDotColor ArchARM"
-      ArchAArch64   -> panic "targetRegDotColor ArchAArch64"
+      ArchAArch64   -> AArch64.regDotColor
       ArchAlpha     -> panic "targetRegDotColor ArchAlpha"
       ArchMipseb    -> panic "targetRegDotColor ArchMipseb"
       ArchMipsel    -> panic "targetRegDotColor ArchMipsel"
diff --git a/compiler/GHC/CmmToAsm/SPARC.hs b/compiler/GHC/CmmToAsm/SPARC.hs
index 7d9a671932..cac72de6d3 100644
--- a/compiler/GHC/CmmToAsm/SPARC.hs
+++ b/compiler/GHC/CmmToAsm/SPARC.hs
@@ -69,7 +69,6 @@ instance Instruction SPARC.Instr where
    takeRegRegMoveInstr     = SPARC.takeRegRegMoveInstr
    mkJumpInstr             = SPARC.mkJumpInstr
    pprInstr                = SPARC.pprInstr
+   mkComment               = const []
    mkStackAllocInstr       = panic "no sparc_mkStackAllocInstr"
    mkStackDeallocInstr     = panic "no sparc_mkStackDeallocInstr"
-
-
diff --git a/compiler/GHC/CmmToAsm/SPARC/Instr.hs b/compiler/GHC/CmmToAsm/SPARC/Instr.hs
index 3aeeb4d976..a5c9e46936 100644
--- a/compiler/GHC/CmmToAsm/SPARC/Instr.hs
+++ b/compiler/GHC/CmmToAsm/SPARC/Instr.hs
@@ -362,7 +362,7 @@ mkSpillInstr
     -> Reg      -- ^ register to spill
     -> Int      -- ^ current stack delta
     -> Int      -- ^ spill slot to use
-    -> Instr
+    -> [Instr]
 
 mkSpillInstr config reg _ slot
  = let  platform = ncgPlatform config
@@ -373,7 +373,7 @@ mkSpillInstr config reg _ slot
                         RcFloat   -> FF32
                         RcDouble  -> FF64
 
-    in ST fmt reg (fpRel (negate off_w))
+    in [ST fmt reg (fpRel (negate off_w))]
 
 
 -- | Make a spill reload instruction.
@@ -382,7 +382,7 @@ mkLoadInstr
     -> Reg      -- ^ register to load into
     -> Int      -- ^ current stack delta
     -> Int      -- ^ spill slot to use
-    -> Instr
+    -> [Instr]
 
 mkLoadInstr config reg _ slot
   = let platform = ncgPlatform config
@@ -393,7 +393,7 @@ mkLoadInstr config reg _ slot
                         RcFloat   -> FF32
                         RcDouble  -> FF64
 
-        in LD fmt (fpRel (- off_w)) reg
+        in [LD fmt (fpRel (- off_w)) reg]
 
 
 --------------------------------------------------------------------------------
diff --git a/compiler/GHC/CmmToAsm/X86.hs b/compiler/GHC/CmmToAsm/X86.hs
index dbeeddc184..3d9780a99c 100644
--- a/compiler/GHC/CmmToAsm/X86.hs
+++ b/compiler/GHC/CmmToAsm/X86.hs
@@ -62,4 +62,4 @@ instance Instruction X86.Instr where
    mkStackAllocInstr       = X86.mkStackAllocInstr
    mkStackDeallocInstr     = X86.mkStackDeallocInstr
    pprInstr                = X86.pprInstr
-
+   mkComment               = const []
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
index e48d0922d8..9410537ed8 100644
--- a/compiler/GHC/CmmToAsm/X86/Instr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -672,15 +672,15 @@ mkSpillInstr
     -> Reg      -- register to spill
     -> Int      -- current stack delta
     -> Int      -- spill slot to use
-    -> Instr
+    -> [Instr]
 
 mkSpillInstr config reg delta slot
   = let off     = spillSlotToOffset platform slot - delta
     in
     case targetClassOfReg platform reg of
-           RcInteger   -> MOV (archWordFormat is32Bit)
-                              (OpReg reg) (OpAddr (spRel platform off))
-           RcDouble    -> MOV FF64 (OpReg reg) (OpAddr (spRel platform off))
+           RcInteger   -> [MOV (archWordFormat is32Bit)
+                                   (OpReg reg) (OpAddr (spRel platform off))]
+           RcDouble    -> [MOV FF64 (OpReg reg) (OpAddr (spRel platform off))]
            _         -> panic "X86.mkSpillInstr: no match"
     where platform = ncgPlatform config
           is32Bit = target32Bit platform
@@ -691,16 +691,16 @@ mkLoadInstr
     -> Reg      -- register to load
     -> Int      -- current stack delta
     -> Int      -- spill slot to use
-    -> Instr
+    -> [Instr]
 
 mkLoadInstr config reg delta slot
   = let off     = spillSlotToOffset platform slot - delta
     in
         case targetClassOfReg platform reg of
-              RcInteger -> MOV (archWordFormat is32Bit)
-                               (OpAddr (spRel platform off)) (OpReg reg)
-              RcDouble  -> MOV FF64 (OpAddr (spRel platform off)) (OpReg reg)
-              _           -> panic "X86.mkLoadInstr"
+              RcInteger -> ([MOV (archWordFormat is32Bit)
+                                 (OpAddr (spRel platform off)) (OpReg reg)])
+              RcDouble  -> ([MOV FF64 (OpAddr (spRel platform off)) (OpReg reg)])
+              _         -> panic "X86.mkLoadInstr"
     where platform = ncgPlatform config
           is32Bit = target32Bit platform
 
diff --git a/compiler/GHC/Driver/Backend.hs b/compiler/GHC/Driver/Backend.hs
index 39789607d9..71cfdd9dec 100644
--- a/compiler/GHC/Driver/Backend.hs
+++ b/compiler/GHC/Driver/Backend.hs
@@ -104,6 +104,7 @@ platformNcgSupported platform = if
          ArchPPC       -> True
          ArchPPC_64 {} -> True
          ArchSPARC     -> True
+         ArchAArch64   -> True
          _             -> False
 
 -- | Will this backend produce an object file on the disk?
diff --git a/compiler/GHC/Linker/Static.hs b/compiler/GHC/Linker/Static.hs
index 4e2367f9e6..b0b8f1c541 100644
--- a/compiler/GHC/Linker/Static.hs
+++ b/compiler/GHC/Linker/Static.hs
@@ -222,7 +222,7 @@ linkBinary' staticLink logger tmpfs dflags unit_env o_files dep_units = do
                              not staticLink &&
                              (platformOS platform == OSDarwin) &&
                              case platformArch platform of
-                               ArchX86 -> True
+                               ArchX86     -> True
                                ArchX86_64  -> True
                                ArchARM {}  -> True
                                ArchAArch64 -> True
diff --git a/compiler/GHC/Platform.hs b/compiler/GHC/Platform.hs
index b3ab1b4020..70c8f9b658 100644
--- a/compiler/GHC/Platform.hs
+++ b/compiler/GHC/Platform.hs
@@ -40,6 +40,7 @@ module GHC.Platform
    , platformSOName
    , platformHsSOName
    , platformSOExt
+   , genericPlatform
    )
 where
 
@@ -83,6 +84,21 @@ platformConstants platform = case platform_constants platform of
   Nothing -> panic "Platform constants not available!"
   Just c  -> c
 
+genericPlatform :: Platform
+genericPlatform = Platform
+   { platformArchOS                  = ArchOS ArchX86_64 OSLinux
+   , platformWordSize                = PW8
+   , platformByteOrder               = LittleEndian
+   , platformUnregisterised          = False
+   , platformHasGnuNonexecStack      = False
+   , platformHasIdentDirective       = False
+   , platformHasSubsectionsViaSymbols= False
+   , platformIsCrossCompiling        = False
+   , platformLeadingUnderscore       = False
+   , platformTablesNextToCode        = True
+   , platform_constants               = Nothing
+   }
+
 data PlatformWordSize
   = PW4 -- ^ A 32-bit platform
   | PW8 -- ^ A 64-bit platform
@@ -223,7 +239,6 @@ data BmiVersion
    | BMI2
    deriving (Eq, Ord)
 
-
 -- | Platform-specific settings formerly hard-coded in Config.hs.
 --
 -- These should probably be all be triaged whether they can be computed from
diff --git a/compiler/GHC/Platform/Reg.hs b/compiler/GHC/Platform/Reg.hs
index c011a59eb0..5edd39df51 100644
--- a/compiler/GHC/Platform/Reg.hs
+++ b/compiler/GHC/Platform/Reg.hs
@@ -181,7 +181,7 @@ realRegsAlias rr1 rr2
 data Reg
         = RegVirtual !VirtualReg
         | RegReal    !RealReg
-        deriving (Eq, Ord)
+        deriving (Eq, Ord, Show)
 
 regSingle :: RegNo -> Reg
 regSingle regNo = RegReal (realRegSingle regNo)
diff --git a/compiler/GHC/StgToCmm/Foreign.hs b/compiler/GHC/StgToCmm/Foreign.hs
index 39f25a7b86..78a01d06d6 100644
--- a/compiler/GHC/StgToCmm/Foreign.hs
+++ b/compiler/GHC/StgToCmm/Foreign.hs
@@ -84,6 +84,7 @@ cgForeignCall (CCall (CCallSpec target cconv safety)) typ stg_args res_ty
             arg_size (arg, _) = max (widthInBytes $ typeWidth $ cmmExprType platform arg)
                                      (platformWordSizeInBytes platform)
         ; cmm_args <- getFCallArgs stg_args typ
+        -- ; traceM $ show cmm_args
         ; (res_regs, res_hints) <- newUnboxedTupleRegs res_ty
         ; let ((call_args, arg_hints), cmm_target)
                 = case target of
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index 40cfde0d3a..5d8d1f9b22 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -203,6 +203,13 @@ Library
         GHC.Cmm.Switch.Implement
         GHC.CmmToAsm
         GHC.Cmm.LRegSet
+        GHC.CmmToAsm.AArch64
+        GHC.CmmToAsm.AArch64.CodeGen
+        GHC.CmmToAsm.AArch64.Cond
+        GHC.CmmToAsm.AArch64.Instr
+        GHC.CmmToAsm.AArch64.Ppr
+        GHC.CmmToAsm.AArch64.RegInfo
+        GHC.CmmToAsm.AArch64.Regs
         GHC.CmmToAsm.BlockLayout
         GHC.CmmToAsm.CFG
         GHC.CmmToAsm.CFG.Dominators
@@ -234,6 +241,7 @@ Library
         GHC.CmmToAsm.Reg.Graph.TrivColorable
         GHC.CmmToAsm.Reg.Graph.X86
         GHC.CmmToAsm.Reg.Linear
+        GHC.CmmToAsm.Reg.Linear.AArch64
         GHC.CmmToAsm.Reg.Linear.Base
         GHC.CmmToAsm.Reg.Linear.FreeRegs
         GHC.CmmToAsm.Reg.Linear.JoinToTargets
diff --git a/includes/CodeGen.Platform.hs b/includes/CodeGen.Platform.hs
index 8c942662e6..0dfac62a3f 100644
--- a/includes/CodeGen.Platform.hs
+++ b/includes/CodeGen.Platform.hs
@@ -1,7 +1,8 @@
 
 import GHC.Cmm.Expr
 #if !(defined(MACHREGS_i386) || defined(MACHREGS_x86_64) \
-    || defined(MACHREGS_sparc) || defined(MACHREGS_powerpc))
+    || defined(MACHREGS_sparc) || defined(MACHREGS_powerpc) \
+    || defined(MACHREGS_aarch64))
 import GHC.Utils.Panic.Plain
 #endif
 import GHC.Platform.Reg
@@ -1016,6 +1017,98 @@ freeReg REG_HpLim = False
 # endif
 freeReg _ = True
 
+#elif defined(MACHREGS_aarch64)
+
+-- stack pointer / zero reg
+freeReg 31 = False
+-- link register
+freeReg 30 = False
+-- frame pointer
+freeReg 29 = False
+-- ip0 -- used for spill offset computations
+freeReg 16 = False
+
+# if defined(REG_Base)
+freeReg REG_Base  = False
+# endif
+# if defined(REG_Sp)
+freeReg REG_Sp    = False
+# endif
+# if defined(REG_SpLim)
+freeReg REG_SpLim = False
+# endif
+# if defined(REG_Hp)
+freeReg REG_Hp    = False
+# endif
+# if defined(REG_HpLim)
+freeReg REG_HpLim = False
+# endif
+
+# if defined(REG_R1)
+freeReg REG_R1    = False
+# endif
+# if defined(REG_R2)
+freeReg REG_R2    = False
+# endif
+# if defined(REG_R3)
+freeReg REG_R3    = False
+# endif
+# if defined(REG_R4)
+freeReg REG_R4    = False
+# endif
+# if defined(REG_R5)
+freeReg REG_R5    = False
+# endif
+# if defined(REG_R6)
+freeReg REG_R6    = False
+# endif
+# if defined(REG_R7)
+freeReg REG_R7    = False
+# endif
+# if defined(REG_R8)
+freeReg REG_R8    = False
+# endif
+
+# if defined(REG_F1)
+freeReg REG_F1    = False
+# endif
+# if defined(REG_F2)
+freeReg REG_F2    = False
+# endif
+# if defined(REG_F3)
+freeReg REG_F3    = False
+# endif
+# if defined(REG_F4)
+freeReg REG_F4    = False
+# endif
+# if defined(REG_F5)
+freeReg REG_F5    = False
+# endif
+# if defined(REG_F6)
+freeReg REG_F6    = False
+# endif
+
+# if defined(REG_D1)
+freeReg REG_D1    = False
+# endif
+# if defined(REG_D2)
+freeReg REG_D2    = False
+# endif
+# if defined(REG_D3)
+freeReg REG_D3    = False
+# endif
+# if defined(REG_D4)
+freeReg REG_D4    = False
+# endif
+# if defined(REG_D5)
+freeReg REG_D5    = False
+# endif
+# if defined(REG_D6)
+freeReg REG_D6    = False
+# endif
+
+freeReg _ = True
+
 #elif defined(MACHREGS_sparc)
 
 -- SPARC regs used by the OS / ABI
diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h
index 12f0e32f0f..0258811728 100644
--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -208,7 +208,6 @@ void flushExec(W_ len, AdjustorExecutable exec_addr);
 #if defined(darwin_HOST_OS)
 AdjustorWritable execToWritable(AdjustorExecutable exec);
 #endif
-
 #if RTS_LINKER_USE_MMAP
 AdjustorWritable allocateWrite(W_ bytes);
 void markExec(W_ bytes, AdjustorWritable writ);
diff --git a/includes/stg/MachRegs.h b/includes/stg/MachRegs.h
index 4b0991891e..d50969b66a 100644
--- a/includes/stg/MachRegs.h
+++ b/includes/stg/MachRegs.h
@@ -548,7 +548,7 @@ the stack. See Note [Overlapping global registers] for implications.
      r30    |  LR     | The Link Register
      r29    |  FP     | The Frame Pointer
    r19-r28  |         | Callee-saved registers
-     r18    |         | The Platform Register, if needed; 
+     r18    |         | The Platform Register, if needed;
             |         | or temporary register
      r17    |  IP1    | The second intra-procedure-call temporary register
      r16    |  IP0    | The first intra-procedure-call scratch register
diff --git a/mk/flavours/devel-cross-ncg.mk b/mk/flavours/devel-cross-ncg.mk
new file mode 100644
index 0000000000..0d5325dd25
--- /dev/null
+++ b/mk/flavours/devel-cross-ncg.mk
@@ -0,0 +1,18 @@
+SRC_HC_OPTS        = -O0 -H64m
+GhcStage1HcOpts    = -O2 -DDEBUG
+GhcStage2HcOpts    = -O0
+GhcLibHcOpts       = -O
+BUILD_PROF_LIBS    = NO
+SplitSections      = NO
+HADDOCK_DOCS       = NO
+BUILD_SPHINX_HTML  = NO
+BUILD_SPHINX_PDF   = NO
+BUILD_MAN          = NO
+WITH_TERMINFO      = NO
+
+BIGNUM_BACKEND       = native
+Stage1Only           = YES
+DYNAMIC_BY_DEFAULT   = NO
+DYNAMIC_GHC_PROGRAMS = NO
+
+libraries/Cabal_dist-install_HC_OPTS += -O0
diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c
index 2380ac7135..993af91528 100644
--- a/rts/RtsSymbols.c
+++ b/rts/RtsSymbols.c
@@ -1020,6 +1020,7 @@
       SymI_HasProto(registerInfoProvList)                               \
       SymI_HasProto(lookupIPE)                                          \
       RTS_USER_SIGNALS_SYMBOLS                                          \
+      RTS_LINKER_USE_MMAP_SYMBOLS                                       \
       RTS_INTCHAR_SYMBOLS
 
 
diff --git a/rts/linker/Elf.c b/rts/linker/Elf.c
index 84cb72bd6b..f6a1754257 100644
--- a/rts/linker/Elf.c
+++ b/rts/linker/Elf.c
@@ -774,9 +774,9 @@ ocGetNames_ELF ( ObjectCode* oc )
 
           void * mem = mmapAnonForLinker(size+stub_space);
 
-          if( mem == NULL ) {
-              barf("failed to mmap allocated memory to load section %d. "
-                   "errno = %d", i, errno);
+          if( mem == MAP_FAILED ) {
+            barf("failed to mmap allocated memory to load section %d. "
+                 "errno = %d", i, errno);
           }
 
           /* copy only the image part over; we don't want to copy data
diff --git a/testsuite/tests/ghci/linking/all.T b/testsuite/tests/ghci/linking/all.T
index 1739129a6a..5802c907c2 100644
--- a/testsuite/tests/ghci/linking/all.T
+++ b/testsuite/tests/ghci/linking/all.T
@@ -14,6 +14,9 @@ test('ghcilink003',
      [ unless(doing_ghci, skip),
        # libstdc++ is GCC-specific on FreeBSD. FreeBSD has libc++ though.
        when(opsys('freebsd'), fragile(17739)),
+       # from Big Sur onwards, we can't dlopen libstdc++.dylib
+       # anymore. Will produce:
+       #   dlopen(libstdc++.dylib, 5): image not found
        when(opsys('darwin'), fragile(16083))
      ], makefile_test, ['ghcilink003'])
 
@@ -34,6 +37,9 @@ test('ghcilink006',
      [ unless(doing_ghci, skip),
        # libstdc++ is GCC-specific on FreeBSD. FreeBSD has libc++ though.
        when(opsys('freebsd'), fragile(17739)),
+       # from Big Sur onwards, we can't dlopen libstdc++.dylib
+       # anymore. Will produce:
+       #   dlopen(libstdc++.dylib, 5): image not found
        when(opsys('darwin'), fragile(16083))
      ], makefile_test, ['ghcilink006'])
 
diff --git a/testsuite/tests/llvm/should_compile/all.T b/testsuite/tests/llvm/should_compile/all.T
index dca858dec8..4fb98dac7b 100644
--- a/testsuite/tests/llvm/should_compile/all.T
+++ b/testsuite/tests/llvm/should_compile/all.T
@@ -5,11 +5,19 @@ def f( name, opts ):
 
 setTestOpts(f)
 
+# Apples LLVM Toolchain knows about a `vortex` cpu (and possibly others), that
+# the stock LLVM toolchain doesn't know abotu and will warn about. Let's not
+# have test fail just because of processor name differences due to different
+# LLVM Toolchains.  GHC tries to pass what apple expects (on darwin), but can
+# be used with the stock LLVM toolchain as well.
+def ignore_llvm_and_vortex( msg ):
+     return re.sub(r".* is not a recognized processor for this target.*\n",r"",msg)
+
 # test('T5486', normal, compile, [''])
-test('T5681', normal, compile, [''])
+test('T5681', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile, [''])
 test('T6158', [reqlib('vector'), reqlib('primitive')], compile, ['-package vector -package primitive'])
-test('T7571', cmm_src, compile, ['-no-hs-main'])
+test('T7571', [cmm_src, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile, ['-no-hs-main'])
 test('T7575', unless(wordsize(32), skip), compile, [''])
-test('T8131b', normal, compile, [''])
-test('T11649', normal, compile, [''])
+test('T8131b', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile, [''])
+test('T11649', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile, [''])
 test('T17920fail', cmm_src, compile_fail, ['-no-hs-main'])
diff --git a/testsuite/tests/llvm/should_run/subsections_via_symbols/all.T b/testsuite/tests/llvm/should_run/subsections_via_symbols/all.T
index 16a30e6f0f..22d7cb2b42 100644
--- a/testsuite/tests/llvm/should_run/subsections_via_symbols/all.T
+++ b/testsuite/tests/llvm/should_run/subsections_via_symbols/all.T
@@ -3,9 +3,15 @@
 #
 # Please refer to https://gitlab.haskell.org/ghc/ghc/issues/5019
 # for the subsections_via_symbols.stderr
+def ignore_llvm_and_vortex( msg ):
+     return re.sub(r"You are using an unsupported version of LLVM!.*\n",r"",
+               re.sub(r"Currently only [^ ]* is supported. System LLVM version: .*\n", r"",
+                    re.sub(r"We will try though.*\n",r"",
+                         re.sub(r".* is not a recognized processor for this target.*\n",r"",msg))))
 
 test('subsections_via_symbols',
      [when(not opsys('darwin'), skip),
       only_ways(['optllvm', 'llvm', 'debugllvm']),
-      extra_files(['SubsectionsViaSymbols.hs'])],
+      extra_files(['SubsectionsViaSymbols.hs']),
+      normalise_errmsg_fun(ignore_llvm_and_vortex)],
      makefile_test, [])
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 085e4b1f12..cd4a92dd7b 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -37,6 +37,7 @@ test('derefnull',
       # The output under OS X is too unstable to readily compare
       when(platform('i386-apple-darwin'), [ignore_stderr, exit_code(139)]),
       when(platform('x86_64-apple-darwin'), [ignore_stderr, exit_code(139)]),
+      when(platform('aarch64-apple-darwin'), [ignore_stderr, exit_code(139)]),
       when(opsys('mingw32'), [ignore_stderr, exit_code(11)]),
       when(opsys('mingw32'), [fragile(18548)]),
       # ThreadSanitizer changes the output
author	Moritz Angermann <moritz.angermann@gmail.com>	2020-10-22 12:08:34 +0800
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2021-06-05 10:29:57 -0400
commit	3b1aa7dba7c006b473855a6b199b15fa3a3d77a0 (patch)
tree	631816f4aef8d28ee596b51b70a0a0ba2df12230
parent	8c90e6c758769b068aea2891b26cc17577b6d36a (diff)
download	haskell-3b1aa7dba7c006b473855a6b199b15fa3a3d77a0.tar.gz