8 files changed, 252 insertions, 57 deletions
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
index 1050fbaa96..83f581cac4 100644
--- a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
@@ -111,7 +111,10 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts excl
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 15
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            -- We should be able to allocate *a lot* more in princple.
+                            -- essentially all 32 - SP, so 31, we'd trash the link reg
+                            -- as well as the platform and all others though.
+                            ArchAArch64   -> 18
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
@@ -143,7 +146,10 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclus
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 0
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            -- we can in princple address all the float regs as
+                            -- segments. So we could have 64 Float regs. Or
+                            -- 128 Half regs, or even 256 Byte regs.
+                            ArchAArch64   -> 0
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
@@ -177,7 +183,7 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclu
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 20
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            ArchAArch64   -> 32
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear.hs b/compiler/GHC/CmmToAsm/Reg/Linear.hs
index 88fdcd6bce..a9a4545f62 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear.hs
@@ -110,10 +110,11 @@ import GHC.CmmToAsm.Reg.Linear.StackMap
 import GHC.CmmToAsm.Reg.Linear.FreeRegs
 import GHC.CmmToAsm.Reg.Linear.Stats
 import GHC.CmmToAsm.Reg.Linear.JoinToTargets
-import qualified GHC.CmmToAsm.Reg.Linear.PPC    as PPC
-import qualified GHC.CmmToAsm.Reg.Linear.SPARC  as SPARC
-import qualified GHC.CmmToAsm.Reg.Linear.X86    as X86
-import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.PPC     as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC   as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
 import GHC.CmmToAsm.Reg.Target
 import GHC.CmmToAsm.Reg.Liveness
 import GHC.CmmToAsm.Reg.Utils
@@ -121,6 +122,7 @@ import GHC.CmmToAsm.Instr
 import GHC.CmmToAsm.Config
 import GHC.CmmToAsm.Types
 import GHC.Platform.Reg
+import GHC.Platform.Reg.Class (RegClass(..))
 
 import GHC.Cmm.BlockId
 import GHC.Cmm.Dataflow.Collections
@@ -202,7 +204,7 @@ regAlloc _ (CmmProc _ _ _ _)
 --   an entry in the block map or it is the first block.
 --
 linearRegAlloc
-        :: forall instr. Instruction instr
+        :: forall instr. (Instruction instr)
         => NCGConfig
         -> [BlockId] -- ^ entry points
         -> BlockMap RegSet
@@ -220,7 +222,7 @@ linearRegAlloc config entry_ids block_live sccs
       ArchSPARC64    -> panic "linearRegAlloc ArchSPARC64"
       ArchPPC        -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
       ArchARM _ _ _  -> panic "linearRegAlloc ArchARM"
-      ArchAArch64    -> panic "linearRegAlloc ArchAArch64"
+      ArchAArch64    -> go $ (frInitFreeRegs platform :: AArch64.FreeRegs)
       ArchPPC_64 _   -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
       ArchAlpha      -> panic "linearRegAlloc ArchAlpha"
       ArchMipseb     -> panic "linearRegAlloc ArchMipseb"
@@ -487,7 +489,7 @@ isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
 
 
 genRaInsn :: forall freeRegs instr.
-             OutputableRegConstraint freeRegs instr
+             (OutputableRegConstraint freeRegs instr)
           => BlockMap RegSet
           -> [instr]
           -> BlockId
@@ -497,7 +499,7 @@ genRaInsn :: forall freeRegs instr.
           -> RegM freeRegs ([instr], [NatBasicBlock instr])
 
 genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
---   pprTraceM "genRaInsn" $ ppr (block_id, instr)
+-- pprTraceM "genRaInsn" $ ppr (block_id, instr)
   platform <- getPlatform
   case regUsageOfInstr platform instr of { RU read written ->
     do
@@ -509,19 +511,20 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
     -- so using nub isn't a problem).
     let virt_read       = nub [ vr      | (RegVirtual vr) <- read ] :: [VirtualReg]
 
-    -- debugging
-{-    freeregs <- getFreeRegsR
-    assig    <- getAssigR
-    pprDebugAndThen (defaultDynFlags Settings{ sTargetPlatform=platform } undefined) trace "genRaInsn"
-        (ppr instr
-                $$ text "r_dying      = " <+> ppr r_dying
-                $$ text "w_dying      = " <+> ppr w_dying
-                $$ text "virt_read    = " <+> ppr virt_read
-                $$ text "virt_written = " <+> ppr virt_written
-                $$ text "freeregs     = " <+> text (show freeregs)
-                $$ text "assig        = " <+> ppr assig)
-        $ do
--}
+--     do
+--         let real_read       = nub [ rr      | (RegReal rr) <- read]
+--         freeregs <- getFreeRegsR
+--         assig    <- getAssigR
+
+--         pprTraceM "genRaInsn"
+--                 (          text "block        = " <+> ppr block_id
+--                         $$ text "instruction  = " <+> ppr instr
+--                         $$ text "r_dying      = " <+> ppr r_dying
+--                         $$ text "w_dying      = " <+> ppr w_dying
+--                         $$ text "read         = " <+> ppr real_read    <+> ppr virt_read
+--                         $$ text "written      = " <+> ppr real_written <+> ppr virt_written
+--                         $$ text "freeregs     = " <+> ppr freeregs
+--                         $$ text "assign       = " <+> ppr assig)
 
     -- (a), (b) allocate real regs for all regs read by this instruction.
     (r_spills, r_allocd) <-
@@ -580,7 +583,6 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
                         Nothing -> x
                         Just y  -> y
 
-
     -- (j) free up stack slots for dead spilled regs
     -- TODO (can't be bothered right now)
 
@@ -592,7 +594,32 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
                                  | src == dst   -> []
                                 _               -> [patched_instr]
 
-    let code = concat [ squashed_instr, w_spills, reverse r_spills, clobber_saves, new_instrs ]
+    -- On the use of @reverse@ below.
+    -- Since we can have spills and reloads produce multiple instructions
+    -- we need to ensure they are emitted in the correct order.  We used to only
+    -- emit single instructions in mkSpill/mkReload/mkRegRegMove.
+    -- As such order of spills and reloads didn't matter.  However,  with
+    -- mutliple instructions potentially issued by those functions we need to be
+    -- careful to not break execution order. Reversing the spills (clobber will
+    -- also spill), will ensure they are emitted in the right order.
+    --
+    -- See also Ticket 19910 for changing the return type from [] to OrdList.
+
+    -- For debugging, uncomment the follow line and the mkComment lines.
+    -- u <- getUniqueR
+    let code = concat [ --  mkComment (text "<genRaInsn(" <> ppr u <> text ")>")
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):squashed>")]
+                        squashed_instr
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):w_spills>")
+                      , reverse w_spills
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):r_spills>")
+                      , reverse r_spills
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):clobber_saves>")
+                      , reverse clobber_saves
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):new_instrs>")
+                      , new_instrs
+                        -- ,mkComment (text "</genRaInsn(" <> ppr u <> text ")>")
+                      ]
 
 --    pprTrace "patched-code" ((vcat $ map (docToSDoc . pprInstr) code)) $ do
 --    pprTrace "pached-fixup" ((ppr fixup_blocks)) $ do
@@ -609,6 +636,7 @@ releaseRegs regs = do
   platform <- getPlatform
   assig <- getAssigR
   free <- getFreeRegsR
+
   let loop assig !free [] = do setAssigR assig; setFreeRegsR free; return ()
       loop assig !free (RegReal rr : rs) = loop assig (frReleaseReg platform rr free) rs
       loop assig !free (r:rs) =
@@ -662,8 +690,9 @@ saveClobberedTemps clobbered dying
 
         (instrs,assig') <- clobber assig [] to_spill
         setAssigR assig'
-        return instrs
-
+        return $ -- mkComment (text "<saveClobberedTemps>") ++
+                 instrs
+--              ++ mkComment (text "</saveClobberedTemps>")
    where
      -- See Note [UniqFM and the register allocator]
      clobber :: RegMap Loc -> [instr] -> [(Unique,RealReg)] -> RegM freeRegs ([instr], RegMap Loc)
@@ -700,7 +729,7 @@ saveClobberedTemps clobbered dying
 
                   let new_assign  = addToUFM_Directly assig temp (InBoth reg slot)
 
-                  clobber new_assign (spill : instrs) rest
+                  clobber new_assign (spill ++ instrs) rest
 
 
 
@@ -714,7 +743,17 @@ clobberRegs []
 clobberRegs clobbered
  = do   platform <- getPlatform
         freeregs <- getFreeRegsR
-        setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered
+
+        let gpRegs  = frGetFreeRegs platform RcInteger freeregs :: [RealReg]
+            fltRegs = frGetFreeRegs platform RcFloat   freeregs :: [RealReg]
+            dblRegs = frGetFreeRegs platform RcDouble  freeregs :: [RealReg]
+
+        let extra_clobbered = [ r | r <- clobbered
+                                  , r `elem` (gpRegs ++ fltRegs ++ dblRegs) ]
+
+        setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs extra_clobbered
+
+        -- setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered
 
         assig           <- getAssigR
         setAssigR $! clobber assig (nonDetUFMToList assig)
@@ -909,10 +948,7 @@ allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
                         | (temp_to_push_out, (my_reg :: RealReg)) : _
                                         <- candidates_inReg
                         = do
-                                (spill_insn, slot) <- spillR (RegReal my_reg) temp_to_push_out
-                                let spill_store  = (if reading then id else reverse)
-                                                        [ -- COMMENT (fsLit "spill alloc")
-                                                           spill_insn ]
+                                (spill_store, slot) <- spillR (RegReal my_reg) temp_to_push_out
 
                                 -- record that this temp was spilled
                                 recordSpill (SpillAlloc temp_to_push_out)
@@ -962,7 +998,7 @@ loadTemp vreg (ReadMem slot) hreg spills
  = do
         insn <- loadR (RegReal hreg) slot
         recordSpill (SpillLoad $ getUnique vreg)
-        return  $  {- COMMENT (fsLit "spill load") : -} insn : spills
+        return  $  {- mkComment (text "spill load") : -} insn ++ spills
 
 loadTemp _ _ _ spills =
    return spills
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs b/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs
new file mode 100644
index 0000000000..50299c4e74
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs
@@ -0,0 +1,137 @@
+module GHC.CmmToAsm.Reg.Linear.AArch64 where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.AArch64.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Utils.Outputable
+import GHC.Utils.Panic
+import GHC.Platform
+
+import Data.Word
+
+import GHC.Stack
+-- AArch64 has 32 64bit general purpose register r0..r30, and zr/sp
+-- AArch64 has 32 128bit floating point registers v0..v31 as part of the NEON
+-- extension in Armv8-A.
+--
+-- Armv8-A is a fundamental change to the Arm architecture. It supports the
+-- 64-bit Execution state called “AArch64”, and a new 64-bit instruction set
+-- “A64”. To provide compatibility with the Armv7-A (32-bit architecture)
+-- instruction set, a 32-bit variant of Armv8-A “AArch32” is provided. Most of
+-- existing Armv7-A code can be run in the AArch32 execution state of Armv8-A.
+--
+-- these can be addresses as q/d/s/h/b 0..31, or v.f<size>[idx]
+-- where size is 64, 32, 16, 8, ... and the index i allows us
+-- to access the given part.
+--
+-- History of Arm Adv SIMD
+-- .---------------------------------------------------------------------------.
+-- | Armv6                  | Armv7-A                | Armv8-A AArch64         |
+-- | SIMD extension         | NEON                   | NEON                    |
+-- |===========================================================================|
+-- | - Operates on 32-bit   | - Separate reg. bank,  | - Separate reg. bank,   |
+-- |   GP ARM registers     |    32x64-bit NEON regs |   32x128-bit NEON regs  |
+-- | - 8-bit/16-bit integer | - 8/16/32/64-bit int   | - 8/16/32/64-bit int    |
+-- |                        | - Single percision fp  | - Single percision fp   |
+-- |                        |                        | - Double precision fp   |
+-- |                        |                        | - Single/Double fp are  |
+-- |                        |                        |   IEEE compliant        |
+-- | - 2x16-bit/4x8-bit ops | - Up to 16x8-bit ops   | - Up to 16x8-bit ops    |
+-- |   per instruction      |   per instruction      |   per instruction       |
+-- '---------------------------------------------------------------------------'
+
+data FreeRegs = FreeRegs !Word32 !Word32
+
+instance Show FreeRegs where
+  show (FreeRegs g f) = "FreeRegs: " ++ showBits g ++ "; " ++ showBits f
+
+instance Outputable FreeRegs where
+    ppr (FreeRegs g f) = text "   " <+> foldr (\i x -> pad_int i    <+> x) (text "") [0..31]
+                      $$ text "GPR" <+> foldr (\i x -> show_bit g i <+> x) (text "") [0..31]
+                      $$ text "FPR" <+> foldr (\i x -> show_bit f i <+> x) (text "") [0..31]
+      where pad_int i | i < 10 = char ' ' <> int i
+            pad_int i = int i
+            -- remember bit = 1 means it's available.
+            show_bit bits bit | testBit bits bit = text "  "
+            show_bit _    _ = text " x"
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0 0
+
+showBits :: Word32 -> String
+showBits w = map (\i -> if testBit w i then '1' else '0') [0..31]
+
+-- FR instance implementation (See Linear.FreeRegs)
+allocateReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs g f)
+    | r > 31 && testBit f (r - 32) = FreeRegs g (clearBit f (r - 32))
+    | r < 32 && testBit g r = FreeRegs (clearBit g r) f
+    | r > 31 = panic $ "Linear.AArch64.allocReg: double allocation of float reg v" ++ show (r - 32) ++ "; " ++ showBits f
+    | otherwise = pprPanic "Linear.AArch64.allocReg" $ text ("double allocation of gp reg x" ++ show r ++ "; " ++ showBits g)
+allocateReg _ _ = panic "Linear.AArch64.allocReg: bad reg"
+
+-- we start from 28 downwards... the logic is similar to the ppc logic.
+-- 31 is Stack Pointer
+-- 30 is Link Register
+-- 29 is Stack Frame (by convention)
+-- 19-28 are callee save
+-- the lower ones are all caller save
+
+-- For this reason someone decided to give aarch64 only 6 regs for
+-- STG:
+-- 19: Base
+-- 20: Sp
+-- 21: Hp
+-- 22-27: R1-R6
+-- 28: SpLim
+
+-- For LLVM code gen interop:
+-- See https://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20150119/253722.html
+-- and the current ghccc implementation here:
+-- https://github.com/llvm/llvm-project/blob/161ae1f39816edf667aaa190bce702a86879c7bd/llvm/lib/Target/AArch64/AArch64CallingConvention.td#L324-L363
+-- and https://gitlab.haskell.org/ghc/ghc/-/wikis/commentary/compiler/generated-code
+-- for the STG discussion.
+{- For reference the ghcc from the link above:
+let Entry = 1 in
+def CC_AArch64_GHC : CallingConv<[
+  CCIfType<[iPTR], CCBitConvertToType<i64>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>,
+  CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>,
+
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
+]>;
+-}
+
+getFreeRegs :: RegClass -> FreeRegs -> [RealReg]
+getFreeRegs cls (FreeRegs g f)
+  | RcFloat   <- cls = [] -- For now we only support double and integer registers, floats will need to be promoted.
+  | RcDouble  <- cls = go 32 f 31
+  | RcInteger <- cls = go  0 g 18
+    where
+        go _   _ i | i < 0 = []
+        go off x i | testBit x i = RealRegSingle (off + i) : (go off x $! i - 1)
+                   | otherwise   = go off x $! i - 1
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+releaseReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle r) (FreeRegs g f)
+  | r > 31 && testBit f (r - 32) = pprPanic "Linear.AArch64.releaseReg" (text  "can't release non-allocated reg v" <> int (r - 32))
+  | r < 32 && testBit g r = pprPanic "Linear.AArch64.releaseReg" (text "can't release non-allocated reg x" <> int r)
+  | r > 31 = FreeRegs g (setBit f (r - 32))
+  | otherwise = FreeRegs (setBit g r) f
+releaseReg _ _ = pprPanic "Linear.AArch64.releaseReg" (text "bad reg")
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
index d501718c4a..3ae0fa140d 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
@@ -25,14 +25,16 @@ import GHC.Platform
 --      getFreeRegs cls f = filter ( (==cls) . regClass . RealReg ) f
 --      allocateReg f r = filter (/= r) f
 
-import qualified GHC.CmmToAsm.Reg.Linear.PPC    as PPC
-import qualified GHC.CmmToAsm.Reg.Linear.SPARC  as SPARC
-import qualified GHC.CmmToAsm.Reg.Linear.X86    as X86
-import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.PPC     as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC   as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
 
-import qualified GHC.CmmToAsm.PPC.Instr   as PPC.Instr
-import qualified GHC.CmmToAsm.SPARC.Instr as SPARC.Instr
-import qualified GHC.CmmToAsm.X86.Instr   as X86.Instr
+import qualified GHC.CmmToAsm.PPC.Instr     as PPC.Instr
+import qualified GHC.CmmToAsm.SPARC.Instr   as SPARC.Instr
+import qualified GHC.CmmToAsm.X86.Instr     as X86.Instr
+import qualified GHC.CmmToAsm.AArch64.Instr as AArch64.Instr
 
 class Show freeRegs => FR freeRegs where
     frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs
@@ -58,6 +60,12 @@ instance FR PPC.FreeRegs where
     frInitFreeRegs = PPC.initFreeRegs
     frReleaseReg   = \_ -> PPC.releaseReg
 
+instance FR AArch64.FreeRegs where
+    frAllocateReg = \_ -> AArch64.allocateReg
+    frGetFreeRegs = \_ -> AArch64.getFreeRegs
+    frInitFreeRegs = AArch64.initFreeRegs
+    frReleaseReg = \_ -> AArch64.releaseReg
+
 instance FR SPARC.FreeRegs where
     frAllocateReg  = SPARC.allocateReg
     frGetFreeRegs  = \_ -> SPARC.getFreeRegs
@@ -73,7 +81,7 @@ maxSpillSlots config = case platformArch (ncgPlatform config) of
    ArchSPARC     -> SPARC.Instr.maxSpillSlots config
    ArchSPARC64   -> panic "maxSpillSlots ArchSPARC64"
    ArchARM _ _ _ -> panic "maxSpillSlots ArchARM"
-   ArchAArch64   -> panic "maxSpillSlots ArchAArch64"
+   ArchAArch64   -> AArch64.Instr.maxSpillSlots config
    ArchPPC_64 _  -> PPC.Instr.maxSpillSlots config
    ArchAlpha     -> panic "maxSpillSlots ArchAlpha"
    ArchMipseb    -> panic "maxSpillSlots ArchMipseb"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
index d0330a4f6a..cbdf5d030b 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
@@ -26,6 +26,7 @@ import GHC.Cmm.BlockId
 import GHC.Cmm.Dataflow.Collections
 import GHC.Data.Graph.Directed
 import GHC.Utils.Panic
+import GHC.Utils.Monad (concatMapM)
 import GHC.Types.Unique
 import GHC.Types.Unique.FM
 import GHC.Types.Unique.Set
@@ -306,7 +307,7 @@ handleComponent
 --      go via a spill slot.
 --
 handleComponent delta _  (AcyclicSCC (DigraphNode vreg src dsts))
-        = mapM (makeMove delta vreg src) dsts
+        = concatMapM (makeMove delta vreg src) dsts
 
 
 -- Handle some cyclic moves.
@@ -340,7 +341,7 @@ handleComponent delta instr
 
         -- make sure to do all the reloads after all the spills,
         --      so we don't end up clobbering the source values.
-        return ([instrSpill] ++ concat remainingFixUps ++ [instrLoad])
+        return (instrSpill ++ concat remainingFixUps ++ instrLoad)
 
 handleComponent _ _ (CyclicSCC _)
  = panic "Register Allocator: handleComponent cyclic"
@@ -354,7 +355,7 @@ makeMove
     -> Unique   -- ^ unique of the vreg that we're moving.
     -> Loc      -- ^ source location.
     -> Loc      -- ^ destination location.
-    -> RegM freeRegs instr  -- ^ move instruction.
+    -> RegM freeRegs [instr]  -- ^ move instruction.
 
 makeMove delta vreg src dst
  = do config <- getConfig
@@ -363,7 +364,7 @@ makeMove delta vreg src dst
       case (src, dst) of
           (InReg s, InReg d) ->
               do recordSpill (SpillJoinRR vreg)
-                 return $ mkRegRegMoveInstr platform (RegReal s) (RegReal d)
+                 return $ [mkRegRegMoveInstr platform (RegReal s) (RegReal d)]
           (InMem s, InReg d) ->
               do recordSpill (SpillJoinRM vreg)
                  return $ mkLoadInstr config (RegReal d) delta s
@@ -377,4 +378,3 @@ makeMove delta vreg src dst
               panic ("makeMove " ++ show vreg ++ " (" ++ show src ++ ") ("
                   ++ show dst ++ ")"
                   ++ " we don't handle mem->mem moves.")
-
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
index 24a75121b8..ec1cd517ea 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
@@ -121,7 +121,7 @@ makeRAStats state
 
 
 spillR :: Instruction instr
-       => Reg -> Unique -> RegM freeRegs (instr, Int)
+       => Reg -> Unique -> RegM freeRegs ([instr], Int)
 
 spillR reg temp = mkRegM $ \s ->
   let (stack1,slot) = getStackSlotFor (ra_stack s) temp
@@ -131,7 +131,7 @@ spillR reg temp = mkRegM $ \s ->
 
 
 loadR :: Instruction instr
-      => Reg -> Int -> RegM freeRegs instr
+      => Reg -> Int -> RegM freeRegs [instr]
 
 loadR reg slot = mkRegM $ \s ->
   RA_Result s (mkLoadInstr (ra_config s) reg (ra_delta s) slot)
diff --git a/compiler/GHC/CmmToAsm/Reg/Liveness.hs b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
index 4d70533624..ad8190270f 100644
--- a/compiler/GHC/CmmToAsm/Reg/Liveness.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
@@ -174,6 +174,8 @@ instance Instruction instr => Instruction (InstrSR instr) where
 
         pprInstr platform i = ppr (fmap (pprInstr platform) i)
 
+        mkComment               = fmap Instr . mkComment
+
 
 -- | An instruction with liveness information.
 data LiveInstr instr
@@ -565,16 +567,20 @@ stripLiveBlock config (BasicBlock i lis)
  where  (instrs', _)
                 = runState (spillNat [] lis) 0
 
+        -- spillNat :: [instr] -> [LiveInstr instr] -> State Int [instr]
+        spillNat :: Instruction instr => [instr] -> [LiveInstr instr] -> State Int [instr]
         spillNat acc []
          =      return (reverse acc)
 
+        -- The SPILL/RELOAD cases do not appear to be exercised by our codegens
+        --
         spillNat acc (LiveInstr (SPILL reg slot) _ : instrs)
          = do   delta   <- get
-                spillNat (mkSpillInstr config reg delta slot : acc) instrs
+                spillNat (mkSpillInstr config reg delta slot ++ acc) instrs
 
         spillNat acc (LiveInstr (RELOAD slot reg) _ : instrs)
          = do   delta   <- get
-                spillNat (mkLoadInstr config reg delta slot : acc) instrs
+                spillNat (mkLoadInstr config reg delta slot ++ acc) instrs
 
         spillNat acc (LiveInstr (Instr instr) _ : instrs)
          | Just i <- takeDeltaInstr instr
diff --git a/compiler/GHC/CmmToAsm/Reg/Target.hs b/compiler/GHC/CmmToAsm/Reg/Target.hs
index 4611c3a8e8..22b22e21cc 100644
--- a/compiler/GHC/CmmToAsm/Reg/Target.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Target.hs
@@ -34,6 +34,8 @@ import qualified GHC.CmmToAsm.X86.Regs       as X86
 import qualified GHC.CmmToAsm.X86.RegInfo    as X86
 import qualified GHC.CmmToAsm.PPC.Regs       as PPC
 import qualified GHC.CmmToAsm.SPARC.Regs     as SPARC
+import qualified GHC.CmmToAsm.AArch64.Regs   as AArch64
+
 
 targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int
 targetVirtualRegSqueeze platform
@@ -46,7 +48,7 @@ targetVirtualRegSqueeze platform
       ArchSPARC64   -> panic "targetVirtualRegSqueeze ArchSPARC64"
       ArchPPC_64 _  -> PPC.virtualRegSqueeze
       ArchARM _ _ _ -> panic "targetVirtualRegSqueeze ArchARM"
-      ArchAArch64   -> panic "targetVirtualRegSqueeze ArchAArch64"
+      ArchAArch64   -> AArch64.virtualRegSqueeze
       ArchAlpha     -> panic "targetVirtualRegSqueeze ArchAlpha"
       ArchMipseb    -> panic "targetVirtualRegSqueeze ArchMipseb"
       ArchMipsel    -> panic "targetVirtualRegSqueeze ArchMipsel"
@@ -66,7 +68,7 @@ targetRealRegSqueeze platform
       ArchSPARC64   -> panic "targetRealRegSqueeze ArchSPARC64"
       ArchPPC_64 _  -> PPC.realRegSqueeze
       ArchARM _ _ _ -> panic "targetRealRegSqueeze ArchARM"
-      ArchAArch64   -> panic "targetRealRegSqueeze ArchAArch64"
+      ArchAArch64   -> AArch64.realRegSqueeze
       ArchAlpha     -> panic "targetRealRegSqueeze ArchAlpha"
       ArchMipseb    -> panic "targetRealRegSqueeze ArchMipseb"
       ArchMipsel    -> panic "targetRealRegSqueeze ArchMipsel"
@@ -85,7 +87,7 @@ targetClassOfRealReg platform
       ArchSPARC64   -> panic "targetClassOfRealReg ArchSPARC64"
       ArchPPC_64 _  -> PPC.classOfRealReg
       ArchARM _ _ _ -> panic "targetClassOfRealReg ArchARM"
-      ArchAArch64   -> panic "targetClassOfRealReg ArchAArch64"
+      ArchAArch64   -> AArch64.classOfRealReg
       ArchAlpha     -> panic "targetClassOfRealReg ArchAlpha"
       ArchMipseb    -> panic "targetClassOfRealReg ArchMipseb"
       ArchMipsel    -> panic "targetClassOfRealReg ArchMipsel"
@@ -104,7 +106,7 @@ targetMkVirtualReg platform
       ArchSPARC64   -> panic "targetMkVirtualReg ArchSPARC64"
       ArchPPC_64 _  -> PPC.mkVirtualReg
       ArchARM _ _ _ -> panic "targetMkVirtualReg ArchARM"
-      ArchAArch64   -> panic "targetMkVirtualReg ArchAArch64"
+      ArchAArch64   -> AArch64.mkVirtualReg
       ArchAlpha     -> panic "targetMkVirtualReg ArchAlpha"
       ArchMipseb    -> panic "targetMkVirtualReg ArchMipseb"
       ArchMipsel    -> panic "targetMkVirtualReg ArchMipsel"
@@ -123,7 +125,7 @@ targetRegDotColor platform
       ArchSPARC64   -> panic "targetRegDotColor ArchSPARC64"
       ArchPPC_64 _  -> PPC.regDotColor
       ArchARM _ _ _ -> panic "targetRegDotColor ArchARM"
-      ArchAArch64   -> panic "targetRegDotColor ArchAArch64"
+      ArchAArch64   -> AArch64.regDotColor
       ArchAlpha     -> panic "targetRegDotColor ArchAlpha"
       ArchMipseb    -> panic "targetRegDotColor ArchMipseb"
       ArchMipsel    -> panic "targetRegDotColor ArchMipsel"