summaryrefslogtreecommitdiff
path: root/compiler/GHC
diff options
context:
space:
mode:
authorSylvain Henry <sylvain@haskus.fr>2020-02-22 15:05:20 +0100
committerMarge Bot <ben+marge-bot@smart-cactus.org>2020-02-24 20:55:25 -0500
commit1b1067d14b656bbbfa7c47f156ec2700c9751549 (patch)
tree32346e3c4c3f89117190b36364144d85dc260e05 /compiler/GHC
parent354e2787be08fb6d973de1a39e58080ff8e107f8 (diff)
downloadhaskell-1b1067d14b656bbbfa7c47f156ec2700c9751549.tar.gz
Modules: CmmToAsm (#13009)
Diffstat (limited to 'compiler/GHC')
-rw-r--r--compiler/GHC/CmmToAsm.hs1236
-rw-r--r--compiler/GHC/CmmToAsm/BlockLayout.hs895
-rw-r--r--compiler/GHC/CmmToAsm/CFG.hs1320
-rw-r--r--compiler/GHC/CmmToAsm/CFG/Dominators.hs597
-rw-r--r--compiler/GHC/CmmToAsm/CPrim.hs133
-rw-r--r--compiler/GHC/CmmToAsm/Dwarf.hs269
-rw-r--r--compiler/GHC/CmmToAsm/Dwarf/Constants.hs229
-rw-r--r--compiler/GHC/CmmToAsm/Dwarf/Types.hs612
-rw-r--r--compiler/GHC/CmmToAsm/Format.hs105
-rw-r--r--compiler/GHC/CmmToAsm/Instr.hs202
-rw-r--r--compiler/GHC/CmmToAsm/Monad.hs294
-rw-r--r--compiler/GHC/CmmToAsm/PIC.hs837
-rw-r--r--compiler/GHC/CmmToAsm/PPC/CodeGen.hs2455
-rw-r--r--compiler/GHC/CmmToAsm/PPC/Cond.hs63
-rw-r--r--compiler/GHC/CmmToAsm/PPC/Instr.hs713
-rw-r--r--compiler/GHC/CmmToAsm/PPC/Ppr.hs994
-rw-r--r--compiler/GHC/CmmToAsm/PPC/RegInfo.hs80
-rw-r--r--compiler/GHC/CmmToAsm/PPC/Regs.hs333
-rw-r--r--compiler/GHC/CmmToAsm/Ppr.hs275
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph.hs472
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph/Base.hs163
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs99
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs382
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs616
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs317
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs346
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs274
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Graph/X86.hs161
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear.hs920
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/Base.hs141
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs89
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs378
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs60
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs188
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs61
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/State.hs184
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs87
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/X86.hs52
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs53
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Liveness.hs1025
-rw-r--r--compiler/GHC/CmmToAsm/Reg/Target.hs135
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/AddrMode.hs44
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/Base.hs77
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen.hs700
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs74
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs119
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs110
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs156
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs692
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot16
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs216
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs69
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/Cond.hs54
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/Imm.hs67
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/Instr.hs481
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/Ppr.hs645
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/Regs.hs259
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs74
-rw-r--r--compiler/GHC/CmmToAsm/SPARC/Stack.hs59
-rw-r--r--compiler/GHC/CmmToAsm/X86/CodeGen.hs3747
-rw-r--r--compiler/GHC/CmmToAsm/X86/Cond.hs109
-rw-r--r--compiler/GHC/CmmToAsm/X86/Instr.hs1056
-rw-r--r--compiler/GHC/CmmToAsm/X86/Ppr.hs1014
-rw-r--r--compiler/GHC/CmmToAsm/X86/RegInfo.hs73
-rw-r--r--compiler/GHC/CmmToAsm/X86/Regs.hs442
-rw-r--r--compiler/GHC/CmmToC.hs2
-rw-r--r--compiler/GHC/Driver/CodeOutput.hs2
-rw-r--r--compiler/GHC/Llvm/Types.hs2
-rw-r--r--compiler/GHC/Platform/Reg.hs241
-rw-r--r--compiler/GHC/Platform/Reg/Class.hs32
-rw-r--r--compiler/GHC/Platform/Regs.hs2
71 files changed, 28475 insertions, 4 deletions
diff --git a/compiler/GHC/CmmToAsm.hs b/compiler/GHC/CmmToAsm.hs
new file mode 100644
index 0000000000..8dc9b61198
--- /dev/null
+++ b/compiler/GHC/CmmToAsm.hs
@@ -0,0 +1,1236 @@
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-- This is the top-level module in the native code generator.
+--
+-- -----------------------------------------------------------------------------
+
+{-# LANGUAGE BangPatterns, CPP, GADTs, ScopedTypeVariables, PatternSynonyms,
+ DeriveFunctor #-}
+
+#if !defined(GHC_LOADED_INTO_GHCI)
+{-# LANGUAGE UnboxedTuples #-}
+#endif
+
+{-# OPTIONS_GHC -Wno-incomplete-record-updates #-}
+
+module GHC.CmmToAsm (
+ -- * Module entry point
+ nativeCodeGen
+
+ -- * Test-only exports: see trac #12744
+ -- used by testGraphNoSpills, which needs to access
+ -- the register allocator intermediate data structures
+ -- cmmNativeGen emits
+ , cmmNativeGen
+ , NcgImpl(..)
+ , x86NcgImpl
+ ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import qualified GHC.CmmToAsm.X86.CodeGen as X86.CodeGen
+import qualified GHC.CmmToAsm.X86.Regs as X86.Regs
+import qualified GHC.CmmToAsm.X86.Instr as X86.Instr
+import qualified GHC.CmmToAsm.X86.Ppr as X86.Ppr
+
+import qualified GHC.CmmToAsm.SPARC.CodeGen as SPARC.CodeGen
+import qualified GHC.CmmToAsm.SPARC.Regs as SPARC.Regs
+import qualified GHC.CmmToAsm.SPARC.Instr as SPARC.Instr
+import qualified GHC.CmmToAsm.SPARC.Ppr as SPARC.Ppr
+import qualified GHC.CmmToAsm.SPARC.ShortcutJump as SPARC.ShortcutJump
+import qualified GHC.CmmToAsm.SPARC.CodeGen.Expand as SPARC.CodeGen.Expand
+
+import qualified GHC.CmmToAsm.PPC.CodeGen as PPC.CodeGen
+import qualified GHC.CmmToAsm.PPC.Regs as PPC.Regs
+import qualified GHC.CmmToAsm.PPC.RegInfo as PPC.RegInfo
+import qualified GHC.CmmToAsm.PPC.Instr as PPC.Instr
+import qualified GHC.CmmToAsm.PPC.Ppr as PPC.Ppr
+
+import GHC.CmmToAsm.Reg.Liveness
+import qualified GHC.CmmToAsm.Reg.Linear as Linear
+
+import qualified GraphColor as Color
+import qualified GHC.CmmToAsm.Reg.Graph as Color
+import qualified GHC.CmmToAsm.Reg.Graph.Stats as Color
+import qualified GHC.CmmToAsm.Reg.Graph.TrivColorable as Color
+
+import AsmUtils
+import GHC.CmmToAsm.Reg.Target
+import GHC.Platform
+import GHC.CmmToAsm.BlockLayout as BlockLayout
+import Config
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.CFG
+import GHC.CmmToAsm.Dwarf
+import GHC.Cmm.DebugBlock
+
+import GHC.Cmm.BlockId
+import GHC.StgToCmm.CgUtils ( fixStgRegisters )
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Opt ( cmmMachOpFold )
+import GHC.Cmm.Ppr
+import GHC.Cmm.CLabel
+
+import UniqFM
+import UniqSupply
+import GHC.Driver.Session
+import Util
+
+import BasicTypes ( Alignment )
+import qualified Pretty
+import BufWrite
+import Outputable
+import FastString
+import UniqSet
+import ErrUtils
+import Module
+import Stream (Stream)
+import qualified Stream
+
+-- DEBUGGING ONLY
+--import OrdList
+
+import Data.List
+import Data.Maybe
+import Data.Ord ( comparing )
+import Control.Exception
+import Control.Monad
+import System.IO
+
+{-
+The native-code generator has machine-independent and
+machine-dependent modules.
+
+This module ("AsmCodeGen") is the top-level machine-independent
+module. Before entering machine-dependent land, we do some
+machine-independent optimisations (defined below) on the
+'CmmStmts's.
+
+We convert to the machine-specific 'Instr' datatype with
+'cmmCodeGen', assuming an infinite supply of registers. We then use
+a machine-independent register allocator ('regAlloc') to rejoin
+reality. Obviously, 'regAlloc' has machine-specific helper
+functions (see about "RegAllocInfo" below).
+
+Finally, we order the basic blocks of the function so as to minimise
+the number of jumps between blocks, by utilising fallthrough wherever
+possible.
+
+The machine-dependent bits break down as follows:
+
+ * ["MachRegs"] Everything about the target platform's machine
+ registers (and immediate operands, and addresses, which tend to
+ intermingle/interact with registers).
+
+ * ["MachInstrs"] Includes the 'Instr' datatype (possibly should
+ have a module of its own), plus a miscellany of other things
+ (e.g., 'targetDoubleSize', 'smStablePtrTable', ...)
+
+ * ["MachCodeGen"] is where 'Cmm' stuff turns into
+ machine instructions.
+
+ * ["PprMach"] 'pprInstr' turns an 'Instr' into text (well, really
+ a 'SDoc').
+
+ * ["RegAllocInfo"] In the register allocator, we manipulate
+ 'MRegsState's, which are 'BitSet's, one bit per machine register.
+ When we want to say something about a specific machine register
+ (e.g., ``it gets clobbered by this instruction''), we set/unset
+ its bit. Obviously, we do this 'BitSet' thing for efficiency
+ reasons.
+
+ The 'RegAllocInfo' module collects together the machine-specific
+ info needed to do register allocation.
+
+ * ["RegisterAlloc"] The (machine-independent) register allocator.
+-}
+
+--------------------
+nativeCodeGen :: forall a . DynFlags -> Module -> ModLocation -> Handle -> UniqSupply
+ -> Stream IO RawCmmGroup a
+ -> IO a
+nativeCodeGen dflags this_mod modLoc h us cmms
+ = let platform = targetPlatform dflags
+ nCG' :: ( Outputable statics, Outputable instr
+ , Outputable jumpDest, Instruction instr)
+ => NcgImpl statics instr jumpDest -> IO a
+ nCG' ncgImpl = nativeCodeGen' dflags this_mod modLoc ncgImpl h us cmms
+ in case platformArch platform of
+ ArchX86 -> nCG' (x86NcgImpl dflags)
+ ArchX86_64 -> nCG' (x86_64NcgImpl dflags)
+ ArchPPC -> nCG' (ppcNcgImpl dflags)
+ ArchS390X -> panic "nativeCodeGen: No NCG for S390X"
+ ArchSPARC -> nCG' (sparcNcgImpl dflags)
+ ArchSPARC64 -> panic "nativeCodeGen: No NCG for SPARC64"
+ ArchARM {} -> panic "nativeCodeGen: No NCG for ARM"
+ ArchARM64 -> panic "nativeCodeGen: No NCG for ARM64"
+ ArchPPC_64 _ -> nCG' (ppcNcgImpl dflags)
+ ArchAlpha -> panic "nativeCodeGen: No NCG for Alpha"
+ ArchMipseb -> panic "nativeCodeGen: No NCG for mipseb"
+ ArchMipsel -> panic "nativeCodeGen: No NCG for mipsel"
+ ArchUnknown -> panic "nativeCodeGen: No NCG for unknown arch"
+ ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript"
+
+x86NcgImpl :: DynFlags -> NcgImpl (Alignment, RawCmmStatics)
+ X86.Instr.Instr X86.Instr.JumpDest
+x86NcgImpl dflags
+ = (x86_64NcgImpl dflags)
+
+x86_64NcgImpl :: DynFlags -> NcgImpl (Alignment, RawCmmStatics)
+ X86.Instr.Instr X86.Instr.JumpDest
+x86_64NcgImpl dflags
+ = NcgImpl {
+ cmmTopCodeGen = X86.CodeGen.cmmTopCodeGen
+ ,generateJumpTableForInstr = X86.CodeGen.generateJumpTableForInstr dflags
+ ,getJumpDestBlockId = X86.Instr.getJumpDestBlockId
+ ,canShortcut = X86.Instr.canShortcut
+ ,shortcutStatics = X86.Instr.shortcutStatics
+ ,shortcutJump = X86.Instr.shortcutJump
+ ,pprNatCmmDecl = X86.Ppr.pprNatCmmDecl
+ ,maxSpillSlots = X86.Instr.maxSpillSlots dflags
+ ,allocatableRegs = X86.Regs.allocatableRegs platform
+ ,ncgAllocMoreStack = X86.Instr.allocMoreStack platform
+ ,ncgExpandTop = id
+ ,ncgMakeFarBranches = const id
+ ,extractUnwindPoints = X86.CodeGen.extractUnwindPoints
+ ,invertCondBranches = X86.CodeGen.invertCondBranches
+ }
+ where platform = targetPlatform dflags
+
+ppcNcgImpl :: DynFlags -> NcgImpl RawCmmStatics PPC.Instr.Instr PPC.RegInfo.JumpDest
+ppcNcgImpl dflags
+ = NcgImpl {
+ cmmTopCodeGen = PPC.CodeGen.cmmTopCodeGen
+ ,generateJumpTableForInstr = PPC.CodeGen.generateJumpTableForInstr dflags
+ ,getJumpDestBlockId = PPC.RegInfo.getJumpDestBlockId
+ ,canShortcut = PPC.RegInfo.canShortcut
+ ,shortcutStatics = PPC.RegInfo.shortcutStatics
+ ,shortcutJump = PPC.RegInfo.shortcutJump
+ ,pprNatCmmDecl = PPC.Ppr.pprNatCmmDecl
+ ,maxSpillSlots = PPC.Instr.maxSpillSlots dflags
+ ,allocatableRegs = PPC.Regs.allocatableRegs platform
+ ,ncgAllocMoreStack = PPC.Instr.allocMoreStack platform
+ ,ncgExpandTop = id
+ ,ncgMakeFarBranches = PPC.Instr.makeFarBranches
+ ,extractUnwindPoints = const []
+ ,invertCondBranches = \_ _ -> id
+ }
+ where platform = targetPlatform dflags
+
+sparcNcgImpl :: DynFlags -> NcgImpl RawCmmStatics SPARC.Instr.Instr SPARC.ShortcutJump.JumpDest
+sparcNcgImpl dflags
+ = NcgImpl {
+ cmmTopCodeGen = SPARC.CodeGen.cmmTopCodeGen
+ ,generateJumpTableForInstr = SPARC.CodeGen.generateJumpTableForInstr dflags
+ ,getJumpDestBlockId = SPARC.ShortcutJump.getJumpDestBlockId
+ ,canShortcut = SPARC.ShortcutJump.canShortcut
+ ,shortcutStatics = SPARC.ShortcutJump.shortcutStatics
+ ,shortcutJump = SPARC.ShortcutJump.shortcutJump
+ ,pprNatCmmDecl = SPARC.Ppr.pprNatCmmDecl
+ ,maxSpillSlots = SPARC.Instr.maxSpillSlots dflags
+ ,allocatableRegs = SPARC.Regs.allocatableRegs
+ ,ncgAllocMoreStack = noAllocMoreStack
+ ,ncgExpandTop = map SPARC.CodeGen.Expand.expandTop
+ ,ncgMakeFarBranches = const id
+ ,extractUnwindPoints = const []
+ ,invertCondBranches = \_ _ -> id
+ }
+
+--
+-- Allocating more stack space for spilling is currently only
+-- supported for the linear register allocator on x86/x86_64, the rest
+-- default to the panic below. To support allocating extra stack on
+-- more platforms provide a definition of ncgAllocMoreStack.
+--
+noAllocMoreStack :: Int -> NatCmmDecl statics instr
+ -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)])
+noAllocMoreStack amount _
+ = panic $ "Register allocator: out of stack slots (need " ++ show amount ++ ")\n"
+ ++ " If you are trying to compile SHA1.hs from the crypto library then this\n"
+ ++ " is a known limitation in the linear allocator.\n"
+ ++ "\n"
+ ++ " Try enabling the graph colouring allocator with -fregs-graph instead."
+ ++ " You can still file a bug report if you like.\n"
+
+
+-- | Data accumulated during code generation. Mostly about statistics,
+-- but also collects debug data for DWARF generation.
+data NativeGenAcc statics instr
+ = NGS { ngs_imports :: ![[CLabel]]
+ , ngs_natives :: ![[NatCmmDecl statics instr]]
+ -- ^ Native code generated, for statistics. This might
+ -- hold a lot of data, so it is important to clear this
+ -- field as early as possible if it isn't actually
+ -- required.
+ , ngs_colorStats :: ![[Color.RegAllocStats statics instr]]
+ , ngs_linearStats :: ![[Linear.RegAllocStats]]
+ , ngs_labels :: ![Label]
+ , ngs_debug :: ![DebugBlock]
+ , ngs_dwarfFiles :: !DwarfFiles
+ , ngs_unwinds :: !(LabelMap [UnwindPoint])
+ -- ^ see Note [Unwinding information in the NCG]
+ -- and Note [What is this unwinding business?] in Debug.
+ }
+
+{-
+Note [Unwinding information in the NCG]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Unwind information is a type of metadata which allows a debugging tool
+to reconstruct the values of machine registers at the time a procedure was
+entered. For the most part, the production of unwind information is handled by
+the Cmm stage, where it is represented by CmmUnwind nodes.
+
+Unfortunately, the Cmm stage doesn't know everything necessary to produce
+accurate unwinding information. For instance, the x86-64 calling convention
+requires that the stack pointer be aligned to 16 bytes, which in turn means that
+GHC must sometimes add padding to $sp prior to performing a foreign call. When
+this happens unwind information must be updated accordingly.
+For this reason, we make the NCG backends responsible for producing
+unwinding tables (with the extractUnwindPoints function in NcgImpl).
+
+We accumulate the produced unwind tables over CmmGroups in the ngs_unwinds
+field of NativeGenAcc. This is a label map which contains an entry for each
+procedure, containing a list of unwinding points (e.g. a label and an associated
+unwinding table).
+
+See also Note [What is this unwinding business?] in Debug.
+-}
+
+nativeCodeGen' :: (Outputable statics, Outputable instr,Outputable jumpDest,
+ Instruction instr)
+ => DynFlags
+ -> Module -> ModLocation
+ -> NcgImpl statics instr jumpDest
+ -> Handle
+ -> UniqSupply
+ -> Stream IO RawCmmGroup a
+ -> IO a
+nativeCodeGen' dflags this_mod modLoc ncgImpl h us cmms
+ = do
+ -- BufHandle is a performance hack. We could hide it inside
+ -- Pretty if it weren't for the fact that we do lots of little
+ -- printDocs here (in order to do codegen in constant space).
+ bufh <- newBufHandle h
+ let ngs0 = NGS [] [] [] [] [] [] emptyUFM mapEmpty
+ (ngs, us', a) <- cmmNativeGenStream dflags this_mod modLoc ncgImpl bufh us
+ cmms ngs0
+ _ <- finishNativeGen dflags modLoc bufh us' ngs
+ return a
+
+finishNativeGen :: Instruction instr
+ => DynFlags
+ -> ModLocation
+ -> BufHandle
+ -> UniqSupply
+ -> NativeGenAcc statics instr
+ -> IO UniqSupply
+finishNativeGen dflags modLoc bufh@(BufHandle _ _ h) us ngs
+ = withTimingSilent dflags (text "NCG") (`seq` ()) $ do
+ -- Write debug data and finish
+ let emitDw = debugLevel dflags > 0
+ us' <- if not emitDw then return us else do
+ (dwarf, us') <- dwarfGen dflags modLoc us (ngs_debug ngs)
+ emitNativeCode dflags bufh dwarf
+ return us'
+ bFlush bufh
+
+ -- dump global NCG stats for graph coloring allocator
+ let stats = concat (ngs_colorStats ngs)
+ unless (null stats) $ do
+
+ -- build the global register conflict graph
+ let graphGlobal
+ = foldl' Color.union Color.initGraph
+ $ [ Color.raGraph stat
+ | stat@Color.RegAllocStatsStart{} <- stats]
+
+ dump_stats (Color.pprStats stats graphGlobal)
+
+ let platform = targetPlatform dflags
+ dumpIfSet_dyn dflags
+ Opt_D_dump_asm_conflicts "Register conflict graph"
+ FormatText
+ $ Color.dotGraph
+ (targetRegDotColor platform)
+ (Color.trivColorable platform
+ (targetVirtualRegSqueeze platform)
+ (targetRealRegSqueeze platform))
+ $ graphGlobal
+
+
+ -- dump global NCG stats for linear allocator
+ let linearStats = concat (ngs_linearStats ngs)
+ unless (null linearStats) $
+ dump_stats (Linear.pprStats (concat (ngs_natives ngs)) linearStats)
+
+ -- write out the imports
+ printSDocLn Pretty.LeftMode dflags h (mkCodeStyle AsmStyle)
+ $ makeImportsDoc dflags (concat (ngs_imports ngs))
+ return us'
+ where
+ dump_stats = dumpAction dflags (mkDumpStyle dflags alwaysQualify)
+ (dumpOptionsFromFlag Opt_D_dump_asm_stats) "NCG stats"
+ FormatText
+
+cmmNativeGenStream :: (Outputable statics, Outputable instr
+ ,Outputable jumpDest, Instruction instr)
+ => DynFlags
+ -> Module -> ModLocation
+ -> NcgImpl statics instr jumpDest
+ -> BufHandle
+ -> UniqSupply
+ -> Stream IO RawCmmGroup a
+ -> NativeGenAcc statics instr
+ -> IO (NativeGenAcc statics instr, UniqSupply, a)
+
+cmmNativeGenStream dflags this_mod modLoc ncgImpl h us cmm_stream ngs
+ = do r <- Stream.runStream cmm_stream
+ case r of
+ Left a ->
+ return (ngs { ngs_imports = reverse $ ngs_imports ngs
+ , ngs_natives = reverse $ ngs_natives ngs
+ , ngs_colorStats = reverse $ ngs_colorStats ngs
+ , ngs_linearStats = reverse $ ngs_linearStats ngs
+ },
+ us,
+ a)
+ Right (cmms, cmm_stream') -> do
+ (us', ngs'') <-
+ withTimingSilent
+ dflags
+ ncglabel (\(a, b) -> a `seq` b `seq` ()) $ do
+ -- Generate debug information
+ let debugFlag = debugLevel dflags > 0
+ !ndbgs | debugFlag = cmmDebugGen modLoc cmms
+ | otherwise = []
+ dbgMap = debugToMap ndbgs
+
+ -- Generate native code
+ (ngs',us') <- cmmNativeGens dflags this_mod modLoc ncgImpl h
+ dbgMap us cmms ngs 0
+
+ -- Link native code information into debug blocks
+ -- See Note [What is this unwinding business?] in Debug.
+ let !ldbgs = cmmDebugLink (ngs_labels ngs') (ngs_unwinds ngs') ndbgs
+ unless (null ldbgs) $
+ dumpIfSet_dyn dflags Opt_D_dump_debug "Debug Infos" FormatText
+ (vcat $ map ppr ldbgs)
+
+ -- Accumulate debug information for emission in finishNativeGen.
+ let ngs'' = ngs' { ngs_debug = ngs_debug ngs' ++ ldbgs, ngs_labels = [] }
+ return (us', ngs'')
+
+ cmmNativeGenStream dflags this_mod modLoc ncgImpl h us'
+ cmm_stream' ngs''
+
+ where ncglabel = text "NCG"
+
+-- | Do native code generation on all these cmms.
+--
+cmmNativeGens :: forall statics instr jumpDest.
+ (Outputable statics, Outputable instr
+ ,Outputable jumpDest, Instruction instr)
+ => DynFlags
+ -> Module -> ModLocation
+ -> NcgImpl statics instr jumpDest
+ -> BufHandle
+ -> LabelMap DebugBlock
+ -> UniqSupply
+ -> [RawCmmDecl]
+ -> NativeGenAcc statics instr
+ -> Int
+ -> IO (NativeGenAcc statics instr, UniqSupply)
+
+cmmNativeGens dflags this_mod modLoc ncgImpl h dbgMap = go
+ where
+ go :: UniqSupply -> [RawCmmDecl]
+ -> NativeGenAcc statics instr -> Int
+ -> IO (NativeGenAcc statics instr, UniqSupply)
+
+ go us [] ngs !_ =
+ return (ngs, us)
+
+ go us (cmm : cmms) ngs count = do
+ let fileIds = ngs_dwarfFiles ngs
+ (us', fileIds', native, imports, colorStats, linearStats, unwinds)
+ <- {-# SCC "cmmNativeGen" #-}
+ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap
+ cmm count
+
+ -- Generate .file directives for every new file that has been
+ -- used. Note that it is important that we generate these in
+ -- ascending order, as Clang's 3.6 assembler complains.
+ let newFileIds = sortBy (comparing snd) $
+ nonDetEltsUFM $ fileIds' `minusUFM` fileIds
+ -- See Note [Unique Determinism and code generation]
+ pprDecl (f,n) = text "\t.file " <> ppr n <+>
+ pprFilePathString (unpackFS f)
+
+ emitNativeCode dflags h $ vcat $
+ map pprDecl newFileIds ++
+ map (pprNatCmmDecl ncgImpl) native
+
+ -- force evaluation all this stuff to avoid space leaks
+ {-# SCC "seqString" #-} evaluate $ seqList (showSDoc dflags $ vcat $ map ppr imports) ()
+
+ let !labels' = if debugLevel dflags > 0
+ then cmmDebugLabels isMetaInstr native else []
+ !natives' = if dopt Opt_D_dump_asm_stats dflags
+ then native : ngs_natives ngs else []
+
+ mCon = maybe id (:)
+ ngs' = ngs{ ngs_imports = imports : ngs_imports ngs
+ , ngs_natives = natives'
+ , ngs_colorStats = colorStats `mCon` ngs_colorStats ngs
+ , ngs_linearStats = linearStats `mCon` ngs_linearStats ngs
+ , ngs_labels = ngs_labels ngs ++ labels'
+ , ngs_dwarfFiles = fileIds'
+ , ngs_unwinds = ngs_unwinds ngs `mapUnion` unwinds
+ }
+ go us' cmms ngs' (count + 1)
+
+
+emitNativeCode :: DynFlags -> BufHandle -> SDoc -> IO ()
+emitNativeCode dflags h sdoc = do
+
+ {-# SCC "pprNativeCode" #-} bufLeftRenderSDoc dflags h
+ (mkCodeStyle AsmStyle) sdoc
+
+ -- dump native code
+ dumpIfSet_dyn dflags
+ Opt_D_dump_asm "Asm code" FormatASM
+ sdoc
+
+-- | Complete native code generation phase for a single top-level chunk of Cmm.
+-- Dumping the output of each stage along the way.
+-- Global conflict graph and NGC stats
+cmmNativeGen
+ :: forall statics instr jumpDest. (Instruction instr,
+ Outputable statics, Outputable instr, Outputable jumpDest)
+ => DynFlags
+ -> Module -> ModLocation
+ -> NcgImpl statics instr jumpDest
+ -> UniqSupply
+ -> DwarfFiles
+ -> LabelMap DebugBlock
+ -> RawCmmDecl -- ^ the cmm to generate code for
+ -> Int -- ^ sequence number of this top thing
+ -> IO ( UniqSupply
+ , DwarfFiles
+ , [NatCmmDecl statics instr] -- native code
+ , [CLabel] -- things imported by this cmm
+ , Maybe [Color.RegAllocStats statics instr] -- stats for the coloring register allocator
+ , Maybe [Linear.RegAllocStats] -- stats for the linear register allocators
+ , LabelMap [UnwindPoint] -- unwinding information for blocks
+ )
+
+cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
+ = do
+ let platform = targetPlatform dflags
+
+ let proc_name = case cmm of
+ (CmmProc _ entry_label _ _) -> ppr entry_label
+ _ -> text "DataChunk"
+
+ -- rewrite assignments to global regs
+ let fixed_cmm =
+ {-# SCC "fixStgRegisters" #-}
+ fixStgRegisters dflags cmm
+
+ -- cmm to cmm optimisations
+ let (opt_cmm, imports) =
+ {-# SCC "cmmToCmm" #-}
+ cmmToCmm dflags this_mod fixed_cmm
+
+ dumpIfSet_dyn dflags
+ Opt_D_dump_opt_cmm "Optimised Cmm" FormatCMM
+ (pprCmmGroup [opt_cmm])
+
+ let cmmCfg = {-# SCC "getCFG" #-}
+ getCfgProc (cfgWeightInfo dflags) opt_cmm
+
+ -- generate native code from cmm
+ let ((native, lastMinuteImports, fileIds', nativeCfgWeights), usGen) =
+ {-# SCC "genMachCode" #-}
+ initUs us $ genMachCode dflags this_mod modLoc
+ (cmmTopCodeGen ncgImpl)
+ fileIds dbgMap opt_cmm cmmCfg
+
+ dumpIfSet_dyn dflags
+ Opt_D_dump_asm_native "Native code" FormatASM
+ (vcat $ map (pprNatCmmDecl ncgImpl) native)
+
+ maybeDumpCfg dflags (Just nativeCfgWeights) "CFG Weights - Native" proc_name
+
+ -- tag instructions with register liveness information
+ -- also drops dead code. We don't keep the cfg in sync on
+ -- some backends, so don't use it there.
+ let livenessCfg = if (backendMaintainsCfg dflags)
+ then Just nativeCfgWeights
+ else Nothing
+ let (withLiveness, usLive) =
+ {-# SCC "regLiveness" #-}
+ initUs usGen
+ $ mapM (cmmTopLiveness livenessCfg platform) native
+
+ dumpIfSet_dyn dflags
+ Opt_D_dump_asm_liveness "Liveness annotations added"
+ FormatCMM
+ (vcat $ map ppr withLiveness)
+
+ -- allocate registers
+ (alloced, usAlloc, ppr_raStatsColor, ppr_raStatsLinear, raStats, stack_updt_blks) <-
+ if ( gopt Opt_RegsGraph dflags
+ || gopt Opt_RegsIterative dflags )
+ then do
+ -- the regs usable for allocation
+ let (alloc_regs :: UniqFM (UniqSet RealReg))
+ = foldr (\r -> plusUFM_C unionUniqSets
+ $ unitUFM (targetClassOfRealReg platform r) (unitUniqSet r))
+ emptyUFM
+ $ allocatableRegs ncgImpl
+
+ -- do the graph coloring register allocation
+ let ((alloced, maybe_more_stack, regAllocStats), usAlloc)
+ = {-# SCC "RegAlloc-color" #-}
+ initUs usLive
+ $ Color.regAlloc
+ dflags
+ alloc_regs
+ (mkUniqSet [0 .. maxSpillSlots ncgImpl])
+ (maxSpillSlots ncgImpl)
+ withLiveness
+ livenessCfg
+
+ let ((alloced', stack_updt_blks), usAlloc')
+ = initUs usAlloc $
+ case maybe_more_stack of
+ Nothing -> return (alloced, [])
+ Just amount -> do
+ (alloced',stack_updt_blks) <- unzip <$>
+ (mapM ((ncgAllocMoreStack ncgImpl) amount) alloced)
+ return (alloced', concat stack_updt_blks )
+
+
+ -- dump out what happened during register allocation
+ dumpIfSet_dyn dflags
+ Opt_D_dump_asm_regalloc "Registers allocated"
+ FormatCMM
+ (vcat $ map (pprNatCmmDecl ncgImpl) alloced)
+
+ dumpIfSet_dyn dflags
+ Opt_D_dump_asm_regalloc_stages "Build/spill stages"
+ FormatText
+ (vcat $ map (\(stage, stats)
+ -> text "# --------------------------"
+ $$ text "# cmm " <> int count <> text " Stage " <> int stage
+ $$ ppr stats)
+ $ zip [0..] regAllocStats)
+
+ let mPprStats =
+ if dopt Opt_D_dump_asm_stats dflags
+ then Just regAllocStats else Nothing
+
+ -- force evaluation of the Maybe to avoid space leak
+ mPprStats `seq` return ()
+
+ return ( alloced', usAlloc'
+ , mPprStats
+ , Nothing
+ , [], stack_updt_blks)
+
+ else do
+ -- do linear register allocation
+ let reg_alloc proc = do
+ (alloced, maybe_more_stack, ra_stats) <-
+ Linear.regAlloc dflags proc
+ case maybe_more_stack of
+ Nothing -> return ( alloced, ra_stats, [] )
+ Just amount -> do
+ (alloced',stack_updt_blks) <-
+ ncgAllocMoreStack ncgImpl amount alloced
+ return (alloced', ra_stats, stack_updt_blks )
+
+ let ((alloced, regAllocStats, stack_updt_blks), usAlloc)
+ = {-# SCC "RegAlloc-linear" #-}
+ initUs usLive
+ $ liftM unzip3
+ $ mapM reg_alloc withLiveness
+
+ dumpIfSet_dyn dflags
+ Opt_D_dump_asm_regalloc "Registers allocated"
+ FormatCMM
+ (vcat $ map (pprNatCmmDecl ncgImpl) alloced)
+
+ let mPprStats =
+ if dopt Opt_D_dump_asm_stats dflags
+ then Just (catMaybes regAllocStats) else Nothing
+
+ -- force evaluation of the Maybe to avoid space leak
+ mPprStats `seq` return ()
+
+ return ( alloced, usAlloc
+ , Nothing
+ , mPprStats, (catMaybes regAllocStats)
+ , concat stack_updt_blks )
+
+ -- Fixupblocks the register allocator inserted (from, regMoves, to)
+ let cfgRegAllocUpdates :: [(BlockId,BlockId,BlockId)]
+ cfgRegAllocUpdates = (concatMap Linear.ra_fixupList raStats)
+
+ let cfgWithFixupBlks =
+ (\cfg -> addNodesBetween cfg cfgRegAllocUpdates) <$> livenessCfg
+
+ -- Insert stack update blocks
+ let postRegCFG =
+ pure (foldl' (\m (from,to) -> addImmediateSuccessor from to m ))
+ <*> cfgWithFixupBlks
+ <*> pure stack_updt_blks
+
+ ---- generate jump tables
+ let tabled =
+ {-# SCC "generateJumpTables" #-}
+ generateJumpTables ncgImpl alloced
+
+ when (not $ null nativeCfgWeights) $ dumpIfSet_dyn dflags
+ Opt_D_dump_cfg_weights "CFG Update information"
+ FormatText
+ ( text "stack:" <+> ppr stack_updt_blks $$
+ text "linearAlloc:" <+> ppr cfgRegAllocUpdates )
+
+ ---- shortcut branches
+ let (shorted, postShortCFG) =
+ {-# SCC "shortcutBranches" #-}
+ shortcutBranches dflags ncgImpl tabled postRegCFG
+
+ let optimizedCFG :: Maybe CFG
+ optimizedCFG =
+ optimizeCFG (cfgWeightInfo dflags) cmm <$!> postShortCFG
+
+ maybeDumpCfg dflags optimizedCFG "CFG Weights - Final" proc_name
+
+ --TODO: Partially check validity of the cfg.
+ let getBlks (CmmProc _info _lbl _live (ListGraph blocks)) = blocks
+ getBlks _ = []
+
+ when ( backendMaintainsCfg dflags &&
+ (gopt Opt_DoAsmLinting dflags || debugIsOn )) $ do
+ let blocks = concatMap getBlks shorted
+ let labels = setFromList $ fmap blockId blocks :: LabelSet
+ let cfg = fromJust optimizedCFG
+ return $! seq (sanityCheckCfg cfg labels $
+ text "cfg not in lockstep") ()
+
+ ---- sequence blocks
+ let sequenced :: [NatCmmDecl statics instr]
+ sequenced =
+ checkLayout shorted $
+ {-# SCC "sequenceBlocks" #-}
+ map (BlockLayout.sequenceTop
+ dflags
+ ncgImpl optimizedCFG)
+ shorted
+
+ let branchOpt :: [NatCmmDecl statics instr]
+ branchOpt =
+ {-# SCC "invertCondBranches" #-}
+ map invert sequenced
+ where
+ invertConds :: LabelMap RawCmmStatics -> [NatBasicBlock instr]
+ -> [NatBasicBlock instr]
+ invertConds = invertCondBranches ncgImpl optimizedCFG
+ invert top@CmmData {} = top
+ invert (CmmProc info lbl live (ListGraph blocks)) =
+ CmmProc info lbl live (ListGraph $ invertConds info blocks)
+
+ ---- expansion of SPARC synthetic instrs
+ let expanded =
+ {-# SCC "sparc_expand" #-}
+ ncgExpandTop ncgImpl branchOpt
+ --ncgExpandTop ncgImpl sequenced
+
+ dumpIfSet_dyn dflags
+ Opt_D_dump_asm_expanded "Synthetic instructions expanded"
+ FormatCMM
+ (vcat $ map (pprNatCmmDecl ncgImpl) expanded)
+
+ -- generate unwinding information from cmm
+ let unwinds :: BlockMap [UnwindPoint]
+ unwinds =
+ {-# SCC "unwindingInfo" #-}
+ foldl' addUnwind mapEmpty expanded
+ where
+ addUnwind acc proc =
+ acc `mapUnion` computeUnwinding dflags ncgImpl proc
+
+ return ( usAlloc
+ , fileIds'
+ , expanded
+ , lastMinuteImports ++ imports
+ , ppr_raStatsColor
+ , ppr_raStatsLinear
+ , unwinds )
+
+maybeDumpCfg :: DynFlags -> Maybe CFG -> String -> SDoc -> IO ()
+maybeDumpCfg _dflags Nothing _ _ = return ()
+maybeDumpCfg dflags (Just cfg) msg proc_name
+ | null cfg = return ()
+ | otherwise
+ = dumpIfSet_dyn
+ dflags Opt_D_dump_cfg_weights msg
+ FormatText
+ (proc_name <> char ':' $$ pprEdgeWeights cfg)
+
+-- | Make sure all blocks we want the layout algorithm to place have been placed.
+checkLayout :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
+ -> [NatCmmDecl statics instr]
+checkLayout procsUnsequenced procsSequenced =
+ ASSERT2(setNull diff,
+ ppr "Block sequencing dropped blocks:" <> ppr diff)
+ procsSequenced
+ where
+ blocks1 = foldl' (setUnion) setEmpty $
+ map getBlockIds procsUnsequenced :: LabelSet
+ blocks2 = foldl' (setUnion) setEmpty $
+ map getBlockIds procsSequenced
+ diff = setDifference blocks1 blocks2
+
+ getBlockIds (CmmData _ _) = setEmpty
+ getBlockIds (CmmProc _ _ _ (ListGraph blocks)) =
+ setFromList $ map blockId blocks
+
+-- | Compute unwinding tables for the blocks of a procedure
+computeUnwinding :: Instruction instr
+ => DynFlags -> NcgImpl statics instr jumpDest
+ -> NatCmmDecl statics instr
+ -- ^ the native code generated for the procedure
+ -> LabelMap [UnwindPoint]
+ -- ^ unwinding tables for all points of all blocks of the
+ -- procedure
+computeUnwinding dflags _ _
+ | debugLevel dflags == 0 = mapEmpty
+computeUnwinding _ _ (CmmData _ _) = mapEmpty
+computeUnwinding _ ncgImpl (CmmProc _ _ _ (ListGraph blks)) =
+ -- In general we would need to push unwinding information down the
+ -- block-level call-graph to ensure that we fully account for all
+ -- relevant register writes within a procedure.
+ --
+ -- However, the only unwinding information that we care about in GHC is for
+ -- Sp. The fact that GHC.Cmm.LayoutStack already ensures that we have unwind
+ -- information at the beginning of every block means that there is no need
+ -- to perform this sort of push-down.
+ mapFromList [ (blk_lbl, extractUnwindPoints ncgImpl instrs)
+ | BasicBlock blk_lbl instrs <- blks ]
+
+-- | Build a doc for all the imports.
+--
+makeImportsDoc :: DynFlags -> [CLabel] -> SDoc
+makeImportsDoc dflags imports
+ = dyld_stubs imports
+ $$
+ -- On recent versions of Darwin, the linker supports
+ -- dead-stripping of code and data on a per-symbol basis.
+ -- There's a hack to make this work in PprMach.pprNatCmmDecl.
+ (if platformHasSubsectionsViaSymbols platform
+ then text ".subsections_via_symbols"
+ else Outputable.empty)
+ $$
+ -- On recent GNU ELF systems one can mark an object file
+ -- as not requiring an executable stack. If all objects
+ -- linked into a program have this note then the program
+ -- will not use an executable stack, which is good for
+ -- security. GHC generated code does not need an executable
+ -- stack so add the note in:
+ (if platformHasGnuNonexecStack platform
+ then text ".section .note.GNU-stack,\"\"," <> sectionType "progbits"
+ else Outputable.empty)
+ $$
+ -- And just because every other compiler does, let's stick in
+ -- an identifier directive: .ident "GHC x.y.z"
+ (if platformHasIdentDirective platform
+ then let compilerIdent = text "GHC" <+> text cProjectVersion
+ in text ".ident" <+> doubleQuotes compilerIdent
+ else Outputable.empty)
+
+ where
+ platform = targetPlatform dflags
+ arch = platformArch platform
+ os = platformOS platform
+
+ -- Generate "symbol stubs" for all external symbols that might
+ -- come from a dynamic library.
+ dyld_stubs :: [CLabel] -> SDoc
+{- dyld_stubs imps = vcat $ map pprDyldSymbolStub $
+ map head $ group $ sort imps-}
+ -- (Hack) sometimes two Labels pretty-print the same, but have
+ -- different uniques; so we compare their text versions...
+ dyld_stubs imps
+ | needImportedSymbols dflags arch os
+ = vcat $
+ (pprGotDeclaration dflags arch os :) $
+ map ( pprImportedSymbol dflags platform . fst . head) $
+ groupBy (\(_,a) (_,b) -> a == b) $
+ sortBy (\(_,a) (_,b) -> compare a b) $
+ map doPpr $
+ imps
+ | otherwise
+ = Outputable.empty
+
+ doPpr lbl = (lbl, renderWithStyle
+ (initSDocContext dflags astyle)
+ (pprCLabel dflags lbl))
+ astyle = mkCodeStyle AsmStyle
+
+-- -----------------------------------------------------------------------------
+-- Generate jump tables
+
+-- Analyzes all native code and generates data sections for all jump
+-- table instructions.
+generateJumpTables
+ :: NcgImpl statics instr jumpDest
+ -> [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
+generateJumpTables ncgImpl xs = concatMap f xs
+ where f p@(CmmProc _ _ _ (ListGraph xs)) = p : concatMap g xs
+ f p = [p]
+ g (BasicBlock _ xs) = catMaybes (map (generateJumpTableForInstr ncgImpl) xs)
+
+-- -----------------------------------------------------------------------------
+-- Shortcut branches
+
+shortcutBranches
+ :: forall statics instr jumpDest. (Outputable jumpDest) => DynFlags
+ -> NcgImpl statics instr jumpDest
+ -> [NatCmmDecl statics instr]
+ -> Maybe CFG
+ -> ([NatCmmDecl statics instr],Maybe CFG)
+
+shortcutBranches dflags ncgImpl tops weights
+ | gopt Opt_AsmShortcutting dflags
+ = ( map (apply_mapping ncgImpl mapping) tops'
+ , shortcutWeightMap mappingBid <$!> weights )
+ | otherwise
+ = (tops, weights)
+ where
+ (tops', mappings) = mapAndUnzip (build_mapping ncgImpl) tops
+ mapping = mapUnions mappings :: LabelMap jumpDest
+ mappingBid = fmap (getJumpDestBlockId ncgImpl) mapping
+
+build_mapping :: forall instr t d statics jumpDest.
+ NcgImpl statics instr jumpDest
+ -> GenCmmDecl d (LabelMap t) (ListGraph instr)
+ -> (GenCmmDecl d (LabelMap t) (ListGraph instr)
+ ,LabelMap jumpDest)
+build_mapping _ top@(CmmData _ _) = (top, mapEmpty)
+build_mapping _ (CmmProc info lbl live (ListGraph []))
+ = (CmmProc info lbl live (ListGraph []), mapEmpty)
+build_mapping ncgImpl (CmmProc info lbl live (ListGraph (head:blocks)))
+ = (CmmProc info lbl live (ListGraph (head:others)), mapping)
+ -- drop the shorted blocks, but don't ever drop the first one,
+ -- because it is pointed to by a global label.
+ where
+ -- find all the blocks that just consist of a jump that can be
+ -- shorted.
+ -- Don't completely eliminate loops here -- that can leave a dangling jump!
+ shortcut_blocks :: [(BlockId, jumpDest)]
+ (_, shortcut_blocks, others) =
+ foldl' split (setEmpty :: LabelSet, [], []) blocks
+ split (s, shortcut_blocks, others) b@(BasicBlock id [insn])
+ | Just jd <- canShortcut ncgImpl insn
+ , Just dest <- getJumpDestBlockId ncgImpl jd
+ , not (has_info id)
+ , (setMember dest s) || dest == id -- loop checks
+ = (s, shortcut_blocks, b : others)
+ split (s, shortcut_blocks, others) (BasicBlock id [insn])
+ | Just dest <- canShortcut ncgImpl insn
+ , not (has_info id)
+ = (setInsert id s, (id,dest) : shortcut_blocks, others)
+ split (s, shortcut_blocks, others) other = (s, shortcut_blocks, other : others)
+
+ -- do not eliminate blocks that have an info table
+ has_info l = mapMember l info
+
+ -- build a mapping from BlockId to JumpDest for shorting branches
+ mapping = mapFromList shortcut_blocks
+
+apply_mapping :: NcgImpl statics instr jumpDest
+ -> LabelMap jumpDest
+ -> GenCmmDecl statics h (ListGraph instr)
+ -> GenCmmDecl statics h (ListGraph instr)
+apply_mapping ncgImpl ufm (CmmData sec statics)
+ = CmmData sec (shortcutStatics ncgImpl (\bid -> mapLookup bid ufm) statics)
+apply_mapping ncgImpl ufm (CmmProc info lbl live (ListGraph blocks))
+ = CmmProc info lbl live (ListGraph $ map short_bb blocks)
+ where
+ short_bb (BasicBlock id insns) = BasicBlock id $! map short_insn insns
+ short_insn i = shortcutJump ncgImpl (\bid -> mapLookup bid ufm) i
+ -- shortcutJump should apply the mapping repeatedly,
+ -- just in case we can short multiple branches.
+
+-- -----------------------------------------------------------------------------
+-- Instruction selection
+
+-- Native code instruction selection for a chunk of stix code. For
+-- this part of the computation, we switch from the UniqSM monad to
+-- the NatM monad. The latter carries not only a Unique, but also an
+-- Int denoting the current C stack pointer offset in the generated
+-- code; this is needed for creating correct spill offsets on
+-- architectures which don't offer, or for which it would be
+-- prohibitively expensive to employ, a frame pointer register. Viz,
+-- x86.
+
+-- The offset is measured in bytes, and indicates the difference
+-- between the current (simulated) C stack-ptr and the value it was at
+-- the beginning of the block. For stacks which grow down, this value
+-- should be either zero or negative.
+
+-- Along with the stack pointer offset, we also carry along a LabelMap of
+-- DebugBlocks, which we read to generate .location directives.
+--
+-- Switching between the two monads whilst carrying along the same
+-- Unique supply breaks abstraction. Is that bad?
+
+genMachCode
+ :: DynFlags
+ -> Module -> ModLocation
+ -> (RawCmmDecl -> NatM [NatCmmDecl statics instr])
+ -> DwarfFiles
+ -> LabelMap DebugBlock
+ -> RawCmmDecl
+ -> CFG
+ -> UniqSM
+ ( [NatCmmDecl statics instr]
+ , [CLabel]
+ , DwarfFiles
+ , CFG
+ )
+
+genMachCode dflags this_mod modLoc cmmTopCodeGen fileIds dbgMap cmm_top cmm_cfg
+ = do { initial_us <- getUniqueSupplyM
+ ; let initial_st = mkNatM_State initial_us 0 dflags this_mod
+ modLoc fileIds dbgMap cmm_cfg
+ (new_tops, final_st) = initNat initial_st (cmmTopCodeGen cmm_top)
+ final_delta = natm_delta final_st
+ final_imports = natm_imports final_st
+ final_cfg = natm_cfg final_st
+ ; if final_delta == 0
+ then return (new_tops, final_imports
+ , natm_fileid final_st, final_cfg)
+ else pprPanic "genMachCode: nonzero final delta" (int final_delta)
+ }
+
+-- -----------------------------------------------------------------------------
+-- Generic Cmm optimiser
+
+{-
+Here we do:
+
+ (a) Constant folding
+ (c) Position independent code and dynamic linking
+ (i) introduce the appropriate indirections
+ and position independent refs
+ (ii) compile a list of imported symbols
+ (d) Some arch-specific optimizations
+
+(a) will be moving to the new Hoopl pipeline, however, (c) and
+(d) are only needed by the native backend and will continue to live
+here.
+
+Ideas for other things we could do (put these in Hoopl please!):
+
+ - shortcut jumps-to-jumps
+ - simple CSE: if an expr is assigned to a temp, then replace later occs of
+ that expr with the temp, until the expr is no longer valid (can push through
+ temp assignments, and certain assigns to mem...)
+-}
+
+cmmToCmm :: DynFlags -> Module -> RawCmmDecl -> (RawCmmDecl, [CLabel])
+cmmToCmm _ _ top@(CmmData _ _) = (top, [])
+cmmToCmm dflags this_mod (CmmProc info lbl live graph)
+ = runCmmOpt dflags this_mod $
+ do blocks' <- mapM cmmBlockConFold (toBlockList graph)
+ return $ CmmProc info lbl live (ofBlockList (g_entry graph) blocks')
+
+-- Avoids using unboxed tuples when loading into GHCi
+#if !defined(GHC_LOADED_INTO_GHCI)
+
+type OptMResult a = (# a, [CLabel] #)
+
+pattern OptMResult :: a -> b -> (# a, b #)
+pattern OptMResult x y = (# x, y #)
+{-# COMPLETE OptMResult #-}
+#else
+
+data OptMResult a = OptMResult !a ![CLabel] deriving (Functor)
+#endif
+
+newtype CmmOptM a = CmmOptM (DynFlags -> Module -> [CLabel] -> OptMResult a)
+ deriving (Functor)
+
+instance Applicative CmmOptM where
+ pure x = CmmOptM $ \_ _ imports -> OptMResult x imports
+ (<*>) = ap
+
+instance Monad CmmOptM where
+ (CmmOptM f) >>= g =
+ CmmOptM $ \dflags this_mod imports0 ->
+ case f dflags this_mod imports0 of
+ OptMResult x imports1 ->
+ case g x of
+ CmmOptM g' -> g' dflags this_mod imports1
+
+instance CmmMakeDynamicReferenceM CmmOptM where
+ addImport = addImportCmmOpt
+ getThisModule = CmmOptM $ \_ this_mod imports -> OptMResult this_mod imports
+
+addImportCmmOpt :: CLabel -> CmmOptM ()
+addImportCmmOpt lbl = CmmOptM $ \_ _ imports -> OptMResult () (lbl:imports)
+
+instance HasDynFlags CmmOptM where
+ getDynFlags = CmmOptM $ \dflags _ imports -> OptMResult dflags imports
+
+runCmmOpt :: DynFlags -> Module -> CmmOptM a -> (a, [CLabel])
+runCmmOpt dflags this_mod (CmmOptM f) =
+ case f dflags this_mod [] of
+ OptMResult result imports -> (result, imports)
+
+cmmBlockConFold :: CmmBlock -> CmmOptM CmmBlock
+cmmBlockConFold block = do
+ let (entry, middle, last) = blockSplit block
+ stmts = blockToList middle
+ stmts' <- mapM cmmStmtConFold stmts
+ last' <- cmmStmtConFold last
+ return $ blockJoin entry (blockFromList stmts') last'
+
+-- This does three optimizations, but they're very quick to check, so we don't
+-- bother turning them off even when the Hoopl code is active. Since
+-- this is on the old Cmm representation, we can't reuse the code either:
+-- * reg = reg --> nop
+-- * if 0 then jump --> nop
+-- * if 1 then jump --> jump
+-- We might be tempted to skip this step entirely of not Opt_PIC, but
+-- there is some PowerPC code for the non-PIC case, which would also
+-- have to be separated.
+cmmStmtConFold :: CmmNode e x -> CmmOptM (CmmNode e x)
+cmmStmtConFold stmt
+ = case stmt of
+ CmmAssign reg src
+ -> do src' <- cmmExprConFold DataReference src
+ return $ case src' of
+ CmmReg reg' | reg == reg' -> CmmComment (fsLit "nop")
+ new_src -> CmmAssign reg new_src
+
+ CmmStore addr src
+ -> do addr' <- cmmExprConFold DataReference addr
+ src' <- cmmExprConFold DataReference src
+ return $ CmmStore addr' src'
+
+ CmmCall { cml_target = addr }
+ -> do addr' <- cmmExprConFold JumpReference addr
+ return $ stmt { cml_target = addr' }
+
+ CmmUnsafeForeignCall target regs args
+ -> do target' <- case target of
+ ForeignTarget e conv -> do
+ e' <- cmmExprConFold CallReference e
+ return $ ForeignTarget e' conv
+ PrimTarget _ ->
+ return target
+ args' <- mapM (cmmExprConFold DataReference) args
+ return $ CmmUnsafeForeignCall target' regs args'
+
+ CmmCondBranch test true false likely
+ -> do test' <- cmmExprConFold DataReference test
+ return $ case test' of
+ CmmLit (CmmInt 0 _) -> CmmBranch false
+ CmmLit (CmmInt _ _) -> CmmBranch true
+ _other -> CmmCondBranch test' true false likely
+
+ CmmSwitch expr ids
+ -> do expr' <- cmmExprConFold DataReference expr
+ return $ CmmSwitch expr' ids
+
+ other
+ -> return other
+
+cmmExprConFold :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
+cmmExprConFold referenceKind expr = do
+ dflags <- getDynFlags
+
+ -- With -O1 and greater, the cmmSink pass does constant-folding, so
+ -- we don't need to do it again here.
+ let expr' = if optLevel dflags >= 1
+ then expr
+ else cmmExprCon dflags expr
+
+ cmmExprNative referenceKind expr'
+
+cmmExprCon :: DynFlags -> CmmExpr -> CmmExpr
+cmmExprCon dflags (CmmLoad addr rep) = CmmLoad (cmmExprCon dflags addr) rep
+cmmExprCon dflags (CmmMachOp mop args)
+ = cmmMachOpFold dflags mop (map (cmmExprCon dflags) args)
+cmmExprCon _ other = other
+
+-- handles both PIC and non-PIC cases... a very strange mixture
+-- of things to do.
+cmmExprNative :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
+cmmExprNative referenceKind expr = do
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ arch = platformArch platform
+ case expr of
+ CmmLoad addr rep
+ -> do addr' <- cmmExprNative DataReference addr
+ return $ CmmLoad addr' rep
+
+ CmmMachOp mop args
+ -> do args' <- mapM (cmmExprNative DataReference) args
+ return $ CmmMachOp mop args'
+
+ CmmLit (CmmBlock id)
+ -> cmmExprNative referenceKind (CmmLit (CmmLabel (infoTblLbl id)))
+ -- we must convert block Ids to CLabels here, because we
+ -- might have to do the PIC transformation. Hence we must
+ -- not modify BlockIds beyond this point.
+
+ CmmLit (CmmLabel lbl)
+ -> do
+ cmmMakeDynamicReference dflags referenceKind lbl
+ CmmLit (CmmLabelOff lbl off)
+ -> do
+ dynRef <- cmmMakeDynamicReference dflags referenceKind lbl
+ -- need to optimize here, since it's late
+ return $ cmmMachOpFold dflags (MO_Add (wordWidth dflags)) [
+ dynRef,
+ (CmmLit $ CmmInt (fromIntegral off) (wordWidth dflags))
+ ]
+
+ -- On powerpc (non-PIC), it's easier to jump directly to a label than
+ -- to use the register table, so we replace these registers
+ -- with the corresponding labels:
+ CmmReg (CmmGlobal EagerBlackholeInfo)
+ | arch == ArchPPC && not (positionIndependent dflags)
+ -> cmmExprNative referenceKind $
+ CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_EAGER_BLACKHOLE_info")))
+ CmmReg (CmmGlobal GCEnter1)
+ | arch == ArchPPC && not (positionIndependent dflags)
+ -> cmmExprNative referenceKind $
+ CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_gc_enter_1")))
+ CmmReg (CmmGlobal GCFun)
+ | arch == ArchPPC && not (positionIndependent dflags)
+ -> cmmExprNative referenceKind $
+ CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_gc_fun")))
+
+ other
+ -> return other
diff --git a/compiler/GHC/CmmToAsm/BlockLayout.hs b/compiler/GHC/CmmToAsm/BlockLayout.hs
new file mode 100644
index 0000000000..01a1388b5f
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/BlockLayout.hs
@@ -0,0 +1,895 @@
+--
+-- Copyright (c) 2018 Andreas Klebinger
+--
+
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE FlexibleContexts #-}
+
+module GHC.CmmToAsm.BlockLayout
+ ( sequenceTop )
+where
+
+#include "HsVersions.h"
+import GhcPrelude
+
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.CFG
+
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+
+import GHC.Driver.Session (gopt, GeneralFlag(..), DynFlags, backendMaintainsCfg)
+import UniqFM
+import Util
+import Unique
+
+import Digraph
+import Outputable
+import Maybes
+
+-- DEBUGGING ONLY
+--import GHC.Cmm.DebugBlock
+--import Debug.Trace
+import ListSetOps (removeDups)
+
+import OrdList
+import Data.List
+import Data.Foldable (toList)
+
+import qualified Data.Set as Set
+import Data.STRef
+import Control.Monad.ST.Strict
+import Control.Monad (foldM)
+
+{-
+ Note [CFG based code layout]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ The major steps in placing blocks are as follow:
+ * Compute a CFG based on the Cmm AST, see getCfgProc.
+ This CFG will have edge weights representing a guess
+ on how important they are.
+ * After we convert Cmm to Asm we run `optimizeCFG` which
+ adds a few more "educated guesses" to the equation.
+ * Then we run loop analysis on the CFG (`loopInfo`) which tells us
+ about loop headers, loop nesting levels and the sort.
+ * Based on the CFG and loop information refine the edge weights
+ in the CFG and normalize them relative to the most often visited
+ node. (See `mkGlobalWeights`)
+ * Feed this CFG into the block layout code (`sequenceTop`) in this
+ module. Which will then produce a code layout based on the input weights.
+
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~~~ Note [Chain based CFG serialization]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ For additional information also look at
+ https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/code-layout
+
+ We have a CFG with edge weights based on which we try to place blocks next to
+ each other.
+
+ Edge weights not only represent likelihood of control transfer between blocks
+ but also how much a block would benefit from being placed sequentially after
+ it's predecessor.
+ For example blocks which are preceded by an info table are more likely to end
+ up in a different cache line than their predecessor and we can't eliminate the jump
+ so there is less benefit to placing them sequentially.
+
+ For example consider this example:
+
+ A: ...
+ jmp cond D (weak successor)
+ jmp B
+ B: ...
+ jmp C
+ C: ...
+ jmp X
+ D: ...
+ jmp B (weak successor)
+
+ We determine a block layout by building up chunks (calling them chains) of
+ possible control flows for which blocks will be placed sequentially.
+
+ Eg for our example we might end up with two chains like:
+ [A->B->C->X],[D]. Blocks inside chains will always be placed sequentially.
+ However there is no particular order in which chains are placed since
+ (hopefully) the blocks for which sequentiality is important have already
+ been placed in the same chain.
+
+ -----------------------------------------------------------------------------
+ 1) First try to create a list of good chains.
+ -----------------------------------------------------------------------------
+
+ Good chains are these which allow us to eliminate jump instructions.
+ Which further eliminate often executed jumps first.
+
+ We do so by:
+
+ *) Ignore edges which represent instructions which can not be replaced
+ by fall through control flow. Primarily calls and edges to blocks which
+ are prefixed by a info table we have to jump across.
+
+ *) Then process remaining edges in order of frequency taken and:
+
+ +) If source and target have not been placed build a new chain from them.
+
+ +) If source and target have been placed, and are ends of differing chains
+ try to merge the two chains.
+
+ +) If one side of the edge is a end/front of a chain, add the other block of
+ to edge to the same chain
+
+ Eg if we look at edge (B -> C) and already have the chain (A -> B)
+ then we extend the chain to (A -> B -> C).
+
+ +) If the edge was used to modify or build a new chain remove the edge from
+ our working list.
+
+ *) If there any blocks not being placed into a chain after these steps we place
+ them into a chain consisting of only this block.
+
+ Ranking edges by their taken frequency, if
+ two edges compete for fall through on the same target block, the one taken
+ more often will automatically win out. Resulting in fewer instructions being
+ executed.
+
+ Creating singleton chains is required for situations where we have code of the
+ form:
+
+ A: goto B:
+ <infoTable>
+ B: goto C:
+ <infoTable>
+ C: ...
+
+ As the code in block B is only connected to the rest of the program via edges
+ which will be ignored in this step we make sure that B still ends up in a chain
+ this way.
+
+ -----------------------------------------------------------------------------
+ 2) We also try to fuse chains.
+ -----------------------------------------------------------------------------
+
+ As a result from the above step we still end up with multiple chains which
+ represent sequential control flow chunks. But they are not yet suitable for
+ code layout as we need to place *all* blocks into a single sequence.
+
+ In this step we combine chains result from the above step via these steps:
+
+ *) Look at the ranked list of *all* edges, including calls/jumps across info tables
+ and the like.
+
+ *) Look at each edge and
+
+ +) Given an edge (A -> B) try to find two chains for which
+ * Block A is at the end of one chain
+ * Block B is at the front of the other chain.
+ +) If we find such a chain we "fuse" them into a single chain, remove the
+ edge from working set and continue.
+ +) If we can't find such chains we skip the edge and continue.
+
+ -----------------------------------------------------------------------------
+ 3) Place indirect successors (neighbours) after each other
+ -----------------------------------------------------------------------------
+
+ We might have chains [A,B,C,X],[E] in a CFG of the sort:
+
+ A ---> B ---> C --------> X(exit)
+ \- ->E- -/
+
+ While E does not follow X it's still beneficial to place them near each other.
+ This can be advantageous if eg C,X,E will end up in the same cache line.
+
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~~~ Note [Triangle Control Flow]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ Checking if an argument is already evaluated leads to a somewhat
+ special case which looks like this:
+
+ A:
+ if (R1 & 7 != 0) goto Leval; else goto Lwork;
+ Leval: // global
+ call (I64[R1])(R1) returns to Lwork, args: 8, res: 8, upd: 8;
+ Lwork: // global
+ ...
+
+ A
+ |\
+ | Leval
+ |/ - (This edge can be missing because of optimizations)
+ Lwork
+
+ Once we hit the metal the call instruction is just 2-3 bytes large
+ depending on the register used. So we lay out the assembly like this:
+
+ movq %rbx,%rax
+ andl $7,%eax
+ cmpq $1,%rax
+ jne Lwork
+ Leval:
+ jmp *(%rbx) # encoded in 2-3 bytes.
+ <info table>
+ Lwork:
+ ...
+
+ We could explicitly check for this control flow pattern.
+
+ This is advantageous because:
+ * It's optimal if the argument isn't evaluated.
+ * If it's evaluated we only have the extra cost of jumping over
+ the 2-3 bytes for the call.
+ * Guarantees the smaller encoding for the conditional jump.
+
+ However given that Lwork usually has an info table we
+ penalize this edge. So Leval should get placed first
+ either way and things work out for the best.
+
+ Optimizing for the evaluated case instead would penalize
+ the other code path. It adds an jump as we can't fall through
+ to Lwork because of the info table.
+ Assuming that Lwork is large the chance that the "call" ends up
+ in the same cache line is also fairly small.
+
+-}
+
+
+-- | Look at X number of blocks in two chains to determine
+-- if they are "neighbours".
+neighbourOverlapp :: Int
+neighbourOverlapp = 2
+
+-- | Maps blocks near the end of a chain to it's chain AND
+-- the other blocks near the end.
+-- [A,B,C,D,E] Gives entries like (B -> ([A,B], [A,B,C,D,E]))
+-- where [A,B] are blocks in the end region of a chain.
+-- This is cheaper then recomputing the ends multiple times.
+type FrontierMap = LabelMap ([BlockId],BlockChain)
+
+-- | A non empty ordered sequence of basic blocks.
+-- It is suitable for serialization in this order.
+--
+-- We use OrdList instead of [] to allow fast append on both sides
+-- when combining chains.
+newtype BlockChain
+ = BlockChain { chainBlocks :: (OrdList BlockId) }
+
+-- All chains are constructed the same way so comparison
+-- including structure is faster.
+instance Eq BlockChain where
+ BlockChain b1 == BlockChain b2 = strictlyEqOL b1 b2
+
+-- Useful for things like sets and debugging purposes, sorts by blocks
+-- in the chain.
+instance Ord (BlockChain) where
+ (BlockChain lbls1) `compare` (BlockChain lbls2)
+ = ASSERT(toList lbls1 /= toList lbls2 || lbls1 `strictlyEqOL` lbls2)
+ strictlyOrdOL lbls1 lbls2
+
+instance Outputable (BlockChain) where
+ ppr (BlockChain blks) =
+ parens (text "Chain:" <+> ppr (fromOL $ blks) )
+
+chainFoldl :: (b -> BlockId -> b) -> b -> BlockChain -> b
+chainFoldl f z (BlockChain blocks) = foldl' f z blocks
+
+noDups :: [BlockChain] -> Bool
+noDups chains =
+ let chainBlocks = concatMap chainToBlocks chains :: [BlockId]
+ (_blocks, dups) = removeDups compare chainBlocks
+ in if null dups then True
+ else pprTrace "Duplicates:" (ppr (map toList dups) $$ text "chains" <+> ppr chains ) False
+
+inFront :: BlockId -> BlockChain -> Bool
+inFront bid (BlockChain seq)
+ = headOL seq == bid
+
+chainSingleton :: BlockId -> BlockChain
+chainSingleton lbl
+ = BlockChain (unitOL lbl)
+
+chainFromList :: [BlockId] -> BlockChain
+chainFromList = BlockChain . toOL
+
+chainSnoc :: BlockChain -> BlockId -> BlockChain
+chainSnoc (BlockChain blks) lbl
+ = BlockChain (blks `snocOL` lbl)
+
+chainCons :: BlockId -> BlockChain -> BlockChain
+chainCons lbl (BlockChain blks)
+ = BlockChain (lbl `consOL` blks)
+
+chainConcat :: BlockChain -> BlockChain -> BlockChain
+chainConcat (BlockChain blks1) (BlockChain blks2)
+ = BlockChain (blks1 `appOL` blks2)
+
+chainToBlocks :: BlockChain -> [BlockId]
+chainToBlocks (BlockChain blks) = fromOL blks
+
+-- | Given the Chain A -> B -> C -> D and we break at C
+-- we get the two Chains (A -> B, C -> D) as result.
+breakChainAt :: BlockId -> BlockChain
+ -> (BlockChain,BlockChain)
+breakChainAt bid (BlockChain blks)
+ | not (bid == head rblks)
+ = panic "Block not in chain"
+ | otherwise
+ = (BlockChain (toOL lblks),
+ BlockChain (toOL rblks))
+ where
+ (lblks, rblks) = break (\lbl -> lbl == bid) (fromOL blks)
+
+takeR :: Int -> BlockChain -> [BlockId]
+takeR n (BlockChain blks) =
+ take n . fromOLReverse $ blks
+
+takeL :: Int -> BlockChain -> [BlockId]
+takeL n (BlockChain blks) =
+ take n . fromOL $ blks
+
+-- Note [Combining neighborhood chains]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-- See also Note [Chain based CFG serialization]
+-- We have the chains (A-B-C-D) and (E-F) and an Edge C->E.
+--
+-- While placing the latter after the former doesn't result in sequential
+-- control flow it is still beneficial. As block C and E might end
+-- up in the same cache line.
+--
+-- So we place these chains next to each other even if we can't fuse them.
+--
+-- A -> B -> C -> D
+-- v
+-- - -> E -> F ...
+--
+-- A simple heuristic to chose which chains we want to combine:
+-- * Process edges in descending priority.
+-- * Check if there is a edge near the end of one chain which goes
+-- to a block near the start of another edge.
+--
+-- While we could take into account the space between the two blocks which
+-- share an edge this blows up compile times quite a bit. It requires
+-- us to find all edges between two chains, check the distance for all edges,
+-- rank them based on the distance and only then we can select two chains
+-- to combine. Which would add a lot of complexity for little gain.
+--
+-- So instead we just rank by the strength of the edge and use the first pair we
+-- find.
+
+-- | For a given list of chains and edges try to combine chains with strong
+-- edges between them.
+combineNeighbourhood :: [CfgEdge] -- ^ Edges to consider
+ -> [BlockChain] -- ^ Current chains of blocks
+ -> ([BlockChain], Set.Set (BlockId,BlockId))
+ -- ^ Resulting list of block chains, and a set of edges which
+ -- were used to fuse chains and as such no longer need to be
+ -- considered.
+combineNeighbourhood edges chains
+ = -- pprTraceIt "Neighbours" $
+ -- pprTrace "combineNeighbours" (ppr edges) $
+ applyEdges edges endFrontier startFrontier (Set.empty)
+ where
+ --Build maps from chain ends to chains
+ endFrontier, startFrontier :: FrontierMap
+ endFrontier =
+ mapFromList $ concatMap (\chain ->
+ let ends = getEnds chain :: [BlockId]
+ entry = (ends,chain)
+ in map (\x -> (x,entry)) ends ) chains
+ startFrontier =
+ mapFromList $ concatMap (\chain ->
+ let front = getFronts chain
+ entry = (front,chain)
+ in map (\x -> (x,entry)) front) chains
+ applyEdges :: [CfgEdge] -> FrontierMap -> FrontierMap -> Set.Set (BlockId, BlockId)
+ -> ([BlockChain], Set.Set (BlockId,BlockId))
+ applyEdges [] chainEnds _chainFronts combined =
+ (ordNub $ map snd $ mapElems chainEnds, combined)
+ applyEdges ((CfgEdge from to _w):edges) chainEnds chainFronts combined
+ | Just (c1_e,c1) <- mapLookup from chainEnds
+ , Just (c2_f,c2) <- mapLookup to chainFronts
+ , c1 /= c2 -- Avoid trying to concat a chain with itself.
+ = let newChain = chainConcat c1 c2
+ newChainFrontier = getFronts newChain
+ newChainEnds = getEnds newChain
+ newFronts :: FrontierMap
+ newFronts =
+ let withoutOld =
+ foldl' (\m b -> mapDelete b m :: FrontierMap) chainFronts (c2_f ++ getFronts c1)
+ entry =
+ (newChainFrontier,newChain) --let bound to ensure sharing
+ in foldl' (\m x -> mapInsert x entry m)
+ withoutOld newChainFrontier
+
+ newEnds =
+ let withoutOld = foldl' (\m b -> mapDelete b m) chainEnds (c1_e ++ getEnds c2)
+ entry = (newChainEnds,newChain) --let bound to ensure sharing
+ in foldl' (\m x -> mapInsert x entry m)
+ withoutOld newChainEnds
+ in
+ -- pprTrace "ApplyEdges"
+ -- (text "before" $$
+ -- text "fronts" <+> ppr chainFronts $$
+ -- text "ends" <+> ppr chainEnds $$
+
+ -- text "various" $$
+ -- text "newChain" <+> ppr newChain $$
+ -- text "newChainFrontier" <+> ppr newChainFrontier $$
+ -- text "newChainEnds" <+> ppr newChainEnds $$
+ -- text "drop" <+> ppr ((c2_f ++ getFronts c1) ++ (c1_e ++ getEnds c2)) $$
+
+ -- text "after" $$
+ -- text "fronts" <+> ppr newFronts $$
+ -- text "ends" <+> ppr newEnds
+ -- )
+ applyEdges edges newEnds newFronts (Set.insert (from,to) combined)
+ | otherwise
+ = applyEdges edges chainEnds chainFronts combined
+ where
+
+ getFronts chain = takeL neighbourOverlapp chain
+ getEnds chain = takeR neighbourOverlapp chain
+
+-- In the last stop we combine all chains into a single one.
+-- Trying to place chains with strong edges next to each other.
+mergeChains :: [CfgEdge] -> [BlockChain]
+ -> (BlockChain)
+mergeChains edges chains
+ = -- pprTrace "combine" (ppr edges) $
+ runST $ do
+ let addChain m0 chain = do
+ ref <- newSTRef chain
+ return $ chainFoldl (\m' b -> mapInsert b ref m') m0 chain
+ chainMap' <- foldM (\m0 c -> addChain m0 c) mapEmpty chains
+ merge edges chainMap'
+ where
+ -- We keep a map from ALL blocks to their respective chain (sigh)
+ -- This is required since when looking at an edge we need to find
+ -- the associated chains quickly.
+ -- We use a map of STRefs, maintaining a invariant of one STRef per chain.
+ -- When merging chains we can update the
+ -- STRef of one chain once (instead of writing to the map for each block).
+ -- We then overwrite the STRefs for the other chain so there is again only
+ -- a single STRef for the combined chain.
+ -- The difference in terms of allocations saved is ~0.2% with -O so actually
+ -- significant compared to using a regular map.
+
+ merge :: forall s. [CfgEdge] -> LabelMap (STRef s BlockChain) -> ST s BlockChain
+ merge [] chains = do
+ chains' <- ordNub <$> (mapM readSTRef $ mapElems chains) :: ST s [BlockChain]
+ return $ foldl' chainConcat (head chains') (tail chains')
+ merge ((CfgEdge from to _):edges) chains
+ -- | pprTrace "merge" (ppr (from,to) <> ppr chains) False
+ -- = undefined
+ | cFrom == cTo
+ = merge edges chains
+ | otherwise
+ = do
+ chains' <- mergeComb cFrom cTo
+ merge edges chains'
+ where
+ mergeComb :: STRef s BlockChain -> STRef s BlockChain -> ST s (LabelMap (STRef s BlockChain))
+ mergeComb refFrom refTo = do
+ cRight <- readSTRef refTo
+ chain <- pure chainConcat <*> readSTRef refFrom <*> pure cRight
+ writeSTRef refFrom chain
+ return $ chainFoldl (\m b -> mapInsert b refFrom m) chains cRight
+
+ cFrom = expectJust "mergeChains:chainMap:from" $ mapLookup from chains
+ cTo = expectJust "mergeChains:chainMap:to" $ mapLookup to chains
+
+
+-- See Note [Chain based CFG serialization] for the general idea.
+-- This creates and fuses chains at the same time for performance reasons.
+
+-- Try to build chains from a list of edges.
+-- Edges must be sorted **descending** by their priority.
+-- Returns the constructed chains, along with all edges which
+-- are irrelevant past this point, this information doesn't need
+-- to be complete - it's only used to speed up the process.
+-- An Edge is irrelevant if the ends are part of the same chain.
+-- We say these edges are already linked
+buildChains :: [CfgEdge] -> [BlockId]
+ -> ( LabelMap BlockChain -- Resulting chains, indexd by end if chain.
+ , Set.Set (BlockId, BlockId)) --List of fused edges.
+buildChains edges blocks
+ = runST $ buildNext setEmpty mapEmpty mapEmpty edges Set.empty
+ where
+ -- buildNext builds up chains from edges one at a time.
+
+ -- We keep a map from the ends of chains to the chains.
+ -- This we we can easily check if an block should be appended to an
+ -- existing chain!
+ -- We store them using STRefs so we don't have to rebuild the spine of both
+ -- maps every time we update a chain.
+ buildNext :: forall s. LabelSet
+ -> LabelMap (STRef s BlockChain) -- Map from end of chain to chain.
+ -> LabelMap (STRef s BlockChain) -- Map from start of chain to chain.
+ -> [CfgEdge] -- Edges to check - ordered by decreasing weight
+ -> Set.Set (BlockId, BlockId) -- Used edges
+ -> ST s ( LabelMap BlockChain -- Chains by end
+ , Set.Set (BlockId, BlockId) --List of fused edges
+ )
+ buildNext placed _chainStarts chainEnds [] linked = do
+ ends' <- sequence $ mapMap readSTRef chainEnds :: ST s (LabelMap BlockChain)
+ -- Any remaining blocks have to be made to singleton chains.
+ -- They might be combined with other chains later on outside this function.
+ let unplaced = filter (\x -> not (setMember x placed)) blocks
+ singletons = map (\x -> (x,chainSingleton x)) unplaced :: [(BlockId,BlockChain)]
+ return (foldl' (\m (k,v) -> mapInsert k v m) ends' singletons , linked)
+ buildNext placed chainStarts chainEnds (edge:todo) linked
+ | from == to
+ -- We skip self edges
+ = buildNext placed chainStarts chainEnds todo (Set.insert (from,to) linked)
+ | not (alreadyPlaced from) &&
+ not (alreadyPlaced to)
+ = do
+ --pprTraceM "Edge-Chain:" (ppr edge)
+ chain' <- newSTRef $ chainFromList [from,to]
+ buildNext
+ (setInsert to (setInsert from placed))
+ (mapInsert from chain' chainStarts)
+ (mapInsert to chain' chainEnds)
+ todo
+ (Set.insert (from,to) linked)
+
+ | (alreadyPlaced from) &&
+ (alreadyPlaced to)
+ , Just predChain <- mapLookup from chainEnds
+ , Just succChain <- mapLookup to chainStarts
+ , predChain /= succChain -- Otherwise we try to create a cycle.
+ = do
+ -- pprTraceM "Fusing edge" (ppr edge)
+ fuseChain predChain succChain
+
+ | (alreadyPlaced from) &&
+ (alreadyPlaced to)
+ = --pprTraceM "Skipping:" (ppr edge) >>
+ buildNext placed chainStarts chainEnds todo linked
+
+ | otherwise
+ = do -- pprTraceM "Finding chain for:" (ppr edge $$
+ -- text "placed" <+> ppr placed)
+ findChain
+ where
+ from = edgeFrom edge
+ to = edgeTo edge
+ alreadyPlaced blkId = (setMember blkId placed)
+
+ -- Combine two chains into a single one.
+ fuseChain :: STRef s BlockChain -> STRef s BlockChain
+ -> ST s ( LabelMap BlockChain -- Chains by end
+ , Set.Set (BlockId, BlockId) --List of fused edges
+ )
+ fuseChain fromRef toRef = do
+ fromChain <- readSTRef fromRef
+ toChain <- readSTRef toRef
+ let newChain = chainConcat fromChain toChain
+ ref <- newSTRef newChain
+ let start = head $ takeL 1 newChain
+ let end = head $ takeR 1 newChain
+ -- chains <- sequence $ mapMap readSTRef chainStarts
+ -- pprTraceM "pre-fuse chains:" $ ppr chains
+ buildNext
+ placed
+ (mapInsert start ref $ mapDelete to $ chainStarts)
+ (mapInsert end ref $ mapDelete from $ chainEnds)
+ todo
+ (Set.insert (from,to) linked)
+
+
+ --Add the block to a existing chain or creates a new chain
+ findChain :: ST s ( LabelMap BlockChain -- Chains by end
+ , Set.Set (BlockId, BlockId) --List of fused edges
+ )
+ findChain
+ -- We can attach the block to the end of a chain
+ | alreadyPlaced from
+ , Just predChain <- mapLookup from chainEnds
+ = do
+ chain <- readSTRef predChain
+ let newChain = chainSnoc chain to
+ writeSTRef predChain newChain
+ let chainEnds' = mapInsert to predChain $ mapDelete from chainEnds
+ -- chains <- sequence $ mapMap readSTRef chainStarts
+ -- pprTraceM "from chains:" $ ppr chains
+ buildNext (setInsert to placed) chainStarts chainEnds' todo (Set.insert (from,to) linked)
+ -- We can attack it to the front of a chain
+ | alreadyPlaced to
+ , Just succChain <- mapLookup to chainStarts
+ = do
+ chain <- readSTRef succChain
+ let newChain = from `chainCons` chain
+ writeSTRef succChain newChain
+ let chainStarts' = mapInsert from succChain $ mapDelete to chainStarts
+ -- chains <- sequence $ mapMap readSTRef chainStarts'
+ -- pprTraceM "to chains:" $ ppr chains
+ buildNext (setInsert from placed) chainStarts' chainEnds todo (Set.insert (from,to) linked)
+ -- The placed end of the edge is part of a chain already and not an end.
+ | otherwise
+ = do
+ let block = if alreadyPlaced to then from else to
+ --pprTraceM "Singleton" $ ppr block
+ let newChain = chainSingleton block
+ ref <- newSTRef newChain
+ buildNext (setInsert block placed) (mapInsert block ref chainStarts)
+ (mapInsert block ref chainEnds) todo (linked)
+ where
+ alreadyPlaced blkId = (setMember blkId placed)
+
+-- | Place basic blocks based on the given CFG.
+-- See Note [Chain based CFG serialization]
+sequenceChain :: forall a i. (Instruction i, Outputable i)
+ => LabelMap a -- ^ Keys indicate an info table on the block.
+ -> CFG -- ^ Control flow graph and some meta data.
+ -> [GenBasicBlock i] -- ^ List of basic blocks to be placed.
+ -> [GenBasicBlock i] -- ^ Blocks placed in sequence.
+sequenceChain _info _weights [] = []
+sequenceChain _info _weights [x] = [x]
+sequenceChain info weights' blocks@((BasicBlock entry _):_) =
+ let weights :: CFG
+ weights = --pprTrace "cfg'" (pprEdgeWeights cfg')
+ cfg'
+ where
+ (_, globalEdgeWeights) = {-# SCC mkGlobalWeights #-} mkGlobalWeights entry weights'
+ cfg' = {-# SCC rewriteEdges #-}
+ mapFoldlWithKey
+ (\cfg from m ->
+ mapFoldlWithKey
+ (\cfg to w -> setEdgeWeight cfg (EdgeWeight w) from to )
+ cfg m )
+ weights'
+ globalEdgeWeights
+
+ directEdges :: [CfgEdge]
+ directEdges = sortBy (flip compare) $ catMaybes . map relevantWeight $ (infoEdgeList weights)
+ where
+ relevantWeight :: CfgEdge -> Maybe CfgEdge
+ relevantWeight edge@(CfgEdge from to edgeInfo)
+ | (EdgeInfo CmmSource { trans_cmmNode = CmmCall {} } _) <- edgeInfo
+ -- Ignore edges across calls
+ = Nothing
+ | mapMember to info
+ , w <- edgeWeight edgeInfo
+ -- The payoff is small if we jump over an info table
+ = Just (CfgEdge from to edgeInfo { edgeWeight = w/8 })
+ | otherwise
+ = Just edge
+
+ blockMap :: LabelMap (GenBasicBlock i)
+ blockMap
+ = foldl' (\m blk@(BasicBlock lbl _ins) ->
+ mapInsert lbl blk m)
+ mapEmpty blocks
+
+ (builtChains, builtEdges)
+ = {-# SCC "buildChains" #-}
+ --pprTraceIt "generatedChains" $
+ --pprTrace "blocks" (ppr (mapKeys blockMap)) $
+ buildChains directEdges (mapKeys blockMap)
+
+ rankedEdges :: [CfgEdge]
+ -- Sort descending by weight, remove fused edges
+ rankedEdges =
+ filter (\edge -> not (Set.member (edgeFrom edge,edgeTo edge) builtEdges)) $
+ directEdges
+
+ (neighbourChains, combined)
+ = ASSERT(noDups $ mapElems builtChains)
+ {-# SCC "groupNeighbourChains" #-}
+ -- pprTraceIt "NeighbourChains" $
+ combineNeighbourhood rankedEdges (mapElems builtChains)
+
+
+ allEdges :: [CfgEdge]
+ allEdges = {-# SCC allEdges #-}
+ sortOn (relevantWeight) $ filter (not . deadEdge) $ (infoEdgeList weights)
+ where
+ deadEdge :: CfgEdge -> Bool
+ deadEdge (CfgEdge from to _) = let e = (from,to) in Set.member e combined || Set.member e builtEdges
+ relevantWeight :: CfgEdge -> EdgeWeight
+ relevantWeight (CfgEdge _ _ edgeInfo)
+ | EdgeInfo (CmmSource { trans_cmmNode = CmmCall {}}) _ <- edgeInfo
+ -- Penalize edges across calls
+ = weight/(64.0)
+ | otherwise
+ = weight
+ where
+ -- negate to sort descending
+ weight = negate (edgeWeight edgeInfo)
+
+ masterChain =
+ {-# SCC "mergeChains" #-}
+ -- pprTraceIt "MergedChains" $
+ mergeChains allEdges neighbourChains
+
+ --Make sure the first block stays first
+ prepedChains
+ | inFront entry masterChain
+ = [masterChain]
+ | (rest,entry) <- breakChainAt entry masterChain
+ = [entry,rest]
+#if __GLASGOW_HASKELL__ <= 810
+ | otherwise = pprPanic "Entry point eliminated" $
+ ppr masterChain
+#endif
+
+ blockList
+ = ASSERT(noDups [masterChain])
+ (concatMap fromOL $ map chainBlocks prepedChains)
+
+ --chainPlaced = setFromList $ map blockId blockList :: LabelSet
+ chainPlaced = setFromList $ blockList :: LabelSet
+ unplaced =
+ let blocks = mapKeys blockMap
+ isPlaced b = setMember (b) chainPlaced
+ in filter (\block -> not (isPlaced block)) blocks
+
+ placedBlocks =
+ -- We want debug builds to catch this as it's a good indicator for
+ -- issues with CFG invariants. But we don't want to blow up production
+ -- builds if something slips through.
+ ASSERT(null unplaced)
+ --pprTraceIt "placedBlocks" $
+ -- ++ [] is stil kinda expensive
+ if null unplaced then blockList else blockList ++ unplaced
+ getBlock bid = expectJust "Block placement" $ mapLookup bid blockMap
+ in
+ --Assert we placed all blocks given as input
+ ASSERT(all (\bid -> mapMember bid blockMap) placedBlocks)
+ dropJumps info $ map getBlock placedBlocks
+
+{-# SCC dropJumps #-}
+-- | Remove redundant jumps between blocks when we can rely on
+-- fall through.
+dropJumps :: forall a i. Instruction i => LabelMap a -> [GenBasicBlock i]
+ -> [GenBasicBlock i]
+dropJumps _ [] = []
+dropJumps info ((BasicBlock lbl ins):todo)
+ | not . null $ ins --This can happen because of shortcutting
+ , [dest] <- jumpDestsOfInstr (last ins)
+ , ((BasicBlock nextLbl _) : _) <- todo
+ , not (mapMember dest info)
+ , nextLbl == dest
+ = BasicBlock lbl (init ins) : dropJumps info todo
+ | otherwise
+ = BasicBlock lbl ins : dropJumps info todo
+
+
+-- -----------------------------------------------------------------------------
+-- Sequencing the basic blocks
+
+-- Cmm BasicBlocks are self-contained entities: they always end in a
+-- jump, either non-local or to another basic block in the same proc.
+-- In this phase, we attempt to place the basic blocks in a sequence
+-- such that as many of the local jumps as possible turn into
+-- fallthroughs.
+
+sequenceTop
+ :: (Instruction instr, Outputable instr)
+ => DynFlags -- Determine which layout algo to use
+ -> NcgImpl statics instr jumpDest
+ -> Maybe CFG -- ^ CFG if we have one.
+ -> NatCmmDecl statics instr -- ^ Function to serialize
+ -> NatCmmDecl statics instr
+
+sequenceTop _ _ _ top@(CmmData _ _) = top
+sequenceTop dflags ncgImpl edgeWeights
+ (CmmProc info lbl live (ListGraph blocks))
+ | (gopt Opt_CfgBlocklayout dflags) && backendMaintainsCfg dflags
+ --Use chain based algorithm
+ , Just cfg <- edgeWeights
+ = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
+ {-# SCC layoutBlocks #-}
+ sequenceChain info cfg blocks )
+ | otherwise
+ --Use old algorithm
+ = let cfg = if dontUseCfg then Nothing else edgeWeights
+ in CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
+ {-# SCC layoutBlocks #-}
+ sequenceBlocks cfg info blocks)
+ where
+ dontUseCfg = gopt Opt_WeightlessBlocklayout dflags ||
+ (not $ backendMaintainsCfg dflags)
+
+-- The old algorithm:
+-- It is very simple (and stupid): We make a graph out of
+-- the blocks where there is an edge from one block to another iff the
+-- first block ends by jumping to the second. Then we topologically
+-- sort this graph. Then traverse the list: for each block, we first
+-- output the block, then if it has an out edge, we move the
+-- destination of the out edge to the front of the list, and continue.
+
+-- FYI, the classic layout for basic blocks uses postorder DFS; this
+-- algorithm is implemented in Hoopl.
+
+sequenceBlocks :: Instruction inst => Maybe CFG -> LabelMap a
+ -> [GenBasicBlock inst] -> [GenBasicBlock inst]
+sequenceBlocks _edgeWeight _ [] = []
+sequenceBlocks edgeWeights infos (entry:blocks) =
+ let entryNode = mkNode edgeWeights entry
+ bodyNodes = reverse
+ (flattenSCCs (sccBlocks edgeWeights blocks))
+ in dropJumps infos . seqBlocks infos $ ( entryNode : bodyNodes)
+ -- the first block is the entry point ==> it must remain at the start.
+
+sccBlocks
+ :: Instruction instr
+ => Maybe CFG -> [NatBasicBlock instr]
+ -> [SCC (Node BlockId (NatBasicBlock instr))]
+sccBlocks edgeWeights blocks =
+ stronglyConnCompFromEdgedVerticesUniqR
+ (map (mkNode edgeWeights) blocks)
+
+mkNode :: (Instruction t)
+ => Maybe CFG -> GenBasicBlock t
+ -> Node BlockId (GenBasicBlock t)
+mkNode edgeWeights block@(BasicBlock id instrs) =
+ DigraphNode block id outEdges
+ where
+ outEdges :: [BlockId]
+ outEdges
+ --Select the heaviest successor, ignore weights <= zero
+ = successor
+ where
+ successor
+ | Just successors <- fmap (`getSuccEdgesSorted` id)
+ edgeWeights -- :: Maybe [(Label, EdgeInfo)]
+ = case successors of
+ [] -> []
+ ((target,info):_)
+ | length successors > 2 || edgeWeight info <= 0 -> []
+ | otherwise -> [target]
+ | otherwise
+ = case jumpDestsOfInstr (last instrs) of
+ [one] -> [one]
+ _many -> []
+
+
+seqBlocks :: LabelMap i -> [Node BlockId (GenBasicBlock t1)]
+ -> [GenBasicBlock t1]
+seqBlocks infos blocks = placeNext pullable0 todo0
+ where
+ -- pullable: Blocks that are not yet placed
+ -- todo: Original order of blocks, to be followed if we have no good
+ -- reason not to;
+ -- may include blocks that have already been placed, but then
+ -- these are not in pullable
+ pullable0 = listToUFM [ (i,(b,n)) | DigraphNode b i n <- blocks ]
+ todo0 = map node_key blocks
+
+ placeNext _ [] = []
+ placeNext pullable (i:rest)
+ | Just (block, pullable') <- lookupDeleteUFM pullable i
+ = place pullable' rest block
+ | otherwise
+ -- We already placed this block, so ignore
+ = placeNext pullable rest
+
+ place pullable todo (block,[])
+ = block : placeNext pullable todo
+ place pullable todo (block@(BasicBlock id instrs),[next])
+ | mapMember next infos
+ = block : placeNext pullable todo
+ | Just (nextBlock, pullable') <- lookupDeleteUFM pullable next
+ = BasicBlock id instrs : place pullable' todo nextBlock
+ | otherwise
+ = block : placeNext pullable todo
+ place _ _ (_,tooManyNextNodes)
+ = pprPanic "seqBlocks" (ppr tooManyNextNodes)
+
+
+lookupDeleteUFM :: Uniquable key => UniqFM elt -> key
+ -> Maybe (elt, UniqFM elt)
+lookupDeleteUFM m k = do -- Maybe monad
+ v <- lookupUFM m k
+ return (v, delFromUFM m k)
diff --git a/compiler/GHC/CmmToAsm/CFG.hs b/compiler/GHC/CmmToAsm/CFG.hs
new file mode 100644
index 0000000000..f52ff514b1
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/CFG.hs
@@ -0,0 +1,1320 @@
+--
+-- Copyright (c) 2018 Andreas Klebinger
+--
+
+{-# LANGUAGE TypeFamilies, ScopedTypeVariables #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE Rank2Types #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DataKinds #-}
+
+module GHC.CmmToAsm.CFG
+ ( CFG, CfgEdge(..), EdgeInfo(..), EdgeWeight(..)
+ , TransitionSource(..)
+
+ --Modify the CFG
+ , addWeightEdge, addEdge
+ , delEdge, delNode
+ , addNodesBetween, shortcutWeightMap
+ , reverseEdges, filterEdges
+ , addImmediateSuccessor
+ , mkWeightInfo, adjustEdgeWeight, setEdgeWeight
+
+ --Query the CFG
+ , infoEdgeList, edgeList
+ , getSuccessorEdges, getSuccessors
+ , getSuccEdgesSorted
+ , getEdgeInfo
+ , getCfgNodes, hasNode
+
+ -- Loop Information
+ , loopMembers, loopLevels, loopInfo
+
+ --Construction/Misc
+ , getCfg, getCfgProc, pprEdgeWeights, sanityCheckCfg
+
+ --Find backedges and update their weight
+ , optimizeCFG
+ , mkGlobalWeights
+
+ )
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Cmm.BlockId
+import GHC.Cmm as Cmm
+
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Block
+import qualified GHC.Cmm.Dataflow.Graph as G
+
+import Util
+import Digraph
+import Maybes
+
+import Unique
+import qualified GHC.CmmToAsm.CFG.Dominators as Dom
+import Data.IntMap.Strict (IntMap)
+import Data.IntSet (IntSet)
+
+import qualified Data.IntMap.Strict as IM
+import qualified Data.Map as M
+import qualified Data.IntSet as IS
+import qualified Data.Set as S
+import Data.Tree
+import Data.Bifunctor
+
+import Outputable
+-- DEBUGGING ONLY
+--import GHC.Cmm.DebugBlock
+--import OrdList
+--import GHC.Cmm.DebugBlock.Trace
+import GHC.Cmm.Ppr () -- For Outputable instances
+import qualified GHC.Driver.Session as D
+
+import Data.List (sort, nub, partition)
+import Data.STRef.Strict
+import Control.Monad.ST
+
+import Data.Array.MArray
+import Data.Array.ST
+import Data.Array.IArray
+import Data.Array.Unsafe (unsafeFreeze)
+import Data.Array.Base (unsafeRead, unsafeWrite)
+
+import Control.Monad
+
+type Prob = Double
+
+type Edge = (BlockId, BlockId)
+type Edges = [Edge]
+
+newtype EdgeWeight
+ = EdgeWeight { weightToDouble :: Double }
+ deriving (Eq,Ord,Enum,Num,Real,Fractional)
+
+instance Outputable EdgeWeight where
+ ppr (EdgeWeight w) = doublePrec 5 w
+
+type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo)
+
+-- | A control flow graph where edges have been annotated with a weight.
+-- Implemented as IntMap (IntMap <edgeData>)
+-- We must uphold the invariant that for each edge A -> B we must have:
+-- A entry B in the outer map.
+-- A entry B in the map we get when looking up A.
+-- Maintaining this invariant is useful as any failed lookup now indicates
+-- an actual error in code which might go unnoticed for a while
+-- otherwise.
+type CFG = EdgeInfoMap EdgeInfo
+
+data CfgEdge
+ = CfgEdge
+ { edgeFrom :: !BlockId
+ , edgeTo :: !BlockId
+ , edgeInfo :: !EdgeInfo
+ }
+
+-- | Careful! Since we assume there is at most one edge from A to B
+-- the Eq instance does not consider weight.
+instance Eq CfgEdge where
+ (==) (CfgEdge from1 to1 _) (CfgEdge from2 to2 _)
+ = from1 == from2 && to1 == to2
+
+-- | Edges are sorted ascending pointwise by weight, source and destination
+instance Ord CfgEdge where
+ compare (CfgEdge from1 to1 (EdgeInfo {edgeWeight = weight1}))
+ (CfgEdge from2 to2 (EdgeInfo {edgeWeight = weight2}))
+ | weight1 < weight2 || weight1 == weight2 && from1 < from2 ||
+ weight1 == weight2 && from1 == from2 && to1 < to2
+ = LT
+ | from1 == from2 && to1 == to2 && weight1 == weight2
+ = EQ
+ | otherwise
+ = GT
+
+instance Outputable CfgEdge where
+ ppr (CfgEdge from1 to1 edgeInfo)
+ = parens (ppr from1 <+> text "-(" <> ppr edgeInfo <> text ")->" <+> ppr to1)
+
+-- | Can we trace back a edge to a specific Cmm Node
+-- or has it been introduced during assembly codegen. We use this to maintain
+-- some information which would otherwise be lost during the
+-- Cmm <-> asm transition.
+-- See also Note [Inverting Conditional Branches]
+data TransitionSource
+ = CmmSource { trans_cmmNode :: (CmmNode O C)
+ , trans_info :: BranchInfo }
+ | AsmCodeGen
+ deriving (Eq)
+
+data BranchInfo = NoInfo -- ^ Unknown, but not heap or stack check.
+ | HeapStackCheck -- ^ Heap or stack check
+ deriving Eq
+
+instance Outputable BranchInfo where
+ ppr NoInfo = text "regular"
+ ppr HeapStackCheck = text "heap/stack"
+
+isHeapOrStackCheck :: TransitionSource -> Bool
+isHeapOrStackCheck (CmmSource { trans_info = HeapStackCheck}) = True
+isHeapOrStackCheck _ = False
+
+-- | Information about edges
+data EdgeInfo
+ = EdgeInfo
+ { transitionSource :: !TransitionSource
+ , edgeWeight :: !EdgeWeight
+ } deriving (Eq)
+
+instance Outputable EdgeInfo where
+ ppr edgeInfo = text "weight:" <+> ppr (edgeWeight edgeInfo)
+
+-- | Convenience function, generate edge info based
+-- on weight not originating from cmm.
+mkWeightInfo :: EdgeWeight -> EdgeInfo
+mkWeightInfo = EdgeInfo AsmCodeGen
+
+-- | Adjust the weight between the blocks using the given function.
+-- If there is no such edge returns the original map.
+adjustEdgeWeight :: CFG -> (EdgeWeight -> EdgeWeight)
+ -> BlockId -> BlockId -> CFG
+adjustEdgeWeight cfg f from to
+ | Just info <- getEdgeInfo from to cfg
+ , !weight <- edgeWeight info
+ , !newWeight <- f weight
+ = addEdge from to (info { edgeWeight = newWeight}) cfg
+ | otherwise = cfg
+
+-- | Set the weight between the blocks to the given weight.
+-- If there is no such edge returns the original map.
+setEdgeWeight :: CFG -> EdgeWeight
+ -> BlockId -> BlockId -> CFG
+setEdgeWeight cfg !weight from to
+ | Just info <- getEdgeInfo from to cfg
+ = addEdge from to (info { edgeWeight = weight}) cfg
+ | otherwise = cfg
+
+
+getCfgNodes :: CFG -> [BlockId]
+getCfgNodes m =
+ mapKeys m
+
+-- | Is this block part of this graph?
+hasNode :: CFG -> BlockId -> Bool
+hasNode m node =
+ -- Check the invariant that each node must exist in the first map or not at all.
+ ASSERT( found || not (any (mapMember node) m))
+ found
+ where
+ found = mapMember node m
+
+
+
+-- | Check if the nodes in the cfg and the set of blocks are the same.
+-- In a case of a missmatch we panic and show the difference.
+sanityCheckCfg :: CFG -> LabelSet -> SDoc -> Bool
+sanityCheckCfg m blockSet msg
+ | blockSet == cfgNodes
+ = True
+ | otherwise =
+ pprPanic "Block list and cfg nodes don't match" (
+ text "difference:" <+> ppr diff $$
+ text "blocks:" <+> ppr blockSet $$
+ text "cfg:" <+> pprEdgeWeights m $$
+ msg )
+ False
+ where
+ cfgNodes = setFromList $ getCfgNodes m :: LabelSet
+ diff = (setUnion cfgNodes blockSet) `setDifference` (setIntersection cfgNodes blockSet) :: LabelSet
+
+-- | Filter the CFG with a custom function f.
+-- Paramaeters are `f from to edgeInfo`
+filterEdges :: (BlockId -> BlockId -> EdgeInfo -> Bool) -> CFG -> CFG
+filterEdges f cfg =
+ mapMapWithKey filterSources cfg
+ where
+ filterSources from m =
+ mapFilterWithKey (\to w -> f from to w) m
+
+
+{- Note [Updating the CFG during shortcutting]
+
+See Note [What is shortcutting] in the control flow optimization
+code (GHC.Cmm.ContFlowOpt) for a slightly more in depth explanation on shortcutting.
+
+In the native backend we shortcut jumps at the assembly level. (AsmCodeGen.hs)
+This means we remove blocks containing only one jump from the code
+and instead redirecting all jumps targeting this block to the deleted
+blocks jump target.
+
+However we want to have an accurate representation of control
+flow in the CFG. So we add/remove edges accordingly to account
+for the eliminated blocks and new edges.
+
+If we shortcut A -> B -> C to A -> C:
+* We delete edges A -> B and B -> C
+* Replacing them with the edge A -> C
+
+We also try to preserve jump weights while doing so.
+
+Note that:
+* The edge B -> C can't have interesting weights since
+ the block B consists of a single unconditional jump without branching.
+* We delete the edge A -> B and add the edge A -> C.
+* The edge A -> B can be one of many edges originating from A so likely
+ has edge weights we want to preserve.
+
+For this reason we simply store the edge info from the original A -> B
+edge and apply this information to the new edge A -> C.
+
+Sometimes we have a scenario where jump target C is not represented by an
+BlockId but an immediate value. I'm only aware of this happening without
+tables next to code currently.
+
+Then we go from A ---> B - -> IMM to A - -> IMM where the dashed arrows
+are not stored in the CFG.
+
+In that case we simply delete the edge A -> B.
+
+In terms of implementation the native backend first builds a mapping
+from blocks suitable for shortcutting to their jump targets.
+Then it redirects all jump instructions to these blocks using the
+built up mapping.
+This function (shortcutWeightMap) takes the same mapping and
+applies the mapping to the CFG in the way laid out above.
+
+-}
+shortcutWeightMap :: LabelMap (Maybe BlockId) -> CFG -> CFG
+shortcutWeightMap cuts cfg =
+ foldl' applyMapping cfg $ mapToList cuts
+ where
+-- takes the tuple (B,C) from the notation in [Updating the CFG during shortcutting]
+ applyMapping :: CFG -> (BlockId,Maybe BlockId) -> CFG
+ --Shortcut immediate
+ applyMapping m (from, Nothing) =
+ mapDelete from .
+ fmap (mapDelete from) $ m
+ --Regular shortcut
+ applyMapping m (from, Just to) =
+ let updatedMap :: CFG
+ updatedMap
+ = fmap (shortcutEdge (from,to)) $
+ (mapDelete from m :: CFG )
+ --Sometimes we can shortcut multiple blocks like so:
+ -- A -> B -> C -> D -> E => A -> E
+ -- so we check for such chains.
+ in case mapLookup to cuts of
+ Nothing -> updatedMap
+ Just dest -> applyMapping updatedMap (to, dest)
+ --Redirect edge from B to C
+ shortcutEdge :: (BlockId, BlockId) -> LabelMap EdgeInfo -> LabelMap EdgeInfo
+ shortcutEdge (from, to) m =
+ case mapLookup from m of
+ Just info -> mapInsert to info $ mapDelete from m
+ Nothing -> m
+
+-- | Sometimes we insert a block which should unconditionally be executed
+-- after a given block. This function updates the CFG for these cases.
+-- So we get A -> B => A -> A' -> B
+-- \ \
+-- -> C => -> C
+--
+addImmediateSuccessor :: BlockId -> BlockId -> CFG -> CFG
+addImmediateSuccessor node follower cfg
+ = updateEdges . addWeightEdge node follower uncondWeight $ cfg
+ where
+ uncondWeight = fromIntegral . D.uncondWeight .
+ D.cfgWeightInfo $ D.unsafeGlobalDynFlags
+ targets = getSuccessorEdges cfg node
+ successors = map fst targets :: [BlockId]
+ updateEdges = addNewSuccs . remOldSuccs
+ remOldSuccs m = foldl' (flip (delEdge node)) m successors
+ addNewSuccs m =
+ foldl' (\m' (t,info) -> addEdge follower t info m') m targets
+
+-- | Adds a new edge, overwrites existing edges if present
+addEdge :: BlockId -> BlockId -> EdgeInfo -> CFG -> CFG
+addEdge from to info cfg =
+ mapAlter addFromToEdge from $
+ mapAlter addDestNode to cfg
+ where
+ -- Simply insert the edge into the edge list.
+ addFromToEdge Nothing = Just $ mapSingleton to info
+ addFromToEdge (Just wm) = Just $ mapInsert to info wm
+ -- We must add the destination node explicitly
+ addDestNode Nothing = Just $ mapEmpty
+ addDestNode n@(Just _) = n
+
+
+-- | Adds a edge with the given weight to the cfg
+-- If there already existed an edge it is overwritten.
+-- `addWeightEdge from to weight cfg`
+addWeightEdge :: BlockId -> BlockId -> EdgeWeight -> CFG -> CFG
+addWeightEdge from to weight cfg =
+ addEdge from to (mkWeightInfo weight) cfg
+
+delEdge :: BlockId -> BlockId -> CFG -> CFG
+delEdge from to m =
+ mapAlter remDest from m
+ where
+ remDest Nothing = Nothing
+ remDest (Just wm) = Just $ mapDelete to wm
+
+delNode :: BlockId -> CFG -> CFG
+delNode node cfg =
+ fmap (mapDelete node) -- < Edges to the node
+ (mapDelete node cfg) -- < Edges from the node
+
+-- | Destinations from bid ordered by weight (descending)
+getSuccEdgesSorted :: CFG -> BlockId -> [(BlockId,EdgeInfo)]
+getSuccEdgesSorted m bid =
+ let destMap = mapFindWithDefault mapEmpty bid m
+ cfgEdges = mapToList destMap
+ sortedEdges = sortWith (negate . edgeWeight . snd) cfgEdges
+ in --pprTrace "getSuccEdgesSorted" (ppr bid <+> text "map:" <+> ppr m)
+ sortedEdges
+
+-- | Get successors of a given node with edge weights.
+getSuccessorEdges :: HasDebugCallStack => CFG -> BlockId -> [(BlockId,EdgeInfo)]
+getSuccessorEdges m bid = maybe lookupError mapToList (mapLookup bid m)
+ where
+ lookupError = pprPanic "getSuccessorEdges: Block does not exist" $
+ ppr bid <+> pprEdgeWeights m
+
+getEdgeInfo :: BlockId -> BlockId -> CFG -> Maybe EdgeInfo
+getEdgeInfo from to m
+ | Just wm <- mapLookup from m
+ , Just info <- mapLookup to wm
+ = Just $! info
+ | otherwise
+ = Nothing
+
+getEdgeWeight :: CFG -> BlockId -> BlockId -> EdgeWeight
+getEdgeWeight cfg from to =
+ edgeWeight $ expectJust "Edgeweight for noexisting block" $
+ getEdgeInfo from to cfg
+
+getTransitionSource :: BlockId -> BlockId -> CFG -> TransitionSource
+getTransitionSource from to cfg = transitionSource $ expectJust "Source info for noexisting block" $
+ getEdgeInfo from to cfg
+
+reverseEdges :: CFG -> CFG
+reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg
+ where
+ -- We must preserve nodes without outgoing edges!
+ addNode :: CFG -> BlockId -> CFG
+ addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg
+ go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG
+ go cfg from toMap = mapFoldlWithKey (\cfg to info -> addEdge to from info cfg) cfg toMap :: CFG
+
+
+-- | Returns a unordered list of all edges with info
+infoEdgeList :: CFG -> [CfgEdge]
+infoEdgeList m =
+ go (mapToList m) []
+ where
+ -- We avoid foldMap to avoid thunk buildup
+ go :: [(BlockId,LabelMap EdgeInfo)] -> [CfgEdge] -> [CfgEdge]
+ go [] acc = acc
+ go ((from,toMap):xs) acc
+ = go' xs from (mapToList toMap) acc
+ go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [(BlockId,EdgeInfo)] -> [CfgEdge] -> [CfgEdge]
+ go' froms _ [] acc = go froms acc
+ go' froms from ((to,info):tos) acc
+ = go' froms from tos (CfgEdge from to info : acc)
+
+-- | Returns a unordered list of all edges without weights
+edgeList :: CFG -> [Edge]
+edgeList m =
+ go (mapToList m) []
+ where
+ -- We avoid foldMap to avoid thunk buildup
+ go :: [(BlockId,LabelMap EdgeInfo)] -> [Edge] -> [Edge]
+ go [] acc = acc
+ go ((from,toMap):xs) acc
+ = go' xs from (mapKeys toMap) acc
+ go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [BlockId] -> [Edge] -> [Edge]
+ go' froms _ [] acc = go froms acc
+ go' froms from (to:tos) acc
+ = go' froms from tos ((from,to) : acc)
+
+-- | Get successors of a given node without edge weights.
+getSuccessors :: HasDebugCallStack => CFG -> BlockId -> [BlockId]
+getSuccessors m bid
+ | Just wm <- mapLookup bid m
+ = mapKeys wm
+ | otherwise = lookupError
+ where
+ lookupError = pprPanic "getSuccessors: Block does not exist" $
+ ppr bid <+> pprEdgeWeights m
+
+pprEdgeWeights :: CFG -> SDoc
+pprEdgeWeights m =
+ let edges = sort $ infoEdgeList m :: [CfgEdge]
+ printEdge (CfgEdge from to (EdgeInfo { edgeWeight = weight }))
+ = text "\t" <> ppr from <+> text "->" <+> ppr to <>
+ text "[label=\"" <> ppr weight <> text "\",weight=\"" <>
+ ppr weight <> text "\"];\n"
+ --for the case that there are no edges from/to this node.
+ --This should rarely happen but it can save a lot of time
+ --to immediately see it when it does.
+ printNode node
+ = text "\t" <> ppr node <> text ";\n"
+ getEdgeNodes (CfgEdge from to _) = [from,to]
+ edgeNodes = setFromList $ concatMap getEdgeNodes edges :: LabelSet
+ nodes = filter (\n -> (not . setMember n) edgeNodes) . mapKeys $ mapFilter null m
+ in
+ text "digraph {\n" <>
+ (foldl' (<>) empty (map printEdge edges)) <>
+ (foldl' (<>) empty (map printNode nodes)) <>
+ text "}\n"
+
+{-# INLINE updateEdgeWeight #-} --Allows eliminating the tuple when possible
+-- | Invariant: The edge **must** exist already in the graph.
+updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG
+updateEdgeWeight f (from, to) cfg
+ | Just oldInfo <- getEdgeInfo from to cfg
+ = let !oldWeight = edgeWeight oldInfo
+ !newWeight = f oldWeight
+ in addEdge from to (oldInfo {edgeWeight = newWeight}) cfg
+ | otherwise
+ = panic "Trying to update invalid edge"
+
+-- from to oldWeight => newWeight
+mapWeights :: (BlockId -> BlockId -> EdgeWeight -> EdgeWeight) -> CFG -> CFG
+mapWeights f cfg =
+ foldl' (\cfg (CfgEdge from to info) ->
+ let oldWeight = edgeWeight info
+ newWeight = f from to oldWeight
+ in addEdge from to (info {edgeWeight = newWeight}) cfg)
+ cfg (infoEdgeList cfg)
+
+
+-- | Insert a block in the control flow between two other blocks.
+-- We pass a list of tuples (A,B,C) where
+-- * A -> C: Old edge
+-- * A -> B -> C : New Arc, where B is the new block.
+-- It's possible that a block has two jumps to the same block
+-- in the assembly code. However we still only store a single edge for
+-- these cases.
+-- We assign the old edge info to the edge A -> B and assign B -> C the
+-- weight of an unconditional jump.
+addNodesBetween :: CFG -> [(BlockId,BlockId,BlockId)] -> CFG
+addNodesBetween m updates =
+ foldl' updateWeight m .
+ weightUpdates $ updates
+ where
+ weight = fromIntegral . D.uncondWeight .
+ D.cfgWeightInfo $ D.unsafeGlobalDynFlags
+ -- We might add two blocks for different jumps along a single
+ -- edge. So we end up with edges: A -> B -> C , A -> D -> C
+ -- in this case after applying the first update the weight for A -> C
+ -- is no longer available. So we calculate future weights before updates.
+ weightUpdates = map getWeight
+ getWeight :: (BlockId,BlockId,BlockId) -> (BlockId,BlockId,BlockId,EdgeInfo)
+ getWeight (from,between,old)
+ | Just edgeInfo <- getEdgeInfo from old m
+ = (from,between,old,edgeInfo)
+ | otherwise
+ = pprPanic "Can't find weight for edge that should have one" (
+ text "triple" <+> ppr (from,between,old) $$
+ text "updates" <+> ppr updates $$
+ text "cfg:" <+> pprEdgeWeights m )
+ updateWeight :: CFG -> (BlockId,BlockId,BlockId,EdgeInfo) -> CFG
+ updateWeight m (from,between,old,edgeInfo)
+ = addEdge from between edgeInfo .
+ addWeightEdge between old weight .
+ delEdge from old $ m
+
+{-
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~~~ Note [CFG Edge Weights] ~~~
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ Edge weights assigned do not currently represent a specific
+ cost model and rather just a ranking of which blocks should
+ be placed next to each other given their connection type in
+ the CFG.
+ This is especially relevant if we whenever two blocks will
+ jump to the same target.
+
+ A B
+ \ /
+ C
+
+ Should A or B be placed in front of C? The block layout algorithm
+ decides this based on which edge (A,C)/(B,C) is heavier. So we
+ make a educated guess on which branch should be preferred.
+
+ We rank edges in this order:
+ * Unconditional Control Transfer - They will always
+ transfer control to their target. Unless there is a info table
+ we can turn the jump into a fallthrough as well.
+ We use 20k as default, so it's easy to spot if values have been
+ modified but unlikely that we run into issues with overflow.
+ * If branches (likely) - We assume branches marked as likely
+ are taken more than 80% of the time.
+ By ranking them below unconditional jumps we make sure we
+ prefer the unconditional if there is a conditional and
+ unconditional edge towards a block.
+ * If branches (regular) - The false branch can potentially be turned
+ into a fallthrough so we prefer it slightly over the true branch.
+ * Unlikely branches - These can be assumed to be taken less than 20%
+ of the time. So we given them one of the lowest priorities.
+ * Switches - Switches at this level are implemented as jump tables
+ so have a larger number of successors. So without more information
+ we can only say that each individual successor is unlikely to be
+ jumped to and we rank them accordingly.
+ * Calls - We currently ignore calls completely:
+ * By the time we return from a call there is a good chance
+ that the address we return to has already been evicted from
+ cache eliminating a main advantage sequential placement brings.
+ * Calls always require a info table in front of their return
+ address. This reduces the chance that we return to the same
+ cache line further.
+
+-}
+-- | Generate weights for a Cmm proc based on some simple heuristics.
+getCfgProc :: D.CfgWeights -> RawCmmDecl -> CFG
+getCfgProc _ (CmmData {}) = mapEmpty
+getCfgProc weights (CmmProc _info _lab _live graph) = getCfg weights graph
+
+getCfg :: D.CfgWeights -> CmmGraph -> CFG
+getCfg weights graph =
+ foldl' insertEdge edgelessCfg $ concatMap getBlockEdges blocks
+ where
+ D.CFGWeights
+ { D.uncondWeight = uncondWeight
+ , D.condBranchWeight = condBranchWeight
+ , D.switchWeight = switchWeight
+ , D.callWeight = callWeight
+ , D.likelyCondWeight = likelyCondWeight
+ , D.unlikelyCondWeight = unlikelyCondWeight
+ -- Last two are used in other places
+ --, D.infoTablePenalty = infoTablePenalty
+ --, D.backEdgeBonus = backEdgeBonus
+ } = weights
+ -- Explicitly add all nodes to the cfg to ensure they are part of the
+ -- CFG.
+ edgelessCfg = mapFromList $ zip (map G.entryLabel blocks) (repeat mapEmpty)
+ insertEdge :: CFG -> ((BlockId,BlockId),EdgeInfo) -> CFG
+ insertEdge m ((from,to),weight) =
+ mapAlter f from m
+ where
+ f :: Maybe (LabelMap EdgeInfo) -> Maybe (LabelMap EdgeInfo)
+ f Nothing = Just $ mapSingleton to weight
+ f (Just destMap) = Just $ mapInsert to weight destMap
+ getBlockEdges :: CmmBlock -> [((BlockId,BlockId),EdgeInfo)]
+ getBlockEdges block =
+ case branch of
+ CmmBranch dest -> [mkEdge dest uncondWeight]
+ CmmCondBranch cond t f l
+ | l == Nothing ->
+ [mkEdge f condBranchWeight, mkEdge t condBranchWeight]
+ | l == Just True ->
+ [mkEdge f unlikelyCondWeight, mkEdge t likelyCondWeight]
+ | l == Just False ->
+ [mkEdge f likelyCondWeight, mkEdge t unlikelyCondWeight]
+ where
+ mkEdgeInfo = -- pprTrace "Info" (ppr branchInfo <+> ppr cond)
+ EdgeInfo (CmmSource branch branchInfo) . fromIntegral
+ mkEdge target weight = ((bid,target), mkEdgeInfo weight)
+ branchInfo =
+ foldRegsUsed
+ (panic "foldRegsDynFlags")
+ (\info r -> if r == SpLim || r == HpLim || r == BaseReg
+ then HeapStackCheck else info)
+ NoInfo cond
+
+ (CmmSwitch _e ids) ->
+ let switchTargets = switchTargetsToList ids
+ --Compiler performance hack - for very wide switches don't
+ --consider targets for layout.
+ adjustedWeight =
+ if (length switchTargets > 10) then -1 else switchWeight
+ in map (\x -> mkEdge x adjustedWeight) switchTargets
+ (CmmCall { cml_cont = Just cont}) -> [mkEdge cont callWeight]
+ (CmmForeignCall {Cmm.succ = cont}) -> [mkEdge cont callWeight]
+ (CmmCall { cml_cont = Nothing }) -> []
+ other ->
+ panic "Foo" $
+ ASSERT2(False, ppr "Unknown successor cause:" <>
+ (ppr branch <+> text "=>" <> ppr (G.successors other)))
+ map (\x -> ((bid,x),mkEdgeInfo 0)) $ G.successors other
+ where
+ bid = G.entryLabel block
+ mkEdgeInfo = EdgeInfo (CmmSource branch NoInfo) . fromIntegral
+ mkEdge target weight = ((bid,target), mkEdgeInfo weight)
+ branch = lastNode block :: CmmNode O C
+
+ blocks = revPostorder graph :: [CmmBlock]
+
+--Find back edges by BFS
+findBackEdges :: HasDebugCallStack => BlockId -> CFG -> Edges
+findBackEdges root cfg =
+ --pprTraceIt "Backedges:" $
+ map fst .
+ filter (\x -> snd x == Backward) $ typedEdges
+ where
+ edges = edgeList cfg :: [(BlockId,BlockId)]
+ getSuccs = getSuccessors cfg :: BlockId -> [BlockId]
+ typedEdges =
+ classifyEdges root getSuccs edges :: [((BlockId,BlockId),EdgeType)]
+
+
+optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG
+optimizeCFG _ (CmmData {}) cfg = cfg
+optimizeCFG weights (CmmProc info _lab _live graph) cfg =
+ {-# SCC optimizeCFG #-}
+ -- pprTrace "Initial:" (pprEdgeWeights cfg) $
+ -- pprTrace "Initial:" (ppr $ mkGlobalWeights (g_entry graph) cfg) $
+
+ -- pprTrace "LoopInfo:" (ppr $ loopInfo cfg (g_entry graph)) $
+ favourFewerPreds .
+ penalizeInfoTables info .
+ increaseBackEdgeWeight (g_entry graph) $ cfg
+ where
+
+ -- | Increase the weight of all backedges in the CFG
+ -- this helps to make loop jumpbacks the heaviest edges
+ increaseBackEdgeWeight :: BlockId -> CFG -> CFG
+ increaseBackEdgeWeight root cfg =
+ let backedges = findBackEdges root cfg
+ update weight
+ --Keep irrelevant edges irrelevant
+ | weight <= 0 = 0
+ | otherwise
+ = weight + fromIntegral (D.backEdgeBonus weights)
+ in foldl' (\cfg edge -> updateEdgeWeight update edge cfg)
+ cfg backedges
+
+ -- | Since we cant fall through info tables we penalize these.
+ penalizeInfoTables :: LabelMap a -> CFG -> CFG
+ penalizeInfoTables info cfg =
+ mapWeights fupdate cfg
+ where
+ fupdate :: BlockId -> BlockId -> EdgeWeight -> EdgeWeight
+ fupdate _ to weight
+ | mapMember to info
+ = weight - (fromIntegral $ D.infoTablePenalty weights)
+ | otherwise = weight
+
+ -- | If a block has two successors, favour the one with fewer
+ -- predecessors and/or the one allowing fall through.
+ favourFewerPreds :: CFG -> CFG
+ favourFewerPreds cfg =
+ let
+ revCfg =
+ reverseEdges $ filterEdges
+ (\_from -> fallthroughTarget) cfg
+
+ predCount n = length $ getSuccessorEdges revCfg n
+ nodes = getCfgNodes cfg
+
+ modifiers :: Int -> Int -> (EdgeWeight, EdgeWeight)
+ modifiers preds1 preds2
+ | preds1 < preds2 = ( 1,-1)
+ | preds1 == preds2 = ( 0, 0)
+ | otherwise = (-1, 1)
+
+ update :: CFG -> BlockId -> CFG
+ update cfg node
+ | [(s1,e1),(s2,e2)] <- getSuccessorEdges cfg node
+ , !w1 <- edgeWeight e1
+ , !w2 <- edgeWeight e2
+ --Only change the weights if there isn't already a ordering.
+ , w1 == w2
+ , (mod1,mod2) <- modifiers (predCount s1) (predCount s2)
+ = (\cfg' ->
+ (adjustEdgeWeight cfg' (+mod2) node s2))
+ (adjustEdgeWeight cfg (+mod1) node s1)
+ | otherwise
+ = cfg
+ in foldl' update cfg nodes
+ where
+ fallthroughTarget :: BlockId -> EdgeInfo -> Bool
+ fallthroughTarget to (EdgeInfo source _weight)
+ | mapMember to info = False
+ | AsmCodeGen <- source = True
+ | CmmSource { trans_cmmNode = CmmBranch {} } <- source = True
+ | CmmSource { trans_cmmNode = CmmCondBranch {} } <- source = True
+ | otherwise = False
+
+-- | Determine loop membership of blocks based on SCC analysis
+-- This is faster but only gives yes/no answers.
+loopMembers :: HasDebugCallStack => CFG -> LabelMap Bool
+loopMembers cfg =
+ foldl' (flip setLevel) mapEmpty sccs
+ where
+ mkNode :: BlockId -> Node BlockId BlockId
+ mkNode bid = DigraphNode bid bid (getSuccessors cfg bid)
+ nodes = map mkNode (getCfgNodes cfg)
+
+ sccs = stronglyConnCompFromEdgedVerticesOrd nodes
+
+ setLevel :: SCC BlockId -> LabelMap Bool -> LabelMap Bool
+ setLevel (AcyclicSCC bid) m = mapInsert bid False m
+ setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids
+
+loopLevels :: CFG -> BlockId -> LabelMap Int
+loopLevels cfg root = liLevels loopInfos
+ where
+ loopInfos = loopInfo cfg root
+
+data LoopInfo = LoopInfo
+ { liBackEdges :: [(Edge)] -- ^ List of back edges
+ , liLevels :: LabelMap Int -- ^ BlockId -> LoopLevel mapping
+ , liLoops :: [(Edge, LabelSet)] -- ^ (backEdge, loopBody), body includes header
+ }
+
+instance Outputable LoopInfo where
+ ppr (LoopInfo _ _lvls loops) =
+ text "Loops:(backEdge, bodyNodes)" $$
+ (vcat $ map ppr loops)
+
+{- Note [Determining the loop body]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ Starting with the knowledge that:
+ * head dominates the loop
+ * `tail` -> `head` is a backedge
+
+ We can determine all nodes by:
+ * Deleting the loop head from the graph.
+ * Collect all blocks which are reachable from the `tail`.
+
+ We do so by performing bfs from the tail node towards the head.
+ -}
+
+-- | Determine loop membership of blocks based on Dominator analysis.
+-- This is slower but gives loop levels instead of just loop membership.
+-- However it only detects natural loops. Irreducible control flow is not
+-- recognized even if it loops. But that is rare enough that we don't have
+-- to care about that special case.
+loopInfo :: HasDebugCallStack => CFG -> BlockId -> LoopInfo
+loopInfo cfg root = LoopInfo { liBackEdges = backEdges
+ , liLevels = mapFromList loopCounts
+ , liLoops = loopBodies }
+ where
+ revCfg = reverseEdges cfg
+
+ graph = -- pprTrace "CFG - loopInfo" (pprEdgeWeights cfg) $
+ fmap (setFromList . mapKeys ) cfg :: LabelMap LabelSet
+
+
+ --TODO - This should be a no op: Export constructors? Use unsafeCoerce? ...
+ rooted = ( fromBlockId root
+ , toIntMap $ fmap toIntSet graph) :: (Int, IntMap IntSet)
+ tree = fmap toBlockId $ Dom.domTree rooted :: Tree BlockId
+
+ -- Map from Nodes to their dominators
+ domMap :: LabelMap LabelSet
+ domMap = mkDomMap tree
+
+ edges = edgeList cfg :: [(BlockId, BlockId)]
+ -- We can't recompute nodes from edges, there might be blocks not connected via edges.
+ nodes = getCfgNodes cfg :: [BlockId]
+
+ -- identify back edges
+ isBackEdge (from,to)
+ | Just doms <- mapLookup from domMap
+ , setMember to doms
+ = True
+ | otherwise = False
+
+ -- See Note [Determining the loop body]
+ -- Get the loop body associated with a back edge.
+ findBody edge@(tail, head)
+ = ( edge, setInsert head $ go (setSingleton tail) (setSingleton tail) )
+ where
+ -- See Note [Determining the loop body]
+ cfg' = delNode head revCfg
+
+ go :: LabelSet -> LabelSet -> LabelSet
+ go found current
+ | setNull current = found
+ | otherwise = go (setUnion newSuccessors found)
+ newSuccessors
+ where
+ -- Really predecessors, since we use the reversed cfg.
+ newSuccessors = setFilter (\n -> not $ setMember n found) successors :: LabelSet
+ successors = setFromList $ concatMap
+ (getSuccessors cfg')
+ -- we filter head as it's no longer part of the cfg.
+ (filter (/= head) $ setElems current) :: LabelSet
+
+ backEdges = filter isBackEdge edges
+ loopBodies = map findBody backEdges :: [(Edge, LabelSet)]
+
+ -- Block b is part of n loop bodies => loop nest level of n
+ loopCounts =
+ let bodies = map (first snd) loopBodies -- [(Header, Body)]
+ loopCount n = length $ nub . map fst . filter (setMember n . snd) $ bodies
+ in map (\n -> (n, loopCount n)) $ nodes :: [(BlockId, Int)]
+
+ toIntSet :: LabelSet -> IntSet
+ toIntSet s = IS.fromList . map fromBlockId . setElems $ s
+ toIntMap :: LabelMap a -> IntMap a
+ toIntMap m = IM.fromList $ map (\(x,y) -> (fromBlockId x,y)) $ mapToList m
+
+ mkDomMap :: Tree BlockId -> LabelMap LabelSet
+ mkDomMap root = mapFromList $ go setEmpty root
+ where
+ go :: LabelSet -> Tree BlockId -> [(Label,LabelSet)]
+ go parents (Node lbl [])
+ = [(lbl, parents)]
+ go parents (Node _ leaves)
+ = let nodes = map rootLabel leaves
+ entries = map (\x -> (x,parents)) nodes
+ in entries ++ concatMap
+ (\n -> go (setInsert (rootLabel n) parents) n)
+ leaves
+
+ fromBlockId :: BlockId -> Int
+ fromBlockId = getKey . getUnique
+
+ toBlockId :: Int -> BlockId
+ toBlockId = mkBlockId . mkUniqueGrimily
+
+-- We make the CFG a Hoopl Graph, so we can reuse revPostOrder.
+newtype BlockNode (e :: Extensibility) (x :: Extensibility) = BN (BlockId,[BlockId])
+
+instance G.NonLocal (BlockNode) where
+ entryLabel (BN (lbl,_)) = lbl
+ successors (BN (_,succs)) = succs
+
+revPostorderFrom :: HasDebugCallStack => CFG -> BlockId -> [BlockId]
+revPostorderFrom cfg root =
+ map fromNode $ G.revPostorderFrom hooplGraph root
+ where
+ nodes = getCfgNodes cfg
+ hooplGraph = foldl' (\m n -> mapInsert n (toNode n) m) mapEmpty nodes
+
+ fromNode :: BlockNode C C -> BlockId
+ fromNode (BN x) = fst x
+
+ toNode :: BlockId -> BlockNode C C
+ toNode bid =
+ BN (bid,getSuccessors cfg $ bid)
+
+
+-- | We take in a CFG which has on its edges weights which are
+-- relative only to other edges originating from the same node.
+--
+-- We return a CFG for which each edge represents a GLOBAL weight.
+-- This means edge weights are comparable across the whole graph.
+--
+-- For irreducible control flow results might be imprecise, otherwise they
+-- are reliable.
+--
+-- The algorithm is based on the Paper
+-- "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus
+-- The only big change is that we go over the nodes in the body of loops in
+-- reverse post order. Which is required for diamond control flow to work probably.
+--
+-- We also apply a few prediction heuristics (based on the same paper)
+
+{-# NOINLINE mkGlobalWeights #-}
+{-# SCC mkGlobalWeights #-}
+mkGlobalWeights :: HasDebugCallStack => BlockId -> CFG -> (LabelMap Double, LabelMap (LabelMap Double))
+mkGlobalWeights root localCfg
+ | null localCfg = panic "Error - Empty CFG"
+ | otherwise
+ = (blockFreqs', edgeFreqs')
+ where
+ -- Calculate fixpoints
+ (blockFreqs, edgeFreqs) = calcFreqs nodeProbs backEdges' bodies' revOrder'
+ blockFreqs' = mapFromList $ map (first fromVertex) (assocs blockFreqs) :: LabelMap Double
+ edgeFreqs' = fmap fromVertexMap $ fromVertexMap edgeFreqs
+
+ fromVertexMap :: IM.IntMap x -> LabelMap x
+ fromVertexMap m = mapFromList . map (first fromVertex) $ IM.toList m
+
+ revOrder = revPostorderFrom localCfg root :: [BlockId]
+ loopResults@(LoopInfo backedges _levels bodies) = loopInfo localCfg root
+
+ revOrder' = map toVertex revOrder
+ backEdges' = map (bimap toVertex toVertex) backedges
+ bodies' = map calcBody bodies
+
+ estimatedCfg = staticBranchPrediction root loopResults localCfg
+ -- Normalize the weights to probabilities and apply heuristics
+ nodeProbs = cfgEdgeProbabilities estimatedCfg toVertex
+
+ -- By mapping vertices to numbers in reverse post order we can bring any subset into reverse post
+ -- order simply by sorting.
+ -- TODO: The sort is redundant if we can guarantee that setElems returns elements ascending
+ calcBody (backedge, blocks) =
+ (toVertex $ snd backedge, sort . map toVertex $ (setElems blocks))
+
+ vertexMapping = mapFromList $ zip revOrder [0..] :: LabelMap Int
+ blockMapping = listArray (0,mapSize vertexMapping - 1) revOrder :: Array Int BlockId
+ -- Map from blockId to indices starting at zero
+ toVertex :: BlockId -> Int
+ toVertex blockId = expectJust "mkGlobalWeights" $ mapLookup blockId vertexMapping
+ -- Map from indices starting at zero to blockIds
+ fromVertex :: Int -> BlockId
+ fromVertex vertex = blockMapping ! vertex
+
+{- Note [Static Branch Prediction]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The work here has been based on the paper
+"Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus.
+
+The primary differences are that if we branch on the result of a heap
+check we do not apply any of the heuristics.
+The reason is simple: They look like loops in the control flow graph
+but are usually never entered, and if at most once.
+
+Currently implemented is a heuristic to predict that we do not exit
+loops (lehPredicts) and one to predict that backedges are more likely
+than any other edge.
+
+The back edge case is special as it superceeds any other heuristic if it
+applies.
+
+Do NOT rely solely on nofib results for benchmarking this. I recommend at least
+comparing megaparsec and container benchmarks. Nofib does not seeem to have
+many instances of "loopy" Cmm where these make a difference.
+
+TODO:
+* The paper containers more benchmarks which should be implemented.
+* If we turn the likelihood on if/else branches into a probability
+ instead of true/false we could implement this as a Cmm pass.
+ + The complete Cmm code still exists and can be accessed by the heuristics
+ + There is no chance of register allocation/codegen inserting branches/blocks
+ + making the TransitionSource info wrong.
+ + potential to use this information in CmmPasses.
+ - Requires refactoring of all the code relying on the binary nature of likelihood.
+ - Requires refactoring `loopInfo` to work on both, Cmm Graphs and the backend CFG.
+-}
+
+-- | Combination of target node id and information about the branch
+-- we are looking at.
+type TargetNodeInfo = (BlockId, EdgeInfo)
+
+
+-- | Update branch weights based on certain heuristics.
+-- See Note [Static Branch Prediction]
+-- TODO: This should be combined with optimizeCFG
+{-# SCC staticBranchPrediction #-}
+staticBranchPrediction :: BlockId -> LoopInfo -> CFG -> CFG
+staticBranchPrediction _root (LoopInfo l_backEdges loopLevels l_loops) cfg =
+ -- pprTrace "staticEstimatesOn" (ppr (cfg)) $
+ foldl' update cfg nodes
+ where
+ nodes = getCfgNodes cfg
+ backedges = S.fromList $ l_backEdges
+ -- Loops keyed by their back edge
+ loops = M.fromList $ l_loops :: M.Map Edge LabelSet
+ loopHeads = S.fromList $ map snd $ M.keys loops
+
+ update :: CFG -> BlockId -> CFG
+ update cfg node
+ -- No successors, nothing to do.
+ | null successors = cfg
+
+ -- Mix of backedges and others:
+ -- Always predict the backedges.
+ | not (null m) && length m < length successors
+ -- Heap/Stack checks "loop", but only once.
+ -- So we simply exclude any case involving them.
+ , not $ any (isHeapOrStackCheck . transitionSource . snd) successors
+ = let loopChance = repeat $! pred_LBH / (fromIntegral $ length m)
+ exitChance = repeat $! (1 - pred_LBH) / fromIntegral (length not_m)
+ updates = zip (map fst m) loopChance ++ zip (map fst not_m) exitChance
+ in -- pprTrace "mix" (ppr (node,successors)) $
+ foldl' (\cfg (to,weight) -> setEdgeWeight cfg weight node to) cfg updates
+
+ -- For (regular) non-binary branches we keep the weights from the STG -> Cmm translation.
+ | length successors /= 2
+ = cfg
+
+ -- Only backedges - no need to adjust
+ | length m > 0
+ = cfg
+
+ -- A regular binary branch, we can plug addition predictors in here.
+ | [(s1,s1_info),(s2,s2_info)] <- successors
+ , not $ any (isHeapOrStackCheck . transitionSource . snd) successors
+ = -- Normalize weights to total of 1
+ let !w1 = max (edgeWeight s1_info) (0)
+ !w2 = max (edgeWeight s2_info) (0)
+ -- Of both weights are <= 0 we set both to 0.5
+ normalizeWeight w = if w1 + w2 == 0 then 0.5 else w/(w1+w2)
+ !cfg' = setEdgeWeight cfg (normalizeWeight w1) node s1
+ !cfg'' = setEdgeWeight cfg' (normalizeWeight w2) node s2
+
+ -- Figure out which heuristics apply to these successors
+ heuristics = map ($ ((s1,s1_info),(s2,s2_info)))
+ [lehPredicts, phPredicts, ohPredicts, ghPredicts, lhhPredicts, chPredicts
+ , shPredicts, rhPredicts]
+ -- Apply result of a heuristic. Argument is the likelihood
+ -- predicted for s1.
+ applyHeuristic :: CFG -> Maybe Prob -> CFG
+ applyHeuristic cfg Nothing = cfg
+ applyHeuristic cfg (Just (s1_pred :: Double))
+ | s1_old == 0 || s2_old == 0 ||
+ isHeapOrStackCheck (transitionSource s1_info) ||
+ isHeapOrStackCheck (transitionSource s2_info)
+ = cfg
+ | otherwise =
+ let -- Predictions from heuristic
+ s1_prob = EdgeWeight s1_pred :: EdgeWeight
+ s2_prob = 1.0 - s1_prob
+ -- Update
+ d = (s1_old * s1_prob) + (s2_old * s2_prob) :: EdgeWeight
+ s1_prob' = s1_old * s1_prob / d
+ !s2_prob' = s2_old * s2_prob / d
+ !cfg_s1 = setEdgeWeight cfg s1_prob' node s1
+ in -- pprTrace "Applying heuristic!" (ppr (node,s1,s2) $$ ppr (s1_prob', s2_prob')) $
+ setEdgeWeight cfg_s1 s2_prob' node s2
+ where
+ -- Old weights
+ s1_old = getEdgeWeight cfg node s1
+ s2_old = getEdgeWeight cfg node s2
+
+ in
+ -- pprTraceIt "RegularCfgResult" $
+ foldl' applyHeuristic cfg'' heuristics
+
+ -- Branch on heap/stack check
+ | otherwise = cfg
+
+ where
+ -- Chance that loops are taken.
+ pred_LBH = 0.875
+ -- successors
+ successors = getSuccessorEdges cfg node
+ -- backedges
+ (m,not_m) = partition (\succ -> S.member (node, fst succ) backedges) successors
+
+ -- Heuristics return nothing if they don't say anything about this branch
+ -- or Just (prob_s1) where prob_s1 is the likelihood for s1 to be the
+ -- taken branch. s1 is the branch in the true case.
+
+ -- Loop exit heuristic.
+ -- We are unlikely to leave a loop unless it's to enter another one.
+ pred_LEH = 0.75
+ -- If and only if no successor is a loopheader,
+ -- then we will likely not exit the current loop body.
+ lehPredicts :: (TargetNodeInfo,TargetNodeInfo) -> Maybe Prob
+ lehPredicts ((s1,_s1_info),(s2,_s2_info))
+ | S.member s1 loopHeads || S.member s2 loopHeads
+ = Nothing
+
+ | otherwise
+ = --pprTrace "lehPredict:" (ppr $ compare s1Level s2Level) $
+ case compare s1Level s2Level of
+ EQ -> Nothing
+ LT -> Just (1-pred_LEH) --s1 exits to a shallower loop level (exits loop)
+ GT -> Just (pred_LEH) --s1 exits to a deeper loop level
+ where
+ s1Level = mapLookup s1 loopLevels
+ s2Level = mapLookup s2 loopLevels
+
+ -- Comparing to a constant is unlikely to be equal.
+ ohPredicts (s1,_s2)
+ | CmmSource { trans_cmmNode = src1 } <- getTransitionSource node (fst s1) cfg
+ , CmmCondBranch cond ltrue _lfalse likely <- src1
+ , likely == Nothing
+ , CmmMachOp mop args <- cond
+ , MO_Eq {} <- mop
+ , not (null [x | x@CmmLit{} <- args])
+ = if fst s1 == ltrue then Just 0.3 else Just 0.7
+
+ | otherwise
+ = Nothing
+
+ -- TODO: These are all the other heuristics from the paper.
+ -- Not all will apply, for now we just stub them out as Nothing.
+ phPredicts = const Nothing
+ ghPredicts = const Nothing
+ lhhPredicts = const Nothing
+ chPredicts = const Nothing
+ shPredicts = const Nothing
+ rhPredicts = const Nothing
+
+-- We normalize all edge weights as probabilities between 0 and 1.
+-- Ignoring rounding errors all outgoing edges sum up to 1.
+cfgEdgeProbabilities :: CFG -> (BlockId -> Int) -> IM.IntMap (IM.IntMap Prob)
+cfgEdgeProbabilities cfg toVertex
+ = mapFoldlWithKey foldEdges IM.empty cfg
+ where
+ foldEdges = (\m from toMap -> IM.insert (toVertex from) (normalize toMap) m)
+
+ normalize :: (LabelMap EdgeInfo) -> (IM.IntMap Prob)
+ normalize weightMap
+ | edgeCount <= 1 = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) 1.0 m) IM.empty weightMap
+ | otherwise = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) (normalWeight k) m) IM.empty weightMap
+ where
+ edgeCount = mapSize weightMap
+ -- Negative weights are generally allowed but are mapped to zero.
+ -- We then check if there is at least one non-zero edge and if not
+ -- assign uniform weights to all branches.
+ minWeight = 0 :: Prob
+ weightMap' = fmap (\w -> max (weightToDouble . edgeWeight $ w) minWeight) weightMap
+ totalWeight = sum weightMap'
+
+ normalWeight :: BlockId -> Prob
+ normalWeight bid
+ | totalWeight == 0
+ = 1.0 / fromIntegral edgeCount
+ | Just w <- mapLookup bid weightMap'
+ = w/totalWeight
+ | otherwise = panic "impossible"
+
+-- This is the fixpoint algorithm from
+-- "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus
+-- The adaption to Haskell is my own.
+calcFreqs :: IM.IntMap (IM.IntMap Prob) -> [(Int,Int)] -> [(Int, [Int])] -> [Int]
+ -> (Array Int Double, IM.IntMap (IM.IntMap Prob))
+calcFreqs graph backEdges loops revPostOrder = runST $ do
+ visitedNodes <- newArray (0,nodeCount-1) False :: ST s (STUArray s Int Bool)
+ blockFreqs <- newArray (0,nodeCount-1) 0.0 :: ST s (STUArray s Int Double)
+ edgeProbs <- newSTRef graph
+ edgeBackProbs <- newSTRef graph
+
+ -- let traceArray a = do
+ -- vs <- forM [0..nodeCount-1] $ \i -> readArray a i >>= (\v -> return (i,v))
+ -- trace ("array: " ++ show vs) $ return ()
+
+ let -- See #1600, we need to inline or unboxing makes perf worse.
+ -- {-# INLINE getFreq #-}
+ {-# INLINE visited #-}
+ visited b = unsafeRead visitedNodes b
+ getFreq b = unsafeRead blockFreqs b
+ -- setFreq :: forall s. Int -> Double -> ST s ()
+ setFreq b f = unsafeWrite blockFreqs b f
+ -- setVisited :: forall s. Node -> ST s ()
+ setVisited b = unsafeWrite visitedNodes b True
+ -- Frequency/probability that edge is taken.
+ getProb' arr b1 b2 = readSTRef arr >>=
+ (\graph ->
+ return .
+ fromMaybe (error "getFreq 1") .
+ IM.lookup b2 .
+ fromMaybe (error "getFreq 2") $
+ (IM.lookup b1 graph)
+ )
+ setProb' arr b1 b2 prob = do
+ g <- readSTRef arr
+ let !m = fromMaybe (error "Foo") $ IM.lookup b1 g
+ !m' = IM.insert b2 prob m
+ writeSTRef arr $! (IM.insert b1 m' g)
+
+ getEdgeFreq b1 b2 = getProb' edgeProbs b1 b2
+ setEdgeFreq b1 b2 = setProb' edgeProbs b1 b2
+ getProb b1 b2 = fromMaybe (error "getProb") $ do
+ m' <- IM.lookup b1 graph
+ IM.lookup b2 m'
+
+ getBackProb b1 b2 = getProb' edgeBackProbs b1 b2
+ setBackProb b1 b2 = setProb' edgeBackProbs b1 b2
+
+
+ let -- calcOutFreqs :: Node -> ST s ()
+ calcOutFreqs bhead block = do
+ !f <- getFreq block
+ forM (successors block) $ \bi -> do
+ let !prob = getProb block bi
+ let !succFreq = f * prob
+ setEdgeFreq block bi succFreq
+ -- traceM $ "SetOut: " ++ show (block, bi, f, prob, succFreq)
+ when (bi == bhead) $ setBackProb block bi succFreq
+
+
+ let propFreq block head = do
+ -- traceM ("prop:" ++ show (block,head))
+ -- traceShowM block
+
+ !v <- visited block
+ if v then
+ return () --Dont look at nodes twice
+ else if block == head then
+ setFreq block 1.0 -- Loop header frequency is always 1
+ else do
+ let preds = IS.elems $ predecessors block
+ irreducible <- (fmap or) $ forM preds $ \bp -> do
+ !bp_visited <- visited bp
+ let bp_backedge = isBackEdge bp block
+ return (not bp_visited && not bp_backedge)
+
+ if irreducible
+ then return () -- Rare we don't care
+ else do
+ setFreq block 0
+ !cycleProb <- sum <$> (forM preds $ \pred -> do
+ if isBackEdge pred block
+ then
+ getBackProb pred block
+ else do
+ !f <- getFreq block
+ !prob <- getEdgeFreq pred block
+ setFreq block $! f + prob
+ return 0)
+ -- traceM $ "cycleProb:" ++ show cycleProb
+ let limit = 1 - 1/512 -- Paper uses 1 - epsilon, but this works.
+ -- determines how large likelyhoods in loops can grow.
+ !cycleProb <- return $ min cycleProb limit -- <- return $ if cycleProb > limit then limit else cycleProb
+ -- traceM $ "cycleProb:" ++ show cycleProb
+
+ !f <- getFreq block
+ setFreq block (f / (1.0 - cycleProb))
+
+ setVisited block
+ calcOutFreqs head block
+
+ -- Loops, by nesting, inner to outer
+ forM_ loops $ \(head, body) -> do
+ forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i True) -- Mark all nodes as visited.
+ forM_ body (\i -> unsafeWrite visitedNodes i False) -- Mark all blocks reachable from head as not visited
+ forM_ body $ \block -> propFreq block head
+
+ -- After dealing with all loops, deal with non-looping parts of the CFG
+ forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i False) -- Everything in revPostOrder is reachable
+ forM_ revPostOrder $ \block -> propFreq block (head revPostOrder)
+
+ -- trace ("Final freqs:") $ return ()
+ -- let freqString = pprFreqs freqs
+ -- trace (unlines freqString) $ return ()
+ -- trace (pprFre) $ return ()
+ graph' <- readSTRef edgeProbs
+ freqs' <- unsafeFreeze blockFreqs
+
+ return (freqs', graph')
+ where
+ -- How can these lookups fail? Consider the CFG [A -> B]
+ predecessors :: Int -> IS.IntSet
+ predecessors b = fromMaybe IS.empty $ IM.lookup b revGraph
+ successors :: Int -> [Int]
+ successors b = fromMaybe (lookupError "succ" b graph)$ IM.keys <$> IM.lookup b graph
+ lookupError s b g = pprPanic ("Lookup error " ++ s) $
+ ( text "node" <+> ppr b $$
+ text "graph" <+>
+ vcat (map (\(k,m) -> ppr (k,m :: IM.IntMap Double)) $ IM.toList g)
+ )
+
+ nodeCount = IM.foldl' (\count toMap -> IM.foldlWithKey' countTargets count toMap) (IM.size graph) graph
+ where
+ countTargets = (\count k _ -> countNode k + count )
+ countNode n = if IM.member n graph then 0 else 1
+
+ isBackEdge from to = S.member (from,to) backEdgeSet
+ backEdgeSet = S.fromList backEdges
+
+ revGraph :: IntMap IntSet
+ revGraph = IM.foldlWithKey' (\m from toMap -> addEdges m from toMap) IM.empty graph
+ where
+ addEdges m0 from toMap = IM.foldlWithKey' (\m k _ -> addEdge m from k) m0 toMap
+ addEdge m0 from to = IM.insertWith IS.union to (IS.singleton from) m0
diff --git a/compiler/GHC/CmmToAsm/CFG/Dominators.hs b/compiler/GHC/CmmToAsm/CFG/Dominators.hs
new file mode 100644
index 0000000000..b9dcacd8cb
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/CFG/Dominators.hs
@@ -0,0 +1,597 @@
+{-# LANGUAGE RankNTypes, BangPatterns, FlexibleContexts, Strict #-}
+
+{- |
+ Module : Dominators
+ Copyright : (c) Matt Morrow 2009
+ License : BSD3
+ Maintainer : <morrow@moonpatio.com>
+ Stability : experimental
+ Portability : portable
+
+ Taken from the dom-lt package.
+
+ The Lengauer-Tarjan graph dominators algorithm.
+
+ \[1\] Lengauer, Tarjan,
+ /A Fast Algorithm for Finding Dominators in a Flowgraph/, 1979.
+
+ \[2\] Muchnick,
+ /Advanced Compiler Design and Implementation/, 1997.
+
+ \[3\] Brisk, Sarrafzadeh,
+ /Interference Graphs for Procedures in Static Single/
+ /Information Form are Interval Graphs/, 2007.
+
+ Originally taken from the dom-lt package.
+-}
+
+module GHC.CmmToAsm.CFG.Dominators (
+ Node,Path,Edge
+ ,Graph,Rooted
+ ,idom,ipdom
+ ,domTree,pdomTree
+ ,dom,pdom
+ ,pddfs,rpddfs
+ ,fromAdj,fromEdges
+ ,toAdj,toEdges
+ ,asTree,asGraph
+ ,parents,ancestors
+) where
+
+import GhcPrelude
+
+import Data.Bifunctor
+import Data.Tuple (swap)
+
+import Data.Tree
+import Data.IntMap(IntMap)
+import Data.IntSet(IntSet)
+import qualified Data.IntMap.Strict as IM
+import qualified Data.IntSet as IS
+
+import Control.Monad
+import Control.Monad.ST.Strict
+
+import Data.Array.ST
+import Data.Array.Base hiding ((!))
+ -- (unsafeNewArray_
+ -- ,unsafeWrite,unsafeRead
+ -- ,readArray,writeArray)
+
+import Util (debugIsOn)
+
+-----------------------------------------------------------------------------
+
+type Node = Int
+type Path = [Node]
+type Edge = (Node,Node)
+type Graph = IntMap IntSet
+type Rooted = (Node, Graph)
+
+-----------------------------------------------------------------------------
+
+-- | /Dominators/.
+-- Complexity as for @idom@
+dom :: Rooted -> [(Node, Path)]
+dom = ancestors . domTree
+
+-- | /Post-dominators/.
+-- Complexity as for @idom@.
+pdom :: Rooted -> [(Node, Path)]
+pdom = ancestors . pdomTree
+
+-- | /Dominator tree/.
+-- Complexity as for @idom@.
+domTree :: Rooted -> Tree Node
+domTree a@(r,_) =
+ let is = filter ((/=r).fst) (idom a)
+ tg = fromEdges (fmap swap is)
+ in asTree (r,tg)
+
+-- | /Post-dominator tree/.
+-- Complexity as for @idom@.
+pdomTree :: Rooted -> Tree Node
+pdomTree a@(r,_) =
+ let is = filter ((/=r).fst) (ipdom a)
+ tg = fromEdges (fmap swap is)
+ in asTree (r,tg)
+
+-- | /Immediate dominators/.
+-- /O(|E|*alpha(|E|,|V|))/, where /alpha(m,n)/ is
+-- \"a functional inverse of Ackermann's function\".
+--
+-- This Complexity bound assumes /O(1)/ indexing. Since we're
+-- using @IntMap@, it has an additional /lg |V|/ factor
+-- somewhere in there. I'm not sure where.
+idom :: Rooted -> [(Node,Node)]
+idom rg = runST (evalS idomM =<< initEnv (pruneReach rg))
+
+-- | /Immediate post-dominators/.
+-- Complexity as for @idom@.
+ipdom :: Rooted -> [(Node,Node)]
+ipdom rg = runST (evalS idomM =<< initEnv (pruneReach (second predG rg)))
+
+-----------------------------------------------------------------------------
+
+-- | /Post-dominated depth-first search/.
+pddfs :: Rooted -> [Node]
+pddfs = reverse . rpddfs
+
+-- | /Reverse post-dominated depth-first search/.
+rpddfs :: Rooted -> [Node]
+rpddfs = concat . levels . pdomTree
+
+-----------------------------------------------------------------------------
+
+type Dom s a = S s (Env s) a
+type NodeSet = IntSet
+type NodeMap a = IntMap a
+data Env s = Env
+ {succE :: !Graph
+ ,predE :: !Graph
+ ,bucketE :: !Graph
+ ,dfsE :: {-# UNPACK #-}!Int
+ ,zeroE :: {-# UNPACK #-}!Node
+ ,rootE :: {-# UNPACK #-}!Node
+ ,labelE :: {-# UNPACK #-}!(Arr s Node)
+ ,parentE :: {-# UNPACK #-}!(Arr s Node)
+ ,ancestorE :: {-# UNPACK #-}!(Arr s Node)
+ ,childE :: {-# UNPACK #-}!(Arr s Node)
+ ,ndfsE :: {-# UNPACK #-}!(Arr s Node)
+ ,dfnE :: {-# UNPACK #-}!(Arr s Int)
+ ,sdnoE :: {-# UNPACK #-}!(Arr s Int)
+ ,sizeE :: {-# UNPACK #-}!(Arr s Int)
+ ,domE :: {-# UNPACK #-}!(Arr s Node)
+ ,rnE :: {-# UNPACK #-}!(Arr s Node)}
+
+-----------------------------------------------------------------------------
+
+idomM :: Dom s [(Node,Node)]
+idomM = do
+ dfsDom =<< rootM
+ n <- gets dfsE
+ forM_ [n,n-1..1] (\i-> do
+ w <- ndfsM i
+ sw <- sdnoM w
+ ps <- predsM w
+ forM_ ps (\v-> do
+ u <- eval v
+ su <- sdnoM u
+ when (su < sw)
+ (store sdnoE w su))
+ z <- ndfsM =<< sdnoM w
+ modify(\e->e{bucketE=IM.adjust
+ (w`IS.insert`)
+ z (bucketE e)})
+ pw <- parentM w
+ link pw w
+ bps <- bucketM pw
+ forM_ bps (\v-> do
+ u <- eval v
+ su <- sdnoM u
+ sv <- sdnoM v
+ let dv = case su < sv of
+ True-> u
+ False-> pw
+ store domE v dv))
+ forM_ [1..n] (\i-> do
+ w <- ndfsM i
+ j <- sdnoM w
+ z <- ndfsM j
+ dw <- domM w
+ when (dw /= z)
+ (do ddw <- domM dw
+ store domE w ddw))
+ fromEnv
+
+-----------------------------------------------------------------------------
+
+eval :: Node -> Dom s Node
+eval v = do
+ n0 <- zeroM
+ a <- ancestorM v
+ case a==n0 of
+ True-> labelM v
+ False-> do
+ compress v
+ a <- ancestorM v
+ l <- labelM v
+ la <- labelM a
+ sl <- sdnoM l
+ sla <- sdnoM la
+ case sl <= sla of
+ True-> return l
+ False-> return la
+
+compress :: Node -> Dom s ()
+compress v = do
+ n0 <- zeroM
+ a <- ancestorM v
+ aa <- ancestorM a
+ when (aa /= n0) (do
+ compress a
+ a <- ancestorM v
+ aa <- ancestorM a
+ l <- labelM v
+ la <- labelM a
+ sl <- sdnoM l
+ sla <- sdnoM la
+ when (sla < sl)
+ (store labelE v la)
+ store ancestorE v aa)
+
+-----------------------------------------------------------------------------
+
+link :: Node -> Node -> Dom s ()
+link v w = do
+ n0 <- zeroM
+ lw <- labelM w
+ slw <- sdnoM lw
+ let balance s = do
+ c <- childM s
+ lc <- labelM c
+ slc <- sdnoM lc
+ case slw < slc of
+ False-> return s
+ True-> do
+ zs <- sizeM s
+ zc <- sizeM c
+ cc <- childM c
+ zcc <- sizeM cc
+ case 2*zc <= zs+zcc of
+ True-> do
+ store ancestorE c s
+ store childE s cc
+ balance s
+ False-> do
+ store sizeE c zs
+ store ancestorE s c
+ balance c
+ s <- balance w
+ lw <- labelM w
+ zw <- sizeM w
+ store labelE s lw
+ store sizeE v . (+zw) =<< sizeM v
+ let follow s = do
+ when (s /= n0) (do
+ store ancestorE s v
+ follow =<< childM s)
+ zv <- sizeM v
+ follow =<< case zv < 2*zw of
+ False-> return s
+ True-> do
+ cv <- childM v
+ store childE v s
+ return cv
+
+-----------------------------------------------------------------------------
+
+dfsDom :: Node -> Dom s ()
+dfsDom i = do
+ _ <- go i
+ n0 <- zeroM
+ r <- rootM
+ store parentE r n0
+ where go i = do
+ n <- nextM
+ store dfnE i n
+ store sdnoE i n
+ store ndfsE n i
+ store labelE i i
+ ss <- succsM i
+ forM_ ss (\j-> do
+ s <- sdnoM j
+ case s==0 of
+ False-> return()
+ True-> do
+ store parentE j i
+ go j)
+
+-----------------------------------------------------------------------------
+
+initEnv :: Rooted -> ST s (Env s)
+initEnv (r0,g0) = do
+ let (g,rnmap) = renum 1 g0
+ pred = predG g
+ r = rnmap IM.! r0
+ n = IM.size g
+ ns = [0..n]
+ m = n+1
+
+ let bucket = IM.fromList
+ (zip ns (repeat mempty))
+
+ rna <- newI m
+ writes rna (fmap swap
+ (IM.toList rnmap))
+
+ doms <- newI m
+ sdno <- newI m
+ size <- newI m
+ parent <- newI m
+ ancestor <- newI m
+ child <- newI m
+ label <- newI m
+ ndfs <- newI m
+ dfn <- newI m
+
+ forM_ [0..n] (doms.=0)
+ forM_ [0..n] (sdno.=0)
+ forM_ [1..n] (size.=1)
+ forM_ [0..n] (ancestor.=0)
+ forM_ [0..n] (child.=0)
+
+ (doms.=r) r
+ (size.=0) 0
+ (label.=0) 0
+
+ return (Env
+ {rnE = rna
+ ,dfsE = 0
+ ,zeroE = 0
+ ,rootE = r
+ ,labelE = label
+ ,parentE = parent
+ ,ancestorE = ancestor
+ ,childE = child
+ ,ndfsE = ndfs
+ ,dfnE = dfn
+ ,sdnoE = sdno
+ ,sizeE = size
+ ,succE = g
+ ,predE = pred
+ ,bucketE = bucket
+ ,domE = doms})
+
+fromEnv :: Dom s [(Node,Node)]
+fromEnv = do
+ dom <- gets domE
+ rn <- gets rnE
+ -- r <- gets rootE
+ (_,n) <- st (getBounds dom)
+ forM [1..n] (\i-> do
+ j <- st(rn!:i)
+ d <- st(dom!:i)
+ k <- st(rn!:d)
+ return (j,k))
+
+-----------------------------------------------------------------------------
+
+zeroM :: Dom s Node
+zeroM = gets zeroE
+domM :: Node -> Dom s Node
+domM = fetch domE
+rootM :: Dom s Node
+rootM = gets rootE
+succsM :: Node -> Dom s [Node]
+succsM i = gets (IS.toList . (! i) . succE)
+predsM :: Node -> Dom s [Node]
+predsM i = gets (IS.toList . (! i) . predE)
+bucketM :: Node -> Dom s [Node]
+bucketM i = gets (IS.toList . (! i) . bucketE)
+sizeM :: Node -> Dom s Int
+sizeM = fetch sizeE
+sdnoM :: Node -> Dom s Int
+sdnoM = fetch sdnoE
+-- dfnM :: Node -> Dom s Int
+-- dfnM = fetch dfnE
+ndfsM :: Int -> Dom s Node
+ndfsM = fetch ndfsE
+childM :: Node -> Dom s Node
+childM = fetch childE
+ancestorM :: Node -> Dom s Node
+ancestorM = fetch ancestorE
+parentM :: Node -> Dom s Node
+parentM = fetch parentE
+labelM :: Node -> Dom s Node
+labelM = fetch labelE
+nextM :: Dom s Int
+nextM = do
+ n <- gets dfsE
+ let n' = n+1
+ modify(\e->e{dfsE=n'})
+ return n'
+
+-----------------------------------------------------------------------------
+
+type A = STUArray
+type Arr s a = A s Int a
+
+infixl 9 !:
+infixr 2 .=
+
+(.=) :: (MArray (A s) a (ST s))
+ => Arr s a -> a -> Int -> ST s ()
+(v .= x) i
+ | debugIsOn = writeArray v i x
+ | otherwise = unsafeWrite v i x
+
+(!:) :: (MArray (A s) a (ST s))
+ => A s Int a -> Int -> ST s a
+a !: i
+ | debugIsOn = do
+ o <- readArray a i
+ return $! o
+ | otherwise = do
+ o <- unsafeRead a i
+ return $! o
+
+new :: (MArray (A s) a (ST s))
+ => Int -> ST s (Arr s a)
+new n = unsafeNewArray_ (0,n-1)
+
+newI :: Int -> ST s (Arr s Int)
+newI = new
+
+-- newD :: Int -> ST s (Arr s Double)
+-- newD = new
+
+-- dump :: (MArray (A s) a (ST s)) => Arr s a -> ST s [a]
+-- dump a = do
+-- (m,n) <- getBounds a
+-- forM [m..n] (\i -> a!:i)
+
+writes :: (MArray (A s) a (ST s))
+ => Arr s a -> [(Int,a)] -> ST s ()
+writes a xs = forM_ xs (\(i,x) -> (a.=x) i)
+
+-- arr :: (MArray (A s) a (ST s)) => [a] -> ST s (Arr s a)
+-- arr xs = do
+-- let n = length xs
+-- a <- new n
+-- go a n 0 xs
+-- return a
+-- where go _ _ _ [] = return ()
+-- go a n i (x:xs)
+-- | i <= n = (a.=x) i >> go a n (i+1) xs
+-- | otherwise = return ()
+
+-----------------------------------------------------------------------------
+
+(!) :: Monoid a => IntMap a -> Int -> a
+(!) g n = maybe mempty id (IM.lookup n g)
+
+fromAdj :: [(Node, [Node])] -> Graph
+fromAdj = IM.fromList . fmap (second IS.fromList)
+
+fromEdges :: [Edge] -> Graph
+fromEdges = collectI IS.union fst (IS.singleton . snd)
+
+toAdj :: Graph -> [(Node, [Node])]
+toAdj = fmap (second IS.toList) . IM.toList
+
+toEdges :: Graph -> [Edge]
+toEdges = concatMap (uncurry (fmap . (,))) . toAdj
+
+predG :: Graph -> Graph
+predG g = IM.unionWith IS.union (go g) g0
+ where g0 = fmap (const mempty) g
+ f :: IntMap IntSet -> Int -> IntSet -> IntMap IntSet
+ f m i a = foldl' (\m p -> IM.insertWith mappend p
+ (IS.singleton i) m)
+ m
+ (IS.toList a)
+ go :: IntMap IntSet -> IntMap IntSet
+ go = flip IM.foldlWithKey' mempty f
+
+pruneReach :: Rooted -> Rooted
+pruneReach (r,g) = (r,g2)
+ where is = reachable
+ (maybe mempty id
+ . flip IM.lookup g) $ r
+ g2 = IM.fromList
+ . fmap (second (IS.filter (`IS.member`is)))
+ . filter ((`IS.member`is) . fst)
+ . IM.toList $ g
+
+tip :: Tree a -> (a, [Tree a])
+tip (Node a ts) = (a, ts)
+
+parents :: Tree a -> [(a, a)]
+parents (Node i xs) = p i xs
+ ++ concatMap parents xs
+ where p i = fmap (flip (,) i . rootLabel)
+
+ancestors :: Tree a -> [(a, [a])]
+ancestors = go []
+ where go acc (Node i xs)
+ = let acc' = i:acc
+ in p acc' xs ++ concatMap (go acc') xs
+ p is = fmap (flip (,) is . rootLabel)
+
+asGraph :: Tree Node -> Rooted
+asGraph t@(Node a _) = let g = go t in (a, fromAdj g)
+ where go (Node a ts) = let as = (fst . unzip . fmap tip) ts
+ in (a, as) : concatMap go ts
+
+asTree :: Rooted -> Tree Node
+asTree (r,g) = let go a = Node a (fmap go ((IS.toList . f) a))
+ f = (g !)
+ in go r
+
+reachable :: (Node -> NodeSet) -> (Node -> NodeSet)
+reachable f a = go (IS.singleton a) a
+ where go seen a = let s = f a
+ as = IS.toList (s `IS.difference` seen)
+ in foldl' go (s `IS.union` seen) as
+
+collectI :: (c -> c -> c)
+ -> (a -> Int) -> (a -> c) -> [a] -> IntMap c
+collectI (<>) f g
+ = foldl' (\m a -> IM.insertWith (<>)
+ (f a)
+ (g a) m) mempty
+
+-- collect :: (Ord b) => (c -> c -> c)
+-- -> (a -> b) -> (a -> c) -> [a] -> Map b c
+-- collect (<>) f g
+-- = foldl' (\m a -> SM.insertWith (<>)
+-- (f a)
+-- (g a) m) mempty
+
+-- (renamed, old -> new)
+renum :: Int -> Graph -> (Graph, NodeMap Node)
+renum from = (\(_,m,g)->(g,m))
+ . IM.foldlWithKey'
+ f (from,mempty,mempty)
+ where
+ f :: (Int, NodeMap Node, IntMap IntSet) -> Node -> IntSet
+ -> (Int, NodeMap Node, IntMap IntSet)
+ f (!n,!env,!new) i ss =
+ let (j,n2,env2) = go n env i
+ (n3,env3,ss2) = IS.fold
+ (\k (!n,!env,!new)->
+ case go n env k of
+ (l,n2,env2)-> (n2,env2,l `IS.insert` new))
+ (n2,env2,mempty) ss
+ new2 = IM.insertWith IS.union j ss2 new
+ in (n3,env3,new2)
+ go :: Int
+ -> NodeMap Node
+ -> Node
+ -> (Node,Int,NodeMap Node)
+ go !n !env i =
+ case IM.lookup i env of
+ Just j -> (j,n,env)
+ Nothing -> (n,n+1,IM.insert i n env)
+
+-----------------------------------------------------------------------------
+
+newtype S z s a = S {unS :: forall o. (a -> s -> ST z o) -> s -> ST z o}
+instance Functor (S z s) where
+ fmap f (S g) = S (\k -> g (k . f))
+instance Monad (S z s) where
+ return = pure
+ S g >>= f = S (\k -> g (\a -> unS (f a) k))
+instance Applicative (S z s) where
+ pure a = S (\k -> k a)
+ (<*>) = ap
+-- get :: S z s s
+-- get = S (\k s -> k s s)
+gets :: (s -> a) -> S z s a
+gets f = S (\k s -> k (f s) s)
+-- set :: s -> S z s ()
+-- set s = S (\k _ -> k () s)
+modify :: (s -> s) -> S z s ()
+modify f = S (\k -> k () . f)
+-- runS :: S z s a -> s -> ST z (a, s)
+-- runS (S g) = g (\a s -> return (a,s))
+evalS :: S z s a -> s -> ST z a
+evalS (S g) = g ((return .) . const)
+-- execS :: S z s a -> s -> ST z s
+-- execS (S g) = g ((return .) . flip const)
+st :: ST z a -> S z s a
+st m = S (\k s-> do
+ a <- m
+ k a s)
+store :: (MArray (A z) a (ST z))
+ => (s -> Arr z a) -> Int -> a -> S z s ()
+store f i x = do
+ a <- gets f
+ st ((a.=x) i)
+fetch :: (MArray (A z) a (ST z))
+ => (s -> Arr z a) -> Int -> S z s a
+fetch f i = do
+ a <- gets f
+ st (a!:i)
+
diff --git a/compiler/GHC/CmmToAsm/CPrim.hs b/compiler/GHC/CmmToAsm/CPrim.hs
new file mode 100644
index 0000000000..34c3a7ff6a
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/CPrim.hs
@@ -0,0 +1,133 @@
+-- | Generating C symbol names emitted by the compiler.
+module GHC.CmmToAsm.CPrim
+ ( atomicReadLabel
+ , atomicWriteLabel
+ , atomicRMWLabel
+ , cmpxchgLabel
+ , popCntLabel
+ , pdepLabel
+ , pextLabel
+ , bSwapLabel
+ , bRevLabel
+ , clzLabel
+ , ctzLabel
+ , word2FloatLabel
+ ) where
+
+import GhcPrelude
+
+import GHC.Cmm.Type
+import GHC.Cmm.MachOp
+import Outputable
+
+popCntLabel :: Width -> String
+popCntLabel w = "hs_popcnt" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "popCntLabel: Unsupported word width " (ppr w)
+
+pdepLabel :: Width -> String
+pdepLabel w = "hs_pdep" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "pdepLabel: Unsupported word width " (ppr w)
+
+pextLabel :: Width -> String
+pextLabel w = "hs_pext" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "pextLabel: Unsupported word width " (ppr w)
+
+bSwapLabel :: Width -> String
+bSwapLabel w = "hs_bswap" ++ pprWidth w
+ where
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "bSwapLabel: Unsupported word width " (ppr w)
+
+bRevLabel :: Width -> String
+bRevLabel w = "hs_bitrev" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "bRevLabel: Unsupported word width " (ppr w)
+
+clzLabel :: Width -> String
+clzLabel w = "hs_clz" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "clzLabel: Unsupported word width " (ppr w)
+
+ctzLabel :: Width -> String
+ctzLabel w = "hs_ctz" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "ctzLabel: Unsupported word width " (ppr w)
+
+word2FloatLabel :: Width -> String
+word2FloatLabel w = "hs_word2float" ++ pprWidth w
+ where
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "word2FloatLabel: Unsupported word width " (ppr w)
+
+atomicRMWLabel :: Width -> AtomicMachOp -> String
+atomicRMWLabel w amop = "hs_atomic_" ++ pprFunName amop ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "atomicRMWLabel: Unsupported word width " (ppr w)
+
+ pprFunName AMO_Add = "add"
+ pprFunName AMO_Sub = "sub"
+ pprFunName AMO_And = "and"
+ pprFunName AMO_Nand = "nand"
+ pprFunName AMO_Or = "or"
+ pprFunName AMO_Xor = "xor"
+
+cmpxchgLabel :: Width -> String
+cmpxchgLabel w = "hs_cmpxchg" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "cmpxchgLabel: Unsupported word width " (ppr w)
+
+atomicReadLabel :: Width -> String
+atomicReadLabel w = "hs_atomicread" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "atomicReadLabel: Unsupported word width " (ppr w)
+
+atomicWriteLabel :: Width -> String
+atomicWriteLabel w = "hs_atomicwrite" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "atomicWriteLabel: Unsupported word width " (ppr w)
diff --git a/compiler/GHC/CmmToAsm/Dwarf.hs b/compiler/GHC/CmmToAsm/Dwarf.hs
new file mode 100644
index 0000000000..8cacd19023
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Dwarf.hs
@@ -0,0 +1,269 @@
+module GHC.CmmToAsm.Dwarf (
+ dwarfGen
+ ) where
+
+import GhcPrelude
+
+import GHC.Cmm.CLabel
+import GHC.Cmm.Expr ( GlobalReg(..) )
+import Config ( cProjectName, cProjectVersion )
+import CoreSyn ( Tickish(..) )
+import GHC.Cmm.DebugBlock
+import GHC.Driver.Session
+import Module
+import Outputable
+import GHC.Platform
+import Unique
+import UniqSupply
+
+import GHC.CmmToAsm.Dwarf.Constants
+import GHC.CmmToAsm.Dwarf.Types
+
+import Control.Arrow ( first )
+import Control.Monad ( mfilter )
+import Data.Maybe
+import Data.List ( sortBy )
+import Data.Ord ( comparing )
+import qualified Data.Map as Map
+import System.FilePath
+import System.Directory ( getCurrentDirectory )
+
+import qualified GHC.Cmm.Dataflow.Label as H
+import qualified GHC.Cmm.Dataflow.Collections as H
+
+-- | Generate DWARF/debug information
+dwarfGen :: DynFlags -> ModLocation -> UniqSupply -> [DebugBlock]
+ -> IO (SDoc, UniqSupply)
+dwarfGen _ _ us [] = return (empty, us)
+dwarfGen df modLoc us blocks = do
+
+ -- Convert debug data structures to DWARF info records
+ -- We strip out block information when running with -g0 or -g1.
+ let procs = debugSplitProcs blocks
+ stripBlocks dbg
+ | debugLevel df < 2 = dbg { dblBlocks = [] }
+ | otherwise = dbg
+ compPath <- getCurrentDirectory
+ let lowLabel = dblCLabel $ head procs
+ highLabel = mkAsmTempEndLabel $ dblCLabel $ last procs
+ dwarfUnit = DwarfCompileUnit
+ { dwChildren = map (procToDwarf df) (map stripBlocks procs)
+ , dwName = fromMaybe "" (ml_hs_file modLoc)
+ , dwCompDir = addTrailingPathSeparator compPath
+ , dwProducer = cProjectName ++ " " ++ cProjectVersion
+ , dwLowLabel = lowLabel
+ , dwHighLabel = highLabel
+ , dwLineLabel = dwarfLineLabel
+ }
+
+ -- Check whether we have any source code information, so we do not
+ -- end up writing a pointer to an empty .debug_line section
+ -- (dsymutil on Mac Os gets confused by this).
+ let haveSrcIn blk = isJust (dblSourceTick blk) && isJust (dblPosition blk)
+ || any haveSrcIn (dblBlocks blk)
+ haveSrc = any haveSrcIn procs
+
+ -- .debug_abbrev section: Declare the format we're using
+ let abbrevSct = pprAbbrevDecls haveSrc
+
+ -- .debug_info section: Information records on procedures and blocks
+ let -- unique to identify start and end compilation unit .debug_inf
+ (unitU, us') = takeUniqFromSupply us
+ infoSct = vcat [ ptext dwarfInfoLabel <> colon
+ , dwarfInfoSection
+ , compileUnitHeader unitU
+ , pprDwarfInfo haveSrc dwarfUnit
+ , compileUnitFooter unitU
+ ]
+
+ -- .debug_line section: Generated mainly by the assembler, but we
+ -- need to label it
+ let lineSct = dwarfLineSection $$
+ ptext dwarfLineLabel <> colon
+
+ -- .debug_frame section: Information about the layout of the GHC stack
+ let (framesU, us'') = takeUniqFromSupply us'
+ frameSct = dwarfFrameSection $$
+ ptext dwarfFrameLabel <> colon $$
+ pprDwarfFrame (debugFrame framesU procs)
+
+ -- .aranges section: Information about the bounds of compilation units
+ let aranges' | gopt Opt_SplitSections df = map mkDwarfARange procs
+ | otherwise = [DwarfARange lowLabel highLabel]
+ let aranges = dwarfARangesSection $$ pprDwarfARanges aranges' unitU
+
+ return (infoSct $$ abbrevSct $$ lineSct $$ frameSct $$ aranges, us'')
+
+-- | Build an address range entry for one proc.
+-- With split sections, each proc needs its own entry, since they may get
+-- scattered in the final binary. Without split sections, we could make a
+-- single arange based on the first/last proc.
+mkDwarfARange :: DebugBlock -> DwarfARange
+mkDwarfARange proc = DwarfARange start end
+ where
+ start = dblCLabel proc
+ end = mkAsmTempEndLabel start
+
+-- | Header for a compilation unit, establishing global format
+-- parameters
+compileUnitHeader :: Unique -> SDoc
+compileUnitHeader unitU = sdocWithPlatform $ \plat ->
+ let cuLabel = mkAsmTempLabel unitU -- sits right before initialLength field
+ length = ppr (mkAsmTempEndLabel cuLabel) <> char '-' <> ppr cuLabel
+ <> text "-4" -- length of initialLength field
+ in vcat [ ppr cuLabel <> colon
+ , text "\t.long " <> length -- compilation unit size
+ , pprHalf 3 -- DWARF version
+ , sectionOffset (ptext dwarfAbbrevLabel) (ptext dwarfAbbrevLabel)
+ -- abbrevs offset
+ , text "\t.byte " <> ppr (platformWordSizeInBytes plat) -- word size
+ ]
+
+-- | Compilation unit footer, mainly establishing size of debug sections
+compileUnitFooter :: Unique -> SDoc
+compileUnitFooter unitU =
+ let cuEndLabel = mkAsmTempEndLabel $ mkAsmTempLabel unitU
+ in ppr cuEndLabel <> colon
+
+-- | Splits the blocks by procedures. In the result all nested blocks
+-- will come from the same procedure as the top-level block. See
+-- Note [Splitting DebugBlocks] for details.
+debugSplitProcs :: [DebugBlock] -> [DebugBlock]
+debugSplitProcs b = concat $ H.mapElems $ mergeMaps $ map (split Nothing) b
+ where mergeMaps = foldr (H.mapUnionWithKey (const (++))) H.mapEmpty
+ split :: Maybe DebugBlock -> DebugBlock -> H.LabelMap [DebugBlock]
+ split parent blk = H.mapInsert prc [blk'] nested
+ where prc = dblProcedure blk
+ blk' = blk { dblBlocks = own_blks
+ , dblParent = parent
+ }
+ own_blks = fromMaybe [] $ H.mapLookup prc nested
+ nested = mergeMaps $ map (split parent') $ dblBlocks blk
+ -- Figure out who should be the parent of nested blocks.
+ -- If @blk@ is optimized out then it isn't a good choice
+ -- and we just use its parent.
+ parent'
+ | Nothing <- dblPosition blk = parent
+ | otherwise = Just blk
+
+{-
+Note [Splitting DebugBlocks]
+
+DWARF requires that we break up the nested DebugBlocks produced from
+the C-- AST. For instance, we begin with tick trees containing nested procs.
+For example,
+
+ proc A [tick1, tick2]
+ block B [tick3]
+ proc C [tick4]
+
+when producing DWARF we need to procs (which are represented in DWARF as
+TAG_subprogram DIEs) to be top-level DIEs. debugSplitProcs is responsible for
+this transform, pulling out the nested procs into top-level procs.
+
+However, in doing this we need to be careful to preserve the parentage of the
+nested procs. This is the reason DebugBlocks carry the dblParent field, allowing
+us to reorganize the above tree as,
+
+ proc A [tick1, tick2]
+ block B [tick3]
+ proc C [tick4] parent=B
+
+Here we have annotated the new proc C with an attribute giving its original
+parent, B.
+-}
+
+-- | Generate DWARF info for a procedure debug block
+procToDwarf :: DynFlags -> DebugBlock -> DwarfInfo
+procToDwarf df prc
+ = DwarfSubprogram { dwChildren = map (blockToDwarf df) (dblBlocks prc)
+ , dwName = case dblSourceTick prc of
+ Just s@SourceNote{} -> sourceName s
+ _otherwise -> showSDocDump df $ ppr $ dblLabel prc
+ , dwLabel = dblCLabel prc
+ , dwParent = fmap mkAsmTempDieLabel
+ $ mfilter goodParent
+ $ fmap dblCLabel (dblParent prc)
+ }
+ where
+ goodParent a | a == dblCLabel prc = False
+ -- Omit parent if it would be self-referential
+ goodParent a | not (externallyVisibleCLabel a)
+ , debugLevel df < 2 = False
+ -- We strip block information when running -g0 or -g1, don't
+ -- refer to blocks in that case. Fixes #14894.
+ goodParent _ = True
+
+-- | Generate DWARF info for a block
+blockToDwarf :: DynFlags -> DebugBlock -> DwarfInfo
+blockToDwarf df blk
+ = DwarfBlock { dwChildren = concatMap (tickToDwarf df) (dblTicks blk)
+ ++ map (blockToDwarf df) (dblBlocks blk)
+ , dwLabel = dblCLabel blk
+ , dwMarker = marker
+ }
+ where
+ marker
+ | Just _ <- dblPosition blk = Just $ mkAsmTempLabel $ dblLabel blk
+ | otherwise = Nothing -- block was optimized out
+
+tickToDwarf :: DynFlags -> Tickish () -> [DwarfInfo]
+tickToDwarf _ (SourceNote ss _) = [DwarfSrcNote ss]
+tickToDwarf _ _ = []
+
+-- | Generates the data for the debug frame section, which encodes the
+-- desired stack unwind behaviour for the debugger
+debugFrame :: Unique -> [DebugBlock] -> DwarfFrame
+debugFrame u procs
+ = DwarfFrame { dwCieLabel = mkAsmTempLabel u
+ , dwCieInit = initUws
+ , dwCieProcs = map (procToFrame initUws) procs
+ }
+ where
+ initUws :: UnwindTable
+ initUws = Map.fromList [(Sp, Just (UwReg Sp 0))]
+
+-- | Generates unwind information for a procedure debug block
+procToFrame :: UnwindTable -> DebugBlock -> DwarfFrameProc
+procToFrame initUws blk
+ = DwarfFrameProc { dwFdeProc = dblCLabel blk
+ , dwFdeHasInfo = dblHasInfoTbl blk
+ , dwFdeBlocks = map (uncurry blockToFrame)
+ (setHasInfo blockUws)
+ }
+ where blockUws :: [(DebugBlock, [UnwindPoint])]
+ blockUws = map snd $ sortBy (comparing fst) $ flatten blk
+
+ flatten :: DebugBlock
+ -> [(Int, (DebugBlock, [UnwindPoint]))]
+ flatten b@DebugBlock{ dblPosition=pos, dblUnwind=uws, dblBlocks=blocks }
+ | Just p <- pos = (p, (b, uws')):nested
+ | otherwise = nested -- block was optimized out
+ where uws' = addDefaultUnwindings initUws uws
+ nested = concatMap flatten blocks
+
+ -- | If the current procedure has an info table, then we also say that
+ -- its first block has one to ensure that it gets the necessary -1
+ -- offset applied to its start address.
+ -- See Note [Info Offset] in Dwarf.Types.
+ setHasInfo :: [(DebugBlock, [UnwindPoint])]
+ -> [(DebugBlock, [UnwindPoint])]
+ setHasInfo [] = []
+ setHasInfo (c0:cs) = first setIt c0 : cs
+ where
+ setIt child =
+ child { dblHasInfoTbl = dblHasInfoTbl child
+ || dblHasInfoTbl blk }
+
+blockToFrame :: DebugBlock -> [UnwindPoint] -> DwarfFrameBlock
+blockToFrame blk uws
+ = DwarfFrameBlock { dwFdeBlkHasInfo = dblHasInfoTbl blk
+ , dwFdeUnwind = uws
+ }
+
+addDefaultUnwindings :: UnwindTable -> [UnwindPoint] -> [UnwindPoint]
+addDefaultUnwindings tbl pts =
+ [ UnwindPoint lbl (tbl' `mappend` tbl)
+ -- mappend is left-biased
+ | UnwindPoint lbl tbl' <- pts
+ ]
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Constants.hs b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
new file mode 100644
index 0000000000..4ab54b6629
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
@@ -0,0 +1,229 @@
+-- | Constants describing the DWARF format. Most of this simply
+-- mirrors /usr/include/dwarf.h.
+
+module GHC.CmmToAsm.Dwarf.Constants where
+
+import GhcPrelude
+
+import AsmUtils
+import FastString
+import GHC.Platform
+import Outputable
+
+import GHC.Platform.Reg
+import GHC.CmmToAsm.X86.Regs
+
+import Data.Word
+
+-- | Language ID used for Haskell.
+dW_LANG_Haskell :: Word
+dW_LANG_Haskell = 0x18
+ -- Thanks to Nathan Howell for getting us our very own language ID!
+
+-- * Dwarf tags
+dW_TAG_compile_unit, dW_TAG_subroutine_type,
+ dW_TAG_file_type, dW_TAG_subprogram, dW_TAG_lexical_block,
+ dW_TAG_base_type, dW_TAG_structure_type, dW_TAG_pointer_type,
+ dW_TAG_array_type, dW_TAG_subrange_type, dW_TAG_typedef,
+ dW_TAG_variable, dW_TAG_arg_variable, dW_TAG_auto_variable,
+ dW_TAG_ghc_src_note :: Word
+dW_TAG_array_type = 1
+dW_TAG_lexical_block = 11
+dW_TAG_pointer_type = 15
+dW_TAG_compile_unit = 17
+dW_TAG_structure_type = 19
+dW_TAG_typedef = 22
+dW_TAG_subroutine_type = 32
+dW_TAG_subrange_type = 33
+dW_TAG_base_type = 36
+dW_TAG_file_type = 41
+dW_TAG_subprogram = 46
+dW_TAG_variable = 52
+dW_TAG_auto_variable = 256
+dW_TAG_arg_variable = 257
+
+dW_TAG_ghc_src_note = 0x5b00
+
+-- * Dwarf attributes
+dW_AT_name, dW_AT_stmt_list, dW_AT_low_pc, dW_AT_high_pc, dW_AT_language,
+ dW_AT_comp_dir, dW_AT_producer, dW_AT_external, dW_AT_frame_base,
+ dW_AT_use_UTF8, dW_AT_MIPS_linkage_name :: Word
+dW_AT_name = 0x03
+dW_AT_stmt_list = 0x10
+dW_AT_low_pc = 0x11
+dW_AT_high_pc = 0x12
+dW_AT_language = 0x13
+dW_AT_comp_dir = 0x1b
+dW_AT_producer = 0x25
+dW_AT_external = 0x3f
+dW_AT_frame_base = 0x40
+dW_AT_use_UTF8 = 0x53
+dW_AT_MIPS_linkage_name = 0x2007
+
+-- * Custom DWARF attributes
+-- Chosen a more or less random section of the vendor-extensible region
+
+-- ** Describing C-- blocks
+-- These appear in DW_TAG_lexical_scope DIEs corresponding to C-- blocks
+dW_AT_ghc_tick_parent :: Word
+dW_AT_ghc_tick_parent = 0x2b20
+
+-- ** Describing source notes
+-- These appear in DW_TAG_ghc_src_note DIEs
+dW_AT_ghc_span_file, dW_AT_ghc_span_start_line,
+ dW_AT_ghc_span_start_col, dW_AT_ghc_span_end_line,
+ dW_AT_ghc_span_end_col :: Word
+dW_AT_ghc_span_file = 0x2b00
+dW_AT_ghc_span_start_line = 0x2b01
+dW_AT_ghc_span_start_col = 0x2b02
+dW_AT_ghc_span_end_line = 0x2b03
+dW_AT_ghc_span_end_col = 0x2b04
+
+
+-- * Abbrev declarations
+dW_CHILDREN_no, dW_CHILDREN_yes :: Word8
+dW_CHILDREN_no = 0
+dW_CHILDREN_yes = 1
+
+dW_FORM_addr, dW_FORM_data2, dW_FORM_data4, dW_FORM_string, dW_FORM_flag,
+ dW_FORM_block1, dW_FORM_ref4, dW_FORM_ref_addr, dW_FORM_flag_present :: Word
+dW_FORM_addr = 0x01
+dW_FORM_data2 = 0x05
+dW_FORM_data4 = 0x06
+dW_FORM_string = 0x08
+dW_FORM_flag = 0x0c
+dW_FORM_block1 = 0x0a
+dW_FORM_ref_addr = 0x10
+dW_FORM_ref4 = 0x13
+dW_FORM_flag_present = 0x19
+
+-- * Dwarf native types
+dW_ATE_address, dW_ATE_boolean, dW_ATE_float, dW_ATE_signed,
+ dW_ATE_signed_char, dW_ATE_unsigned, dW_ATE_unsigned_char :: Word
+dW_ATE_address = 1
+dW_ATE_boolean = 2
+dW_ATE_float = 4
+dW_ATE_signed = 5
+dW_ATE_signed_char = 6
+dW_ATE_unsigned = 7
+dW_ATE_unsigned_char = 8
+
+-- * Call frame information
+dW_CFA_set_loc, dW_CFA_undefined, dW_CFA_same_value,
+ dW_CFA_def_cfa, dW_CFA_def_cfa_offset, dW_CFA_def_cfa_expression,
+ dW_CFA_expression, dW_CFA_offset_extended_sf, dW_CFA_def_cfa_offset_sf,
+ dW_CFA_def_cfa_sf, dW_CFA_val_offset, dW_CFA_val_expression,
+ dW_CFA_offset :: Word8
+dW_CFA_set_loc = 0x01
+dW_CFA_undefined = 0x07
+dW_CFA_same_value = 0x08
+dW_CFA_def_cfa = 0x0c
+dW_CFA_def_cfa_offset = 0x0e
+dW_CFA_def_cfa_expression = 0x0f
+dW_CFA_expression = 0x10
+dW_CFA_offset_extended_sf = 0x11
+dW_CFA_def_cfa_sf = 0x12
+dW_CFA_def_cfa_offset_sf = 0x13
+dW_CFA_val_offset = 0x14
+dW_CFA_val_expression = 0x16
+dW_CFA_offset = 0x80
+
+-- * Operations
+dW_OP_addr, dW_OP_deref, dW_OP_consts,
+ dW_OP_minus, dW_OP_mul, dW_OP_plus,
+ dW_OP_lit0, dW_OP_breg0, dW_OP_call_frame_cfa :: Word8
+dW_OP_addr = 0x03
+dW_OP_deref = 0x06
+dW_OP_consts = 0x11
+dW_OP_minus = 0x1c
+dW_OP_mul = 0x1e
+dW_OP_plus = 0x22
+dW_OP_lit0 = 0x30
+dW_OP_breg0 = 0x70
+dW_OP_call_frame_cfa = 0x9c
+
+-- * Dwarf section declarations
+dwarfInfoSection, dwarfAbbrevSection, dwarfLineSection,
+ dwarfFrameSection, dwarfGhcSection, dwarfARangesSection :: SDoc
+dwarfInfoSection = dwarfSection "info"
+dwarfAbbrevSection = dwarfSection "abbrev"
+dwarfLineSection = dwarfSection "line"
+dwarfFrameSection = dwarfSection "frame"
+dwarfGhcSection = dwarfSection "ghc"
+dwarfARangesSection = dwarfSection "aranges"
+
+dwarfSection :: String -> SDoc
+dwarfSection name = sdocWithPlatform $ \plat ->
+ case platformOS plat of
+ os | osElfTarget os
+ -> text "\t.section .debug_" <> text name <> text ",\"\","
+ <> sectionType "progbits"
+ | osMachOTarget os
+ -> text "\t.section __DWARF,__debug_" <> text name <> text ",regular,debug"
+ | otherwise
+ -> text "\t.section .debug_" <> text name <> text ",\"dr\""
+
+-- * Dwarf section labels
+dwarfInfoLabel, dwarfAbbrevLabel, dwarfLineLabel, dwarfFrameLabel :: PtrString
+dwarfInfoLabel = sLit ".Lsection_info"
+dwarfAbbrevLabel = sLit ".Lsection_abbrev"
+dwarfLineLabel = sLit ".Lsection_line"
+dwarfFrameLabel = sLit ".Lsection_frame"
+
+-- | Mapping of registers to DWARF register numbers
+dwarfRegNo :: Platform -> Reg -> Word8
+dwarfRegNo p r = case platformArch p of
+ ArchX86
+ | r == eax -> 0
+ | r == ecx -> 1 -- yes, no typo
+ | r == edx -> 2
+ | r == ebx -> 3
+ | r == esp -> 4
+ | r == ebp -> 5
+ | r == esi -> 6
+ | r == edi -> 7
+ ArchX86_64
+ | r == rax -> 0
+ | r == rdx -> 1 -- this neither. The order GCC allocates registers in?
+ | r == rcx -> 2
+ | r == rbx -> 3
+ | r == rsi -> 4
+ | r == rdi -> 5
+ | r == rbp -> 6
+ | r == rsp -> 7
+ | r == r8 -> 8
+ | r == r9 -> 9
+ | r == r10 -> 10
+ | r == r11 -> 11
+ | r == r12 -> 12
+ | r == r13 -> 13
+ | r == r14 -> 14
+ | r == r15 -> 15
+ | r == xmm0 -> 17
+ | r == xmm1 -> 18
+ | r == xmm2 -> 19
+ | r == xmm3 -> 20
+ | r == xmm4 -> 21
+ | r == xmm5 -> 22
+ | r == xmm6 -> 23
+ | r == xmm7 -> 24
+ | r == xmm8 -> 25
+ | r == xmm9 -> 26
+ | r == xmm10 -> 27
+ | r == xmm11 -> 28
+ | r == xmm12 -> 29
+ | r == xmm13 -> 30
+ | r == xmm14 -> 31
+ | r == xmm15 -> 32
+ _other -> error "dwarfRegNo: Unsupported platform or unknown register!"
+
+-- | Virtual register number to use for return address.
+dwarfReturnRegNo :: Platform -> Word8
+dwarfReturnRegNo p
+ -- We "overwrite" IP with our pseudo register - that makes sense, as
+ -- when using this mechanism gdb already knows the IP anyway. Clang
+ -- does this too, so it must be safe.
+ = case platformArch p of
+ ArchX86 -> 8 -- eip
+ ArchX86_64 -> 16 -- rip
+ _other -> error "dwarfReturnRegNo: Unsupported platform!"
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Types.hs b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
new file mode 100644
index 0000000000..5eda37a653
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
@@ -0,0 +1,612 @@
+module GHC.CmmToAsm.Dwarf.Types
+ ( -- * Dwarf information
+ DwarfInfo(..)
+ , pprDwarfInfo
+ , pprAbbrevDecls
+ -- * Dwarf address range table
+ , DwarfARange(..)
+ , pprDwarfARanges
+ -- * Dwarf frame
+ , DwarfFrame(..), DwarfFrameProc(..), DwarfFrameBlock(..)
+ , pprDwarfFrame
+ -- * Utilities
+ , pprByte
+ , pprHalf
+ , pprData4'
+ , pprDwWord
+ , pprWord
+ , pprLEBWord
+ , pprLEBInt
+ , wordAlign
+ , sectionOffset
+ )
+ where
+
+import GhcPrelude
+
+import GHC.Cmm.DebugBlock
+import GHC.Cmm.CLabel
+import GHC.Cmm.Expr ( GlobalReg(..) )
+import Encoding
+import FastString
+import Outputable
+import GHC.Platform
+import Unique
+import GHC.Platform.Reg
+import SrcLoc
+import Util
+
+import GHC.CmmToAsm.Dwarf.Constants
+
+import qualified Data.ByteString as BS
+import qualified Control.Monad.Trans.State.Strict as S
+import Control.Monad (zipWithM, join)
+import Data.Bits
+import qualified Data.Map as Map
+import Data.Word
+import Data.Char
+
+import GHC.Platform.Regs
+
+-- | Individual dwarf records. Each one will be encoded as an entry in
+-- the @.debug_info@ section.
+data DwarfInfo
+ = DwarfCompileUnit { dwChildren :: [DwarfInfo]
+ , dwName :: String
+ , dwProducer :: String
+ , dwCompDir :: String
+ , dwLowLabel :: CLabel
+ , dwHighLabel :: CLabel
+ , dwLineLabel :: PtrString }
+ | DwarfSubprogram { dwChildren :: [DwarfInfo]
+ , dwName :: String
+ , dwLabel :: CLabel
+ , dwParent :: Maybe CLabel
+ -- ^ label of DIE belonging to the parent tick
+ }
+ | DwarfBlock { dwChildren :: [DwarfInfo]
+ , dwLabel :: CLabel
+ , dwMarker :: Maybe CLabel
+ }
+ | DwarfSrcNote { dwSrcSpan :: RealSrcSpan
+ }
+
+-- | Abbreviation codes used for encoding above records in the
+-- @.debug_info@ section.
+data DwarfAbbrev
+ = DwAbbrNull -- ^ Pseudo, used for marking the end of lists
+ | DwAbbrCompileUnit
+ | DwAbbrSubprogram
+ | DwAbbrSubprogramWithParent
+ | DwAbbrBlockWithoutCode
+ | DwAbbrBlock
+ | DwAbbrGhcSrcNote
+ deriving (Eq, Enum)
+
+-- | Generate assembly for the given abbreviation code
+pprAbbrev :: DwarfAbbrev -> SDoc
+pprAbbrev = pprLEBWord . fromIntegral . fromEnum
+
+-- | Abbreviation declaration. This explains the binary encoding we
+-- use for representing 'DwarfInfo'. Be aware that this must be updated
+-- along with 'pprDwarfInfo'.
+pprAbbrevDecls :: Bool -> SDoc
+pprAbbrevDecls haveDebugLine =
+ let mkAbbrev abbr tag chld flds =
+ let fld (tag, form) = pprLEBWord tag $$ pprLEBWord form
+ in pprAbbrev abbr $$ pprLEBWord tag $$ pprByte chld $$
+ vcat (map fld flds) $$ pprByte 0 $$ pprByte 0
+ -- These are shared between DwAbbrSubprogram and
+ -- DwAbbrSubprogramWithParent
+ subprogramAttrs =
+ [ (dW_AT_name, dW_FORM_string)
+ , (dW_AT_MIPS_linkage_name, dW_FORM_string)
+ , (dW_AT_external, dW_FORM_flag)
+ , (dW_AT_low_pc, dW_FORM_addr)
+ , (dW_AT_high_pc, dW_FORM_addr)
+ , (dW_AT_frame_base, dW_FORM_block1)
+ ]
+ in dwarfAbbrevSection $$
+ ptext dwarfAbbrevLabel <> colon $$
+ mkAbbrev DwAbbrCompileUnit dW_TAG_compile_unit dW_CHILDREN_yes
+ ([(dW_AT_name, dW_FORM_string)
+ , (dW_AT_producer, dW_FORM_string)
+ , (dW_AT_language, dW_FORM_data4)
+ , (dW_AT_comp_dir, dW_FORM_string)
+ , (dW_AT_use_UTF8, dW_FORM_flag_present) -- not represented in body
+ , (dW_AT_low_pc, dW_FORM_addr)
+ , (dW_AT_high_pc, dW_FORM_addr)
+ ] ++
+ (if haveDebugLine
+ then [ (dW_AT_stmt_list, dW_FORM_data4) ]
+ else [])) $$
+ mkAbbrev DwAbbrSubprogram dW_TAG_subprogram dW_CHILDREN_yes
+ subprogramAttrs $$
+ mkAbbrev DwAbbrSubprogramWithParent dW_TAG_subprogram dW_CHILDREN_yes
+ (subprogramAttrs ++ [(dW_AT_ghc_tick_parent, dW_FORM_ref_addr)]) $$
+ mkAbbrev DwAbbrBlockWithoutCode dW_TAG_lexical_block dW_CHILDREN_yes
+ [ (dW_AT_name, dW_FORM_string)
+ ] $$
+ mkAbbrev DwAbbrBlock dW_TAG_lexical_block dW_CHILDREN_yes
+ [ (dW_AT_name, dW_FORM_string)
+ , (dW_AT_low_pc, dW_FORM_addr)
+ , (dW_AT_high_pc, dW_FORM_addr)
+ ] $$
+ mkAbbrev DwAbbrGhcSrcNote dW_TAG_ghc_src_note dW_CHILDREN_no
+ [ (dW_AT_ghc_span_file, dW_FORM_string)
+ , (dW_AT_ghc_span_start_line, dW_FORM_data4)
+ , (dW_AT_ghc_span_start_col, dW_FORM_data2)
+ , (dW_AT_ghc_span_end_line, dW_FORM_data4)
+ , (dW_AT_ghc_span_end_col, dW_FORM_data2)
+ ] $$
+ pprByte 0
+
+-- | Generate assembly for DWARF data
+pprDwarfInfo :: Bool -> DwarfInfo -> SDoc
+pprDwarfInfo haveSrc d
+ = case d of
+ DwarfCompileUnit {} -> hasChildren
+ DwarfSubprogram {} -> hasChildren
+ DwarfBlock {} -> hasChildren
+ DwarfSrcNote {} -> noChildren
+ where
+ hasChildren =
+ pprDwarfInfoOpen haveSrc d $$
+ vcat (map (pprDwarfInfo haveSrc) (dwChildren d)) $$
+ pprDwarfInfoClose
+ noChildren = pprDwarfInfoOpen haveSrc d
+
+-- | Prints assembler data corresponding to DWARF info records. Note
+-- that the binary format of this is parameterized in @abbrevDecls@ and
+-- has to be kept in synch.
+pprDwarfInfoOpen :: Bool -> DwarfInfo -> SDoc
+pprDwarfInfoOpen haveSrc (DwarfCompileUnit _ name producer compDir lowLabel
+ highLabel lineLbl) =
+ pprAbbrev DwAbbrCompileUnit
+ $$ pprString name
+ $$ pprString producer
+ $$ pprData4 dW_LANG_Haskell
+ $$ pprString compDir
+ $$ pprWord (ppr lowLabel)
+ $$ pprWord (ppr highLabel)
+ $$ if haveSrc
+ then sectionOffset (ptext lineLbl) (ptext dwarfLineLabel)
+ else empty
+pprDwarfInfoOpen _ (DwarfSubprogram _ name label
+ parent) = sdocWithDynFlags $ \df ->
+ ppr (mkAsmTempDieLabel label) <> colon
+ $$ pprAbbrev abbrev
+ $$ pprString name
+ $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label))
+ $$ pprFlag (externallyVisibleCLabel label)
+ $$ pprWord (ppr label)
+ $$ pprWord (ppr $ mkAsmTempEndLabel label)
+ $$ pprByte 1
+ $$ pprByte dW_OP_call_frame_cfa
+ $$ parentValue
+ where
+ abbrev = case parent of Nothing -> DwAbbrSubprogram
+ Just _ -> DwAbbrSubprogramWithParent
+ parentValue = maybe empty pprParentDie parent
+ pprParentDie sym = sectionOffset (ppr sym) (ptext dwarfInfoLabel)
+pprDwarfInfoOpen _ (DwarfBlock _ label Nothing) = sdocWithDynFlags $ \df ->
+ ppr (mkAsmTempDieLabel label) <> colon
+ $$ pprAbbrev DwAbbrBlockWithoutCode
+ $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label))
+pprDwarfInfoOpen _ (DwarfBlock _ label (Just marker)) = sdocWithDynFlags $ \df ->
+ ppr (mkAsmTempDieLabel label) <> colon
+ $$ pprAbbrev DwAbbrBlock
+ $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label))
+ $$ pprWord (ppr marker)
+ $$ pprWord (ppr $ mkAsmTempEndLabel marker)
+pprDwarfInfoOpen _ (DwarfSrcNote ss) =
+ pprAbbrev DwAbbrGhcSrcNote
+ $$ pprString' (ftext $ srcSpanFile ss)
+ $$ pprData4 (fromIntegral $ srcSpanStartLine ss)
+ $$ pprHalf (fromIntegral $ srcSpanStartCol ss)
+ $$ pprData4 (fromIntegral $ srcSpanEndLine ss)
+ $$ pprHalf (fromIntegral $ srcSpanEndCol ss)
+
+-- | Close a DWARF info record with children
+pprDwarfInfoClose :: SDoc
+pprDwarfInfoClose = pprAbbrev DwAbbrNull
+
+-- | A DWARF address range. This is used by the debugger to quickly locate
+-- which compilation unit a given address belongs to. This type assumes
+-- a non-segmented address-space.
+data DwarfARange
+ = DwarfARange
+ { dwArngStartLabel :: CLabel
+ , dwArngEndLabel :: CLabel
+ }
+
+-- | Print assembler directives corresponding to a DWARF @.debug_aranges@
+-- address table entry.
+pprDwarfARanges :: [DwarfARange] -> Unique -> SDoc
+pprDwarfARanges arngs unitU = sdocWithPlatform $ \plat ->
+ let wordSize = platformWordSizeInBytes plat
+ paddingSize = 4 :: Int
+ -- header is 12 bytes long.
+ -- entry is 8 bytes (32-bit platform) or 16 bytes (64-bit platform).
+ -- pad such that first entry begins at multiple of entry size.
+ pad n = vcat $ replicate n $ pprByte 0
+ -- Fix for #17428
+ initialLength = 8 + paddingSize + (1 + length arngs) * 2 * wordSize
+ in pprDwWord (ppr initialLength)
+ $$ pprHalf 2
+ $$ sectionOffset (ppr $ mkAsmTempLabel $ unitU)
+ (ptext dwarfInfoLabel)
+ $$ pprByte (fromIntegral wordSize)
+ $$ pprByte 0
+ $$ pad paddingSize
+ -- body
+ $$ vcat (map pprDwarfARange arngs)
+ -- terminus
+ $$ pprWord (char '0')
+ $$ pprWord (char '0')
+
+pprDwarfARange :: DwarfARange -> SDoc
+pprDwarfARange arng = pprWord (ppr $ dwArngStartLabel arng) $$ pprWord length
+ where
+ length = ppr (dwArngEndLabel arng)
+ <> char '-' <> ppr (dwArngStartLabel arng)
+
+-- | Information about unwind instructions for a procedure. This
+-- corresponds to a "Common Information Entry" (CIE) in DWARF.
+data DwarfFrame
+ = DwarfFrame
+ { dwCieLabel :: CLabel
+ , dwCieInit :: UnwindTable
+ , dwCieProcs :: [DwarfFrameProc]
+ }
+
+-- | Unwind instructions for an individual procedure. Corresponds to a
+-- "Frame Description Entry" (FDE) in DWARF.
+data DwarfFrameProc
+ = DwarfFrameProc
+ { dwFdeProc :: CLabel
+ , dwFdeHasInfo :: Bool
+ , dwFdeBlocks :: [DwarfFrameBlock]
+ -- ^ List of blocks. Order must match asm!
+ }
+
+-- | Unwind instructions for a block. Will become part of the
+-- containing FDE.
+data DwarfFrameBlock
+ = DwarfFrameBlock
+ { dwFdeBlkHasInfo :: Bool
+ , dwFdeUnwind :: [UnwindPoint]
+ -- ^ these unwind points must occur in the same order as they occur
+ -- in the block
+ }
+
+instance Outputable DwarfFrameBlock where
+ ppr (DwarfFrameBlock hasInfo unwinds) = braces $ ppr hasInfo <+> ppr unwinds
+
+-- | Header for the @.debug_frame@ section. Here we emit the "Common
+-- Information Entry" record that establishes general call frame
+-- parameters and the default stack layout.
+pprDwarfFrame :: DwarfFrame -> SDoc
+pprDwarfFrame DwarfFrame{dwCieLabel=cieLabel,dwCieInit=cieInit,dwCieProcs=procs}
+ = sdocWithPlatform $ \plat ->
+ let cieStartLabel= mkAsmTempDerivedLabel cieLabel (fsLit "_start")
+ cieEndLabel = mkAsmTempEndLabel cieLabel
+ length = ppr cieEndLabel <> char '-' <> ppr cieStartLabel
+ spReg = dwarfGlobalRegNo plat Sp
+ retReg = dwarfReturnRegNo plat
+ wordSize = platformWordSizeInBytes plat
+ pprInit :: (GlobalReg, Maybe UnwindExpr) -> SDoc
+ pprInit (g, uw) = pprSetUnwind plat g (Nothing, uw)
+
+ -- Preserve C stack pointer: This necessary to override that default
+ -- unwinding behavior of setting $sp = CFA.
+ preserveSp = case platformArch plat of
+ ArchX86 -> pprByte dW_CFA_same_value $$ pprLEBWord 4
+ ArchX86_64 -> pprByte dW_CFA_same_value $$ pprLEBWord 7
+ _ -> empty
+ in vcat [ ppr cieLabel <> colon
+ , pprData4' length -- Length of CIE
+ , ppr cieStartLabel <> colon
+ , pprData4' (text "-1")
+ -- Common Information Entry marker (-1 = 0xf..f)
+ , pprByte 3 -- CIE version (we require DWARF 3)
+ , pprByte 0 -- Augmentation (none)
+ , pprByte 1 -- Code offset multiplicator
+ , pprByte (128-fromIntegral wordSize)
+ -- Data offset multiplicator
+ -- (stacks grow down => "-w" in signed LEB128)
+ , pprByte retReg -- virtual register holding return address
+ ] $$
+ -- Initial unwind table
+ vcat (map pprInit $ Map.toList cieInit) $$
+ vcat [ -- RET = *CFA
+ pprByte (dW_CFA_offset+retReg)
+ , pprByte 0
+
+ -- Preserve C stack pointer
+ , preserveSp
+
+ -- Sp' = CFA
+ -- (we need to set this manually as our (STG) Sp register is
+ -- often not the architecture's default stack register)
+ , pprByte dW_CFA_val_offset
+ , pprLEBWord (fromIntegral spReg)
+ , pprLEBWord 0
+ ] $$
+ wordAlign $$
+ ppr cieEndLabel <> colon $$
+ -- Procedure unwind tables
+ vcat (map (pprFrameProc cieLabel cieInit) procs)
+
+-- | Writes a "Frame Description Entry" for a procedure. This consists
+-- mainly of referencing the CIE and writing state machine
+-- instructions to describe how the frame base (CFA) changes.
+pprFrameProc :: CLabel -> UnwindTable -> DwarfFrameProc -> SDoc
+pprFrameProc frameLbl initUw (DwarfFrameProc procLbl hasInfo blocks)
+ = let fdeLabel = mkAsmTempDerivedLabel procLbl (fsLit "_fde")
+ fdeEndLabel = mkAsmTempDerivedLabel procLbl (fsLit "_fde_end")
+ procEnd = mkAsmTempEndLabel procLbl
+ ifInfo str = if hasInfo then text str else empty
+ -- see [Note: Info Offset]
+ in vcat [ whenPprDebug $ text "# Unwinding for" <+> ppr procLbl <> colon
+ , pprData4' (ppr fdeEndLabel <> char '-' <> ppr fdeLabel)
+ , ppr fdeLabel <> colon
+ , pprData4' (ppr frameLbl <> char '-' <>
+ ptext dwarfFrameLabel) -- Reference to CIE
+ , pprWord (ppr procLbl <> ifInfo "-1") -- Code pointer
+ , pprWord (ppr procEnd <> char '-' <>
+ ppr procLbl <> ifInfo "+1") -- Block byte length
+ ] $$
+ vcat (S.evalState (mapM pprFrameBlock blocks) initUw) $$
+ wordAlign $$
+ ppr fdeEndLabel <> colon
+
+-- | Generates unwind information for a block. We only generate
+-- instructions where unwind information actually changes. This small
+-- optimisations saves a lot of space, as subsequent blocks often have
+-- the same unwind information.
+pprFrameBlock :: DwarfFrameBlock -> S.State UnwindTable SDoc
+pprFrameBlock (DwarfFrameBlock hasInfo uws0) =
+ vcat <$> zipWithM pprFrameDecl (True : repeat False) uws0
+ where
+ pprFrameDecl :: Bool -> UnwindPoint -> S.State UnwindTable SDoc
+ pprFrameDecl firstDecl (UnwindPoint lbl uws) = S.state $ \oldUws ->
+ let -- Did a register's unwind expression change?
+ isChanged :: GlobalReg -> Maybe UnwindExpr
+ -> Maybe (Maybe UnwindExpr, Maybe UnwindExpr)
+ isChanged g new
+ -- the value didn't change
+ | Just new == old = Nothing
+ -- the value was and still is undefined
+ | Nothing <- old
+ , Nothing <- new = Nothing
+ -- the value changed
+ | otherwise = Just (join old, new)
+ where
+ old = Map.lookup g oldUws
+
+ changed = Map.toList $ Map.mapMaybeWithKey isChanged uws
+
+ in if oldUws == uws
+ then (empty, oldUws)
+ else let -- see [Note: Info Offset]
+ needsOffset = firstDecl && hasInfo
+ lblDoc = ppr lbl <>
+ if needsOffset then text "-1" else empty
+ doc = sdocWithPlatform $ \plat ->
+ pprByte dW_CFA_set_loc $$ pprWord lblDoc $$
+ vcat (map (uncurry $ pprSetUnwind plat) changed)
+ in (doc, uws)
+
+-- Note [Info Offset]
+--
+-- GDB was pretty much written with C-like programs in mind, and as a
+-- result they assume that once you have a return address, it is a
+-- good idea to look at (PC-1) to unwind further - as that's where the
+-- "call" instruction is supposed to be.
+--
+-- Now on one hand, code generated by GHC looks nothing like what GDB
+-- expects, and in fact going up from a return pointer is guaranteed
+-- to land us inside an info table! On the other hand, that actually
+-- gives us some wiggle room, as we expect IP to never *actually* end
+-- up inside the info table, so we can "cheat" by putting whatever GDB
+-- expects to see there. This is probably pretty safe, as GDB cannot
+-- assume (PC-1) to be a valid code pointer in the first place - and I
+-- have seen no code trying to correct this.
+--
+-- Note that this will not prevent GDB from failing to look-up the
+-- correct function name for the frame, as that uses the symbol table,
+-- which we can not manipulate as easily.
+--
+-- There's a GDB patch to address this at [1]. At the moment of writing
+-- it's not merged, so I recommend building GDB with the patch if you
+-- care about unwinding. The hack above doesn't cover every case.
+--
+-- [1] https://sourceware.org/ml/gdb-patches/2018-02/msg00055.html
+
+-- | Get DWARF register ID for a given GlobalReg
+dwarfGlobalRegNo :: Platform -> GlobalReg -> Word8
+dwarfGlobalRegNo p UnwindReturnReg = dwarfReturnRegNo p
+dwarfGlobalRegNo p reg = maybe 0 (dwarfRegNo p . RegReal) $ globalRegMaybe p reg
+
+-- | Generate code for setting the unwind information for a register,
+-- optimized using its known old value in the table. Note that "Sp" is
+-- special: We see it as synonym for the CFA.
+pprSetUnwind :: Platform
+ -> GlobalReg
+ -- ^ the register to produce an unwinding table entry for
+ -> (Maybe UnwindExpr, Maybe UnwindExpr)
+ -- ^ the old and new values of the register
+ -> SDoc
+pprSetUnwind plat g (_, Nothing)
+ = pprUndefUnwind plat g
+pprSetUnwind _ Sp (Just (UwReg s _), Just (UwReg s' o')) | s == s'
+ = if o' >= 0
+ then pprByte dW_CFA_def_cfa_offset $$ pprLEBWord (fromIntegral o')
+ else pprByte dW_CFA_def_cfa_offset_sf $$ pprLEBInt o'
+pprSetUnwind plat Sp (_, Just (UwReg s' o'))
+ = if o' >= 0
+ then pprByte dW_CFA_def_cfa $$
+ pprLEBRegNo plat s' $$
+ pprLEBWord (fromIntegral o')
+ else pprByte dW_CFA_def_cfa_sf $$
+ pprLEBRegNo plat s' $$
+ pprLEBInt o'
+pprSetUnwind _ Sp (_, Just uw)
+ = pprByte dW_CFA_def_cfa_expression $$ pprUnwindExpr False uw
+pprSetUnwind plat g (_, Just (UwDeref (UwReg Sp o)))
+ | o < 0 && ((-o) `mod` platformWordSizeInBytes plat) == 0 -- expected case
+ = pprByte (dW_CFA_offset + dwarfGlobalRegNo plat g) $$
+ pprLEBWord (fromIntegral ((-o) `div` platformWordSizeInBytes plat))
+ | otherwise
+ = pprByte dW_CFA_offset_extended_sf $$
+ pprLEBRegNo plat g $$
+ pprLEBInt o
+pprSetUnwind plat g (_, Just (UwDeref uw))
+ = pprByte dW_CFA_expression $$
+ pprLEBRegNo plat g $$
+ pprUnwindExpr True uw
+pprSetUnwind plat g (_, Just (UwReg g' 0))
+ | g == g'
+ = pprByte dW_CFA_same_value $$
+ pprLEBRegNo plat g
+pprSetUnwind plat g (_, Just uw)
+ = pprByte dW_CFA_val_expression $$
+ pprLEBRegNo plat g $$
+ pprUnwindExpr True uw
+
+-- | Print the register number of the given 'GlobalReg' as an unsigned LEB128
+-- encoded number.
+pprLEBRegNo :: Platform -> GlobalReg -> SDoc
+pprLEBRegNo plat = pprLEBWord . fromIntegral . dwarfGlobalRegNo plat
+
+-- | Generates a DWARF expression for the given unwind expression. If
+-- @spIsCFA@ is true, we see @Sp@ as the frame base CFA where it gets
+-- mentioned.
+pprUnwindExpr :: Bool -> UnwindExpr -> SDoc
+pprUnwindExpr spIsCFA expr
+ = sdocWithPlatform $ \plat ->
+ let pprE (UwConst i)
+ | i >= 0 && i < 32 = pprByte (dW_OP_lit0 + fromIntegral i)
+ | otherwise = pprByte dW_OP_consts $$ pprLEBInt i -- lazy...
+ pprE (UwReg Sp i) | spIsCFA
+ = if i == 0
+ then pprByte dW_OP_call_frame_cfa
+ else pprE (UwPlus (UwReg Sp 0) (UwConst i))
+ pprE (UwReg g i) = pprByte (dW_OP_breg0+dwarfGlobalRegNo plat g) $$
+ pprLEBInt i
+ pprE (UwDeref u) = pprE u $$ pprByte dW_OP_deref
+ pprE (UwLabel l) = pprByte dW_OP_addr $$ pprWord (ppr l)
+ pprE (UwPlus u1 u2) = pprE u1 $$ pprE u2 $$ pprByte dW_OP_plus
+ pprE (UwMinus u1 u2) = pprE u1 $$ pprE u2 $$ pprByte dW_OP_minus
+ pprE (UwTimes u1 u2) = pprE u1 $$ pprE u2 $$ pprByte dW_OP_mul
+ in text "\t.uleb128 2f-1f" $$ -- DW_FORM_block length
+ -- computed as the difference of the following local labels 2: and 1:
+ text "1:" $$
+ pprE expr $$
+ text "2:"
+
+-- | Generate code for re-setting the unwind information for a
+-- register to @undefined@
+pprUndefUnwind :: Platform -> GlobalReg -> SDoc
+pprUndefUnwind plat g = pprByte dW_CFA_undefined $$
+ pprLEBRegNo plat g
+
+
+-- | Align assembly at (machine) word boundary
+wordAlign :: SDoc
+wordAlign = sdocWithPlatform $ \plat ->
+ text "\t.align " <> case platformOS plat of
+ OSDarwin -> case platformWordSize plat of
+ PW8 -> char '3'
+ PW4 -> char '2'
+ _other -> ppr (platformWordSizeInBytes plat)
+
+-- | Assembly for a single byte of constant DWARF data
+pprByte :: Word8 -> SDoc
+pprByte x = text "\t.byte " <> ppr (fromIntegral x :: Word)
+
+-- | Assembly for a two-byte constant integer
+pprHalf :: Word16 -> SDoc
+pprHalf x = text "\t.short" <+> ppr (fromIntegral x :: Word)
+
+-- | Assembly for a constant DWARF flag
+pprFlag :: Bool -> SDoc
+pprFlag f = pprByte (if f then 0xff else 0x00)
+
+-- | Assembly for 4 bytes of dynamic DWARF data
+pprData4' :: SDoc -> SDoc
+pprData4' x = text "\t.long " <> x
+
+-- | Assembly for 4 bytes of constant DWARF data
+pprData4 :: Word -> SDoc
+pprData4 = pprData4' . ppr
+
+-- | Assembly for a DWARF word of dynamic data. This means 32 bit, as
+-- we are generating 32 bit DWARF.
+pprDwWord :: SDoc -> SDoc
+pprDwWord = pprData4'
+
+-- | Assembly for a machine word of dynamic data. Depends on the
+-- architecture we are currently generating code for.
+pprWord :: SDoc -> SDoc
+pprWord s = (<> s) . sdocWithPlatform $ \plat ->
+ case platformWordSize plat of
+ PW4 -> text "\t.long "
+ PW8 -> text "\t.quad "
+
+-- | Prints a number in "little endian base 128" format. The idea is
+-- to optimize for small numbers by stopping once all further bytes
+-- would be 0. The highest bit in every byte signals whether there
+-- are further bytes to read.
+pprLEBWord :: Word -> SDoc
+pprLEBWord x | x < 128 = pprByte (fromIntegral x)
+ | otherwise = pprByte (fromIntegral $ 128 .|. (x .&. 127)) $$
+ pprLEBWord (x `shiftR` 7)
+
+-- | Same as @pprLEBWord@, but for a signed number
+pprLEBInt :: Int -> SDoc
+pprLEBInt x | x >= -64 && x < 64
+ = pprByte (fromIntegral (x .&. 127))
+ | otherwise = pprByte (fromIntegral $ 128 .|. (x .&. 127)) $$
+ pprLEBInt (x `shiftR` 7)
+
+-- | Generates a dynamic null-terminated string. If required the
+-- caller needs to make sure that the string is escaped properly.
+pprString' :: SDoc -> SDoc
+pprString' str = text "\t.asciz \"" <> str <> char '"'
+
+-- | Generate a string constant. We take care to escape the string.
+pprString :: String -> SDoc
+pprString str
+ = pprString' $ hcat $ map escapeChar $
+ if str `lengthIs` utf8EncodedLength str
+ then str
+ else map (chr . fromIntegral) $ BS.unpack $ bytesFS $ mkFastString str
+
+-- | Escape a single non-unicode character
+escapeChar :: Char -> SDoc
+escapeChar '\\' = text "\\\\"
+escapeChar '\"' = text "\\\""
+escapeChar '\n' = text "\\n"
+escapeChar c
+ | isAscii c && isPrint c && c /= '?' -- prevents trigraph warnings
+ = char c
+ | otherwise
+ = char '\\' <> char (intToDigit (ch `div` 64)) <>
+ char (intToDigit ((ch `div` 8) `mod` 8)) <>
+ char (intToDigit (ch `mod` 8))
+ where ch = ord c
+
+-- | Generate an offset into another section. This is tricky because
+-- this is handled differently depending on platform: Mac Os expects
+-- us to calculate the offset using assembler arithmetic. Linux expects
+-- us to just reference the target directly, and will figure out on
+-- their own that we actually need an offset. Finally, Windows has
+-- a special directive to refer to relative offsets. Fun.
+sectionOffset :: SDoc -> SDoc -> SDoc
+sectionOffset target section = sdocWithPlatform $ \plat ->
+ case platformOS plat of
+ OSDarwin -> pprDwWord (target <> char '-' <> section)
+ OSMinGW32 -> text "\t.secrel32 " <> target
+ _other -> pprDwWord target
diff --git a/compiler/GHC/CmmToAsm/Format.hs b/compiler/GHC/CmmToAsm/Format.hs
new file mode 100644
index 0000000000..446c760939
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Format.hs
@@ -0,0 +1,105 @@
+-- | Formats on this architecture
+-- A Format is a combination of width and class
+--
+-- TODO: Signed vs unsigned?
+--
+-- TODO: This module is currently shared by all architectures because
+-- NCGMonad need to know about it to make a VReg. It would be better
+-- to have architecture specific formats, and do the overloading
+-- properly. eg SPARC doesn't care about FF80.
+--
+module GHC.CmmToAsm.Format (
+ Format(..),
+ intFormat,
+ floatFormat,
+ isFloatFormat,
+ cmmTypeFormat,
+ formatToWidth,
+ formatInBytes
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Cmm
+import Outputable
+
+-- It looks very like the old MachRep, but it's now of purely local
+-- significance, here in the native code generator. You can change it
+-- without global consequences.
+--
+-- A major use is as an opcode qualifier; thus the opcode
+-- mov.l a b
+-- might be encoded
+-- MOV II32 a b
+-- where the Format field encodes the ".l" part.
+
+-- ToDo: it's not clear to me that we need separate signed-vs-unsigned formats
+-- here. I've removed them from the x86 version, we'll see what happens --SDM
+
+-- ToDo: quite a few occurrences of Format could usefully be replaced by Width
+
+data Format
+ = II8
+ | II16
+ | II32
+ | II64
+ | FF32
+ | FF64
+ deriving (Show, Eq)
+
+
+-- | Get the integer format of this width.
+intFormat :: Width -> Format
+intFormat width
+ = case width of
+ W8 -> II8
+ W16 -> II16
+ W32 -> II32
+ W64 -> II64
+ other -> sorry $ "The native code generator cannot " ++
+ "produce code for Format.intFormat " ++ show other
+ ++ "\n\tConsider using the llvm backend with -fllvm"
+
+
+-- | Get the float format of this width.
+floatFormat :: Width -> Format
+floatFormat width
+ = case width of
+ W32 -> FF32
+ W64 -> FF64
+
+ other -> pprPanic "Format.floatFormat" (ppr other)
+
+
+-- | Check if a format represents a floating point value.
+isFloatFormat :: Format -> Bool
+isFloatFormat format
+ = case format of
+ FF32 -> True
+ FF64 -> True
+ _ -> False
+
+
+-- | Convert a Cmm type to a Format.
+cmmTypeFormat :: CmmType -> Format
+cmmTypeFormat ty
+ | isFloatType ty = floatFormat (typeWidth ty)
+ | otherwise = intFormat (typeWidth ty)
+
+
+-- | Get the Width of a Format.
+formatToWidth :: Format -> Width
+formatToWidth format
+ = case format of
+ II8 -> W8
+ II16 -> W16
+ II32 -> W32
+ II64 -> W64
+ FF32 -> W32
+ FF64 -> W64
+
+
+formatInBytes :: Format -> Int
+formatInBytes = widthInBytes . formatToWidth
diff --git a/compiler/GHC/CmmToAsm/Instr.hs b/compiler/GHC/CmmToAsm/Instr.hs
new file mode 100644
index 0000000000..44fa9b7cc9
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Instr.hs
@@ -0,0 +1,202 @@
+
+module GHC.CmmToAsm.Instr (
+ RegUsage(..),
+ noUsage,
+ GenBasicBlock(..), blockId,
+ ListGraph(..),
+ NatCmm,
+ NatCmmDecl,
+ NatBasicBlock,
+ topInfoTable,
+ entryBlocks,
+ Instruction(..)
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Driver.Session
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Platform
+
+-- | Holds a list of source and destination registers used by a
+-- particular instruction.
+--
+-- Machine registers that are pre-allocated to stgRegs are filtered
+-- out, because they are uninteresting from a register allocation
+-- standpoint. (We wouldn't want them to end up on the free list!)
+--
+-- As far as we are concerned, the fixed registers simply don't exist
+-- (for allocation purposes, anyway).
+--
+data RegUsage
+ = RU [Reg] [Reg]
+
+-- | No regs read or written to.
+noUsage :: RegUsage
+noUsage = RU [] []
+
+-- Our flavours of the Cmm types
+-- Type synonyms for Cmm populated with native code
+type NatCmm instr
+ = GenCmmGroup
+ RawCmmStatics
+ (LabelMap RawCmmStatics)
+ (ListGraph instr)
+
+type NatCmmDecl statics instr
+ = GenCmmDecl
+ statics
+ (LabelMap RawCmmStatics)
+ (ListGraph instr)
+
+
+type NatBasicBlock instr
+ = GenBasicBlock instr
+
+
+-- | Returns the info table associated with the CmmDecl's entry point,
+-- if any.
+topInfoTable :: GenCmmDecl a (LabelMap i) (ListGraph b) -> Maybe i
+topInfoTable (CmmProc infos _ _ (ListGraph (b:_)))
+ = mapLookup (blockId b) infos
+topInfoTable _
+ = Nothing
+
+-- | Return the list of BlockIds in a CmmDecl that are entry points
+-- for this proc (i.e. they may be jumped to from outside this proc).
+entryBlocks :: GenCmmDecl a (LabelMap i) (ListGraph b) -> [BlockId]
+entryBlocks (CmmProc info _ _ (ListGraph code)) = entries
+ where
+ infos = mapKeys info
+ entries = case code of
+ [] -> infos
+ BasicBlock entry _ : _ -- first block is the entry point
+ | entry `elem` infos -> infos
+ | otherwise -> entry : infos
+entryBlocks _ = []
+
+-- | Common things that we can do with instructions, on all architectures.
+-- These are used by the shared parts of the native code generator,
+-- specifically the register allocators.
+--
+class Instruction instr where
+
+ -- | Get the registers that are being used by this instruction.
+ -- regUsage doesn't need to do any trickery for jumps and such.
+ -- Just state precisely the regs read and written by that insn.
+ -- The consequences of control flow transfers, as far as register
+ -- allocation goes, are taken care of by the register allocator.
+ --
+ regUsageOfInstr
+ :: Platform
+ -> instr
+ -> RegUsage
+
+
+ -- | Apply a given mapping to all the register references in this
+ -- instruction.
+ patchRegsOfInstr
+ :: instr
+ -> (Reg -> Reg)
+ -> instr
+
+
+ -- | Checks whether this instruction is a jump/branch instruction.
+ -- One that can change the flow of control in a way that the
+ -- register allocator needs to worry about.
+ isJumpishInstr
+ :: instr -> Bool
+
+
+ -- | Give the possible destinations of this jump instruction.
+ -- Must be defined for all jumpish instructions.
+ jumpDestsOfInstr
+ :: instr -> [BlockId]
+
+
+ -- | Change the destination of this jump instruction.
+ -- Used in the linear allocator when adding fixup blocks for join
+ -- points.
+ patchJumpInstr
+ :: instr
+ -> (BlockId -> BlockId)
+ -> instr
+
+
+ -- | An instruction to spill a register into a spill slot.
+ mkSpillInstr
+ :: DynFlags
+ -> Reg -- ^ the reg to spill
+ -> Int -- ^ the current stack delta
+ -> Int -- ^ spill slot to use
+ -> instr
+
+
+ -- | An instruction to reload a register from a spill slot.
+ mkLoadInstr
+ :: DynFlags
+ -> Reg -- ^ the reg to reload.
+ -> Int -- ^ the current stack delta
+ -> Int -- ^ the spill slot to use
+ -> instr
+
+ -- | See if this instruction is telling us the current C stack delta
+ takeDeltaInstr
+ :: instr
+ -> Maybe Int
+
+ -- | Check whether this instruction is some meta thing inserted into
+ -- the instruction stream for other purposes.
+ --
+ -- Not something that has to be treated as a real machine instruction
+ -- and have its registers allocated.
+ --
+ -- eg, comments, delta, ldata, etc.
+ isMetaInstr
+ :: instr
+ -> Bool
+
+
+
+ -- | Copy the value in a register to another one.
+ -- Must work for all register classes.
+ mkRegRegMoveInstr
+ :: Platform
+ -> Reg -- ^ source register
+ -> Reg -- ^ destination register
+ -> instr
+
+ -- | Take the source and destination from this reg -> reg move instruction
+ -- or Nothing if it's not one
+ takeRegRegMoveInstr
+ :: instr
+ -> Maybe (Reg, Reg)
+
+ -- | Make an unconditional jump instruction.
+ -- For architectures with branch delay slots, its ok to put
+ -- a NOP after the jump. Don't fill the delay slot with an
+ -- instruction that references regs or you'll confuse the
+ -- linear allocator.
+ mkJumpInstr
+ :: BlockId
+ -> [instr]
+
+
+ -- Subtract an amount from the C stack pointer
+ mkStackAllocInstr
+ :: Platform
+ -> Int
+ -> [instr]
+
+ -- Add an amount to the C stack pointer
+ mkStackDeallocInstr
+ :: Platform
+ -> Int
+ -> [instr]
diff --git a/compiler/GHC/CmmToAsm/Monad.hs b/compiler/GHC/CmmToAsm/Monad.hs
new file mode 100644
index 0000000000..c9414a2eee
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Monad.hs
@@ -0,0 +1,294 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE BangPatterns #-}
+
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-- The native code generator's monad.
+--
+-- -----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.Monad (
+ NcgImpl(..),
+ NatM_State(..), mkNatM_State,
+
+ NatM, -- instance Monad
+ initNat,
+ addImportNat,
+ addNodeBetweenNat,
+ addImmediateSuccessorNat,
+ updateCfgNat,
+ getUniqueNat,
+ mapAccumLNat,
+ setDeltaNat,
+ getDeltaNat,
+ getThisModuleNat,
+ getBlockIdNat,
+ getNewLabelNat,
+ getNewRegNat,
+ getNewRegPairNat,
+ getPicBaseMaybeNat,
+ getPicBaseNat,
+ getDynFlags,
+ getModLoc,
+ getFileId,
+ getDebugBlock,
+
+ DwarfFiles
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Reg.Target
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.CLabel ( CLabel )
+import GHC.Cmm.DebugBlock
+import FastString ( FastString )
+import UniqFM
+import UniqSupply
+import Unique ( Unique )
+import GHC.Driver.Session
+import Module
+
+import Control.Monad ( ap )
+
+import GHC.CmmToAsm.Instr
+import Outputable (SDoc, pprPanic, ppr)
+import GHC.Cmm (RawCmmDecl, RawCmmStatics)
+import GHC.CmmToAsm.CFG
+
+data NcgImpl statics instr jumpDest = NcgImpl {
+ cmmTopCodeGen :: RawCmmDecl -> NatM [NatCmmDecl statics instr],
+ generateJumpTableForInstr :: instr -> Maybe (NatCmmDecl statics instr),
+ getJumpDestBlockId :: jumpDest -> Maybe BlockId,
+ canShortcut :: instr -> Maybe jumpDest,
+ shortcutStatics :: (BlockId -> Maybe jumpDest) -> statics -> statics,
+ shortcutJump :: (BlockId -> Maybe jumpDest) -> instr -> instr,
+ pprNatCmmDecl :: NatCmmDecl statics instr -> SDoc,
+ maxSpillSlots :: Int,
+ allocatableRegs :: [RealReg],
+ ncgExpandTop :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
+ ncgAllocMoreStack :: Int -> NatCmmDecl statics instr
+ -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)]),
+ -- ^ The list of block ids records the redirected jumps to allow us to update
+ -- the CFG.
+ ncgMakeFarBranches :: LabelMap RawCmmStatics -> [NatBasicBlock instr] -> [NatBasicBlock instr],
+ extractUnwindPoints :: [instr] -> [UnwindPoint],
+ -- ^ given the instruction sequence of a block, produce a list of
+ -- the block's 'UnwindPoint's
+ -- See Note [What is this unwinding business?] in Debug
+ -- and Note [Unwinding information in the NCG] in this module.
+ invertCondBranches :: Maybe CFG -> LabelMap RawCmmStatics -> [NatBasicBlock instr]
+ -> [NatBasicBlock instr]
+ -- ^ Turn the sequence of `jcc l1; jmp l2` into `jncc l2; <block_l1>`
+ -- when possible.
+ }
+
+data NatM_State
+ = NatM_State {
+ natm_us :: UniqSupply,
+ natm_delta :: Int,
+ natm_imports :: [(CLabel)],
+ natm_pic :: Maybe Reg,
+ natm_dflags :: DynFlags,
+ natm_this_module :: Module,
+ natm_modloc :: ModLocation,
+ natm_fileid :: DwarfFiles,
+ natm_debug_map :: LabelMap DebugBlock,
+ natm_cfg :: CFG
+ -- ^ Having a CFG with additional information is essential for some
+ -- operations. However we can't reconstruct all information once we
+ -- generated instructions. So instead we update the CFG as we go.
+ }
+
+type DwarfFiles = UniqFM (FastString, Int)
+
+newtype NatM result = NatM (NatM_State -> (result, NatM_State))
+ deriving (Functor)
+
+unNat :: NatM a -> NatM_State -> (a, NatM_State)
+unNat (NatM a) = a
+
+mkNatM_State :: UniqSupply -> Int -> DynFlags -> Module -> ModLocation ->
+ DwarfFiles -> LabelMap DebugBlock -> CFG -> NatM_State
+mkNatM_State us delta dflags this_mod
+ = \loc dwf dbg cfg ->
+ NatM_State
+ { natm_us = us
+ , natm_delta = delta
+ , natm_imports = []
+ , natm_pic = Nothing
+ , natm_dflags = dflags
+ , natm_this_module = this_mod
+ , natm_modloc = loc
+ , natm_fileid = dwf
+ , natm_debug_map = dbg
+ , natm_cfg = cfg
+ }
+
+initNat :: NatM_State -> NatM a -> (a, NatM_State)
+initNat init_st m
+ = case unNat m init_st of { (r,st) -> (r,st) }
+
+instance Applicative NatM where
+ pure = returnNat
+ (<*>) = ap
+
+instance Monad NatM where
+ (>>=) = thenNat
+
+instance MonadUnique NatM where
+ getUniqueSupplyM = NatM $ \st ->
+ case splitUniqSupply (natm_us st) of
+ (us1, us2) -> (us1, st {natm_us = us2})
+
+ getUniqueM = NatM $ \st ->
+ case takeUniqFromSupply (natm_us st) of
+ (uniq, us') -> (uniq, st {natm_us = us'})
+
+thenNat :: NatM a -> (a -> NatM b) -> NatM b
+thenNat expr cont
+ = NatM $ \st -> case unNat expr st of
+ (result, st') -> unNat (cont result) st'
+
+returnNat :: a -> NatM a
+returnNat result
+ = NatM $ \st -> (result, st)
+
+mapAccumLNat :: (acc -> x -> NatM (acc, y))
+ -> acc
+ -> [x]
+ -> NatM (acc, [y])
+
+mapAccumLNat _ b []
+ = return (b, [])
+mapAccumLNat f b (x:xs)
+ = do (b__2, x__2) <- f b x
+ (b__3, xs__2) <- mapAccumLNat f b__2 xs
+ return (b__3, x__2:xs__2)
+
+getUniqueNat :: NatM Unique
+getUniqueNat = NatM $ \ st ->
+ case takeUniqFromSupply $ natm_us st of
+ (uniq, us') -> (uniq, st {natm_us = us'})
+
+instance HasDynFlags NatM where
+ getDynFlags = NatM $ \ st -> (natm_dflags st, st)
+
+
+getDeltaNat :: NatM Int
+getDeltaNat = NatM $ \ st -> (natm_delta st, st)
+
+
+setDeltaNat :: Int -> NatM ()
+setDeltaNat delta = NatM $ \ st -> ((), st {natm_delta = delta})
+
+
+getThisModuleNat :: NatM Module
+getThisModuleNat = NatM $ \ st -> (natm_this_module st, st)
+
+
+addImportNat :: CLabel -> NatM ()
+addImportNat imp
+ = NatM $ \ st -> ((), st {natm_imports = imp : natm_imports st})
+
+updateCfgNat :: (CFG -> CFG) -> NatM ()
+updateCfgNat f
+ = NatM $ \ st -> let !cfg' = f (natm_cfg st)
+ in ((), st { natm_cfg = cfg'})
+
+-- | Record that we added a block between `from` and `old`.
+addNodeBetweenNat :: BlockId -> BlockId -> BlockId -> NatM ()
+addNodeBetweenNat from between to
+ = do df <- getDynFlags
+ let jmpWeight = fromIntegral . uncondWeight .
+ cfgWeightInfo $ df
+ updateCfgNat (updateCfg jmpWeight from between to)
+ where
+ -- When transforming A -> B to A -> A' -> B
+ -- A -> A' keeps the old edge info while
+ -- A' -> B gets the info for an unconditional
+ -- jump.
+ updateCfg weight from between old m
+ | Just info <- getEdgeInfo from old m
+ = addEdge from between info .
+ addWeightEdge between old weight .
+ delEdge from old $ m
+ | otherwise
+ = pprPanic "Failed to update cfg: Untracked edge" (ppr (from,to))
+
+
+-- | Place `succ` after `block` and change any edges
+-- block -> X to `succ` -> X
+addImmediateSuccessorNat :: BlockId -> BlockId -> NatM ()
+addImmediateSuccessorNat block succ
+ = updateCfgNat (addImmediateSuccessor block succ)
+
+getBlockIdNat :: NatM BlockId
+getBlockIdNat
+ = do u <- getUniqueNat
+ return (mkBlockId u)
+
+
+getNewLabelNat :: NatM CLabel
+getNewLabelNat
+ = blockLbl <$> getBlockIdNat
+
+
+getNewRegNat :: Format -> NatM Reg
+getNewRegNat rep
+ = do u <- getUniqueNat
+ dflags <- getDynFlags
+ return (RegVirtual $ targetMkVirtualReg (targetPlatform dflags) u rep)
+
+
+getNewRegPairNat :: Format -> NatM (Reg,Reg)
+getNewRegPairNat rep
+ = do u <- getUniqueNat
+ dflags <- getDynFlags
+ let vLo = targetMkVirtualReg (targetPlatform dflags) u rep
+ let lo = RegVirtual $ targetMkVirtualReg (targetPlatform dflags) u rep
+ let hi = RegVirtual $ getHiVirtualRegFromLo vLo
+ return (lo, hi)
+
+
+getPicBaseMaybeNat :: NatM (Maybe Reg)
+getPicBaseMaybeNat
+ = NatM (\state -> (natm_pic state, state))
+
+
+getPicBaseNat :: Format -> NatM Reg
+getPicBaseNat rep
+ = do mbPicBase <- getPicBaseMaybeNat
+ case mbPicBase of
+ Just picBase -> return picBase
+ Nothing
+ -> do
+ reg <- getNewRegNat rep
+ NatM (\state -> (reg, state { natm_pic = Just reg }))
+
+getModLoc :: NatM ModLocation
+getModLoc
+ = NatM $ \ st -> (natm_modloc st, st)
+
+getFileId :: FastString -> NatM Int
+getFileId f = NatM $ \st ->
+ case lookupUFM (natm_fileid st) f of
+ Just (_,n) -> (n, st)
+ Nothing -> let n = 1 + sizeUFM (natm_fileid st)
+ fids = addToUFM (natm_fileid st) f (f,n)
+ in n `seq` fids `seq` (n, st { natm_fileid = fids })
+
+getDebugBlock :: Label -> NatM (Maybe DebugBlock)
+getDebugBlock l = NatM $ \st -> (mapLookup l (natm_debug_map st), st)
diff --git a/compiler/GHC/CmmToAsm/PIC.hs b/compiler/GHC/CmmToAsm/PIC.hs
new file mode 100644
index 0000000000..323d93d173
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PIC.hs
@@ -0,0 +1,837 @@
+{-
+ This module handles generation of position independent code and
+ dynamic-linking related issues for the native code generator.
+
+ This depends both the architecture and OS, so we define it here
+ instead of in one of the architecture specific modules.
+
+ Things outside this module which are related to this:
+
+ + module CLabel
+ - PIC base label (pretty printed as local label 1)
+ - DynamicLinkerLabels - several kinds:
+ CodeStub, SymbolPtr, GotSymbolPtr, GotSymbolOffset
+ - labelDynamic predicate
+ + module Cmm
+ - The GlobalReg datatype has a PicBaseReg constructor
+ - The CmmLit datatype has a CmmLabelDiffOff constructor
+ + codeGen & RTS
+ - When tablesNextToCode, no absolute addresses are stored in info tables
+ any more. Instead, offsets from the info label are used.
+ - For Win32 only, SRTs might contain addresses of __imp_ symbol pointers
+ because Win32 doesn't support external references in data sections.
+ TODO: make sure this still works, it might be bitrotted
+ + NCG
+ - The cmmToCmm pass in AsmCodeGen calls cmmMakeDynamicReference for all
+ labels.
+ - nativeCodeGen calls pprImportedSymbol and pprGotDeclaration to output
+ all the necessary stuff for imported symbols.
+ - The NCG monad keeps track of a list of imported symbols.
+ - MachCodeGen invokes initializePicBase to generate code to initialize
+ the PIC base register when needed.
+ - MachCodeGen calls cmmMakeDynamicReference whenever it uses a CLabel
+ that wasn't in the original Cmm code (e.g. floating point literals).
+-}
+
+module GHC.CmmToAsm.PIC (
+ cmmMakeDynamicReference,
+ CmmMakeDynamicReferenceM(..),
+ ReferenceKind(..),
+ needImportedSymbols,
+ pprImportedSymbol,
+ pprGotDeclaration,
+
+ initializePicBase_ppc,
+ initializePicBase_x86
+)
+
+where
+
+import GhcPrelude
+
+import qualified GHC.CmmToAsm.PPC.Instr as PPC
+import qualified GHC.CmmToAsm.PPC.Regs as PPC
+import qualified GHC.CmmToAsm.X86.Instr as X86
+
+import GHC.Platform
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Monad
+
+
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm
+import GHC.Cmm.CLabel ( CLabel, ForeignLabelSource(..), pprCLabel,
+ mkDynamicLinkerLabel, DynamicLinkerLabelInfo(..),
+ dynamicLinkerLabelInfo, mkPicBaseLabel,
+ labelDynamic, externallyVisibleCLabel )
+
+import GHC.Cmm.CLabel ( mkForeignLabel )
+
+
+import BasicTypes
+import Module
+
+import Outputable
+
+import GHC.Driver.Session
+import FastString
+
+
+
+--------------------------------------------------------------------------------
+-- It gets called by the cmmToCmm pass for every CmmLabel in the Cmm
+-- code. It does The Right Thing(tm) to convert the CmmLabel into a
+-- position-independent, dynamic-linking-aware reference to the thing
+-- in question.
+-- Note that this also has to be called from MachCodeGen in order to
+-- access static data like floating point literals (labels that were
+-- created after the cmmToCmm pass).
+-- The function must run in a monad that can keep track of imported symbols
+-- A function for recording an imported symbol must be passed in:
+-- - addImportCmmOpt for the CmmOptM monad
+-- - addImportNat for the NatM monad.
+
+data ReferenceKind
+ = DataReference
+ | CallReference
+ | JumpReference
+ deriving(Eq)
+
+class Monad m => CmmMakeDynamicReferenceM m where
+ addImport :: CLabel -> m ()
+ getThisModule :: m Module
+
+instance CmmMakeDynamicReferenceM NatM where
+ addImport = addImportNat
+ getThisModule = getThisModuleNat
+
+cmmMakeDynamicReference
+ :: CmmMakeDynamicReferenceM m
+ => DynFlags
+ -> ReferenceKind -- whether this is the target of a jump
+ -> CLabel -- the label
+ -> m CmmExpr
+
+cmmMakeDynamicReference dflags referenceKind lbl
+ | Just _ <- dynamicLinkerLabelInfo lbl
+ = return $ CmmLit $ CmmLabel lbl -- already processed it, pass through
+
+ | otherwise
+ = do this_mod <- getThisModule
+ case howToAccessLabel
+ dflags
+ (platformArch $ targetPlatform dflags)
+ (platformOS $ targetPlatform dflags)
+ this_mod
+ referenceKind lbl of
+
+ AccessViaStub -> do
+ let stub = mkDynamicLinkerLabel CodeStub lbl
+ addImport stub
+ return $ CmmLit $ CmmLabel stub
+
+ AccessViaSymbolPtr -> do
+ let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
+ addImport symbolPtr
+ return $ CmmLoad (cmmMakePicReference dflags symbolPtr) (bWord dflags)
+
+ AccessDirectly -> case referenceKind of
+ -- for data, we might have to make some calculations:
+ DataReference -> return $ cmmMakePicReference dflags lbl
+ -- all currently supported processors support
+ -- PC-relative branch and call instructions,
+ -- so just jump there if it's a call or a jump
+ _ -> return $ CmmLit $ CmmLabel lbl
+
+
+-- -----------------------------------------------------------------------------
+-- Create a position independent reference to a label.
+-- (but do not bother with dynamic linking).
+-- We calculate the label's address by adding some (platform-dependent)
+-- offset to our base register; this offset is calculated by
+-- the function picRelative in the platform-dependent part below.
+
+cmmMakePicReference :: DynFlags -> CLabel -> CmmExpr
+cmmMakePicReference dflags lbl
+
+ -- Windows doesn't need PIC,
+ -- everything gets relocated at runtime
+ | OSMinGW32 <- platformOS $ targetPlatform dflags
+ = CmmLit $ CmmLabel lbl
+
+ | OSAIX <- platformOS $ targetPlatform dflags
+ = CmmMachOp (MO_Add W32)
+ [ CmmReg (CmmGlobal PicBaseReg)
+ , CmmLit $ picRelative dflags
+ (platformArch $ targetPlatform dflags)
+ (platformOS $ targetPlatform dflags)
+ lbl ]
+
+ -- both ABI versions default to medium code model
+ | ArchPPC_64 _ <- platformArch $ targetPlatform dflags
+ = CmmMachOp (MO_Add W32) -- code model medium
+ [ CmmReg (CmmGlobal PicBaseReg)
+ , CmmLit $ picRelative dflags
+ (platformArch $ targetPlatform dflags)
+ (platformOS $ targetPlatform dflags)
+ lbl ]
+
+ | (positionIndependent dflags || gopt Opt_ExternalDynamicRefs dflags)
+ && absoluteLabel lbl
+ = CmmMachOp (MO_Add (wordWidth dflags))
+ [ CmmReg (CmmGlobal PicBaseReg)
+ , CmmLit $ picRelative dflags
+ (platformArch $ targetPlatform dflags)
+ (platformOS $ targetPlatform dflags)
+ lbl ]
+
+ | otherwise
+ = CmmLit $ CmmLabel lbl
+
+
+absoluteLabel :: CLabel -> Bool
+absoluteLabel lbl
+ = case dynamicLinkerLabelInfo lbl of
+ Just (GotSymbolPtr, _) -> False
+ Just (GotSymbolOffset, _) -> False
+ _ -> True
+
+
+--------------------------------------------------------------------------------
+-- Knowledge about how special dynamic linker labels like symbol
+-- pointers, code stubs and GOT offsets look like is located in the
+-- module CLabel.
+
+-- We have to decide which labels need to be accessed
+-- indirectly or via a piece of stub code.
+data LabelAccessStyle
+ = AccessViaStub
+ | AccessViaSymbolPtr
+ | AccessDirectly
+
+howToAccessLabel
+ :: DynFlags -> Arch -> OS -> Module -> ReferenceKind -> CLabel -> LabelAccessStyle
+
+
+-- Windows
+-- In Windows speak, a "module" is a set of objects linked into the
+-- same Portable Executable (PE) file. (both .exe and .dll files are PEs).
+--
+-- If we're compiling a multi-module program then symbols from other modules
+-- are accessed by a symbol pointer named __imp_SYMBOL. At runtime we have the
+-- following.
+--
+-- (in the local module)
+-- __imp_SYMBOL: addr of SYMBOL
+--
+-- (in the other module)
+-- SYMBOL: the real function / data.
+--
+-- To access the function at SYMBOL from our local module, we just need to
+-- dereference the local __imp_SYMBOL.
+--
+-- If not compiling with -dynamic we assume that all our code will be linked
+-- into the same .exe file. In this case we always access symbols directly,
+-- and never use __imp_SYMBOL.
+--
+howToAccessLabel dflags _ OSMinGW32 this_mod _ lbl
+
+ -- Assume all symbols will be in the same PE, so just access them directly.
+ | not (gopt Opt_ExternalDynamicRefs dflags)
+ = AccessDirectly
+
+ -- If the target symbol is in another PE we need to access it via the
+ -- appropriate __imp_SYMBOL pointer.
+ | labelDynamic dflags this_mod lbl
+ = AccessViaSymbolPtr
+
+ -- Target symbol is in the same PE as the caller, so just access it directly.
+ | otherwise
+ = AccessDirectly
+
+
+-- Mach-O (Darwin, Mac OS X)
+--
+-- Indirect access is required in the following cases:
+-- * things imported from a dynamic library
+-- * (not on x86_64) data from a different module, if we're generating PIC code
+-- It is always possible to access something indirectly,
+-- even when it's not necessary.
+--
+howToAccessLabel dflags arch OSDarwin this_mod DataReference lbl
+ -- data access to a dynamic library goes via a symbol pointer
+ | labelDynamic dflags this_mod lbl
+ = AccessViaSymbolPtr
+
+ -- when generating PIC code, all cross-module data references must
+ -- must go via a symbol pointer, too, because the assembler
+ -- cannot generate code for a label difference where one
+ -- label is undefined. Doesn't apply t x86_64.
+ -- Unfortunately, we don't know whether it's cross-module,
+ -- so we do it for all externally visible labels.
+ -- This is a slight waste of time and space, but otherwise
+ -- we'd need to pass the current Module all the way in to
+ -- this function.
+ | arch /= ArchX86_64
+ , positionIndependent dflags && externallyVisibleCLabel lbl
+ = AccessViaSymbolPtr
+
+ | otherwise
+ = AccessDirectly
+
+howToAccessLabel dflags arch OSDarwin this_mod JumpReference lbl
+ -- dyld code stubs don't work for tailcalls because the
+ -- stack alignment is only right for regular calls.
+ -- Therefore, we have to go via a symbol pointer:
+ | arch == ArchX86 || arch == ArchX86_64
+ , labelDynamic dflags this_mod lbl
+ = AccessViaSymbolPtr
+
+
+howToAccessLabel dflags arch OSDarwin this_mod _ lbl
+ -- Code stubs are the usual method of choice for imported code;
+ -- not needed on x86_64 because Apple's new linker, ld64, generates
+ -- them automatically.
+ | arch /= ArchX86_64
+ , labelDynamic dflags this_mod lbl
+ = AccessViaStub
+
+ | otherwise
+ = AccessDirectly
+
+
+----------------------------------------------------------------------------
+-- AIX
+
+-- quite simple (for now)
+howToAccessLabel _dflags _arch OSAIX _this_mod kind _lbl
+ = case kind of
+ DataReference -> AccessViaSymbolPtr
+ CallReference -> AccessDirectly
+ JumpReference -> AccessDirectly
+
+-- ELF (Linux)
+--
+-- ELF tries to pretend to the main application code that dynamic linking does
+-- not exist. While this may sound convenient, it tends to mess things up in
+-- very bad ways, so we have to be careful when we generate code for a non-PIE
+-- main program (-dynamic but no -fPIC).
+--
+-- Indirect access is required for references to imported symbols
+-- from position independent code. It is also required from the main program
+-- when dynamic libraries containing Haskell code are used.
+
+howToAccessLabel _ (ArchPPC_64 _) os _ kind _
+ | osElfTarget os
+ = case kind of
+ -- ELF PPC64 (powerpc64-linux), AIX, MacOS 9, BeOS/PPC
+ DataReference -> AccessViaSymbolPtr
+ -- RTLD does not generate stubs for function descriptors
+ -- in tail calls. Create a symbol pointer and generate
+ -- the code to load the function descriptor at the call site.
+ JumpReference -> AccessViaSymbolPtr
+ -- regular calls are handled by the runtime linker
+ _ -> AccessDirectly
+
+howToAccessLabel dflags _ os _ _ _
+ -- no PIC -> the dynamic linker does everything for us;
+ -- if we don't dynamically link to Haskell code,
+ -- it actually manages to do so without messing things up.
+ | osElfTarget os
+ , not (positionIndependent dflags) &&
+ not (gopt Opt_ExternalDynamicRefs dflags)
+ = AccessDirectly
+
+howToAccessLabel dflags arch os this_mod DataReference lbl
+ | osElfTarget os
+ = case () of
+ -- A dynamic label needs to be accessed via a symbol pointer.
+ _ | labelDynamic dflags this_mod lbl
+ -> AccessViaSymbolPtr
+
+ -- For PowerPC32 -fPIC, we have to access even static data
+ -- via a symbol pointer (see below for an explanation why
+ -- PowerPC32 Linux is especially broken).
+ | arch == ArchPPC
+ , positionIndependent dflags
+ -> AccessViaSymbolPtr
+
+ | otherwise
+ -> AccessDirectly
+
+
+ -- In most cases, we have to avoid symbol stubs on ELF, for the following reasons:
+ -- on i386, the position-independent symbol stubs in the Procedure Linkage Table
+ -- require the address of the GOT to be loaded into register %ebx on entry.
+ -- The linker will take any reference to the symbol stub as a hint that
+ -- the label in question is a code label. When linking executables, this
+ -- will cause the linker to replace even data references to the label with
+ -- references to the symbol stub.
+
+ -- This leaves calling a (foreign) function from non-PIC code
+ -- (AccessDirectly, because we get an implicit symbol stub)
+ -- and calling functions from PIC code on non-i386 platforms (via a symbol stub)
+
+howToAccessLabel dflags arch os this_mod CallReference lbl
+ | osElfTarget os
+ , labelDynamic dflags this_mod lbl && not (positionIndependent dflags)
+ = AccessDirectly
+
+ | osElfTarget os
+ , arch /= ArchX86
+ , labelDynamic dflags this_mod lbl
+ , positionIndependent dflags
+ = AccessViaStub
+
+howToAccessLabel dflags _ os this_mod _ lbl
+ | osElfTarget os
+ = if labelDynamic dflags this_mod lbl
+ then AccessViaSymbolPtr
+ else AccessDirectly
+
+-- all other platforms
+howToAccessLabel dflags _ _ _ _ _
+ | not (positionIndependent dflags)
+ = AccessDirectly
+
+ | otherwise
+ = panic "howToAccessLabel: PIC not defined for this platform"
+
+
+
+-- -------------------------------------------------------------------
+-- | Says what we have to add to our 'PIC base register' in order to
+-- get the address of a label.
+
+picRelative :: DynFlags -> Arch -> OS -> CLabel -> CmmLit
+
+-- Darwin, but not x86_64:
+-- The PIC base register points to the PIC base label at the beginning
+-- of the current CmmDecl. We just have to use a label difference to
+-- get the offset.
+-- We have already made sure that all labels that are not from the current
+-- module are accessed indirectly ('as' can't calculate differences between
+-- undefined labels).
+picRelative dflags arch OSDarwin lbl
+ | arch /= ArchX86_64
+ = CmmLabelDiffOff lbl mkPicBaseLabel 0 (wordWidth dflags)
+
+-- On AIX we use an indirect local TOC anchored by 'gotLabel'.
+-- This way we use up only one global TOC entry per compilation-unit
+-- (this is quite similar to GCC's @-mminimal-toc@ compilation mode)
+picRelative dflags _ OSAIX lbl
+ = CmmLabelDiffOff lbl gotLabel 0 (wordWidth dflags)
+
+-- PowerPC Linux:
+-- The PIC base register points to our fake GOT. Use a label difference
+-- to get the offset.
+-- We have made sure that *everything* is accessed indirectly, so this
+-- is only used for offsets from the GOT to symbol pointers inside the
+-- GOT.
+picRelative dflags ArchPPC os lbl
+ | osElfTarget os
+ = CmmLabelDiffOff lbl gotLabel 0 (wordWidth dflags)
+
+
+-- Most Linux versions:
+-- The PIC base register points to the GOT. Use foo@got for symbol
+-- pointers, and foo@gotoff for everything else.
+-- Linux and Darwin on x86_64:
+-- The PIC base register is %rip, we use foo@gotpcrel for symbol pointers,
+-- and a GotSymbolOffset label for other things.
+-- For reasons of tradition, the symbol offset label is written as a plain label.
+picRelative _ arch os lbl
+ | osElfTarget os || (os == OSDarwin && arch == ArchX86_64)
+ = let result
+ | Just (SymbolPtr, lbl') <- dynamicLinkerLabelInfo lbl
+ = CmmLabel $ mkDynamicLinkerLabel GotSymbolPtr lbl'
+
+ | otherwise
+ = CmmLabel $ mkDynamicLinkerLabel GotSymbolOffset lbl
+
+ in result
+
+picRelative _ _ _ _
+ = panic "PositionIndependentCode.picRelative undefined for this platform"
+
+
+
+--------------------------------------------------------------------------------
+
+needImportedSymbols :: DynFlags -> Arch -> OS -> Bool
+needImportedSymbols dflags arch os
+ | os == OSDarwin
+ , arch /= ArchX86_64
+ = True
+
+ | os == OSAIX
+ = True
+
+ -- PowerPC Linux: -fPIC or -dynamic
+ | osElfTarget os
+ , arch == ArchPPC
+ = positionIndependent dflags || gopt Opt_ExternalDynamicRefs dflags
+
+ -- PowerPC 64 Linux: always
+ | osElfTarget os
+ , arch == ArchPPC_64 ELF_V1 || arch == ArchPPC_64 ELF_V2
+ = True
+
+ -- i386 (and others?): -dynamic but not -fPIC
+ | osElfTarget os
+ , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2
+ = gopt Opt_ExternalDynamicRefs dflags &&
+ not (positionIndependent dflags)
+
+ | otherwise
+ = False
+
+-- gotLabel
+-- The label used to refer to our "fake GOT" from
+-- position-independent code.
+gotLabel :: CLabel
+gotLabel
+ -- HACK: this label isn't really foreign
+ = mkForeignLabel
+ (fsLit ".LCTOC1")
+ Nothing ForeignLabelInThisPackage IsData
+
+
+
+--------------------------------------------------------------------------------
+-- We don't need to declare any offset tables.
+-- However, for PIC on x86, we need a small helper function.
+pprGotDeclaration :: DynFlags -> Arch -> OS -> SDoc
+pprGotDeclaration dflags ArchX86 OSDarwin
+ | positionIndependent dflags
+ = vcat [
+ text ".section __TEXT,__textcoal_nt,coalesced,no_toc",
+ text ".weak_definition ___i686.get_pc_thunk.ax",
+ text ".private_extern ___i686.get_pc_thunk.ax",
+ text "___i686.get_pc_thunk.ax:",
+ text "\tmovl (%esp), %eax",
+ text "\tret" ]
+
+pprGotDeclaration _ _ OSDarwin
+ = empty
+
+-- Emit XCOFF TOC section
+pprGotDeclaration _ _ OSAIX
+ = vcat $ [ text ".toc"
+ , text ".tc ghc_toc_table[TC],.LCTOC1"
+ , text ".csect ghc_toc_table[RW]"
+ -- See Note [.LCTOC1 in PPC PIC code]
+ , text ".set .LCTOC1,$+0x8000"
+ ]
+
+
+-- PPC 64 ELF v1 needs a Table Of Contents (TOC)
+pprGotDeclaration _ (ArchPPC_64 ELF_V1) _
+ = text ".section \".toc\",\"aw\""
+-- In ELF v2 we also need to tell the assembler that we want ABI
+-- version 2. This would normally be done at the top of the file
+-- right after a file directive, but I could not figure out how
+-- to do that.
+pprGotDeclaration _ (ArchPPC_64 ELF_V2) _
+ = vcat [ text ".abiversion 2",
+ text ".section \".toc\",\"aw\""
+ ]
+
+-- Emit GOT declaration
+-- Output whatever needs to be output once per .s file.
+pprGotDeclaration dflags arch os
+ | osElfTarget os
+ , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2
+ , not (positionIndependent dflags)
+ = empty
+
+ | osElfTarget os
+ , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2
+ = vcat [
+ -- See Note [.LCTOC1 in PPC PIC code]
+ text ".section \".got2\",\"aw\"",
+ text ".LCTOC1 = .+32768" ]
+
+pprGotDeclaration _ _ _
+ = panic "pprGotDeclaration: no match"
+
+
+--------------------------------------------------------------------------------
+-- On Darwin, we have to generate our own stub code for lazy binding..
+-- For each processor architecture, there are two versions, one for PIC
+-- and one for non-PIC.
+--
+
+pprImportedSymbol :: DynFlags -> Platform -> CLabel -> SDoc
+pprImportedSymbol dflags (Platform { platformMini = PlatformMini { platformMini_arch = ArchX86, platformMini_os = OSDarwin } }) importedLbl
+ | Just (CodeStub, lbl) <- dynamicLinkerLabelInfo importedLbl
+ = case positionIndependent dflags of
+ False ->
+ vcat [
+ text ".symbol_stub",
+ text "L" <> pprCLabel dflags lbl <> ptext (sLit "$stub:"),
+ text "\t.indirect_symbol" <+> pprCLabel dflags lbl,
+ text "\tjmp *L" <> pprCLabel dflags lbl
+ <> text "$lazy_ptr",
+ text "L" <> pprCLabel dflags lbl
+ <> text "$stub_binder:",
+ text "\tpushl $L" <> pprCLabel dflags lbl
+ <> text "$lazy_ptr",
+ text "\tjmp dyld_stub_binding_helper"
+ ]
+ True ->
+ vcat [
+ text ".section __TEXT,__picsymbolstub2,"
+ <> text "symbol_stubs,pure_instructions,25",
+ text "L" <> pprCLabel dflags lbl <> ptext (sLit "$stub:"),
+ text "\t.indirect_symbol" <+> pprCLabel dflags lbl,
+ text "\tcall ___i686.get_pc_thunk.ax",
+ text "1:",
+ text "\tmovl L" <> pprCLabel dflags lbl
+ <> text "$lazy_ptr-1b(%eax),%edx",
+ text "\tjmp *%edx",
+ text "L" <> pprCLabel dflags lbl
+ <> text "$stub_binder:",
+ text "\tlea L" <> pprCLabel dflags lbl
+ <> text "$lazy_ptr-1b(%eax),%eax",
+ text "\tpushl %eax",
+ text "\tjmp dyld_stub_binding_helper"
+ ]
+ $+$ vcat [ text ".section __DATA, __la_sym_ptr"
+ <> (if positionIndependent dflags then int 2 else int 3)
+ <> text ",lazy_symbol_pointers",
+ text "L" <> pprCLabel dflags lbl <> ptext (sLit "$lazy_ptr:"),
+ text "\t.indirect_symbol" <+> pprCLabel dflags lbl,
+ text "\t.long L" <> pprCLabel dflags lbl
+ <> text "$stub_binder"]
+
+ | Just (SymbolPtr, lbl) <- dynamicLinkerLabelInfo importedLbl
+ = vcat [
+ text ".non_lazy_symbol_pointer",
+ char 'L' <> pprCLabel dflags lbl <> text "$non_lazy_ptr:",
+ text "\t.indirect_symbol" <+> pprCLabel dflags lbl,
+ text "\t.long\t0"]
+
+ | otherwise
+ = empty
+
+
+pprImportedSymbol _ (Platform { platformMini = PlatformMini { platformMini_os = OSDarwin } }) _
+ = empty
+
+-- XCOFF / AIX
+--
+-- Similar to PPC64 ELF v1, there's dedicated TOC register (r2). To
+-- workaround the limitation of a global TOC we use an indirect TOC
+-- with the label `ghc_toc_table`.
+--
+-- See also GCC's `-mminimal-toc` compilation mode or
+-- http://www.ibm.com/developerworks/rational/library/overview-toc-aix/
+--
+-- NB: No DSO-support yet
+
+pprImportedSymbol dflags (Platform { platformMini = PlatformMini { platformMini_os = OSAIX } }) importedLbl
+ = case dynamicLinkerLabelInfo importedLbl of
+ Just (SymbolPtr, lbl)
+ -> vcat [
+ text "LC.." <> pprCLabel dflags lbl <> char ':',
+ text "\t.long" <+> pprCLabel dflags lbl ]
+ _ -> empty
+
+-- ELF / Linux
+--
+-- In theory, we don't need to generate any stubs or symbol pointers
+-- by hand for Linux.
+--
+-- Reality differs from this in two areas.
+--
+-- 1) If we just use a dynamically imported symbol directly in a read-only
+-- section of the main executable (as GCC does), ld generates R_*_COPY
+-- relocations, which are fundamentally incompatible with reversed info
+-- tables. Therefore, we need a table of imported addresses in a writable
+-- section.
+-- The "official" GOT mechanism (label@got) isn't intended to be used
+-- in position dependent code, so we have to create our own "fake GOT"
+-- when not Opt_PIC && WayDyn `elem` ways dflags.
+--
+-- 2) PowerPC Linux is just plain broken.
+-- While it's theoretically possible to use GOT offsets larger
+-- than 16 bit, the standard crt*.o files don't, which leads to
+-- linker errors as soon as the GOT size exceeds 16 bit.
+-- Also, the assembler doesn't support @gotoff labels.
+-- In order to be able to use a larger GOT, we have to circumvent the
+-- entire GOT mechanism and do it ourselves (this is also what GCC does).
+
+
+-- When needImportedSymbols is defined,
+-- the NCG will keep track of all DynamicLinkerLabels it uses
+-- and output each of them using pprImportedSymbol.
+
+pprImportedSymbol dflags platform@(Platform { platformMini = PlatformMini { platformMini_arch = ArchPPC_64 _ } })
+ importedLbl
+ | osElfTarget (platformOS platform)
+ = case dynamicLinkerLabelInfo importedLbl of
+ Just (SymbolPtr, lbl)
+ -> vcat [
+ text ".section \".toc\", \"aw\"",
+ text ".LC_" <> pprCLabel dflags lbl <> char ':',
+ text "\t.quad" <+> pprCLabel dflags lbl ]
+ _ -> empty
+
+pprImportedSymbol dflags platform importedLbl
+ | osElfTarget (platformOS platform)
+ = case dynamicLinkerLabelInfo importedLbl of
+ Just (SymbolPtr, lbl)
+ -> let symbolSize = case wordWidth dflags of
+ W32 -> sLit "\t.long"
+ W64 -> sLit "\t.quad"
+ _ -> panic "Unknown wordRep in pprImportedSymbol"
+
+ in vcat [
+ text ".section \".got2\", \"aw\"",
+ text ".LC_" <> pprCLabel dflags lbl <> char ':',
+ ptext symbolSize <+> pprCLabel dflags lbl ]
+
+ -- PLT code stubs are generated automatically by the dynamic linker.
+ _ -> empty
+
+pprImportedSymbol _ _ _
+ = panic "PIC.pprImportedSymbol: no match"
+
+--------------------------------------------------------------------------------
+-- Generate code to calculate the address that should be put in the
+-- PIC base register.
+-- This is called by MachCodeGen for every CmmProc that accessed the
+-- PIC base register. It adds the appropriate instructions to the
+-- top of the CmmProc.
+
+-- It is assumed that the first NatCmmDecl in the input list is a Proc
+-- and the rest are CmmDatas.
+
+-- Darwin is simple: just fetch the address of a local label.
+-- The FETCHPC pseudo-instruction is expanded to multiple instructions
+-- during pretty-printing so that we don't have to deal with the
+-- local label:
+
+-- PowerPC version:
+-- bcl 20,31,1f.
+-- 1: mflr picReg
+
+-- i386 version:
+-- call 1f
+-- 1: popl %picReg
+
+
+
+-- Get a pointer to our own fake GOT, which is defined on a per-module basis.
+-- This is exactly how GCC does it in linux.
+
+initializePicBase_ppc
+ :: Arch -> OS -> Reg
+ -> [NatCmmDecl RawCmmStatics PPC.Instr]
+ -> NatM [NatCmmDecl RawCmmStatics PPC.Instr]
+
+initializePicBase_ppc ArchPPC os picReg
+ (CmmProc info lab live (ListGraph blocks) : statics)
+ | osElfTarget os
+ = do
+ let
+ gotOffset = PPC.ImmConstantDiff
+ (PPC.ImmCLbl gotLabel)
+ (PPC.ImmCLbl mkPicBaseLabel)
+
+ blocks' = case blocks of
+ [] -> []
+ (b:bs) -> fetchPC b : map maybeFetchPC bs
+
+ maybeFetchPC b@(BasicBlock bID _)
+ | bID `mapMember` info = fetchPC b
+ | otherwise = b
+
+ -- GCC does PIC prologs thusly:
+ -- bcl 20,31,.L1
+ -- .L1:
+ -- mflr 30
+ -- addis 30,30,.LCTOC1-.L1@ha
+ -- addi 30,30,.LCTOC1-.L1@l
+ -- TODO: below we use it over temporary register,
+ -- it can and should be optimised by picking
+ -- correct PIC reg.
+ fetchPC (BasicBlock bID insns) =
+ BasicBlock bID (PPC.FETCHPC picReg
+ : PPC.ADDIS picReg picReg (PPC.HA gotOffset)
+ : PPC.ADD picReg picReg
+ (PPC.RIImm (PPC.LO gotOffset))
+ : PPC.MR PPC.r30 picReg
+ : insns)
+
+ return (CmmProc info lab live (ListGraph blocks') : statics)
+
+-------------------------------------------------------------------------
+-- Load TOC into register 2
+-- PowerPC 64-bit ELF ABI 2.0 requires the address of the callee
+-- in register 12.
+-- We pass the label to FETCHTOC and create a .localentry too.
+-- TODO: Explain this better and refer to ABI spec!
+{-
+We would like to do approximately this, but spill slot allocation
+might be added before the first BasicBlock. That violates the ABI.
+
+For now we will emit the prologue code in the pretty printer,
+which is also what we do for ELF v1.
+initializePicBase_ppc (ArchPPC_64 ELF_V2) OSLinux picReg
+ (CmmProc info lab live (ListGraph (entry:blocks)) : statics)
+ = do
+ bID <-getUniqueM
+ return (CmmProc info lab live (ListGraph (b':entry:blocks))
+ : statics)
+ where BasicBlock entryID _ = entry
+ b' = BasicBlock bID [PPC.FETCHTOC picReg lab,
+ PPC.BCC PPC.ALWAYS entryID]
+-}
+
+initializePicBase_ppc _ _ _ _
+ = panic "initializePicBase_ppc: not needed"
+
+
+-- We cheat a bit here by defining a pseudo-instruction named FETCHGOT
+-- which pretty-prints as:
+-- call 1f
+-- 1: popl %picReg
+-- addl __GLOBAL_OFFSET_TABLE__+.-1b, %picReg
+-- (See PprMach.hs)
+
+initializePicBase_x86
+ :: Arch -> OS -> Reg
+ -> [NatCmmDecl (Alignment, RawCmmStatics) X86.Instr]
+ -> NatM [NatCmmDecl (Alignment, RawCmmStatics) X86.Instr]
+
+initializePicBase_x86 ArchX86 os picReg
+ (CmmProc info lab live (ListGraph blocks) : statics)
+ | osElfTarget os
+ = return (CmmProc info lab live (ListGraph blocks') : statics)
+ where blocks' = case blocks of
+ [] -> []
+ (b:bs) -> fetchGOT b : map maybeFetchGOT bs
+
+ -- we want to add a FETCHGOT instruction to the beginning of
+ -- every block that is an entry point, which corresponds to
+ -- the blocks that have entries in the info-table mapping.
+ maybeFetchGOT b@(BasicBlock bID _)
+ | bID `mapMember` info = fetchGOT b
+ | otherwise = b
+
+ fetchGOT (BasicBlock bID insns) =
+ BasicBlock bID (X86.FETCHGOT picReg : insns)
+
+initializePicBase_x86 ArchX86 OSDarwin picReg
+ (CmmProc info lab live (ListGraph (entry:blocks)) : statics)
+ = return (CmmProc info lab live (ListGraph (block':blocks)) : statics)
+
+ where BasicBlock bID insns = entry
+ block' = BasicBlock bID (X86.FETCHPC picReg : insns)
+
+initializePicBase_x86 _ _ _ _
+ = panic "initializePicBase_x86: not needed"
+
diff --git a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
new file mode 100644
index 0000000000..02319171dc
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
@@ -0,0 +1,2455 @@
+{-# LANGUAGE CPP, GADTs #-}
+
+-----------------------------------------------------------------------------
+--
+-- Generating machine code (instruction selection)
+--
+-- (c) The University of Glasgow 1996-2004
+--
+-----------------------------------------------------------------------------
+
+-- This is a big module, but, if you pay attention to
+-- (a) the sectioning, and (b) the type signatures,
+-- the structure should not be too overwhelming.
+
+module GHC.CmmToAsm.PPC.CodeGen (
+ cmmTopCodeGen,
+ generateJumpTableForInstr,
+ InstrBlock
+)
+
+where
+
+#include "HsVersions.h"
+
+-- NCG stuff:
+import GhcPrelude
+
+import GHC.Platform.Regs
+import GHC.CmmToAsm.PPC.Instr
+import GHC.CmmToAsm.PPC.Cond
+import GHC.CmmToAsm.PPC.Regs
+import GHC.CmmToAsm.CPrim
+import GHC.CmmToAsm.Monad
+ ( NatM, getNewRegNat, getNewLabelNat
+ , getBlockIdNat, getPicBaseNat, getNewRegPairNat
+ , getPicBaseMaybeNat
+ )
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Reg.Target
+import GHC.Platform
+
+-- Our intermediate code:
+import GHC.Cmm.BlockId
+import GHC.Cmm.Ppr ( pprExpr )
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm.CLabel
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+
+-- The rest:
+import OrdList
+import Outputable
+import GHC.Driver.Session
+
+import Control.Monad ( mapAndUnzipM, when )
+import Data.Bits
+import Data.Word
+
+import BasicTypes
+import FastString
+import Util
+
+-- -----------------------------------------------------------------------------
+-- Top-level of the instruction selector
+
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+-- They are really trees of insns to facilitate fast appending, where a
+-- left-to-right traversal (pre-order?) yields the insns in the correct
+-- order.
+
+cmmTopCodeGen
+ :: RawCmmDecl
+ -> NatM [NatCmmDecl RawCmmStatics Instr]
+
+cmmTopCodeGen (CmmProc info lab live graph) = do
+ let blocks = toBlockListEntryFirst graph
+ (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+ dflags <- getDynFlags
+ let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+ tops = proc : concat statics
+ os = platformOS $ targetPlatform dflags
+ arch = platformArch $ targetPlatform dflags
+ case arch of
+ ArchPPC | os == OSAIX -> return tops
+ | otherwise -> do
+ picBaseMb <- getPicBaseMaybeNat
+ case picBaseMb of
+ Just picBase -> initializePicBase_ppc arch os picBase tops
+ Nothing -> return tops
+ ArchPPC_64 ELF_V1 -> fixup_entry tops
+ -- generating function descriptor is handled in
+ -- pretty printer
+ ArchPPC_64 ELF_V2 -> fixup_entry tops
+ -- generating function prologue is handled in
+ -- pretty printer
+ _ -> panic "PPC.cmmTopCodeGen: unknown arch"
+ where
+ fixup_entry (CmmProc info lab live (ListGraph (entry:blocks)) : statics)
+ = do
+ let BasicBlock bID insns = entry
+ bID' <- if lab == (blockLbl bID)
+ then newBlockId
+ else return bID
+ let b' = BasicBlock bID' insns
+ return (CmmProc info lab live (ListGraph (b':blocks)) : statics)
+ fixup_entry _ = panic "cmmTopCodegen: Broken CmmProc"
+
+cmmTopCodeGen (CmmData sec dat) = do
+ return [CmmData sec dat] -- no translation, we just use CmmStatic
+
+basicBlockCodeGen
+ :: Block CmmNode C C
+ -> NatM ( [NatBasicBlock Instr]
+ , [NatCmmDecl RawCmmStatics Instr])
+
+basicBlockCodeGen block = do
+ let (_, nodes, tail) = blockSplit block
+ id = entryLabel block
+ stmts = blockToList nodes
+ mid_instrs <- stmtsToInstrs stmts
+ tail_instrs <- stmtToInstrs tail
+ let instrs = mid_instrs `appOL` tail_instrs
+ -- code generation may introduce new basic block boundaries, which
+ -- are indicated by the NEWBLOCK instruction. We must split up the
+ -- instruction stream into basic blocks again. Also, we extract
+ -- LDATAs here too.
+ let
+ (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs
+
+ mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+ = ([], BasicBlock id instrs : blocks, statics)
+ mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+ = (instrs, blocks, CmmData sec dat:statics)
+ mkBlocks instr (instrs,blocks,statics)
+ = (instr:instrs, blocks, statics)
+ return (BasicBlock id top : other_blocks, statics)
+
+stmtsToInstrs :: [CmmNode e x] -> NatM InstrBlock
+stmtsToInstrs stmts
+ = do instrss <- mapM stmtToInstrs stmts
+ return (concatOL instrss)
+
+stmtToInstrs :: CmmNode e x -> NatM InstrBlock
+stmtToInstrs stmt = do
+ dflags <- getDynFlags
+ case stmt of
+ CmmComment s -> return (unitOL (COMMENT s))
+ CmmTick {} -> return nilOL
+ CmmUnwind {} -> return nilOL
+
+ CmmAssign reg src
+ | isFloatType ty -> assignReg_FltCode format reg src
+ | target32Bit (targetPlatform dflags) &&
+ isWord64 ty -> assignReg_I64Code reg src
+ | otherwise -> assignReg_IntCode format reg src
+ where ty = cmmRegType dflags reg
+ format = cmmTypeFormat ty
+
+ CmmStore addr src
+ | isFloatType ty -> assignMem_FltCode format addr src
+ | target32Bit (targetPlatform dflags) &&
+ isWord64 ty -> assignMem_I64Code addr src
+ | otherwise -> assignMem_IntCode format addr src
+ where ty = cmmExprType dflags src
+ format = cmmTypeFormat ty
+
+ CmmUnsafeForeignCall target result_regs args
+ -> genCCall target result_regs args
+
+ CmmBranch id -> genBranch id
+ CmmCondBranch arg true false prediction -> do
+ b1 <- genCondJump true arg prediction
+ b2 <- genBranch false
+ return (b1 `appOL` b2)
+ CmmSwitch arg ids -> do dflags <- getDynFlags
+ genSwitch dflags arg ids
+ CmmCall { cml_target = arg
+ , cml_args_regs = gregs } -> do
+ dflags <- getDynFlags
+ genJump arg (jumpRegs dflags gregs)
+ _ ->
+ panic "stmtToInstrs: statement should have been cps'd away"
+
+jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
+jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
+ where platform = targetPlatform dflags
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+-- They are really trees of insns to facilitate fast appending, where a
+-- left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+ = OrdList Instr
+
+
+-- | Register's passed up the tree. If the stix code forces the register
+-- to live in a pre-decided machine register, it comes out as @Fixed@;
+-- otherwise, it comes out as @Any@, and the parent can decide which
+-- register to put it in.
+--
+data Register
+ = Fixed Format Reg InstrBlock
+ | Any Format (Reg -> InstrBlock)
+
+
+swizzleRegisterRep :: Register -> Format -> Register
+swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
+swizzleRegisterRep (Any _ codefn) format = Any format codefn
+
+
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform -> CmmReg -> Reg
+
+getRegisterReg _ (CmmLocal (LocalReg u pk))
+ = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
+
+getRegisterReg platform (CmmGlobal mid)
+ = case globalRegMaybe platform mid of
+ Just reg -> RegReal reg
+ Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
+ -- By this stage, the only MagicIds remaining should be the
+ -- ones which map to a real machine register on this
+ -- platform. Hence ...
+
+-- | Convert a BlockId to some CmmStatic data
+jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
+jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+ where blockLabel = blockLbl blockid
+
+
+
+-- -----------------------------------------------------------------------------
+-- General things for putting together code sequences
+
+-- Expand CmmRegOff. ToDo: should we do it this way around, or convert
+-- CmmExprs into CmmRegOff?
+mangleIndexTree :: DynFlags -> CmmExpr -> CmmExpr
+mangleIndexTree dflags (CmmRegOff reg off)
+ = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+ where width = typeWidth (cmmRegType dflags reg)
+
+mangleIndexTree _ _
+ = panic "PPC.CodeGen.mangleIndexTree: no match"
+
+-- -----------------------------------------------------------------------------
+-- Code gen for 64-bit arithmetic on 32-bit platforms
+
+{-
+Simple support for generating 64-bit code (ie, 64 bit values and 64
+bit assignments) on 32-bit platforms. Unlike the main code generator
+we merely shoot for generating working code as simply as possible, and
+pay little attention to code quality. Specifically, there is no
+attempt to deal cleverly with the fixed-vs-floating register
+distinction; all values are generated into (pairs of) floating
+registers, even if this would mean some redundant reg-reg moves as a
+result. Only one of the VRegUniques is returned, since it will be
+of the VRegUniqueLo form, and the upper-half VReg can be determined
+by applying getHiVRegFromLo to it.
+-}
+
+data ChildCode64 -- a.k.a "Register64"
+ = ChildCode64
+ InstrBlock -- code
+ Reg -- the lower 32-bit temporary which contains the
+ -- result; use getHiVRegFromLo to find the other
+ -- VRegUnique. Rules of this simplified insn
+ -- selection game are therefore that the returned
+ -- Reg may be modified
+
+
+-- | Compute an expression into a register, but
+-- we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+ r <- getRegister expr
+ case r of
+ Any rep code -> do
+ tmp <- getNewRegNat rep
+ return (tmp, code tmp)
+ Fixed _ reg code ->
+ return (reg, code)
+
+getI64Amodes :: CmmExpr -> NatM (AddrMode, AddrMode, InstrBlock)
+getI64Amodes addrTree = do
+ Amode hi_addr addr_code <- getAmode D addrTree
+ case addrOffset hi_addr 4 of
+ Just lo_addr -> return (hi_addr, lo_addr, addr_code)
+ Nothing -> do (hi_ptr, code) <- getSomeReg addrTree
+ return (AddrRegImm hi_ptr (ImmInt 0),
+ AddrRegImm hi_ptr (ImmInt 4),
+ code)
+
+
+assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_I64Code addrTree valueTree = do
+ (hi_addr, lo_addr, addr_code) <- getI64Amodes addrTree
+ ChildCode64 vcode rlo <- iselExpr64 valueTree
+ let
+ rhi = getHiVRegFromLo rlo
+
+ -- Big-endian store
+ mov_hi = ST II32 rhi hi_addr
+ mov_lo = ST II32 rlo lo_addr
+ return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
+
+
+assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock
+assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
+ ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
+ let
+ r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
+ r_dst_hi = getHiVRegFromLo r_dst_lo
+ r_src_hi = getHiVRegFromLo r_src_lo
+ mov_lo = MR r_dst_lo r_src_lo
+ mov_hi = MR r_dst_hi r_src_hi
+ return (
+ vcode `snocOL` mov_lo `snocOL` mov_hi
+ )
+
+assignReg_I64Code _ _
+ = panic "assignReg_I64Code(powerpc): invalid lvalue"
+
+
+iselExpr64 :: CmmExpr -> NatM ChildCode64
+iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
+ (hi_addr, lo_addr, addr_code) <- getI64Amodes addrTree
+ (rlo, rhi) <- getNewRegPairNat II32
+ let mov_hi = LD II32 rhi hi_addr
+ mov_lo = LD II32 rlo lo_addr
+ return $ ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
+ rlo
+
+iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
+ = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
+
+iselExpr64 (CmmLit (CmmInt i _)) = do
+ (rlo,rhi) <- getNewRegPairNat II32
+ let
+ half0 = fromIntegral (fromIntegral i :: Word16)
+ half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
+ half2 = fromIntegral (fromIntegral (i `shiftR` 32) :: Word16)
+ half3 = fromIntegral (fromIntegral (i `shiftR` 48) :: Word16)
+
+ code = toOL [
+ LIS rlo (ImmInt half1),
+ OR rlo rlo (RIImm $ ImmInt half0),
+ LIS rhi (ImmInt half3),
+ OR rhi rhi (RIImm $ ImmInt half2)
+ ]
+ return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
+ ChildCode64 code1 r1lo <- iselExpr64 e1
+ ChildCode64 code2 r2lo <- iselExpr64 e2
+ (rlo,rhi) <- getNewRegPairNat II32
+ let
+ r1hi = getHiVRegFromLo r1lo
+ r2hi = getHiVRegFromLo r2lo
+ code = code1 `appOL`
+ code2 `appOL`
+ toOL [ ADDC rlo r1lo r2lo,
+ ADDE rhi r1hi r2hi ]
+ return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
+ ChildCode64 code1 r1lo <- iselExpr64 e1
+ ChildCode64 code2 r2lo <- iselExpr64 e2
+ (rlo,rhi) <- getNewRegPairNat II32
+ let
+ r1hi = getHiVRegFromLo r1lo
+ r2hi = getHiVRegFromLo r2lo
+ code = code1 `appOL`
+ code2 `appOL`
+ toOL [ SUBFC rlo r2lo (RIReg r1lo),
+ SUBFE rhi r2hi r1hi ]
+ return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_UU_Conv W32 W64) [expr]) = do
+ (expr_reg,expr_code) <- getSomeReg expr
+ (rlo, rhi) <- getNewRegPairNat II32
+ let mov_hi = LI rhi (ImmInt 0)
+ mov_lo = MR rlo expr_reg
+ return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi)
+ rlo
+
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
+ (expr_reg,expr_code) <- getSomeReg expr
+ (rlo, rhi) <- getNewRegPairNat II32
+ let mov_hi = SRA II32 rhi expr_reg (RIImm (ImmInt 31))
+ mov_lo = MR rlo expr_reg
+ return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi)
+ rlo
+iselExpr64 expr
+ = pprPanic "iselExpr64(powerpc)" (pprExpr expr)
+
+
+
+getRegister :: CmmExpr -> NatM Register
+getRegister e = do dflags <- getDynFlags
+ getRegister' dflags e
+
+getRegister' :: DynFlags -> CmmExpr -> NatM Register
+
+getRegister' dflags (CmmReg (CmmGlobal PicBaseReg))
+ | OSAIX <- platformOS (targetPlatform dflags) = do
+ let code dst = toOL [ LD II32 dst tocAddr ]
+ tocAddr = AddrRegImm toc (ImmLit (text "ghc_toc_table[TC]"))
+ return (Any II32 code)
+ | target32Bit (targetPlatform dflags) = do
+ reg <- getPicBaseNat $ archWordFormat (target32Bit (targetPlatform dflags))
+ return (Fixed (archWordFormat (target32Bit (targetPlatform dflags)))
+ reg nilOL)
+ | otherwise = return (Fixed II64 toc nilOL)
+
+getRegister' dflags (CmmReg reg)
+ = return (Fixed (cmmTypeFormat (cmmRegType dflags reg))
+ (getRegisterReg (targetPlatform dflags) reg) nilOL)
+
+getRegister' dflags tree@(CmmRegOff _ _)
+ = getRegister' dflags (mangleIndexTree dflags tree)
+
+ -- for 32-bit architectures, support some 64 -> 32 bit conversions:
+ -- TO_W_(x), TO_W_(x >> 32)
+
+getRegister' dflags (CmmMachOp (MO_UU_Conv W64 W32)
+ [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | target32Bit (targetPlatform dflags) = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' dflags (CmmMachOp (MO_SS_Conv W64 W32)
+ [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | target32Bit (targetPlatform dflags) = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' dflags (CmmMachOp (MO_UU_Conv W64 W32) [x])
+ | target32Bit (targetPlatform dflags) = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 rlo code
+
+getRegister' dflags (CmmMachOp (MO_SS_Conv W64 W32) [x])
+ | target32Bit (targetPlatform dflags) = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 rlo code
+
+getRegister' dflags (CmmLoad mem pk)
+ | not (isWord64 pk) = do
+ let platform = targetPlatform dflags
+ Amode addr addr_code <- getAmode D mem
+ let code dst = ASSERT((targetClassOfReg platform dst == RcDouble) == isFloatType pk)
+ addr_code `snocOL` LD format dst addr
+ return (Any format code)
+ | not (target32Bit (targetPlatform dflags)) = do
+ Amode addr addr_code <- getAmode DS mem
+ let code dst = addr_code `snocOL` LD II64 dst addr
+ return (Any II64 code)
+
+ where format = cmmTypeFormat pk
+
+-- catch simple cases of zero- or sign-extended load
+getRegister' _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II32 (\dst -> addr_code `snocOL` LD II8 dst addr))
+
+getRegister' _ (CmmMachOp (MO_XX_Conv W8 W32) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II32 (\dst -> addr_code `snocOL` LD II8 dst addr))
+
+getRegister' _ (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II64 (\dst -> addr_code `snocOL` LD II8 dst addr))
+
+getRegister' _ (CmmMachOp (MO_XX_Conv W8 W64) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II64 (\dst -> addr_code `snocOL` LD II8 dst addr))
+
+-- Note: there is no Load Byte Arithmetic instruction, so no signed case here
+
+getRegister' _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II32 (\dst -> addr_code `snocOL` LD II16 dst addr))
+
+getRegister' _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II32 (\dst -> addr_code `snocOL` LA II16 dst addr))
+
+getRegister' _ (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II64 (\dst -> addr_code `snocOL` LD II16 dst addr))
+
+getRegister' _ (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II64 (\dst -> addr_code `snocOL` LA II16 dst addr))
+
+getRegister' _ (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad mem _]) = do
+ Amode addr addr_code <- getAmode D mem
+ return (Any II64 (\dst -> addr_code `snocOL` LD II32 dst addr))
+
+getRegister' _ (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad mem _]) = do
+ -- lwa is DS-form. See Note [Power instruction format]
+ Amode addr addr_code <- getAmode DS mem
+ return (Any II64 (\dst -> addr_code `snocOL` LA II32 dst addr))
+
+getRegister' dflags (CmmMachOp mop [x]) -- unary MachOps
+ = case mop of
+ MO_Not rep -> triv_ucode_int rep NOT
+
+ MO_F_Neg w -> triv_ucode_float w FNEG
+ MO_S_Neg w -> triv_ucode_int w NEG
+
+ MO_FF_Conv W64 W32 -> trivialUCode FF32 FRSP x
+ MO_FF_Conv W32 W64 -> conversionNop FF64 x
+
+ MO_FS_Conv from to -> coerceFP2Int from to x
+ MO_SF_Conv from to -> coerceInt2FP from to x
+
+ MO_SS_Conv from to
+ | from >= to -> conversionNop (intFormat to) x
+ | otherwise -> triv_ucode_int to (EXTS (intFormat from))
+
+ MO_UU_Conv from to
+ | from >= to -> conversionNop (intFormat to) x
+ | otherwise -> clearLeft from to
+
+ MO_XX_Conv _ to -> conversionNop (intFormat to) x
+
+ _ -> panic "PPC.CodeGen.getRegister: no match"
+
+ where
+ triv_ucode_int width instr = trivialUCode (intFormat width) instr x
+ triv_ucode_float width instr = trivialUCode (floatFormat width) instr x
+
+ conversionNop new_format expr
+ = do e_code <- getRegister' dflags expr
+ return (swizzleRegisterRep e_code new_format)
+
+ clearLeft from to
+ = do (src1, code1) <- getSomeReg x
+ let arch_fmt = intFormat (wordWidth dflags)
+ arch_bits = widthInBits (wordWidth dflags)
+ size = widthInBits from
+ code dst = code1 `snocOL`
+ CLRLI arch_fmt dst src1 (arch_bits - size)
+ return (Any (intFormat to) code)
+
+getRegister' _ (CmmMachOp mop [x, y]) -- dyadic PrimOps
+ = case mop of
+ MO_F_Eq _ -> condFltReg EQQ x y
+ MO_F_Ne _ -> condFltReg NE x y
+ MO_F_Gt _ -> condFltReg GTT x y
+ MO_F_Ge _ -> condFltReg GE x y
+ MO_F_Lt _ -> condFltReg LTT x y
+ MO_F_Le _ -> condFltReg LE x y
+
+ MO_Eq rep -> condIntReg EQQ rep x y
+ MO_Ne rep -> condIntReg NE rep x y
+
+ MO_S_Gt rep -> condIntReg GTT rep x y
+ MO_S_Ge rep -> condIntReg GE rep x y
+ MO_S_Lt rep -> condIntReg LTT rep x y
+ MO_S_Le rep -> condIntReg LE rep x y
+
+ MO_U_Gt rep -> condIntReg GU rep x y
+ MO_U_Ge rep -> condIntReg GEU rep x y
+ MO_U_Lt rep -> condIntReg LU rep x y
+ MO_U_Le rep -> condIntReg LEU rep x y
+
+ MO_F_Add w -> triv_float w FADD
+ MO_F_Sub w -> triv_float w FSUB
+ MO_F_Mul w -> triv_float w FMUL
+ MO_F_Quot w -> triv_float w FDIV
+
+ -- optimize addition with 32-bit immediate
+ -- (needed for PIC)
+ MO_Add W32 ->
+ case y of
+ CmmLit (CmmInt imm immrep) | Just _ <- makeImmediate W32 True imm
+ -> trivialCode W32 True ADD x (CmmLit $ CmmInt imm immrep)
+ CmmLit lit
+ -> do
+ (src, srcCode) <- getSomeReg x
+ let imm = litToImm lit
+ code dst = srcCode `appOL` toOL [
+ ADDIS dst src (HA imm),
+ ADD dst dst (RIImm (LO imm))
+ ]
+ return (Any II32 code)
+ _ -> trivialCode W32 True ADD x y
+
+ MO_Add rep -> trivialCode rep True ADD x y
+ MO_Sub rep ->
+ case y of
+ CmmLit (CmmInt imm immrep) | Just _ <- makeImmediate rep True (-imm)
+ -> trivialCode rep True ADD x (CmmLit $ CmmInt (-imm) immrep)
+ _ -> case x of
+ CmmLit (CmmInt imm _)
+ | Just _ <- makeImmediate rep True imm
+ -- subfi ('subtract from' with immediate) doesn't exist
+ -> trivialCode rep True SUBFC y x
+ _ -> trivialCodeNoImm' (intFormat rep) SUBF y x
+
+ MO_Mul rep -> shiftMulCode rep True MULL x y
+ MO_S_MulMayOflo rep -> do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let
+ format = intFormat rep
+ code dst = code1 `appOL` code2
+ `appOL` toOL [ MULLO format dst src1 src2
+ , MFOV format dst
+ ]
+ return (Any format code)
+
+ MO_S_Quot rep -> divCode rep True x y
+ MO_U_Quot rep -> divCode rep False x y
+
+ MO_S_Rem rep -> remainder rep True x y
+ MO_U_Rem rep -> remainder rep False x y
+
+ MO_And rep -> case y of
+ (CmmLit (CmmInt imm _)) | imm == -8 || imm == -4
+ -> do
+ (src, srcCode) <- getSomeReg x
+ let clear_mask = if imm == -4 then 2 else 3
+ fmt = intFormat rep
+ code dst = srcCode
+ `appOL` unitOL (CLRRI fmt dst src clear_mask)
+ return (Any fmt code)
+ _ -> trivialCode rep False AND x y
+ MO_Or rep -> trivialCode rep False OR x y
+ MO_Xor rep -> trivialCode rep False XOR x y
+
+ MO_Shl rep -> shiftMulCode rep False SL x y
+ MO_S_Shr rep -> srCode rep True SRA x y
+ MO_U_Shr rep -> srCode rep False SR x y
+ _ -> panic "PPC.CodeGen.getRegister: no match"
+
+ where
+ triv_float :: Width -> (Format -> Reg -> Reg -> Reg -> Instr) -> NatM Register
+ triv_float width instr = trivialCodeNoImm (floatFormat width) instr x y
+
+ remainder :: Width -> Bool -> CmmExpr -> CmmExpr -> NatM Register
+ remainder rep sgn x y = do
+ let fmt = intFormat rep
+ tmp <- getNewRegNat fmt
+ code <- remainderCode rep sgn tmp x y
+ return (Any fmt code)
+
+
+getRegister' _ (CmmLit (CmmInt i rep))
+ | Just imm <- makeImmediate rep True i
+ = let
+ code dst = unitOL (LI dst imm)
+ in
+ return (Any (intFormat rep) code)
+
+getRegister' _ (CmmLit (CmmFloat f frep)) = do
+ lbl <- getNewLabelNat
+ dflags <- getDynFlags
+ dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+ Amode addr addr_code <- getAmode D dynRef
+ let format = floatFormat frep
+ code dst =
+ LDATA (Section ReadOnlyData lbl)
+ (RawCmmStatics lbl [CmmStaticLit (CmmFloat f frep)])
+ `consOL` (addr_code `snocOL` LD format dst addr)
+ return (Any format code)
+
+getRegister' dflags (CmmLit lit)
+ | target32Bit (targetPlatform dflags)
+ = let rep = cmmLitType dflags lit
+ imm = litToImm lit
+ code dst = toOL [
+ LIS dst (HA imm),
+ ADD dst dst (RIImm (LO imm))
+ ]
+ in return (Any (cmmTypeFormat rep) code)
+ | otherwise
+ = do lbl <- getNewLabelNat
+ dflags <- getDynFlags
+ dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+ Amode addr addr_code <- getAmode D dynRef
+ let rep = cmmLitType dflags lit
+ format = cmmTypeFormat rep
+ code dst =
+ LDATA (Section ReadOnlyData lbl) (RawCmmStatics lbl [CmmStaticLit lit])
+ `consOL` (addr_code `snocOL` LD format dst addr)
+ return (Any format code)
+
+getRegister' _ other = pprPanic "getRegister(ppc)" (pprExpr other)
+
+ -- extend?Rep: wrap integer expression of type `from`
+ -- in a conversion to `to`
+extendSExpr :: Width -> Width -> CmmExpr -> CmmExpr
+extendSExpr from to x = CmmMachOp (MO_SS_Conv from to) [x]
+
+extendUExpr :: Width -> Width -> CmmExpr -> CmmExpr
+extendUExpr from to x = CmmMachOp (MO_UU_Conv from to) [x]
+
+-- -----------------------------------------------------------------------------
+-- The 'Amode' type: Memory addressing modes passed up the tree.
+
+data Amode
+ = Amode AddrMode InstrBlock
+
+{-
+Now, given a tree (the argument to a CmmLoad) that references memory,
+produce a suitable addressing mode.
+
+A Rule of the Game (tm) for Amodes: use of the addr bit must
+immediately follow use of the code part, since the code part puts
+values in registers which the addr then refers to. So you can't put
+anything in between, lest it overwrite some of those registers. If
+you need to do some other computation between the code part and use of
+the addr bit, first store the effective address from the amode in a
+temporary, then do the other computation, and then use the temporary:
+
+ code
+ LEA amode, tmp
+ ... other computation ...
+ ... (tmp) ...
+-}
+
+{- Note [Power instruction format]
+In some instructions the 16 bit offset must be a multiple of 4, i.e.
+the two least significant bits must be zero. The "Power ISA" specification
+calls these instruction formats "DS-FORM" and the instructions with
+arbitrary 16 bit offsets are "D-FORM".
+
+The Power ISA specification document can be obtained from www.power.org.
+-}
+data InstrForm = D | DS
+
+getAmode :: InstrForm -> CmmExpr -> NatM Amode
+getAmode inf tree@(CmmRegOff _ _)
+ = do dflags <- getDynFlags
+ getAmode inf (mangleIndexTree dflags tree)
+
+getAmode _ (CmmMachOp (MO_Sub W32) [x, CmmLit (CmmInt i _)])
+ | Just off <- makeImmediate W32 True (-i)
+ = do
+ (reg, code) <- getSomeReg x
+ return (Amode (AddrRegImm reg off) code)
+
+
+getAmode _ (CmmMachOp (MO_Add W32) [x, CmmLit (CmmInt i _)])
+ | Just off <- makeImmediate W32 True i
+ = do
+ (reg, code) <- getSomeReg x
+ return (Amode (AddrRegImm reg off) code)
+
+getAmode D (CmmMachOp (MO_Sub W64) [x, CmmLit (CmmInt i _)])
+ | Just off <- makeImmediate W64 True (-i)
+ = do
+ (reg, code) <- getSomeReg x
+ return (Amode (AddrRegImm reg off) code)
+
+
+getAmode D (CmmMachOp (MO_Add W64) [x, CmmLit (CmmInt i _)])
+ | Just off <- makeImmediate W64 True i
+ = do
+ (reg, code) <- getSomeReg x
+ return (Amode (AddrRegImm reg off) code)
+
+getAmode DS (CmmMachOp (MO_Sub W64) [x, CmmLit (CmmInt i _)])
+ | Just off <- makeImmediate W64 True (-i)
+ = do
+ (reg, code) <- getSomeReg x
+ (reg', off', code') <-
+ if i `mod` 4 == 0
+ then do return (reg, off, code)
+ else do
+ tmp <- getNewRegNat II64
+ return (tmp, ImmInt 0,
+ code `snocOL` ADD tmp reg (RIImm off))
+ return (Amode (AddrRegImm reg' off') code')
+
+getAmode DS (CmmMachOp (MO_Add W64) [x, CmmLit (CmmInt i _)])
+ | Just off <- makeImmediate W64 True i
+ = do
+ (reg, code) <- getSomeReg x
+ (reg', off', code') <-
+ if i `mod` 4 == 0
+ then do return (reg, off, code)
+ else do
+ tmp <- getNewRegNat II64
+ return (tmp, ImmInt 0,
+ code `snocOL` ADD tmp reg (RIImm off))
+ return (Amode (AddrRegImm reg' off') code')
+
+ -- optimize addition with 32-bit immediate
+ -- (needed for PIC)
+getAmode _ (CmmMachOp (MO_Add W32) [x, CmmLit lit])
+ = do
+ dflags <- getDynFlags
+ (src, srcCode) <- getSomeReg x
+ let imm = litToImm lit
+ case () of
+ _ | OSAIX <- platformOS (targetPlatform dflags)
+ , isCmmLabelType lit ->
+ -- HA16/LO16 relocations on labels not supported on AIX
+ return (Amode (AddrRegImm src imm) srcCode)
+ | otherwise -> do
+ tmp <- getNewRegNat II32
+ let code = srcCode `snocOL` ADDIS tmp src (HA imm)
+ return (Amode (AddrRegImm tmp (LO imm)) code)
+ where
+ isCmmLabelType (CmmLabel {}) = True
+ isCmmLabelType (CmmLabelOff {}) = True
+ isCmmLabelType (CmmLabelDiffOff {}) = True
+ isCmmLabelType _ = False
+
+getAmode _ (CmmLit lit)
+ = do
+ dflags <- getDynFlags
+ case platformArch $ targetPlatform dflags of
+ ArchPPC -> do
+ tmp <- getNewRegNat II32
+ let imm = litToImm lit
+ code = unitOL (LIS tmp (HA imm))
+ return (Amode (AddrRegImm tmp (LO imm)) code)
+ _ -> do -- TODO: Load from TOC,
+ -- see getRegister' _ (CmmLit lit)
+ tmp <- getNewRegNat II64
+ let imm = litToImm lit
+ code = toOL [
+ LIS tmp (HIGHESTA imm),
+ OR tmp tmp (RIImm (HIGHERA imm)),
+ SL II64 tmp tmp (RIImm (ImmInt 32)),
+ ORIS tmp tmp (HA imm)
+ ]
+ return (Amode (AddrRegImm tmp (LO imm)) code)
+
+getAmode _ (CmmMachOp (MO_Add W32) [x, y])
+ = do
+ (regX, codeX) <- getSomeReg x
+ (regY, codeY) <- getSomeReg y
+ return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY))
+
+getAmode _ (CmmMachOp (MO_Add W64) [x, y])
+ = do
+ (regX, codeX) <- getSomeReg x
+ (regY, codeY) <- getSomeReg y
+ return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY))
+
+getAmode _ other
+ = do
+ (reg, code) <- getSomeReg other
+ let
+ off = ImmInt 0
+ return (Amode (AddrRegImm reg off) code)
+
+
+-- The 'CondCode' type: Condition codes passed up the tree.
+data CondCode
+ = CondCode Bool Cond InstrBlock
+
+-- Set up a condition code for a conditional branch.
+
+getCondCode :: CmmExpr -> NatM CondCode
+
+-- almost the same as everywhere else - but we need to
+-- extend small integers to 32 bit or 64 bit first
+
+getCondCode (CmmMachOp mop [x, y])
+ = do
+ case mop of
+ MO_F_Eq W32 -> condFltCode EQQ x y
+ MO_F_Ne W32 -> condFltCode NE x y
+ MO_F_Gt W32 -> condFltCode GTT x y
+ MO_F_Ge W32 -> condFltCode GE x y
+ MO_F_Lt W32 -> condFltCode LTT x y
+ MO_F_Le W32 -> condFltCode LE x y
+
+ MO_F_Eq W64 -> condFltCode EQQ x y
+ MO_F_Ne W64 -> condFltCode NE x y
+ MO_F_Gt W64 -> condFltCode GTT x y
+ MO_F_Ge W64 -> condFltCode GE x y
+ MO_F_Lt W64 -> condFltCode LTT x y
+ MO_F_Le W64 -> condFltCode LE x y
+
+ MO_Eq rep -> condIntCode EQQ rep x y
+ MO_Ne rep -> condIntCode NE rep x y
+
+ MO_S_Gt rep -> condIntCode GTT rep x y
+ MO_S_Ge rep -> condIntCode GE rep x y
+ MO_S_Lt rep -> condIntCode LTT rep x y
+ MO_S_Le rep -> condIntCode LE rep x y
+
+ MO_U_Gt rep -> condIntCode GU rep x y
+ MO_U_Ge rep -> condIntCode GEU rep x y
+ MO_U_Lt rep -> condIntCode LU rep x y
+ MO_U_Le rep -> condIntCode LEU rep x y
+
+ _ -> pprPanic "getCondCode(powerpc)" (pprMachOp mop)
+
+getCondCode _ = panic "getCondCode(2)(powerpc)"
+
+
+-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
+-- passed back up the tree.
+
+condIntCode :: Cond -> Width -> CmmExpr -> CmmExpr -> NatM CondCode
+condIntCode cond width x y = do
+ dflags <- getDynFlags
+ condIntCode' (target32Bit (targetPlatform dflags)) cond width x y
+
+condIntCode' :: Bool -> Cond -> Width -> CmmExpr -> CmmExpr -> NatM CondCode
+
+-- simple code for 64-bit on 32-bit platforms
+condIntCode' True cond W64 x y
+ | condUnsigned cond
+ = do
+ ChildCode64 code_x x_lo <- iselExpr64 x
+ ChildCode64 code_y y_lo <- iselExpr64 y
+ let x_hi = getHiVRegFromLo x_lo
+ y_hi = getHiVRegFromLo y_lo
+ end_lbl <- getBlockIdNat
+ let code = code_x `appOL` code_y `appOL` toOL
+ [ CMPL II32 x_hi (RIReg y_hi)
+ , BCC NE end_lbl Nothing
+ , CMPL II32 x_lo (RIReg y_lo)
+ , BCC ALWAYS end_lbl Nothing
+
+ , NEWBLOCK end_lbl
+ ]
+ return (CondCode False cond code)
+ | otherwise
+ = do
+ ChildCode64 code_x x_lo <- iselExpr64 x
+ ChildCode64 code_y y_lo <- iselExpr64 y
+ let x_hi = getHiVRegFromLo x_lo
+ y_hi = getHiVRegFromLo y_lo
+ end_lbl <- getBlockIdNat
+ cmp_lo <- getBlockIdNat
+ let code = code_x `appOL` code_y `appOL` toOL
+ [ CMP II32 x_hi (RIReg y_hi)
+ , BCC NE end_lbl Nothing
+ , CMP II32 x_hi (RIImm (ImmInt 0))
+ , BCC LE cmp_lo Nothing
+ , CMPL II32 x_lo (RIReg y_lo)
+ , BCC ALWAYS end_lbl Nothing
+ , NEWBLOCK cmp_lo
+ , CMPL II32 y_lo (RIReg x_lo)
+ , BCC ALWAYS end_lbl Nothing
+
+ , NEWBLOCK end_lbl
+ ]
+ return (CondCode False cond code)
+
+-- optimize pointer tag checks. Operation andi. sets condition register
+-- so cmpi ..., 0 is redundant.
+condIntCode' _ cond _ (CmmMachOp (MO_And _) [x, CmmLit (CmmInt imm rep)])
+ (CmmLit (CmmInt 0 _))
+ | not $ condUnsigned cond,
+ Just src2 <- makeImmediate rep False imm
+ = do
+ (src1, code) <- getSomeReg x
+ let code' = code `snocOL` AND r0 src1 (RIImm src2)
+ return (CondCode False cond code')
+
+condIntCode' _ cond width x (CmmLit (CmmInt y rep))
+ | Just src2 <- makeImmediate rep (not $ condUnsigned cond) y
+ = do
+ let op_len = max W32 width
+ let extend = extendSExpr width op_len
+ (src1, code) <- getSomeReg (extend x)
+ let format = intFormat op_len
+ code' = code `snocOL`
+ (if condUnsigned cond then CMPL else CMP) format src1 (RIImm src2)
+ return (CondCode False cond code')
+
+condIntCode' _ cond width x y = do
+ let op_len = max W32 width
+ let extend = if condUnsigned cond then extendUExpr width op_len
+ else extendSExpr width op_len
+ (src1, code1) <- getSomeReg (extend x)
+ (src2, code2) <- getSomeReg (extend y)
+ let format = intFormat op_len
+ code' = code1 `appOL` code2 `snocOL`
+ (if condUnsigned cond then CMPL else CMP) format src1 (RIReg src2)
+ return (CondCode False cond code')
+
+condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condFltCode cond x y = do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let
+ code' = code1 `appOL` code2 `snocOL` FCMP src1 src2
+ code'' = case cond of -- twiddle CR to handle unordered case
+ GE -> code' `snocOL` CRNOR ltbit eqbit gtbit
+ LE -> code' `snocOL` CRNOR gtbit eqbit ltbit
+ _ -> code'
+ where
+ ltbit = 0 ; eqbit = 2 ; gtbit = 1
+ return (CondCode True cond code'')
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business. Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers. If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side. This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
+
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
+
+assignMem_IntCode pk addr src = do
+ (srcReg, code) <- getSomeReg src
+ Amode dstAddr addr_code <- case pk of
+ II64 -> getAmode DS addr
+ _ -> getAmode D addr
+ return $ code `appOL` addr_code `snocOL` ST pk srcReg dstAddr
+
+-- dst is a reg, but src could be anything
+assignReg_IntCode _ reg src
+ = do
+ dflags <- getDynFlags
+ let dst = getRegisterReg (targetPlatform dflags) reg
+ r <- getRegister src
+ return $ case r of
+ Any _ code -> code dst
+ Fixed _ freg fcode -> fcode `snocOL` MR dst freg
+
+
+
+-- Easy, isn't it?
+assignMem_FltCode = assignMem_IntCode
+assignReg_FltCode = assignReg_IntCode
+
+
+
+genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
+
+genJump (CmmLit (CmmLabel lbl)) regs
+ = return (unitOL $ JMP lbl regs)
+
+genJump tree gregs
+ = do
+ dflags <- getDynFlags
+ genJump' tree (platformToGCP (targetPlatform dflags)) gregs
+
+genJump' :: CmmExpr -> GenCCallPlatform -> [Reg] -> NatM InstrBlock
+
+genJump' tree (GCP64ELF 1) regs
+ = do
+ (target,code) <- getSomeReg tree
+ return (code
+ `snocOL` LD II64 r11 (AddrRegImm target (ImmInt 0))
+ `snocOL` LD II64 toc (AddrRegImm target (ImmInt 8))
+ `snocOL` MTCTR r11
+ `snocOL` LD II64 r11 (AddrRegImm target (ImmInt 16))
+ `snocOL` BCTR [] Nothing regs)
+
+genJump' tree (GCP64ELF 2) regs
+ = do
+ (target,code) <- getSomeReg tree
+ return (code
+ `snocOL` MR r12 target
+ `snocOL` MTCTR r12
+ `snocOL` BCTR [] Nothing regs)
+
+genJump' tree _ regs
+ = do
+ (target,code) <- getSomeReg tree
+ return (code `snocOL` MTCTR target `snocOL` BCTR [] Nothing regs)
+
+-- -----------------------------------------------------------------------------
+-- Unconditional branches
+genBranch :: BlockId -> NatM InstrBlock
+genBranch = return . toOL . mkJumpInstr
+
+
+-- -----------------------------------------------------------------------------
+-- Conditional jumps
+
+{-
+Conditional jumps are always to local labels, so we can use branch
+instructions. We peek at the arguments to decide what kind of
+comparison to do.
+-}
+
+
+genCondJump
+ :: BlockId -- the branch target
+ -> CmmExpr -- the condition on which to branch
+ -> Maybe Bool
+ -> NatM InstrBlock
+
+genCondJump id bool prediction = do
+ CondCode _ cond code <- getCondCode bool
+ return (code `snocOL` BCC cond id prediction)
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating C calls
+
+-- Now the biggest nightmare---calls. Most of the nastiness is buried in
+-- @get_arg@, which moves the arguments to the correct registers/stack
+-- locations. Apart from that, the code is easy.
+
+genCCall :: ForeignTarget -- function to call
+ -> [CmmFormal] -- where to put the result
+ -> [CmmActual] -- arguments (of mixed type)
+ -> NatM InstrBlock
+genCCall (PrimTarget MO_ReadBarrier) _ _
+ = return $ unitOL LWSYNC
+genCCall (PrimTarget MO_WriteBarrier) _ _
+ = return $ unitOL LWSYNC
+
+genCCall (PrimTarget MO_Touch) _ _
+ = return $ nilOL
+
+genCCall (PrimTarget (MO_Prefetch_Data _)) _ _
+ = return $ nilOL
+
+genCCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n]
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ fmt = intFormat width
+ reg_dst = getRegisterReg platform (CmmLocal dst)
+ (instr, n_code) <- case amop of
+ AMO_Add -> getSomeRegOrImm ADD True reg_dst
+ AMO_Sub -> case n of
+ CmmLit (CmmInt i _)
+ | Just imm <- makeImmediate width True (-i)
+ -> return (ADD reg_dst reg_dst (RIImm imm), nilOL)
+ _
+ -> do
+ (n_reg, n_code) <- getSomeReg n
+ return (SUBF reg_dst n_reg reg_dst, n_code)
+ AMO_And -> getSomeRegOrImm AND False reg_dst
+ AMO_Nand -> do (n_reg, n_code) <- getSomeReg n
+ return (NAND reg_dst reg_dst n_reg, n_code)
+ AMO_Or -> getSomeRegOrImm OR False reg_dst
+ AMO_Xor -> getSomeRegOrImm XOR False reg_dst
+ Amode addr_reg addr_code <- getAmodeIndex addr
+ lbl_retry <- getBlockIdNat
+ return $ n_code `appOL` addr_code
+ `appOL` toOL [ HWSYNC
+ , BCC ALWAYS lbl_retry Nothing
+
+ , NEWBLOCK lbl_retry
+ , LDR fmt reg_dst addr_reg
+ , instr
+ , STC fmt reg_dst addr_reg
+ , BCC NE lbl_retry (Just False)
+ , ISYNC
+ ]
+ where
+ getAmodeIndex (CmmMachOp (MO_Add _) [x, y])
+ = do
+ (regX, codeX) <- getSomeReg x
+ (regY, codeY) <- getSomeReg y
+ return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY))
+ getAmodeIndex other
+ = do
+ (reg, code) <- getSomeReg other
+ return (Amode (AddrRegReg r0 reg) code) -- NB: r0 is 0 here!
+ getSomeRegOrImm op sign dst
+ = case n of
+ CmmLit (CmmInt i _) | Just imm <- makeImmediate width sign i
+ -> return (op dst dst (RIImm imm), nilOL)
+ _
+ -> do
+ (n_reg, n_code) <- getSomeReg n
+ return (op dst dst (RIReg n_reg), n_code)
+
+genCCall (PrimTarget (MO_AtomicRead width)) [dst] [addr]
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ fmt = intFormat width
+ reg_dst = getRegisterReg platform (CmmLocal dst)
+ form = if widthInBits width == 64 then DS else D
+ Amode addr_reg addr_code <- getAmode form addr
+ lbl_end <- getBlockIdNat
+ return $ addr_code `appOL` toOL [ HWSYNC
+ , LD fmt reg_dst addr_reg
+ , CMP fmt reg_dst (RIReg reg_dst)
+ , BCC NE lbl_end (Just False)
+ , BCC ALWAYS lbl_end Nothing
+ -- See Note [Seemingly useless cmp and bne]
+ , NEWBLOCK lbl_end
+ , ISYNC
+ ]
+
+-- Note [Seemingly useless cmp and bne]
+-- In Power ISA, Book II, Section 4.4.1, Instruction Synchronize Instruction
+-- the second paragraph says that isync may complete before storage accesses
+-- "associated" with a preceding instruction have been performed. The cmp
+-- operation and the following bne introduce a data and control dependency
+-- on the load instruction (See also Power ISA, Book II, Appendix B.2.3, Safe
+-- Fetch).
+-- This is also what gcc does.
+
+
+genCCall (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do
+ code <- assignMem_IntCode (intFormat width) addr val
+ return $ unitOL(HWSYNC) `appOL` code
+
+genCCall (PrimTarget (MO_Clz width)) [dst] [src]
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ reg_dst = getRegisterReg platform (CmmLocal dst)
+ if target32Bit platform && width == W64
+ then do
+ ChildCode64 code vr_lo <- iselExpr64 src
+ lbl1 <- getBlockIdNat
+ lbl2 <- getBlockIdNat
+ lbl3 <- getBlockIdNat
+ let vr_hi = getHiVRegFromLo vr_lo
+ cntlz = toOL [ CMPL II32 vr_hi (RIImm (ImmInt 0))
+ , BCC NE lbl2 Nothing
+ , BCC ALWAYS lbl1 Nothing
+
+ , NEWBLOCK lbl1
+ , CNTLZ II32 reg_dst vr_lo
+ , ADD reg_dst reg_dst (RIImm (ImmInt 32))
+ , BCC ALWAYS lbl3 Nothing
+
+ , NEWBLOCK lbl2
+ , CNTLZ II32 reg_dst vr_hi
+ , BCC ALWAYS lbl3 Nothing
+
+ , NEWBLOCK lbl3
+ ]
+ return $ code `appOL` cntlz
+ else do
+ let format = if width == W64 then II64 else II32
+ (s_reg, s_code) <- getSomeReg src
+ (pre, reg , post) <-
+ case width of
+ W64 -> return (nilOL, s_reg, nilOL)
+ W32 -> return (nilOL, s_reg, nilOL)
+ W16 -> do
+ reg_tmp <- getNewRegNat format
+ return
+ ( unitOL $ AND reg_tmp s_reg (RIImm (ImmInt 65535))
+ , reg_tmp
+ , unitOL $ ADD reg_dst reg_dst (RIImm (ImmInt (-16)))
+ )
+ W8 -> do
+ reg_tmp <- getNewRegNat format
+ return
+ ( unitOL $ AND reg_tmp s_reg (RIImm (ImmInt 255))
+ , reg_tmp
+ , unitOL $ ADD reg_dst reg_dst (RIImm (ImmInt (-24)))
+ )
+ _ -> panic "genCall: Clz wrong format"
+ let cntlz = unitOL (CNTLZ format reg_dst reg)
+ return $ s_code `appOL` pre `appOL` cntlz `appOL` post
+
+genCCall (PrimTarget (MO_Ctz width)) [dst] [src]
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ reg_dst = getRegisterReg platform (CmmLocal dst)
+ if target32Bit platform && width == W64
+ then do
+ let format = II32
+ ChildCode64 code vr_lo <- iselExpr64 src
+ lbl1 <- getBlockIdNat
+ lbl2 <- getBlockIdNat
+ lbl3 <- getBlockIdNat
+ x' <- getNewRegNat format
+ x'' <- getNewRegNat format
+ r' <- getNewRegNat format
+ cnttzlo <- cnttz format reg_dst vr_lo
+ let vr_hi = getHiVRegFromLo vr_lo
+ cnttz64 = toOL [ CMPL format vr_lo (RIImm (ImmInt 0))
+ , BCC NE lbl2 Nothing
+ , BCC ALWAYS lbl1 Nothing
+
+ , NEWBLOCK lbl1
+ , ADD x' vr_hi (RIImm (ImmInt (-1)))
+ , ANDC x'' x' vr_hi
+ , CNTLZ format r' x''
+ -- 32 + (32 - clz(x''))
+ , SUBFC reg_dst r' (RIImm (ImmInt 64))
+ , BCC ALWAYS lbl3 Nothing
+
+ , NEWBLOCK lbl2
+ ]
+ `appOL` cnttzlo `appOL`
+ toOL [ BCC ALWAYS lbl3 Nothing
+
+ , NEWBLOCK lbl3
+ ]
+ return $ code `appOL` cnttz64
+ else do
+ let format = if width == W64 then II64 else II32
+ (s_reg, s_code) <- getSomeReg src
+ (reg_ctz, pre_code) <-
+ case width of
+ W64 -> return (s_reg, nilOL)
+ W32 -> return (s_reg, nilOL)
+ W16 -> do
+ reg_tmp <- getNewRegNat format
+ return (reg_tmp, unitOL $ ORIS reg_tmp s_reg (ImmInt 1))
+ W8 -> do
+ reg_tmp <- getNewRegNat format
+ return (reg_tmp, unitOL $ OR reg_tmp s_reg (RIImm (ImmInt 256)))
+ _ -> panic "genCall: Ctz wrong format"
+ ctz_code <- cnttz format reg_dst reg_ctz
+ return $ s_code `appOL` pre_code `appOL` ctz_code
+ where
+ -- cnttz(x) = sizeof(x) - cntlz(~x & (x - 1))
+ -- see Henry S. Warren, Hacker's Delight, p 107
+ cnttz format dst src = do
+ let format_bits = 8 * formatInBytes format
+ x' <- getNewRegNat format
+ x'' <- getNewRegNat format
+ r' <- getNewRegNat format
+ return $ toOL [ ADD x' src (RIImm (ImmInt (-1)))
+ , ANDC x'' x' src
+ , CNTLZ format r' x''
+ , SUBFC dst r' (RIImm (ImmInt (format_bits)))
+ ]
+
+genCCall target dest_regs argsAndHints
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ case target of
+ PrimTarget (MO_S_QuotRem width) -> divOp1 platform True width
+ dest_regs argsAndHints
+ PrimTarget (MO_U_QuotRem width) -> divOp1 platform False width
+ dest_regs argsAndHints
+ PrimTarget (MO_U_QuotRem2 width) -> divOp2 platform width dest_regs
+ argsAndHints
+ PrimTarget (MO_U_Mul2 width) -> multOp2 platform width dest_regs
+ argsAndHints
+ PrimTarget (MO_Add2 _) -> add2Op platform dest_regs argsAndHints
+ PrimTarget (MO_AddWordC _) -> addcOp platform dest_regs argsAndHints
+ PrimTarget (MO_SubWordC _) -> subcOp platform dest_regs argsAndHints
+ PrimTarget (MO_AddIntC width) -> addSubCOp ADDO platform width
+ dest_regs argsAndHints
+ PrimTarget (MO_SubIntC width) -> addSubCOp SUBFO platform width
+ dest_regs argsAndHints
+ PrimTarget MO_F64_Fabs -> fabs platform dest_regs argsAndHints
+ PrimTarget MO_F32_Fabs -> fabs platform dest_regs argsAndHints
+ _ -> genCCall' dflags (platformToGCP platform)
+ target dest_regs argsAndHints
+ where divOp1 platform signed width [res_q, res_r] [arg_x, arg_y]
+ = do let reg_q = getRegisterReg platform (CmmLocal res_q)
+ reg_r = getRegisterReg platform (CmmLocal res_r)
+ remainderCode width signed reg_q arg_x arg_y
+ <*> pure reg_r
+
+ divOp1 _ _ _ _ _
+ = panic "genCCall: Wrong number of arguments for divOp1"
+ divOp2 platform width [res_q, res_r]
+ [arg_x_high, arg_x_low, arg_y]
+ = do let reg_q = getRegisterReg platform (CmmLocal res_q)
+ reg_r = getRegisterReg platform (CmmLocal res_r)
+ fmt = intFormat width
+ half = 4 * (formatInBytes fmt)
+ (xh_reg, xh_code) <- getSomeReg arg_x_high
+ (xl_reg, xl_code) <- getSomeReg arg_x_low
+ (y_reg, y_code) <- getSomeReg arg_y
+ s <- getNewRegNat fmt
+ b <- getNewRegNat fmt
+ v <- getNewRegNat fmt
+ vn1 <- getNewRegNat fmt
+ vn0 <- getNewRegNat fmt
+ un32 <- getNewRegNat fmt
+ tmp <- getNewRegNat fmt
+ un10 <- getNewRegNat fmt
+ un1 <- getNewRegNat fmt
+ un0 <- getNewRegNat fmt
+ q1 <- getNewRegNat fmt
+ rhat <- getNewRegNat fmt
+ tmp1 <- getNewRegNat fmt
+ q0 <- getNewRegNat fmt
+ un21 <- getNewRegNat fmt
+ again1 <- getBlockIdNat
+ no1 <- getBlockIdNat
+ then1 <- getBlockIdNat
+ endif1 <- getBlockIdNat
+ again2 <- getBlockIdNat
+ no2 <- getBlockIdNat
+ then2 <- getBlockIdNat
+ endif2 <- getBlockIdNat
+ return $ y_code `appOL` xl_code `appOL` xh_code `appOL`
+ -- see Hacker's Delight p 196 Figure 9-3
+ toOL [ -- b = 2 ^ (bits_in_word / 2)
+ LI b (ImmInt 1)
+ , SL fmt b b (RIImm (ImmInt half))
+ -- s = clz(y)
+ , CNTLZ fmt s y_reg
+ -- v = y << s
+ , SL fmt v y_reg (RIReg s)
+ -- vn1 = upper half of v
+ , SR fmt vn1 v (RIImm (ImmInt half))
+ -- vn0 = lower half of v
+ , CLRLI fmt vn0 v half
+ -- un32 = (u1 << s)
+ -- | (u0 >> (bits_in_word - s))
+ , SL fmt un32 xh_reg (RIReg s)
+ , SUBFC tmp s
+ (RIImm (ImmInt (8 * formatInBytes fmt)))
+ , SR fmt tmp xl_reg (RIReg tmp)
+ , OR un32 un32 (RIReg tmp)
+ -- un10 = u0 << s
+ , SL fmt un10 xl_reg (RIReg s)
+ -- un1 = upper half of un10
+ , SR fmt un1 un10 (RIImm (ImmInt half))
+ -- un0 = lower half of un10
+ , CLRLI fmt un0 un10 half
+ -- q1 = un32/vn1
+ , DIV fmt False q1 un32 vn1
+ -- rhat = un32 - q1*vn1
+ , MULL fmt tmp q1 (RIReg vn1)
+ , SUBF rhat tmp un32
+ , BCC ALWAYS again1 Nothing
+
+ , NEWBLOCK again1
+ -- if (q1 >= b || q1*vn0 > b*rhat + un1)
+ , CMPL fmt q1 (RIReg b)
+ , BCC GEU then1 Nothing
+ , BCC ALWAYS no1 Nothing
+
+ , NEWBLOCK no1
+ , MULL fmt tmp q1 (RIReg vn0)
+ , SL fmt tmp1 rhat (RIImm (ImmInt half))
+ , ADD tmp1 tmp1 (RIReg un1)
+ , CMPL fmt tmp (RIReg tmp1)
+ , BCC LEU endif1 Nothing
+ , BCC ALWAYS then1 Nothing
+
+ , NEWBLOCK then1
+ -- q1 = q1 - 1
+ , ADD q1 q1 (RIImm (ImmInt (-1)))
+ -- rhat = rhat + vn1
+ , ADD rhat rhat (RIReg vn1)
+ -- if (rhat < b) goto again1
+ , CMPL fmt rhat (RIReg b)
+ , BCC LTT again1 Nothing
+ , BCC ALWAYS endif1 Nothing
+
+ , NEWBLOCK endif1
+ -- un21 = un32*b + un1 - q1*v
+ , SL fmt un21 un32 (RIImm (ImmInt half))
+ , ADD un21 un21 (RIReg un1)
+ , MULL fmt tmp q1 (RIReg v)
+ , SUBF un21 tmp un21
+ -- compute second quotient digit
+ -- q0 = un21/vn1
+ , DIV fmt False q0 un21 vn1
+ -- rhat = un21- q0*vn1
+ , MULL fmt tmp q0 (RIReg vn1)
+ , SUBF rhat tmp un21
+ , BCC ALWAYS again2 Nothing
+
+ , NEWBLOCK again2
+ -- if (q0>b || q0*vn0 > b*rhat + un0)
+ , CMPL fmt q0 (RIReg b)
+ , BCC GEU then2 Nothing
+ , BCC ALWAYS no2 Nothing
+
+ , NEWBLOCK no2
+ , MULL fmt tmp q0 (RIReg vn0)
+ , SL fmt tmp1 rhat (RIImm (ImmInt half))
+ , ADD tmp1 tmp1 (RIReg un0)
+ , CMPL fmt tmp (RIReg tmp1)
+ , BCC LEU endif2 Nothing
+ , BCC ALWAYS then2 Nothing
+
+ , NEWBLOCK then2
+ -- q0 = q0 - 1
+ , ADD q0 q0 (RIImm (ImmInt (-1)))
+ -- rhat = rhat + vn1
+ , ADD rhat rhat (RIReg vn1)
+ -- if (rhat<b) goto again2
+ , CMPL fmt rhat (RIReg b)
+ , BCC LTT again2 Nothing
+ , BCC ALWAYS endif2 Nothing
+
+ , NEWBLOCK endif2
+ -- compute remainder
+ -- r = (un21*b + un0 - q0*v) >> s
+ , SL fmt reg_r un21 (RIImm (ImmInt half))
+ , ADD reg_r reg_r (RIReg un0)
+ , MULL fmt tmp q0 (RIReg v)
+ , SUBF reg_r tmp reg_r
+ , SR fmt reg_r reg_r (RIReg s)
+ -- compute quotient
+ -- q = q1*b + q0
+ , SL fmt reg_q q1 (RIImm (ImmInt half))
+ , ADD reg_q reg_q (RIReg q0)
+ ]
+ divOp2 _ _ _ _
+ = panic "genCCall: Wrong number of arguments for divOp2"
+ multOp2 platform width [res_h, res_l] [arg_x, arg_y]
+ = do let reg_h = getRegisterReg platform (CmmLocal res_h)
+ reg_l = getRegisterReg platform (CmmLocal res_l)
+ fmt = intFormat width
+ (x_reg, x_code) <- getSomeReg arg_x
+ (y_reg, y_code) <- getSomeReg arg_y
+ return $ y_code `appOL` x_code
+ `appOL` toOL [ MULL fmt reg_l x_reg (RIReg y_reg)
+ , MULHU fmt reg_h x_reg y_reg
+ ]
+ multOp2 _ _ _ _
+ = panic "genCall: Wrong number of arguments for multOp2"
+ add2Op platform [res_h, res_l] [arg_x, arg_y]
+ = do let reg_h = getRegisterReg platform (CmmLocal res_h)
+ reg_l = getRegisterReg platform (CmmLocal res_l)
+ (x_reg, x_code) <- getSomeReg arg_x
+ (y_reg, y_code) <- getSomeReg arg_y
+ return $ y_code `appOL` x_code
+ `appOL` toOL [ LI reg_h (ImmInt 0)
+ , ADDC reg_l x_reg y_reg
+ , ADDZE reg_h reg_h
+ ]
+ add2Op _ _ _
+ = panic "genCCall: Wrong number of arguments/results for add2"
+
+ addcOp platform [res_r, res_c] [arg_x, arg_y]
+ = add2Op platform [res_c {-hi-}, res_r {-lo-}] [arg_x, arg_y]
+ addcOp _ _ _
+ = panic "genCCall: Wrong number of arguments/results for addc"
+
+ -- PowerPC subfc sets the carry for rT = ~(rA) + rB + 1,
+ -- which is 0 for borrow and 1 otherwise. We need 1 and 0
+ -- so xor with 1.
+ subcOp platform [res_r, res_c] [arg_x, arg_y]
+ = do let reg_r = getRegisterReg platform (CmmLocal res_r)
+ reg_c = getRegisterReg platform (CmmLocal res_c)
+ (x_reg, x_code) <- getSomeReg arg_x
+ (y_reg, y_code) <- getSomeReg arg_y
+ return $ y_code `appOL` x_code
+ `appOL` toOL [ LI reg_c (ImmInt 0)
+ , SUBFC reg_r y_reg (RIReg x_reg)
+ , ADDZE reg_c reg_c
+ , XOR reg_c reg_c (RIImm (ImmInt 1))
+ ]
+ subcOp _ _ _
+ = panic "genCCall: Wrong number of arguments/results for subc"
+ addSubCOp instr platform width [res_r, res_c] [arg_x, arg_y]
+ = do let reg_r = getRegisterReg platform (CmmLocal res_r)
+ reg_c = getRegisterReg platform (CmmLocal res_c)
+ (x_reg, x_code) <- getSomeReg arg_x
+ (y_reg, y_code) <- getSomeReg arg_y
+ return $ y_code `appOL` x_code
+ `appOL` toOL [ instr reg_r y_reg x_reg,
+ -- SUBFO argument order reversed!
+ MFOV (intFormat width) reg_c
+ ]
+ addSubCOp _ _ _ _ _
+ = panic "genCall: Wrong number of arguments/results for addC"
+ fabs platform [res] [arg]
+ = do let res_r = getRegisterReg platform (CmmLocal res)
+ (arg_reg, arg_code) <- getSomeReg arg
+ return $ arg_code `snocOL` FABS res_r arg_reg
+ fabs _ _ _
+ = panic "genCall: Wrong number of arguments/results for fabs"
+
+-- TODO: replace 'Int' by an enum such as 'PPC_64ABI'
+data GenCCallPlatform = GCP32ELF | GCP64ELF !Int | GCPAIX
+
+platformToGCP :: Platform -> GenCCallPlatform
+platformToGCP platform
+ = case platformOS platform of
+ OSAIX -> GCPAIX
+ _ -> case platformArch platform of
+ ArchPPC -> GCP32ELF
+ ArchPPC_64 ELF_V1 -> GCP64ELF 1
+ ArchPPC_64 ELF_V2 -> GCP64ELF 2
+ _ -> panic "platformToGCP: Not PowerPC"
+
+
+genCCall'
+ :: DynFlags
+ -> GenCCallPlatform
+ -> ForeignTarget -- function to call
+ -> [CmmFormal] -- where to put the result
+ -> [CmmActual] -- arguments (of mixed type)
+ -> NatM InstrBlock
+
+{-
+ PowerPC Linux uses the System V Release 4 Calling Convention
+ for PowerPC. It is described in the
+ "System V Application Binary Interface PowerPC Processor Supplement".
+
+ PowerPC 64 Linux uses the System V Release 4 Calling Convention for
+ 64-bit PowerPC. It is specified in
+ "64-bit PowerPC ELF Application Binary Interface Supplement 1.9"
+ (PPC64 ELF v1.9).
+
+ PowerPC 64 Linux in little endian mode uses the "Power Architecture 64-Bit
+ ELF V2 ABI Specification -- OpenPOWER ABI for Linux Supplement"
+ (PPC64 ELF v2).
+
+ AIX follows the "PowerOpen ABI: Application Binary Interface Big-Endian
+ 32-Bit Hardware Implementation"
+
+ All four conventions are similar:
+ Parameters may be passed in general-purpose registers starting at r3, in
+ floating point registers starting at f1, or on the stack.
+
+ But there are substantial differences:
+ * The number of registers used for parameter passing and the exact set of
+ nonvolatile registers differs (see MachRegs.hs).
+ * On AIX and 64-bit ELF, stack space is always reserved for parameters,
+ even if they are passed in registers. The called routine may choose to
+ save parameters from registers to the corresponding space on the stack.
+ * On AIX and 64-bit ELF, a corresponding amount of GPRs is skipped when
+ a floating point parameter is passed in an FPR.
+ * SysV insists on either passing I64 arguments on the stack, or in two GPRs,
+ starting with an odd-numbered GPR. It may skip a GPR to achieve this.
+ AIX just treats an I64 likt two separate I32s (high word first).
+ * I64 and FF64 arguments are 8-byte aligned on the stack for SysV, but only
+ 4-byte aligned like everything else on AIX.
+ * The SysV spec claims that FF32 is represented as FF64 on the stack. GCC on
+ PowerPC Linux does not agree, so neither do we.
+
+ According to all conventions, the parameter area should be part of the
+ caller's stack frame, allocated in the caller's prologue code (large enough
+ to hold the parameter lists for all called routines). The NCG already
+ uses the stack for register spilling, leaving 64 bytes free at the top.
+ If we need a larger parameter area than that, we increase the size
+ of the stack frame just before ccalling.
+-}
+
+
+genCCall' dflags gcp target dest_regs args
+ = do
+ (finalStack,passArgumentsCode,usedRegs) <- passArguments
+ (zip3 args argReps argHints)
+ allArgRegs
+ (allFPArgRegs platform)
+ initialStackOffset
+ nilOL []
+
+ (labelOrExpr, reduceToFF32) <- case target of
+ ForeignTarget (CmmLit (CmmLabel lbl)) _ -> do
+ uses_pic_base_implicitly
+ return (Left lbl, False)
+ ForeignTarget expr _ -> do
+ uses_pic_base_implicitly
+ return (Right expr, False)
+ PrimTarget mop -> outOfLineMachOp mop
+
+ let codeBefore = move_sp_down finalStack `appOL` passArgumentsCode
+ codeAfter = move_sp_up finalStack `appOL` moveResult reduceToFF32
+
+ case labelOrExpr of
+ Left lbl -> do -- the linker does all the work for us
+ return ( codeBefore
+ `snocOL` BL lbl usedRegs
+ `appOL` maybeNOP -- some ABI require a NOP after BL
+ `appOL` codeAfter)
+ Right dyn -> do -- implement call through function pointer
+ (dynReg, dynCode) <- getSomeReg dyn
+ case gcp of
+ GCP64ELF 1 -> return ( dynCode
+ `appOL` codeBefore
+ `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 40))
+ `snocOL` LD II64 r11 (AddrRegImm dynReg (ImmInt 0))
+ `snocOL` LD II64 toc (AddrRegImm dynReg (ImmInt 8))
+ `snocOL` MTCTR r11
+ `snocOL` LD II64 r11 (AddrRegImm dynReg (ImmInt 16))
+ `snocOL` BCTRL usedRegs
+ `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 40))
+ `appOL` codeAfter)
+ GCP64ELF 2 -> return ( dynCode
+ `appOL` codeBefore
+ `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 24))
+ `snocOL` MR r12 dynReg
+ `snocOL` MTCTR r12
+ `snocOL` BCTRL usedRegs
+ `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 24))
+ `appOL` codeAfter)
+ GCPAIX -> return ( dynCode
+ -- AIX/XCOFF follows the PowerOPEN ABI
+ -- which is quite similar to LinuxPPC64/ELFv1
+ `appOL` codeBefore
+ `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 20))
+ `snocOL` LD II32 r11 (AddrRegImm dynReg (ImmInt 0))
+ `snocOL` LD II32 toc (AddrRegImm dynReg (ImmInt 4))
+ `snocOL` MTCTR r11
+ `snocOL` LD II32 r11 (AddrRegImm dynReg (ImmInt 8))
+ `snocOL` BCTRL usedRegs
+ `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 20))
+ `appOL` codeAfter)
+ _ -> return ( dynCode
+ `snocOL` MTCTR dynReg
+ `appOL` codeBefore
+ `snocOL` BCTRL usedRegs
+ `appOL` codeAfter)
+ where
+ platform = targetPlatform dflags
+
+ uses_pic_base_implicitly = do
+ -- See Note [implicit register in PPC PIC code]
+ -- on why we claim to use PIC register here
+ when (positionIndependent dflags && target32Bit platform) $ do
+ _ <- getPicBaseNat $ archWordFormat True
+ return ()
+
+ initialStackOffset = case gcp of
+ GCPAIX -> 24
+ GCP32ELF -> 8
+ GCP64ELF 1 -> 48
+ GCP64ELF 2 -> 32
+ _ -> panic "genCall': unknown calling convention"
+ -- size of linkage area + size of arguments, in bytes
+ stackDelta finalStack = case gcp of
+ GCPAIX ->
+ roundTo 16 $ (24 +) $ max 32 $ sum $
+ map (widthInBytes . typeWidth) argReps
+ GCP32ELF -> roundTo 16 finalStack
+ GCP64ELF 1 ->
+ roundTo 16 $ (48 +) $ max 64 $ sum $
+ map (roundTo 8 . widthInBytes . typeWidth)
+ argReps
+ GCP64ELF 2 ->
+ roundTo 16 $ (32 +) $ max 64 $ sum $
+ map (roundTo 8 . widthInBytes . typeWidth)
+ argReps
+ _ -> panic "genCall': unknown calling conv."
+
+ argReps = map (cmmExprType dflags) args
+ (argHints, _) = foreignTargetHints target
+
+ roundTo a x | x `mod` a == 0 = x
+ | otherwise = x + a - (x `mod` a)
+
+ spFormat = if target32Bit platform then II32 else II64
+
+ -- TODO: Do not create a new stack frame if delta is too large.
+ move_sp_down finalStack
+ | delta > stackFrameHeaderSize dflags =
+ toOL [STU spFormat sp (AddrRegImm sp (ImmInt (-delta))),
+ DELTA (-delta)]
+ | otherwise = nilOL
+ where delta = stackDelta finalStack
+ move_sp_up finalStack
+ | delta > stackFrameHeaderSize dflags =
+ toOL [ADD sp sp (RIImm (ImmInt delta)),
+ DELTA 0]
+ | otherwise = nilOL
+ where delta = stackDelta finalStack
+
+ -- A NOP instruction is required after a call (bl instruction)
+ -- on AIX and 64-Bit Linux.
+ -- If the call is to a function with a different TOC (r2) the
+ -- link editor replaces the NOP instruction with a load of the TOC
+ -- from the stack to restore the TOC.
+ maybeNOP = case gcp of
+ GCP32ELF -> nilOL
+ -- See Section 3.9.4 of OpenPower ABI
+ GCPAIX -> unitOL NOP
+ -- See Section 3.5.11 of PPC64 ELF v1.9
+ GCP64ELF 1 -> unitOL NOP
+ -- See Section 2.3.6 of PPC64 ELF v2
+ GCP64ELF 2 -> unitOL NOP
+ _ -> panic "maybeNOP: Unknown PowerPC 64-bit ABI"
+
+ passArguments [] _ _ stackOffset accumCode accumUsed = return (stackOffset, accumCode, accumUsed)
+ passArguments ((arg,arg_ty,_):args) gprs fprs stackOffset
+ accumCode accumUsed | isWord64 arg_ty
+ && target32Bit (targetPlatform dflags) =
+ do
+ ChildCode64 code vr_lo <- iselExpr64 arg
+ let vr_hi = getHiVRegFromLo vr_lo
+
+ case gcp of
+ GCPAIX ->
+ do let storeWord vr (gpr:_) _ = MR gpr vr
+ storeWord vr [] offset
+ = ST II32 vr (AddrRegImm sp (ImmInt offset))
+ passArguments args
+ (drop 2 gprs)
+ fprs
+ (stackOffset+8)
+ (accumCode `appOL` code
+ `snocOL` storeWord vr_hi gprs stackOffset
+ `snocOL` storeWord vr_lo (drop 1 gprs) (stackOffset+4))
+ ((take 2 gprs) ++ accumUsed)
+ GCP32ELF ->
+ do let stackOffset' = roundTo 8 stackOffset
+ stackCode = accumCode `appOL` code
+ `snocOL` ST II32 vr_hi (AddrRegImm sp (ImmInt stackOffset'))
+ `snocOL` ST II32 vr_lo (AddrRegImm sp (ImmInt (stackOffset'+4)))
+ regCode hireg loreg =
+ accumCode `appOL` code
+ `snocOL` MR hireg vr_hi
+ `snocOL` MR loreg vr_lo
+
+ case gprs of
+ hireg : loreg : regs | even (length gprs) ->
+ passArguments args regs fprs stackOffset
+ (regCode hireg loreg) (hireg : loreg : accumUsed)
+ _skipped : hireg : loreg : regs ->
+ passArguments args regs fprs stackOffset
+ (regCode hireg loreg) (hireg : loreg : accumUsed)
+ _ -> -- only one or no regs left
+ passArguments args [] fprs (stackOffset'+8)
+ stackCode accumUsed
+ GCP64ELF _ -> panic "passArguments: 32 bit code"
+
+ passArguments ((arg,rep,hint):args) gprs fprs stackOffset accumCode accumUsed
+ | reg : _ <- regs = do
+ register <- getRegister arg_pro
+ let code = case register of
+ Fixed _ freg fcode -> fcode `snocOL` MR reg freg
+ Any _ acode -> acode reg
+ stackOffsetRes = case gcp of
+ -- The PowerOpen ABI requires that we
+ -- reserve stack slots for register
+ -- parameters
+ GCPAIX -> stackOffset + stackBytes
+ -- ... the SysV ABI 32-bit doesn't.
+ GCP32ELF -> stackOffset
+ -- ... but SysV ABI 64-bit does.
+ GCP64ELF _ -> stackOffset + stackBytes
+ passArguments args
+ (drop nGprs gprs)
+ (drop nFprs fprs)
+ stackOffsetRes
+ (accumCode `appOL` code)
+ (reg : accumUsed)
+ | otherwise = do
+ (vr, code) <- getSomeReg arg_pro
+ passArguments args
+ (drop nGprs gprs)
+ (drop nFprs fprs)
+ (stackOffset' + stackBytes)
+ (accumCode `appOL` code
+ `snocOL` ST format_pro vr stackSlot)
+ accumUsed
+ where
+ arg_pro
+ | isBitsType rep = CmmMachOp (conv_op (typeWidth rep) (wordWidth dflags)) [arg]
+ | otherwise = arg
+ format_pro
+ | isBitsType rep = intFormat (wordWidth dflags)
+ | otherwise = cmmTypeFormat rep
+ conv_op = case hint of
+ SignedHint -> MO_SS_Conv
+ _ -> MO_UU_Conv
+
+ stackOffset' = case gcp of
+ GCPAIX ->
+ -- The 32bit PowerOPEN ABI is happy with
+ -- 32bit-alignment ...
+ stackOffset
+ GCP32ELF
+ -- ... the SysV ABI requires 8-byte
+ -- alignment for doubles.
+ | isFloatType rep && typeWidth rep == W64 ->
+ roundTo 8 stackOffset
+ | otherwise ->
+ stackOffset
+ GCP64ELF _ ->
+ -- Everything on the stack is mapped to
+ -- 8-byte aligned doublewords
+ stackOffset
+ stackOffset''
+ | isFloatType rep && typeWidth rep == W32 =
+ case gcp of
+ -- The ELF v1 ABI Section 3.2.3 requires:
+ -- "Single precision floating point values
+ -- are mapped to the second word in a single
+ -- doubleword"
+ GCP64ELF 1 -> stackOffset' + 4
+ _ -> stackOffset'
+ | otherwise = stackOffset'
+
+ stackSlot = AddrRegImm sp (ImmInt stackOffset'')
+ (nGprs, nFprs, stackBytes, regs)
+ = case gcp of
+ GCPAIX ->
+ case cmmTypeFormat rep of
+ II8 -> (1, 0, 4, gprs)
+ II16 -> (1, 0, 4, gprs)
+ II32 -> (1, 0, 4, gprs)
+ -- The PowerOpen ABI requires that we skip a
+ -- corresponding number of GPRs when we use
+ -- the FPRs.
+ --
+ -- E.g. for a `double` two GPRs are skipped,
+ -- whereas for a `float` one GPR is skipped
+ -- when parameters are assigned to
+ -- registers.
+ --
+ -- The PowerOpen ABI specification can be found at
+ -- ftp://www.sourceware.org/pub/binutils/ppc-docs/ppc-poweropen/
+ FF32 -> (1, 1, 4, fprs)
+ FF64 -> (2, 1, 8, fprs)
+ II64 -> panic "genCCall' passArguments II64"
+
+ GCP32ELF ->
+ case cmmTypeFormat rep of
+ II8 -> (1, 0, 4, gprs)
+ II16 -> (1, 0, 4, gprs)
+ II32 -> (1, 0, 4, gprs)
+ -- ... the SysV ABI doesn't.
+ FF32 -> (0, 1, 4, fprs)
+ FF64 -> (0, 1, 8, fprs)
+ II64 -> panic "genCCall' passArguments II64"
+ GCP64ELF _ ->
+ case cmmTypeFormat rep of
+ II8 -> (1, 0, 8, gprs)
+ II16 -> (1, 0, 8, gprs)
+ II32 -> (1, 0, 8, gprs)
+ II64 -> (1, 0, 8, gprs)
+ -- The ELFv1 ABI requires that we skip a
+ -- corresponding number of GPRs when we use
+ -- the FPRs.
+ FF32 -> (1, 1, 8, fprs)
+ FF64 -> (1, 1, 8, fprs)
+
+ moveResult reduceToFF32 =
+ case dest_regs of
+ [] -> nilOL
+ [dest]
+ | reduceToFF32 && isFloat32 rep -> unitOL (FRSP r_dest f1)
+ | isFloat32 rep || isFloat64 rep -> unitOL (MR r_dest f1)
+ | isWord64 rep && target32Bit (targetPlatform dflags)
+ -> toOL [MR (getHiVRegFromLo r_dest) r3,
+ MR r_dest r4]
+ | otherwise -> unitOL (MR r_dest r3)
+ where rep = cmmRegType dflags (CmmLocal dest)
+ r_dest = getRegisterReg platform (CmmLocal dest)
+ _ -> panic "genCCall' moveResult: Bad dest_regs"
+
+ outOfLineMachOp mop =
+ do
+ dflags <- getDynFlags
+ mopExpr <- cmmMakeDynamicReference dflags CallReference $
+ mkForeignLabel functionName Nothing ForeignLabelInThisPackage IsFunction
+ let mopLabelOrExpr = case mopExpr of
+ CmmLit (CmmLabel lbl) -> Left lbl
+ _ -> Right mopExpr
+ return (mopLabelOrExpr, reduce)
+ where
+ (functionName, reduce) = case mop of
+ MO_F32_Exp -> (fsLit "exp", True)
+ MO_F32_ExpM1 -> (fsLit "expm1", True)
+ MO_F32_Log -> (fsLit "log", True)
+ MO_F32_Log1P -> (fsLit "log1p", True)
+ MO_F32_Sqrt -> (fsLit "sqrt", True)
+ MO_F32_Fabs -> unsupported
+
+ MO_F32_Sin -> (fsLit "sin", True)
+ MO_F32_Cos -> (fsLit "cos", True)
+ MO_F32_Tan -> (fsLit "tan", True)
+
+ MO_F32_Asin -> (fsLit "asin", True)
+ MO_F32_Acos -> (fsLit "acos", True)
+ MO_F32_Atan -> (fsLit "atan", True)
+
+ MO_F32_Sinh -> (fsLit "sinh", True)
+ MO_F32_Cosh -> (fsLit "cosh", True)
+ MO_F32_Tanh -> (fsLit "tanh", True)
+ MO_F32_Pwr -> (fsLit "pow", True)
+
+ MO_F32_Asinh -> (fsLit "asinh", True)
+ MO_F32_Acosh -> (fsLit "acosh", True)
+ MO_F32_Atanh -> (fsLit "atanh", True)
+
+ MO_F64_Exp -> (fsLit "exp", False)
+ MO_F64_ExpM1 -> (fsLit "expm1", False)
+ MO_F64_Log -> (fsLit "log", False)
+ MO_F64_Log1P -> (fsLit "log1p", False)
+ MO_F64_Sqrt -> (fsLit "sqrt", False)
+ MO_F64_Fabs -> unsupported
+
+ MO_F64_Sin -> (fsLit "sin", False)
+ MO_F64_Cos -> (fsLit "cos", False)
+ MO_F64_Tan -> (fsLit "tan", False)
+
+ MO_F64_Asin -> (fsLit "asin", False)
+ MO_F64_Acos -> (fsLit "acos", False)
+ MO_F64_Atan -> (fsLit "atan", False)
+
+ MO_F64_Sinh -> (fsLit "sinh", False)
+ MO_F64_Cosh -> (fsLit "cosh", False)
+ MO_F64_Tanh -> (fsLit "tanh", False)
+ MO_F64_Pwr -> (fsLit "pow", False)
+
+ MO_F64_Asinh -> (fsLit "asinh", False)
+ MO_F64_Acosh -> (fsLit "acosh", False)
+ MO_F64_Atanh -> (fsLit "atanh", False)
+
+ MO_UF_Conv w -> (fsLit $ word2FloatLabel w, False)
+
+ MO_Memcpy _ -> (fsLit "memcpy", False)
+ MO_Memset _ -> (fsLit "memset", False)
+ MO_Memmove _ -> (fsLit "memmove", False)
+ MO_Memcmp _ -> (fsLit "memcmp", False)
+
+ MO_BSwap w -> (fsLit $ bSwapLabel w, False)
+ MO_BRev w -> (fsLit $ bRevLabel w, False)
+ MO_PopCnt w -> (fsLit $ popCntLabel w, False)
+ MO_Pdep w -> (fsLit $ pdepLabel w, False)
+ MO_Pext w -> (fsLit $ pextLabel w, False)
+ MO_Clz _ -> unsupported
+ MO_Ctz _ -> unsupported
+ MO_AtomicRMW {} -> unsupported
+ MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False)
+ MO_AtomicRead _ -> unsupported
+ MO_AtomicWrite _ -> unsupported
+
+ MO_S_Mul2 {} -> unsupported
+ MO_S_QuotRem {} -> unsupported
+ MO_U_QuotRem {} -> unsupported
+ MO_U_QuotRem2 {} -> unsupported
+ MO_Add2 {} -> unsupported
+ MO_AddWordC {} -> unsupported
+ MO_SubWordC {} -> unsupported
+ MO_AddIntC {} -> unsupported
+ MO_SubIntC {} -> unsupported
+ MO_U_Mul2 {} -> unsupported
+ MO_ReadBarrier -> unsupported
+ MO_WriteBarrier -> unsupported
+ MO_Touch -> unsupported
+ MO_Prefetch_Data _ -> unsupported
+ unsupported = panic ("outOfLineCmmOp: " ++ show mop
+ ++ " not supported")
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+genSwitch dflags expr targets
+ | OSAIX <- platformOS (targetPlatform dflags)
+ = do
+ (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+ let fmt = archWordFormat $ target32Bit $ targetPlatform dflags
+ sha = if target32Bit $ targetPlatform dflags then 2 else 3
+ tmp <- getNewRegNat fmt
+ lbl <- getNewLabelNat
+ dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+ (tableReg,t_code) <- getSomeReg $ dynRef
+ let code = e_code `appOL` t_code `appOL` toOL [
+ SL fmt tmp reg (RIImm (ImmInt sha)),
+ LD fmt tmp (AddrRegReg tableReg tmp),
+ MTCTR tmp,
+ BCTR ids (Just lbl) []
+ ]
+ return code
+
+ | (positionIndependent dflags) || (not $ target32Bit $ targetPlatform dflags)
+ = do
+ (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+ let fmt = archWordFormat $ target32Bit $ targetPlatform dflags
+ sha = if target32Bit $ targetPlatform dflags then 2 else 3
+ tmp <- getNewRegNat fmt
+ lbl <- getNewLabelNat
+ dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+ (tableReg,t_code) <- getSomeReg $ dynRef
+ let code = e_code `appOL` t_code `appOL` toOL [
+ SL fmt tmp reg (RIImm (ImmInt sha)),
+ LD fmt tmp (AddrRegReg tableReg tmp),
+ ADD tmp tmp (RIReg tableReg),
+ MTCTR tmp,
+ BCTR ids (Just lbl) []
+ ]
+ return code
+ | otherwise
+ = do
+ (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+ let fmt = archWordFormat $ target32Bit $ targetPlatform dflags
+ sha = if target32Bit $ targetPlatform dflags then 2 else 3
+ tmp <- getNewRegNat fmt
+ lbl <- getNewLabelNat
+ let code = e_code `appOL` toOL [
+ SL fmt tmp reg (RIImm (ImmInt sha)),
+ ADDIS tmp tmp (HA (ImmCLbl lbl)),
+ LD fmt tmp (AddrRegImm tmp (LO (ImmCLbl lbl))),
+ MTCTR tmp,
+ BCTR ids (Just lbl) []
+ ]
+ return code
+ where (offset, ids) = switchTargetsToTable targets
+
+generateJumpTableForInstr :: DynFlags -> Instr
+ -> Maybe (NatCmmDecl RawCmmStatics Instr)
+generateJumpTableForInstr dflags (BCTR ids (Just lbl) _) =
+ let jumpTable
+ | (positionIndependent dflags)
+ || (not $ target32Bit $ targetPlatform dflags)
+ = map jumpTableEntryRel ids
+ | otherwise = map (jumpTableEntry dflags) ids
+ where jumpTableEntryRel Nothing
+ = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+ jumpTableEntryRel (Just blockid)
+ = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0
+ (wordWidth dflags))
+ where blockLabel = blockLbl blockid
+ in Just (CmmData (Section ReadOnlyData lbl) (RawCmmStatics lbl jumpTable))
+generateJumpTableForInstr _ _ = Nothing
+
+-- -----------------------------------------------------------------------------
+-- 'condIntReg' and 'condFltReg': condition codes into registers
+
+-- Turn those condition codes into integers now (when they appear on
+-- the right hand side of an assignment).
+
+
+
+condReg :: NatM CondCode -> NatM Register
+condReg getCond = do
+ CondCode _ cond cond_code <- getCond
+ dflags <- getDynFlags
+ let
+ code dst = cond_code
+ `appOL` negate_code
+ `appOL` toOL [
+ MFCR dst,
+ RLWINM dst dst (bit + 1) 31 31
+ ]
+
+ negate_code | do_negate = unitOL (CRNOR bit bit bit)
+ | otherwise = nilOL
+
+ (bit, do_negate) = case cond of
+ LTT -> (0, False)
+ LE -> (1, True)
+ EQQ -> (2, False)
+ GE -> (0, True)
+ GTT -> (1, False)
+
+ NE -> (2, True)
+
+ LU -> (0, False)
+ LEU -> (1, True)
+ GEU -> (0, True)
+ GU -> (1, False)
+ _ -> panic "PPC.CodeGen.codeReg: no match"
+
+ format = archWordFormat $ target32Bit $ targetPlatform dflags
+ return (Any format code)
+
+condIntReg :: Cond -> Width -> CmmExpr -> CmmExpr -> NatM Register
+condIntReg cond width x y = condReg (condIntCode cond width x y)
+condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+condFltReg cond x y = condReg (condFltCode cond x y)
+
+
+
+-- -----------------------------------------------------------------------------
+-- 'trivial*Code': deal with trivial instructions
+
+-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
+-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
+-- Only look for constants on the right hand side, because that's
+-- where the generic optimizer will have put them.
+
+-- Similarly, for unary instructions, we don't have to worry about
+-- matching an StInt as the argument, because genericOpt will already
+-- have handled the constant-folding.
+
+
+
+{-
+Wolfgang's PowerPC version of The Rules:
+
+A slightly modified version of The Rules to take advantage of the fact
+that PowerPC instructions work on all registers and don't implicitly
+clobber any fixed registers.
+
+* The only expression for which getRegister returns Fixed is (CmmReg reg).
+
+* If getRegister returns Any, then the code it generates may modify only:
+ (a) fresh temporaries
+ (b) the destination register
+ It may *not* modify global registers, unless the global
+ register happens to be the destination register.
+ It may not clobber any other registers. In fact, only ccalls clobber any
+ fixed registers.
+ Also, it may not modify the counter register (used by genCCall).
+
+ Corollary: If a getRegister for a subexpression returns Fixed, you need
+ not move it to a fresh temporary before evaluating the next subexpression.
+ The Fixed register won't be modified.
+ Therefore, we don't need a counterpart for the x86's getStableReg on PPC.
+
+* SDM's First Rule is valid for PowerPC, too: subexpressions can depend on
+ the value of the destination register.
+-}
+
+trivialCode
+ :: Width
+ -> Bool
+ -> (Reg -> Reg -> RI -> Instr)
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+
+trivialCode rep signed instr x (CmmLit (CmmInt y _))
+ | Just imm <- makeImmediate rep signed y
+ = do
+ (src1, code1) <- getSomeReg x
+ let code dst = code1 `snocOL` instr dst src1 (RIImm imm)
+ return (Any (intFormat rep) code)
+
+trivialCode rep _ instr x y = do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let code dst = code1 `appOL` code2 `snocOL` instr dst src1 (RIReg src2)
+ return (Any (intFormat rep) code)
+
+shiftMulCode
+ :: Width
+ -> Bool
+ -> (Format-> Reg -> Reg -> RI -> Instr)
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+shiftMulCode width sign instr x (CmmLit (CmmInt y _))
+ | Just imm <- makeImmediate width sign y
+ = do
+ (src1, code1) <- getSomeReg x
+ let format = intFormat width
+ let ins_fmt = intFormat (max W32 width)
+ let code dst = code1 `snocOL` instr ins_fmt dst src1 (RIImm imm)
+ return (Any format code)
+
+shiftMulCode width _ instr x y = do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let format = intFormat width
+ let ins_fmt = intFormat (max W32 width)
+ let code dst = code1 `appOL` code2
+ `snocOL` instr ins_fmt dst src1 (RIReg src2)
+ return (Any format code)
+
+trivialCodeNoImm' :: Format -> (Reg -> Reg -> Reg -> Instr)
+ -> CmmExpr -> CmmExpr -> NatM Register
+trivialCodeNoImm' format instr x y = do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let code dst = code1 `appOL` code2 `snocOL` instr dst src1 src2
+ return (Any format code)
+
+trivialCodeNoImm :: Format -> (Format -> Reg -> Reg -> Reg -> Instr)
+ -> CmmExpr -> CmmExpr -> NatM Register
+trivialCodeNoImm format instr x y
+ = trivialCodeNoImm' format (instr format) x y
+
+srCode :: Width -> Bool -> (Format-> Reg -> Reg -> RI -> Instr)
+ -> CmmExpr -> CmmExpr -> NatM Register
+srCode width sgn instr x (CmmLit (CmmInt y _))
+ | Just imm <- makeImmediate width sgn y
+ = do
+ let op_len = max W32 width
+ extend = if sgn then extendSExpr else extendUExpr
+ (src1, code1) <- getSomeReg (extend width op_len x)
+ let code dst = code1 `snocOL`
+ instr (intFormat op_len) dst src1 (RIImm imm)
+ return (Any (intFormat width) code)
+
+srCode width sgn instr x y = do
+ let op_len = max W32 width
+ extend = if sgn then extendSExpr else extendUExpr
+ (src1, code1) <- getSomeReg (extend width op_len x)
+ (src2, code2) <- getSomeReg (extendUExpr width op_len y)
+ -- Note: Shift amount `y` is unsigned
+ let code dst = code1 `appOL` code2 `snocOL`
+ instr (intFormat op_len) dst src1 (RIReg src2)
+ return (Any (intFormat width) code)
+
+divCode :: Width -> Bool -> CmmExpr -> CmmExpr -> NatM Register
+divCode width sgn x y = do
+ let op_len = max W32 width
+ extend = if sgn then extendSExpr else extendUExpr
+ (src1, code1) <- getSomeReg (extend width op_len x)
+ (src2, code2) <- getSomeReg (extend width op_len y)
+ let code dst = code1 `appOL` code2 `snocOL`
+ DIV (intFormat op_len) sgn dst src1 src2
+ return (Any (intFormat width) code)
+
+
+trivialUCode :: Format
+ -> (Reg -> Reg -> Instr)
+ -> CmmExpr
+ -> NatM Register
+trivialUCode rep instr x = do
+ (src, code) <- getSomeReg x
+ let code' dst = code `snocOL` instr dst src
+ return (Any rep code')
+
+-- There is no "remainder" instruction on the PPC, so we have to do
+-- it the hard way.
+-- The "sgn" parameter is the signedness for the division instruction
+
+remainderCode :: Width -> Bool -> Reg -> CmmExpr -> CmmExpr
+ -> NatM (Reg -> InstrBlock)
+remainderCode rep sgn reg_q arg_x arg_y = do
+ let op_len = max W32 rep
+ fmt = intFormat op_len
+ extend = if sgn then extendSExpr else extendUExpr
+ (x_reg, x_code) <- getSomeReg (extend rep op_len arg_x)
+ (y_reg, y_code) <- getSomeReg (extend rep op_len arg_y)
+ return $ \reg_r -> y_code `appOL` x_code
+ `appOL` toOL [ DIV fmt sgn reg_q x_reg y_reg
+ , MULL fmt reg_r reg_q (RIReg y_reg)
+ , SUBF reg_r reg_r x_reg
+ ]
+
+
+coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP fromRep toRep x = do
+ dflags <- getDynFlags
+ let arch = platformArch $ targetPlatform dflags
+ coerceInt2FP' arch fromRep toRep x
+
+coerceInt2FP' :: Arch -> Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP' ArchPPC fromRep toRep x = do
+ (src, code) <- getSomeReg x
+ lbl <- getNewLabelNat
+ itmp <- getNewRegNat II32
+ ftmp <- getNewRegNat FF64
+ dflags <- getDynFlags
+ dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+ Amode addr addr_code <- getAmode D dynRef
+ let
+ code' dst = code `appOL` maybe_exts `appOL` toOL [
+ LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl
+ [CmmStaticLit (CmmInt 0x43300000 W32),
+ CmmStaticLit (CmmInt 0x80000000 W32)],
+ XORIS itmp src (ImmInt 0x8000),
+ ST II32 itmp (spRel dflags 3),
+ LIS itmp (ImmInt 0x4330),
+ ST II32 itmp (spRel dflags 2),
+ LD FF64 ftmp (spRel dflags 2)
+ ] `appOL` addr_code `appOL` toOL [
+ LD FF64 dst addr,
+ FSUB FF64 dst ftmp dst
+ ] `appOL` maybe_frsp dst
+
+ maybe_exts = case fromRep of
+ W8 -> unitOL $ EXTS II8 src src
+ W16 -> unitOL $ EXTS II16 src src
+ W32 -> nilOL
+ _ -> panic "PPC.CodeGen.coerceInt2FP: no match"
+
+ maybe_frsp dst
+ = case toRep of
+ W32 -> unitOL $ FRSP dst dst
+ W64 -> nilOL
+ _ -> panic "PPC.CodeGen.coerceInt2FP: no match"
+
+ return (Any (floatFormat toRep) code')
+
+-- On an ELF v1 Linux we use the compiler doubleword in the stack frame
+-- this is the TOC pointer doubleword on ELF v2 Linux. The latter is only
+-- set right before a call and restored right after return from the call.
+-- So it is fine.
+coerceInt2FP' (ArchPPC_64 _) fromRep toRep x = do
+ (src, code) <- getSomeReg x
+ dflags <- getDynFlags
+ let
+ code' dst = code `appOL` maybe_exts `appOL` toOL [
+ ST II64 src (spRel dflags 3),
+ LD FF64 dst (spRel dflags 3),
+ FCFID dst dst
+ ] `appOL` maybe_frsp dst
+
+ maybe_exts = case fromRep of
+ W8 -> unitOL $ EXTS II8 src src
+ W16 -> unitOL $ EXTS II16 src src
+ W32 -> unitOL $ EXTS II32 src src
+ W64 -> nilOL
+ _ -> panic "PPC.CodeGen.coerceInt2FP: no match"
+
+ maybe_frsp dst
+ = case toRep of
+ W32 -> unitOL $ FRSP dst dst
+ W64 -> nilOL
+ _ -> panic "PPC.CodeGen.coerceInt2FP: no match"
+
+ return (Any (floatFormat toRep) code')
+
+coerceInt2FP' _ _ _ _ = panic "PPC.CodeGen.coerceInt2FP: unknown arch"
+
+
+coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int fromRep toRep x = do
+ dflags <- getDynFlags
+ let arch = platformArch $ targetPlatform dflags
+ coerceFP2Int' arch fromRep toRep x
+
+coerceFP2Int' :: Arch -> Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int' ArchPPC _ toRep x = do
+ dflags <- getDynFlags
+ -- the reps don't really matter: F*->FF64 and II32->I* are no-ops
+ (src, code) <- getSomeReg x
+ tmp <- getNewRegNat FF64
+ let
+ code' dst = code `appOL` toOL [
+ -- convert to int in FP reg
+ FCTIWZ tmp src,
+ -- store value (64bit) from FP to stack
+ ST FF64 tmp (spRel dflags 2),
+ -- read low word of value (high word is undefined)
+ LD II32 dst (spRel dflags 3)]
+ return (Any (intFormat toRep) code')
+
+coerceFP2Int' (ArchPPC_64 _) _ toRep x = do
+ dflags <- getDynFlags
+ -- the reps don't really matter: F*->FF64 and II64->I* are no-ops
+ (src, code) <- getSomeReg x
+ tmp <- getNewRegNat FF64
+ let
+ code' dst = code `appOL` toOL [
+ -- convert to int in FP reg
+ FCTIDZ tmp src,
+ -- store value (64bit) from FP to compiler word on stack
+ ST FF64 tmp (spRel dflags 3),
+ LD II64 dst (spRel dflags 3)]
+ return (Any (intFormat toRep) code')
+
+coerceFP2Int' _ _ _ _ = panic "PPC.CodeGen.coerceFP2Int: unknown arch"
+
+-- Note [.LCTOC1 in PPC PIC code]
+-- The .LCTOC1 label is defined to point 32768 bytes into the GOT table
+-- to make the most of the PPC's 16-bit displacements.
+-- As 16-bit signed offset is used (usually via addi/lwz instructions)
+-- first element will have '-32768' offset against .LCTOC1.
+
+-- Note [implicit register in PPC PIC code]
+-- PPC generates calls by labels in assembly
+-- in form of:
+-- bl puts+32768@plt
+-- in this form it's not seen directly (by GHC NCG)
+-- that r30 (PicBaseReg) is used,
+-- but r30 is a required part of PLT code setup:
+-- puts+32768@plt:
+-- lwz r11,-30484(r30) ; offset in .LCTOC1
+-- mtctr r11
+-- bctr
diff --git a/compiler/GHC/CmmToAsm/PPC/Cond.hs b/compiler/GHC/CmmToAsm/PPC/Cond.hs
new file mode 100644
index 0000000000..e8efa30064
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/Cond.hs
@@ -0,0 +1,63 @@
+module GHC.CmmToAsm.PPC.Cond (
+ Cond(..),
+ condNegate,
+ condUnsigned,
+ condToSigned,
+ condToUnsigned,
+)
+
+where
+
+import GhcPrelude
+
+import Panic
+
+data Cond
+ = ALWAYS
+ | EQQ
+ | GE
+ | GEU
+ | GTT
+ | GU
+ | LE
+ | LEU
+ | LTT
+ | LU
+ | NE
+ deriving Eq
+
+
+condNegate :: Cond -> Cond
+condNegate ALWAYS = panic "condNegate: ALWAYS"
+condNegate EQQ = NE
+condNegate GE = LTT
+condNegate GEU = LU
+condNegate GTT = LE
+condNegate GU = LEU
+condNegate LE = GTT
+condNegate LEU = GU
+condNegate LTT = GE
+condNegate LU = GEU
+condNegate NE = EQQ
+
+-- Condition utils
+condUnsigned :: Cond -> Bool
+condUnsigned GU = True
+condUnsigned LU = True
+condUnsigned GEU = True
+condUnsigned LEU = True
+condUnsigned _ = False
+
+condToSigned :: Cond -> Cond
+condToSigned GU = GTT
+condToSigned LU = LTT
+condToSigned GEU = GE
+condToSigned LEU = LE
+condToSigned x = x
+
+condToUnsigned :: Cond -> Cond
+condToUnsigned GTT = GU
+condToUnsigned LTT = LU
+condToUnsigned GE = GEU
+condToUnsigned LE = LEU
+condToUnsigned x = x
diff --git a/compiler/GHC/CmmToAsm/PPC/Instr.hs b/compiler/GHC/CmmToAsm/PPC/Instr.hs
new file mode 100644
index 0000000000..b92a952340
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/Instr.hs
@@ -0,0 +1,713 @@
+{-# LANGUAGE CPP #-}
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-dependent assembly language
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-----------------------------------------------------------------------------
+
+#include "HsVersions.h"
+
+module GHC.CmmToAsm.PPC.Instr (
+ archWordFormat,
+ RI(..),
+ Instr(..),
+ stackFrameHeaderSize,
+ maxSpillSlots,
+ allocMoreStack,
+ makeFarBranches
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.PPC.Regs
+import GHC.CmmToAsm.PPC.Cond
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Reg.Target
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Driver.Session
+import GHC.Cmm
+import GHC.Cmm.Info
+import FastString
+import GHC.Cmm.CLabel
+import Outputable
+import GHC.Platform
+import UniqFM (listToUFM, lookupUFM)
+import UniqSupply
+
+import Control.Monad (replicateM)
+import Data.Maybe (fromMaybe)
+
+--------------------------------------------------------------------------------
+-- Format of a PPC memory address.
+--
+archWordFormat :: Bool -> Format
+archWordFormat is32Bit
+ | is32Bit = II32
+ | otherwise = II64
+
+
+-- | Instruction instance for powerpc
+instance Instruction Instr where
+ regUsageOfInstr = ppc_regUsageOfInstr
+ patchRegsOfInstr = ppc_patchRegsOfInstr
+ isJumpishInstr = ppc_isJumpishInstr
+ jumpDestsOfInstr = ppc_jumpDestsOfInstr
+ patchJumpInstr = ppc_patchJumpInstr
+ mkSpillInstr = ppc_mkSpillInstr
+ mkLoadInstr = ppc_mkLoadInstr
+ takeDeltaInstr = ppc_takeDeltaInstr
+ isMetaInstr = ppc_isMetaInstr
+ mkRegRegMoveInstr _ = ppc_mkRegRegMoveInstr
+ takeRegRegMoveInstr = ppc_takeRegRegMoveInstr
+ mkJumpInstr = ppc_mkJumpInstr
+ mkStackAllocInstr = ppc_mkStackAllocInstr
+ mkStackDeallocInstr = ppc_mkStackDeallocInstr
+
+
+ppc_mkStackAllocInstr :: Platform -> Int -> [Instr]
+ppc_mkStackAllocInstr platform amount
+ = ppc_mkStackAllocInstr' platform (-amount)
+
+ppc_mkStackDeallocInstr :: Platform -> Int -> [Instr]
+ppc_mkStackDeallocInstr platform amount
+ = ppc_mkStackAllocInstr' platform amount
+
+ppc_mkStackAllocInstr' :: Platform -> Int -> [Instr]
+ppc_mkStackAllocInstr' platform amount
+ | fits16Bits amount
+ = [ LD fmt r0 (AddrRegImm sp zero)
+ , STU fmt r0 (AddrRegImm sp immAmount)
+ ]
+ | otherwise
+ = [ LD fmt r0 (AddrRegImm sp zero)
+ , ADDIS tmp sp (HA immAmount)
+ , ADD tmp tmp (RIImm (LO immAmount))
+ , STU fmt r0 (AddrRegReg sp tmp)
+ ]
+ where
+ fmt = intFormat $ widthFromBytes (platformWordSizeInBytes platform)
+ zero = ImmInt 0
+ tmp = tmpReg platform
+ immAmount = ImmInt amount
+
+--
+-- See note [extra spill slots] in X86/Instr.hs
+--
+allocMoreStack
+ :: Platform
+ -> Int
+ -> NatCmmDecl statics GHC.CmmToAsm.PPC.Instr.Instr
+ -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.PPC.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots (CmmProc info lbl live (ListGraph code)) = do
+ let
+ infos = mapKeys info
+ entries = case code of
+ [] -> infos
+ BasicBlock entry _ : _ -- first block is the entry point
+ | entry `elem` infos -> infos
+ | otherwise -> entry : infos
+
+ uniqs <- replicateM (length entries) getUniqueM
+
+ let
+ delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+ where x = slots * spillSlotSize -- sp delta
+
+ alloc = mkStackAllocInstr platform delta
+ dealloc = mkStackDeallocInstr platform delta
+
+ retargetList = (zip entries (map mkBlockId uniqs))
+
+ new_blockmap :: LabelMap BlockId
+ new_blockmap = mapFromList retargetList
+
+ insert_stack_insns (BasicBlock id insns)
+ | Just new_blockid <- mapLookup id new_blockmap
+ = [ BasicBlock id $ alloc ++ [BCC ALWAYS new_blockid Nothing]
+ , BasicBlock new_blockid block'
+ ]
+ | otherwise
+ = [ BasicBlock id block' ]
+ where
+ block' = foldr insert_dealloc [] insns
+
+ insert_dealloc insn r
+ -- BCTR might or might not be a non-local jump. For
+ -- "labeled-goto" we use JMP, and for "computed-goto" we
+ -- use MTCTR followed by BCTR. See 'PPC.CodeGen.genJump'.
+ = case insn of
+ JMP _ _ -> dealloc ++ (insn : r)
+ BCTR [] Nothing _ -> dealloc ++ (insn : r)
+ BCTR ids label rs -> BCTR (map (fmap retarget) ids) label rs : r
+ BCCFAR cond b p -> BCCFAR cond (retarget b) p : r
+ BCC cond b p -> BCC cond (retarget b) p : r
+ _ -> insn : r
+ -- BL and BCTRL are call-like instructions rather than
+ -- jumps, and are used only for C calls.
+
+ retarget :: BlockId -> BlockId
+ retarget b
+ = fromMaybe b (mapLookup b new_blockmap)
+
+ new_code
+ = concatMap insert_stack_insns code
+
+ -- in
+ return (CmmProc info lbl live (ListGraph new_code),retargetList)
+
+
+-- -----------------------------------------------------------------------------
+-- Machine's assembly language
+
+-- We have a few common "instructions" (nearly all the pseudo-ops) but
+-- mostly all of 'Instr' is machine-specific.
+
+-- Register or immediate
+data RI
+ = RIReg Reg
+ | RIImm Imm
+
+data Instr
+ -- comment pseudo-op
+ = COMMENT FastString
+
+ -- some static data spat out during code
+ -- generation. Will be extracted before
+ -- pretty-printing.
+ | LDATA Section RawCmmStatics
+
+ -- start a new basic block. Useful during
+ -- codegen, removed later. Preceding
+ -- instruction should be a jump, as per the
+ -- invariants for a BasicBlock (see Cmm).
+ | NEWBLOCK BlockId
+
+ -- specify current stack offset for
+ -- benefit of subsequent passes
+ | DELTA Int
+
+ -- Loads and stores.
+ | LD Format Reg AddrMode -- Load format, dst, src
+ | LDFAR Format Reg AddrMode -- Load format, dst, src 32 bit offset
+ | LDR Format Reg AddrMode -- Load and reserve format, dst, src
+ | LA Format Reg AddrMode -- Load arithmetic format, dst, src
+ | ST Format Reg AddrMode -- Store format, src, dst
+ | STFAR Format Reg AddrMode -- Store format, src, dst 32 bit offset
+ | STU Format Reg AddrMode -- Store with Update format, src, dst
+ | STC Format Reg AddrMode -- Store conditional format, src, dst
+ | LIS Reg Imm -- Load Immediate Shifted dst, src
+ | LI Reg Imm -- Load Immediate dst, src
+ | MR Reg Reg -- Move Register dst, src -- also for fmr
+
+ | CMP Format Reg RI -- format, src1, src2
+ | CMPL Format Reg RI -- format, src1, src2
+
+ | BCC Cond BlockId (Maybe Bool) -- cond, block, hint
+ | BCCFAR Cond BlockId (Maybe Bool) -- cond, block, hint
+ -- hint:
+ -- Just True: branch likely taken
+ -- Just False: branch likely not taken
+ -- Nothing: no hint
+ | JMP CLabel [Reg] -- same as branch,
+ -- but with CLabel instead of block ID
+ -- and live global registers
+ | MTCTR Reg
+ | BCTR [Maybe BlockId] (Maybe CLabel) [Reg]
+ -- with list of local destinations, and
+ -- jump table location if necessary
+ | BL CLabel [Reg] -- with list of argument regs
+ | BCTRL [Reg]
+
+ | ADD Reg Reg RI -- dst, src1, src2
+ | ADDO Reg Reg Reg -- add and set overflow
+ | ADDC Reg Reg Reg -- (carrying) dst, src1, src2
+ | ADDE Reg Reg Reg -- (extended) dst, src1, src2
+ | ADDZE Reg Reg -- (to zero extended) dst, src
+ | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2
+ | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1
+ | SUBFO Reg Reg Reg -- subtract from and set overflow
+ | SUBFC Reg Reg RI -- (carrying) dst, src1, src2 ;
+ -- dst = src2 - src1
+ | SUBFE Reg Reg Reg -- (extended) dst, src1, src2 ;
+ -- dst = src2 - src1
+ | MULL Format Reg Reg RI
+ | MULLO Format Reg Reg Reg -- multiply and set overflow
+ | MFOV Format Reg -- move overflow bit (1|33) to register
+ -- pseudo-instruction; pretty printed as
+ -- mfxer dst
+ -- extr[w|d]i dst, dst, 1, [1|33]
+ | MULHU Format Reg Reg Reg
+ | DIV Format Bool Reg Reg Reg
+ | AND Reg Reg RI -- dst, src1, src2
+ | ANDC Reg Reg Reg -- AND with complement, dst = src1 & ~ src2
+ | NAND Reg Reg Reg -- dst, src1, src2
+ | OR Reg Reg RI -- dst, src1, src2
+ | ORIS Reg Reg Imm -- OR Immediate Shifted dst, src1, src2
+ | XOR Reg Reg RI -- dst, src1, src2
+ | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2
+
+ | EXTS Format Reg Reg
+ | CNTLZ Format Reg Reg
+
+ | NEG Reg Reg
+ | NOT Reg Reg
+
+ | SL Format Reg Reg RI -- shift left
+ | SR Format Reg Reg RI -- shift right
+ | SRA Format Reg Reg RI -- shift right arithmetic
+
+ | RLWINM Reg Reg Int Int Int -- Rotate Left Word Immediate then AND with Mask
+ | CLRLI Format Reg Reg Int -- clear left immediate (extended mnemonic)
+ | CLRRI Format Reg Reg Int -- clear right immediate (extended mnemonic)
+
+ | FADD Format Reg Reg Reg
+ | FSUB Format Reg Reg Reg
+ | FMUL Format Reg Reg Reg
+ | FDIV Format Reg Reg Reg
+ | FABS Reg Reg -- abs is the same for single and double
+ | FNEG Reg Reg -- negate is the same for single and double prec.
+
+ | FCMP Reg Reg
+
+ | FCTIWZ Reg Reg -- convert to integer word
+ | FCTIDZ Reg Reg -- convert to integer double word
+ | FCFID Reg Reg -- convert from integer double word
+ | FRSP Reg Reg -- reduce to single precision
+ -- (but destination is a FP register)
+
+ | CRNOR Int Int Int -- condition register nor
+ | MFCR Reg -- move from condition register
+
+ | MFLR Reg -- move from link register
+ | FETCHPC Reg -- pseudo-instruction:
+ -- bcl to next insn, mflr reg
+ | HWSYNC -- heavy weight sync
+ | ISYNC -- instruction synchronize
+ | LWSYNC -- memory barrier
+ | NOP -- no operation, PowerPC 64 bit
+ -- needs this as place holder to
+ -- reload TOC pointer
+
+-- | Get the registers that are being used by this instruction.
+-- regUsage doesn't need to do any trickery for jumps and such.
+-- Just state precisely the regs read and written by that insn.
+-- The consequences of control flow transfers, as far as register
+-- allocation goes, are taken care of by the register allocator.
+--
+ppc_regUsageOfInstr :: Platform -> Instr -> RegUsage
+ppc_regUsageOfInstr platform instr
+ = case instr of
+ LD _ reg addr -> usage (regAddr addr, [reg])
+ LDFAR _ reg addr -> usage (regAddr addr, [reg])
+ LDR _ reg addr -> usage (regAddr addr, [reg])
+ LA _ reg addr -> usage (regAddr addr, [reg])
+ ST _ reg addr -> usage (reg : regAddr addr, [])
+ STFAR _ reg addr -> usage (reg : regAddr addr, [])
+ STU _ reg addr -> usage (reg : regAddr addr, [])
+ STC _ reg addr -> usage (reg : regAddr addr, [])
+ LIS reg _ -> usage ([], [reg])
+ LI reg _ -> usage ([], [reg])
+ MR reg1 reg2 -> usage ([reg2], [reg1])
+ CMP _ reg ri -> usage (reg : regRI ri,[])
+ CMPL _ reg ri -> usage (reg : regRI ri,[])
+ BCC _ _ _ -> noUsage
+ BCCFAR _ _ _ -> noUsage
+ JMP _ regs -> usage (regs, [])
+ MTCTR reg -> usage ([reg],[])
+ BCTR _ _ regs -> usage (regs, [])
+ BL _ params -> usage (params, callClobberedRegs platform)
+ BCTRL params -> usage (params, callClobberedRegs platform)
+
+ ADD reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ ADDO reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ ADDC reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ ADDE reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ ADDZE reg1 reg2 -> usage ([reg2], [reg1])
+ ADDIS reg1 reg2 _ -> usage ([reg2], [reg1])
+ SUBF reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ SUBFO reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ SUBFC reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ SUBFE reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ MULL _ reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ MULLO _ reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ MFOV _ reg -> usage ([], [reg])
+ MULHU _ reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ DIV _ _ reg1 reg2 reg3
+ -> usage ([reg2,reg3], [reg1])
+
+ AND reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ ANDC reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ NAND reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+ OR reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ ORIS reg1 reg2 _ -> usage ([reg2], [reg1])
+ XOR reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ XORIS reg1 reg2 _ -> usage ([reg2], [reg1])
+ EXTS _ reg1 reg2 -> usage ([reg2], [reg1])
+ CNTLZ _ reg1 reg2 -> usage ([reg2], [reg1])
+ NEG reg1 reg2 -> usage ([reg2], [reg1])
+ NOT reg1 reg2 -> usage ([reg2], [reg1])
+ SL _ reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ SR _ reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ SRA _ reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1])
+ RLWINM reg1 reg2 _ _ _ -> usage ([reg2], [reg1])
+ CLRLI _ reg1 reg2 _ -> usage ([reg2], [reg1])
+ CLRRI _ reg1 reg2 _ -> usage ([reg2], [reg1])
+
+ FADD _ r1 r2 r3 -> usage ([r2,r3], [r1])
+ FSUB _ r1 r2 r3 -> usage ([r2,r3], [r1])
+ FMUL _ r1 r2 r3 -> usage ([r2,r3], [r1])
+ FDIV _ r1 r2 r3 -> usage ([r2,r3], [r1])
+ FABS r1 r2 -> usage ([r2], [r1])
+ FNEG r1 r2 -> usage ([r2], [r1])
+ FCMP r1 r2 -> usage ([r1,r2], [])
+ FCTIWZ r1 r2 -> usage ([r2], [r1])
+ FCTIDZ r1 r2 -> usage ([r2], [r1])
+ FCFID r1 r2 -> usage ([r2], [r1])
+ FRSP r1 r2 -> usage ([r2], [r1])
+ MFCR reg -> usage ([], [reg])
+ MFLR reg -> usage ([], [reg])
+ FETCHPC reg -> usage ([], [reg])
+ _ -> noUsage
+ where
+ usage (src, dst) = RU (filter (interesting platform) src)
+ (filter (interesting platform) dst)
+ regAddr (AddrRegReg r1 r2) = [r1, r2]
+ regAddr (AddrRegImm r1 _) = [r1]
+
+ regRI (RIReg r) = [r]
+ regRI _ = []
+
+interesting :: Platform -> Reg -> Bool
+interesting _ (RegVirtual _) = True
+interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
+interesting _ (RegReal (RealRegPair{}))
+ = panic "PPC.Instr.interesting: no reg pairs on this arch"
+
+
+
+-- | Apply a given mapping to all the register references in this
+-- instruction.
+ppc_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+ppc_patchRegsOfInstr instr env
+ = case instr of
+ LD fmt reg addr -> LD fmt (env reg) (fixAddr addr)
+ LDFAR fmt reg addr -> LDFAR fmt (env reg) (fixAddr addr)
+ LDR fmt reg addr -> LDR fmt (env reg) (fixAddr addr)
+ LA fmt reg addr -> LA fmt (env reg) (fixAddr addr)
+ ST fmt reg addr -> ST fmt (env reg) (fixAddr addr)
+ STFAR fmt reg addr -> STFAR fmt (env reg) (fixAddr addr)
+ STU fmt reg addr -> STU fmt (env reg) (fixAddr addr)
+ STC fmt reg addr -> STC fmt (env reg) (fixAddr addr)
+ LIS reg imm -> LIS (env reg) imm
+ LI reg imm -> LI (env reg) imm
+ MR reg1 reg2 -> MR (env reg1) (env reg2)
+ CMP fmt reg ri -> CMP fmt (env reg) (fixRI ri)
+ CMPL fmt reg ri -> CMPL fmt (env reg) (fixRI ri)
+ BCC cond lbl p -> BCC cond lbl p
+ BCCFAR cond lbl p -> BCCFAR cond lbl p
+ JMP l regs -> JMP l regs -- global regs will not be remapped
+ MTCTR reg -> MTCTR (env reg)
+ BCTR targets lbl rs -> BCTR targets lbl rs
+ BL imm argRegs -> BL imm argRegs -- argument regs
+ BCTRL argRegs -> BCTRL argRegs -- cannot be remapped
+ ADD reg1 reg2 ri -> ADD (env reg1) (env reg2) (fixRI ri)
+ ADDO reg1 reg2 reg3 -> ADDO (env reg1) (env reg2) (env reg3)
+ ADDC reg1 reg2 reg3 -> ADDC (env reg1) (env reg2) (env reg3)
+ ADDE reg1 reg2 reg3 -> ADDE (env reg1) (env reg2) (env reg3)
+ ADDZE reg1 reg2 -> ADDZE (env reg1) (env reg2)
+ ADDIS reg1 reg2 imm -> ADDIS (env reg1) (env reg2) imm
+ SUBF reg1 reg2 reg3 -> SUBF (env reg1) (env reg2) (env reg3)
+ SUBFO reg1 reg2 reg3 -> SUBFO (env reg1) (env reg2) (env reg3)
+ SUBFC reg1 reg2 ri -> SUBFC (env reg1) (env reg2) (fixRI ri)
+ SUBFE reg1 reg2 reg3 -> SUBFE (env reg1) (env reg2) (env reg3)
+ MULL fmt reg1 reg2 ri
+ -> MULL fmt (env reg1) (env reg2) (fixRI ri)
+ MULLO fmt reg1 reg2 reg3
+ -> MULLO fmt (env reg1) (env reg2) (env reg3)
+ MFOV fmt reg -> MFOV fmt (env reg)
+ MULHU fmt reg1 reg2 reg3
+ -> MULHU fmt (env reg1) (env reg2) (env reg3)
+ DIV fmt sgn reg1 reg2 reg3
+ -> DIV fmt sgn (env reg1) (env reg2) (env reg3)
+
+ AND reg1 reg2 ri -> AND (env reg1) (env reg2) (fixRI ri)
+ ANDC reg1 reg2 reg3 -> ANDC (env reg1) (env reg2) (env reg3)
+ NAND reg1 reg2 reg3 -> NAND (env reg1) (env reg2) (env reg3)
+ OR reg1 reg2 ri -> OR (env reg1) (env reg2) (fixRI ri)
+ ORIS reg1 reg2 imm -> ORIS (env reg1) (env reg2) imm
+ XOR reg1 reg2 ri -> XOR (env reg1) (env reg2) (fixRI ri)
+ XORIS reg1 reg2 imm -> XORIS (env reg1) (env reg2) imm
+ EXTS fmt reg1 reg2 -> EXTS fmt (env reg1) (env reg2)
+ CNTLZ fmt reg1 reg2 -> CNTLZ fmt (env reg1) (env reg2)
+ NEG reg1 reg2 -> NEG (env reg1) (env reg2)
+ NOT reg1 reg2 -> NOT (env reg1) (env reg2)
+ SL fmt reg1 reg2 ri
+ -> SL fmt (env reg1) (env reg2) (fixRI ri)
+ SR fmt reg1 reg2 ri
+ -> SR fmt (env reg1) (env reg2) (fixRI ri)
+ SRA fmt reg1 reg2 ri
+ -> SRA fmt (env reg1) (env reg2) (fixRI ri)
+ RLWINM reg1 reg2 sh mb me
+ -> RLWINM (env reg1) (env reg2) sh mb me
+ CLRLI fmt reg1 reg2 n -> CLRLI fmt (env reg1) (env reg2) n
+ CLRRI fmt reg1 reg2 n -> CLRRI fmt (env reg1) (env reg2) n
+ FADD fmt r1 r2 r3 -> FADD fmt (env r1) (env r2) (env r3)
+ FSUB fmt r1 r2 r3 -> FSUB fmt (env r1) (env r2) (env r3)
+ FMUL fmt r1 r2 r3 -> FMUL fmt (env r1) (env r2) (env r3)
+ FDIV fmt r1 r2 r3 -> FDIV fmt (env r1) (env r2) (env r3)
+ FABS r1 r2 -> FABS (env r1) (env r2)
+ FNEG r1 r2 -> FNEG (env r1) (env r2)
+ FCMP r1 r2 -> FCMP (env r1) (env r2)
+ FCTIWZ r1 r2 -> FCTIWZ (env r1) (env r2)
+ FCTIDZ r1 r2 -> FCTIDZ (env r1) (env r2)
+ FCFID r1 r2 -> FCFID (env r1) (env r2)
+ FRSP r1 r2 -> FRSP (env r1) (env r2)
+ MFCR reg -> MFCR (env reg)
+ MFLR reg -> MFLR (env reg)
+ FETCHPC reg -> FETCHPC (env reg)
+ _ -> instr
+ where
+ fixAddr (AddrRegReg r1 r2) = AddrRegReg (env r1) (env r2)
+ fixAddr (AddrRegImm r1 i) = AddrRegImm (env r1) i
+
+ fixRI (RIReg r) = RIReg (env r)
+ fixRI other = other
+
+
+--------------------------------------------------------------------------------
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+ppc_isJumpishInstr :: Instr -> Bool
+ppc_isJumpishInstr instr
+ = case instr of
+ BCC{} -> True
+ BCCFAR{} -> True
+ BCTR{} -> True
+ BCTRL{} -> True
+ BL{} -> True
+ JMP{} -> True
+ _ -> False
+
+
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+ppc_jumpDestsOfInstr :: Instr -> [BlockId]
+ppc_jumpDestsOfInstr insn
+ = case insn of
+ BCC _ id _ -> [id]
+ BCCFAR _ id _ -> [id]
+ BCTR targets _ _ -> [id | Just id <- targets]
+ _ -> []
+
+
+-- | Change the destination of this jump instruction.
+-- Used in the linear allocator when adding fixup blocks for join
+-- points.
+ppc_patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
+ppc_patchJumpInstr insn patchF
+ = case insn of
+ BCC cc id p -> BCC cc (patchF id) p
+ BCCFAR cc id p -> BCCFAR cc (patchF id) p
+ BCTR ids lbl rs -> BCTR (map (fmap patchF) ids) lbl rs
+ _ -> insn
+
+
+-- -----------------------------------------------------------------------------
+
+-- | An instruction to spill a register into a spill slot.
+ppc_mkSpillInstr
+ :: DynFlags
+ -> Reg -- register to spill
+ -> Int -- current stack delta
+ -> Int -- spill slot to use
+ -> Instr
+
+ppc_mkSpillInstr dflags reg delta slot
+ = let platform = targetPlatform dflags
+ off = spillSlotToOffset dflags slot
+ arch = platformArch platform
+ in
+ let fmt = case targetClassOfReg platform reg of
+ RcInteger -> case arch of
+ ArchPPC -> II32
+ _ -> II64
+ RcDouble -> FF64
+ _ -> panic "PPC.Instr.mkSpillInstr: no match"
+ instr = case makeImmediate W32 True (off-delta) of
+ Just _ -> ST
+ Nothing -> STFAR -- pseudo instruction: 32 bit offsets
+
+ in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+
+
+ppc_mkLoadInstr
+ :: DynFlags
+ -> Reg -- register to load
+ -> Int -- current stack delta
+ -> Int -- spill slot to use
+ -> Instr
+
+ppc_mkLoadInstr dflags reg delta slot
+ = let platform = targetPlatform dflags
+ off = spillSlotToOffset dflags slot
+ arch = platformArch platform
+ in
+ let fmt = case targetClassOfReg platform reg of
+ RcInteger -> case arch of
+ ArchPPC -> II32
+ _ -> II64
+ RcDouble -> FF64
+ _ -> panic "PPC.Instr.mkLoadInstr: no match"
+ instr = case makeImmediate W32 True (off-delta) of
+ Just _ -> LD
+ Nothing -> LDFAR -- pseudo instruction: 32 bit offsets
+
+ in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+
+
+-- | The size of a minimal stackframe header including minimal
+-- parameter save area.
+stackFrameHeaderSize :: DynFlags -> Int
+stackFrameHeaderSize dflags
+ = case platformOS platform of
+ OSAIX -> 24 + 8 * 4
+ _ -> case platformArch platform of
+ -- header + parameter save area
+ ArchPPC -> 64 -- TODO: check ABI spec
+ ArchPPC_64 ELF_V1 -> 48 + 8 * 8
+ ArchPPC_64 ELF_V2 -> 32 + 8 * 8
+ _ -> panic "PPC.stackFrameHeaderSize: not defined for this OS"
+ where platform = targetPlatform dflags
+
+-- | The maximum number of bytes required to spill a register. PPC32
+-- has 32-bit GPRs and 64-bit FPRs, while PPC64 has 64-bit GPRs and
+-- 64-bit FPRs. So the maximum is 8 regardless of platforms unlike
+-- x86. Note that AltiVec's vector registers are 128-bit wide so we
+-- must not use this to spill them.
+spillSlotSize :: Int
+spillSlotSize = 8
+
+-- | The number of spill slots available without allocating more.
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+ = ((rESERVED_C_STACK_BYTES dflags - stackFrameHeaderSize dflags)
+ `div` spillSlotSize) - 1
+-- = 0 -- useful for testing allocMoreStack
+
+-- | The number of bytes that the stack pointer should be aligned
+-- to. This is 16 both on PPC32 and PPC64 ELF (see ELF processor
+-- specific supplements).
+stackAlign :: Int
+stackAlign = 16
+
+-- | Convert a spill slot number to a *byte* offset, with no sign.
+spillSlotToOffset :: DynFlags -> Int -> Int
+spillSlotToOffset dflags slot
+ = stackFrameHeaderSize dflags + spillSlotSize * slot
+
+
+--------------------------------------------------------------------------------
+-- | See if this instruction is telling us the current C stack delta
+ppc_takeDeltaInstr
+ :: Instr
+ -> Maybe Int
+
+ppc_takeDeltaInstr instr
+ = case instr of
+ DELTA i -> Just i
+ _ -> Nothing
+
+
+ppc_isMetaInstr
+ :: Instr
+ -> Bool
+
+ppc_isMetaInstr instr
+ = case instr of
+ COMMENT{} -> True
+ LDATA{} -> True
+ NEWBLOCK{} -> True
+ DELTA{} -> True
+ _ -> False
+
+
+-- | Copy the value in a register to another one.
+-- Must work for all register classes.
+ppc_mkRegRegMoveInstr
+ :: Reg
+ -> Reg
+ -> Instr
+
+ppc_mkRegRegMoveInstr src dst
+ = MR dst src
+
+
+-- | Make an unconditional jump instruction.
+ppc_mkJumpInstr
+ :: BlockId
+ -> [Instr]
+
+ppc_mkJumpInstr id
+ = [BCC ALWAYS id Nothing]
+
+
+-- | Take the source and destination from this reg -> reg move instruction
+-- or Nothing if it's not one
+ppc_takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
+ppc_takeRegRegMoveInstr (MR dst src) = Just (src,dst)
+ppc_takeRegRegMoveInstr _ = Nothing
+
+-- -----------------------------------------------------------------------------
+-- Making far branches
+
+-- Conditional branches on PowerPC are limited to +-32KB; if our Procs get too
+-- big, we have to work around this limitation.
+
+makeFarBranches
+ :: LabelMap RawCmmStatics
+ -> [NatBasicBlock Instr]
+ -> [NatBasicBlock Instr]
+makeFarBranches info_env blocks
+ | last blockAddresses < nearLimit = blocks
+ | otherwise = zipWith handleBlock blockAddresses blocks
+ where
+ blockAddresses = scanl (+) 0 $ map blockLen blocks
+ blockLen (BasicBlock _ instrs) = length instrs
+
+ handleBlock addr (BasicBlock id instrs)
+ = BasicBlock id (zipWith makeFar [addr..] instrs)
+
+ makeFar _ (BCC ALWAYS tgt _) = BCC ALWAYS tgt Nothing
+ makeFar addr (BCC cond tgt p)
+ | abs (addr - targetAddr) >= nearLimit
+ = BCCFAR cond tgt p
+ | otherwise
+ = BCC cond tgt p
+ where Just targetAddr = lookupUFM blockAddressMap tgt
+ makeFar _ other = other
+
+ -- 8192 instructions are allowed; let's keep some distance, as
+ -- we have a few pseudo-insns that are pretty-printed as
+ -- multiple instructions, and it's just not worth the effort
+ -- to calculate things exactly
+ nearLimit = 7000 - mapSize info_env * maxRetInfoTableSizeW
+
+ blockAddressMap = listToUFM $ zip (map blockId blocks) blockAddresses
diff --git a/compiler/GHC/CmmToAsm/PPC/Ppr.hs b/compiler/GHC/CmmToAsm/PPC/Ppr.hs
new file mode 100644
index 0000000000..550bd618ef
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/Ppr.hs
@@ -0,0 +1,994 @@
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.CmmToAsm.PPC.Ppr (pprNatCmmDecl) where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.PPC.Regs
+import GHC.CmmToAsm.PPC.Instr
+import GHC.CmmToAsm.PPC.Cond
+import GHC.CmmToAsm.Ppr
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Reg.Target
+
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+
+import Unique ( pprUniqueAlways, getUnique )
+import GHC.Platform
+import FastString
+import Outputable
+import GHC.Driver.Session
+
+import Data.Word
+import Data.Int
+import Data.Bits
+
+-- -----------------------------------------------------------------------------
+-- Printing this stuff out
+
+pprNatCmmDecl :: NatCmmDecl RawCmmStatics Instr -> SDoc
+pprNatCmmDecl (CmmData section dats) =
+ pprSectionAlign section $$ pprDatas dats
+
+pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+ case topInfoTable proc of
+ Nothing ->
+ sdocWithPlatform $ \platform ->
+ -- special case for code without info table:
+ pprSectionAlign (Section Text lbl) $$
+ (case platformArch platform of
+ ArchPPC_64 ELF_V1 -> pprFunctionDescriptor lbl
+ ArchPPC_64 ELF_V2 -> pprFunctionPrologue lbl
+ _ -> pprLabel lbl) $$ -- blocks guaranteed not null,
+ -- so label needed
+ vcat (map (pprBasicBlock top_info) blocks)
+
+ Just (RawCmmStatics info_lbl _) ->
+ sdocWithPlatform $ \platform ->
+ pprSectionAlign (Section Text info_lbl) $$
+ (if platformHasSubsectionsViaSymbols platform
+ then ppr (mkDeadStripPreventer info_lbl) <> char ':'
+ else empty) $$
+ vcat (map (pprBasicBlock top_info) blocks) $$
+ -- above: Even the first block gets a label, because with branch-chain
+ -- elimination, it might be the target of a goto.
+ (if platformHasSubsectionsViaSymbols platform
+ then
+ -- See Note [Subsections Via Symbols] in X86/Ppr.hs
+ text "\t.long "
+ <+> ppr info_lbl
+ <+> char '-'
+ <+> ppr (mkDeadStripPreventer info_lbl)
+ else empty)
+
+pprFunctionDescriptor :: CLabel -> SDoc
+pprFunctionDescriptor lab = pprGloblDecl lab
+ $$ text "\t.section \".opd\", \"aw\""
+ $$ text "\t.align 3"
+ $$ ppr lab <> char ':'
+ $$ text "\t.quad ."
+ <> ppr lab
+ <> text ",.TOC.@tocbase,0"
+ $$ text "\t.previous"
+ $$ text "\t.type"
+ <+> ppr lab
+ <> text ", @function"
+ $$ char '.' <> ppr lab <> char ':'
+
+pprFunctionPrologue :: CLabel ->SDoc
+pprFunctionPrologue lab = pprGloblDecl lab
+ $$ text ".type "
+ <> ppr lab
+ <> text ", @function"
+ $$ ppr lab <> char ':'
+ $$ text "0:\taddis\t" <> pprReg toc
+ <> text ",12,.TOC.-0b@ha"
+ $$ text "\taddi\t" <> pprReg toc
+ <> char ',' <> pprReg toc <> text ",.TOC.-0b@l"
+ $$ text "\t.localentry\t" <> ppr lab
+ <> text ",.-" <> ppr lab
+
+pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc
+pprBasicBlock info_env (BasicBlock blockid instrs)
+ = maybe_infotable $$
+ pprLabel (blockLbl blockid) $$
+ vcat (map pprInstr instrs)
+ where
+ maybe_infotable = case mapLookup blockid info_env of
+ Nothing -> empty
+ Just (RawCmmStatics info_lbl info) ->
+ pprAlignForSection Text $$
+ vcat (map pprData info) $$
+ pprLabel info_lbl
+
+
+
+pprDatas :: RawCmmStatics -> SDoc
+-- See note [emit-time elimination of static indirections] in CLabel.
+pprDatas (RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+ | lbl == mkIndStaticInfoLabel
+ , let labelInd (CmmLabelOff l _) = Just l
+ labelInd (CmmLabel l) = Just l
+ labelInd _ = Nothing
+ , Just ind' <- labelInd ind
+ , alias `mayRedirectTo` ind'
+ = pprGloblDecl alias
+ $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind')
+pprDatas (RawCmmStatics lbl dats) = vcat (pprLabel lbl : map pprData dats)
+
+pprData :: CmmStatic -> SDoc
+pprData (CmmString str) = pprBytes str
+pprData (CmmUninitialised bytes) = text ".space " <> int bytes
+pprData (CmmStaticLit lit) = pprDataItem lit
+
+pprGloblDecl :: CLabel -> SDoc
+pprGloblDecl lbl
+ | not (externallyVisibleCLabel lbl) = empty
+ | otherwise = text ".globl " <> ppr lbl
+
+pprTypeAndSizeDecl :: CLabel -> SDoc
+pprTypeAndSizeDecl lbl
+ = sdocWithPlatform $ \platform ->
+ if platformOS platform == OSLinux && externallyVisibleCLabel lbl
+ then text ".type " <>
+ ppr lbl <> text ", @object"
+ else empty
+
+pprLabel :: CLabel -> SDoc
+pprLabel lbl = pprGloblDecl lbl
+ $$ pprTypeAndSizeDecl lbl
+ $$ (ppr lbl <> char ':')
+
+-- -----------------------------------------------------------------------------
+-- pprInstr: print an 'Instr'
+
+instance Outputable Instr where
+ ppr instr = pprInstr instr
+
+
+pprReg :: Reg -> SDoc
+
+pprReg r
+ = case r of
+ RegReal (RealRegSingle i) -> ppr_reg_no i
+ RegReal (RealRegPair{}) -> panic "PPC.pprReg: no reg pairs on this arch"
+ RegVirtual (VirtualRegI u) -> text "%vI_" <> pprUniqueAlways u
+ RegVirtual (VirtualRegHi u) -> text "%vHi_" <> pprUniqueAlways u
+ RegVirtual (VirtualRegF u) -> text "%vF_" <> pprUniqueAlways u
+ RegVirtual (VirtualRegD u) -> text "%vD_" <> pprUniqueAlways u
+
+ where
+ ppr_reg_no :: Int -> SDoc
+ ppr_reg_no i
+ | i <= 31 = int i -- GPRs
+ | i <= 63 = int (i-32) -- FPRs
+ | otherwise = text "very naughty powerpc register"
+
+
+
+pprFormat :: Format -> SDoc
+pprFormat x
+ = ptext (case x of
+ II8 -> sLit "b"
+ II16 -> sLit "h"
+ II32 -> sLit "w"
+ II64 -> sLit "d"
+ FF32 -> sLit "fs"
+ FF64 -> sLit "fd")
+
+
+pprCond :: Cond -> SDoc
+pprCond c
+ = ptext (case c of {
+ ALWAYS -> sLit "";
+ EQQ -> sLit "eq"; NE -> sLit "ne";
+ LTT -> sLit "lt"; GE -> sLit "ge";
+ GTT -> sLit "gt"; LE -> sLit "le";
+ LU -> sLit "lt"; GEU -> sLit "ge";
+ GU -> sLit "gt"; LEU -> sLit "le"; })
+
+
+pprImm :: Imm -> SDoc
+
+pprImm (ImmInt i) = int i
+pprImm (ImmInteger i) = integer i
+pprImm (ImmCLbl l) = ppr l
+pprImm (ImmIndex l i) = ppr l <> char '+' <> int i
+pprImm (ImmLit s) = s
+
+pprImm (ImmFloat _) = text "naughty float immediate"
+pprImm (ImmDouble _) = text "naughty double immediate"
+
+pprImm (ImmConstantSum a b) = pprImm a <> char '+' <> pprImm b
+pprImm (ImmConstantDiff a b) = pprImm a <> char '-'
+ <> lparen <> pprImm b <> rparen
+
+pprImm (LO (ImmInt i)) = pprImm (LO (ImmInteger (toInteger i)))
+pprImm (LO (ImmInteger i)) = pprImm (ImmInteger (toInteger lo16))
+ where
+ lo16 = fromInteger (i .&. 0xffff) :: Int16
+
+pprImm (LO i)
+ = pprImm i <> text "@l"
+
+pprImm (HI i)
+ = pprImm i <> text "@h"
+
+pprImm (HA (ImmInt i)) = pprImm (HA (ImmInteger (toInteger i)))
+pprImm (HA (ImmInteger i)) = pprImm (ImmInteger ha16)
+ where
+ ha16 = if lo16 >= 0x8000 then hi16+1 else hi16
+ hi16 = (i `shiftR` 16)
+ lo16 = i .&. 0xffff
+
+pprImm (HA i)
+ = pprImm i <> text "@ha"
+
+pprImm (HIGHERA i)
+ = pprImm i <> text "@highera"
+
+pprImm (HIGHESTA i)
+ = pprImm i <> text "@highesta"
+
+
+pprAddr :: AddrMode -> SDoc
+pprAddr (AddrRegReg r1 r2)
+ = pprReg r1 <> char ',' <+> pprReg r2
+pprAddr (AddrRegImm r1 (ImmInt i))
+ = hcat [ int i, char '(', pprReg r1, char ')' ]
+pprAddr (AddrRegImm r1 (ImmInteger i))
+ = hcat [ integer i, char '(', pprReg r1, char ')' ]
+pprAddr (AddrRegImm r1 imm)
+ = hcat [ pprImm imm, char '(', pprReg r1, char ')' ]
+
+
+pprSectionAlign :: Section -> SDoc
+pprSectionAlign sec@(Section seg _) =
+ sdocWithPlatform $ \platform ->
+ pprSectionHeader platform sec $$
+ pprAlignForSection seg
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: SectionType -> SDoc
+pprAlignForSection seg =
+ sdocWithPlatform $ \platform ->
+ let ppc64 = not $ target32Bit platform
+ in ptext $ case seg of
+ Text -> sLit ".align 2"
+ Data
+ | ppc64 -> sLit ".align 3"
+ | otherwise -> sLit ".align 2"
+ ReadOnlyData
+ | ppc64 -> sLit ".align 3"
+ | otherwise -> sLit ".align 2"
+ RelocatableReadOnlyData
+ | ppc64 -> sLit ".align 3"
+ | otherwise -> sLit ".align 2"
+ UninitialisedData
+ | ppc64 -> sLit ".align 3"
+ | otherwise -> sLit ".align 2"
+ ReadOnlyData16 -> sLit ".align 4"
+ -- TODO: This is copied from the ReadOnlyData case, but it can likely be
+ -- made more efficient.
+ CString
+ | ppc64 -> sLit ".align 3"
+ | otherwise -> sLit ".align 2"
+ OtherSection _ -> panic "PprMach.pprSectionAlign: unknown section"
+
+pprDataItem :: CmmLit -> SDoc
+pprDataItem lit
+ = sdocWithDynFlags $ \dflags ->
+ vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit dflags)
+ where
+ imm = litToImm lit
+ archPPC_64 dflags = not $ target32Bit $ targetPlatform dflags
+
+ ppr_item II8 _ _ = [text "\t.byte\t" <> pprImm imm]
+
+ ppr_item II32 _ _ = [text "\t.long\t" <> pprImm imm]
+
+ ppr_item II64 _ dflags
+ | archPPC_64 dflags = [text "\t.quad\t" <> pprImm imm]
+
+
+ ppr_item FF32 (CmmFloat r _) _
+ = let bs = floatToBytes (fromRational r)
+ in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+ ppr_item FF64 (CmmFloat r _) _
+ = let bs = doubleToBytes (fromRational r)
+ in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+ ppr_item II16 _ _ = [text "\t.short\t" <> pprImm imm]
+
+ ppr_item II64 (CmmInt x _) dflags
+ | not(archPPC_64 dflags) =
+ [text "\t.long\t"
+ <> int (fromIntegral
+ (fromIntegral (x `shiftR` 32) :: Word32)),
+ text "\t.long\t"
+ <> int (fromIntegral (fromIntegral x :: Word32))]
+
+ ppr_item _ _ _
+ = panic "PPC.Ppr.pprDataItem: no match"
+
+
+pprInstr :: Instr -> SDoc
+
+pprInstr (COMMENT _) = empty -- nuke 'em
+{-
+pprInstr (COMMENT s) =
+ if platformOS platform == OSLinux
+ then text "# " <> ftext s
+ else text "; " <> ftext s
+-}
+pprInstr (DELTA d)
+ = pprInstr (COMMENT (mkFastString ("\tdelta = " ++ show d)))
+
+pprInstr (NEWBLOCK _)
+ = panic "PprMach.pprInstr: NEWBLOCK"
+
+pprInstr (LDATA _ _)
+ = panic "PprMach.pprInstr: LDATA"
+
+{-
+pprInstr (SPILL reg slot)
+ = hcat [
+ text "\tSPILL",
+ char '\t',
+ pprReg reg,
+ comma,
+ text "SLOT" <> parens (int slot)]
+
+pprInstr (RELOAD slot reg)
+ = hcat [
+ text "\tRELOAD",
+ char '\t',
+ text "SLOT" <> parens (int slot),
+ comma,
+ pprReg reg]
+-}
+
+pprInstr (LD fmt reg addr) = hcat [
+ char '\t',
+ text "l",
+ ptext (case fmt of
+ II8 -> sLit "bz"
+ II16 -> sLit "hz"
+ II32 -> sLit "wz"
+ II64 -> sLit "d"
+ FF32 -> sLit "fs"
+ FF64 -> sLit "fd"
+ ),
+ case addr of AddrRegImm _ _ -> empty
+ AddrRegReg _ _ -> char 'x',
+ char '\t',
+ pprReg reg,
+ text ", ",
+ pprAddr addr
+ ]
+
+pprInstr (LDFAR fmt reg (AddrRegImm source off)) =
+ sdocWithPlatform $ \platform -> vcat [
+ pprInstr (ADDIS (tmpReg platform) source (HA off)),
+ pprInstr (LD fmt reg (AddrRegImm (tmpReg platform) (LO off)))
+ ]
+pprInstr (LDFAR _ _ _) =
+ panic "PPC.Ppr.pprInstr LDFAR: no match"
+
+pprInstr (LDR fmt reg1 addr) = hcat [
+ text "\tl",
+ case fmt of
+ II32 -> char 'w'
+ II64 -> char 'd'
+ _ -> panic "PPC.Ppr.Instr LDR: no match",
+ text "arx\t",
+ pprReg reg1,
+ text ", ",
+ pprAddr addr
+ ]
+
+pprInstr (LA fmt reg addr) = hcat [
+ char '\t',
+ text "l",
+ ptext (case fmt of
+ II8 -> sLit "ba"
+ II16 -> sLit "ha"
+ II32 -> sLit "wa"
+ II64 -> sLit "d"
+ FF32 -> sLit "fs"
+ FF64 -> sLit "fd"
+ ),
+ case addr of AddrRegImm _ _ -> empty
+ AddrRegReg _ _ -> char 'x',
+ char '\t',
+ pprReg reg,
+ text ", ",
+ pprAddr addr
+ ]
+pprInstr (ST fmt reg addr) = hcat [
+ char '\t',
+ text "st",
+ pprFormat fmt,
+ case addr of AddrRegImm _ _ -> empty
+ AddrRegReg _ _ -> char 'x',
+ char '\t',
+ pprReg reg,
+ text ", ",
+ pprAddr addr
+ ]
+pprInstr (STFAR fmt reg (AddrRegImm source off)) =
+ sdocWithPlatform $ \platform -> vcat [
+ pprInstr (ADDIS (tmpReg platform) source (HA off)),
+ pprInstr (ST fmt reg (AddrRegImm (tmpReg platform) (LO off)))
+ ]
+pprInstr (STFAR _ _ _) =
+ panic "PPC.Ppr.pprInstr STFAR: no match"
+pprInstr (STU fmt reg addr) = hcat [
+ char '\t',
+ text "st",
+ pprFormat fmt,
+ char 'u',
+ case addr of AddrRegImm _ _ -> empty
+ AddrRegReg _ _ -> char 'x',
+ char '\t',
+ pprReg reg,
+ text ", ",
+ pprAddr addr
+ ]
+pprInstr (STC fmt reg1 addr) = hcat [
+ text "\tst",
+ case fmt of
+ II32 -> char 'w'
+ II64 -> char 'd'
+ _ -> panic "PPC.Ppr.Instr STC: no match",
+ text "cx.\t",
+ pprReg reg1,
+ text ", ",
+ pprAddr addr
+ ]
+pprInstr (LIS reg imm) = hcat [
+ char '\t',
+ text "lis",
+ char '\t',
+ pprReg reg,
+ text ", ",
+ pprImm imm
+ ]
+pprInstr (LI reg imm) = hcat [
+ char '\t',
+ text "li",
+ char '\t',
+ pprReg reg,
+ text ", ",
+ pprImm imm
+ ]
+pprInstr (MR reg1 reg2)
+ | reg1 == reg2 = empty
+ | otherwise = hcat [
+ char '\t',
+ sdocWithPlatform $ \platform ->
+ case targetClassOfReg platform reg1 of
+ RcInteger -> text "mr"
+ _ -> text "fmr",
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2
+ ]
+pprInstr (CMP fmt reg ri) = hcat [
+ char '\t',
+ op,
+ char '\t',
+ pprReg reg,
+ text ", ",
+ pprRI ri
+ ]
+ where
+ op = hcat [
+ text "cmp",
+ pprFormat fmt,
+ case ri of
+ RIReg _ -> empty
+ RIImm _ -> char 'i'
+ ]
+pprInstr (CMPL fmt reg ri) = hcat [
+ char '\t',
+ op,
+ char '\t',
+ pprReg reg,
+ text ", ",
+ pprRI ri
+ ]
+ where
+ op = hcat [
+ text "cmpl",
+ pprFormat fmt,
+ case ri of
+ RIReg _ -> empty
+ RIImm _ -> char 'i'
+ ]
+pprInstr (BCC cond blockid prediction) = hcat [
+ char '\t',
+ text "b",
+ pprCond cond,
+ pprPrediction prediction,
+ char '\t',
+ ppr lbl
+ ]
+ where lbl = mkLocalBlockLabel (getUnique blockid)
+ pprPrediction p = case p of
+ Nothing -> empty
+ Just True -> char '+'
+ Just False -> char '-'
+
+pprInstr (BCCFAR cond blockid prediction) = vcat [
+ hcat [
+ text "\tb",
+ pprCond (condNegate cond),
+ neg_prediction,
+ text "\t$+8"
+ ],
+ hcat [
+ text "\tb\t",
+ ppr lbl
+ ]
+ ]
+ where lbl = mkLocalBlockLabel (getUnique blockid)
+ neg_prediction = case prediction of
+ Nothing -> empty
+ Just True -> char '-'
+ Just False -> char '+'
+
+pprInstr (JMP lbl _)
+ -- We never jump to ForeignLabels; if we ever do, c.f. handling for "BL"
+ | isForeignLabel lbl = panic "PPC.Ppr.pprInstr: JMP to ForeignLabel"
+ | otherwise =
+ hcat [ -- an alias for b that takes a CLabel
+ char '\t',
+ text "b",
+ char '\t',
+ ppr lbl
+ ]
+
+pprInstr (MTCTR reg) = hcat [
+ char '\t',
+ text "mtctr",
+ char '\t',
+ pprReg reg
+ ]
+pprInstr (BCTR _ _ _) = hcat [
+ char '\t',
+ text "bctr"
+ ]
+pprInstr (BL lbl _) = do
+ sdocWithPlatform $ \platform -> case platformOS platform of
+ OSAIX ->
+ -- On AIX, "printf" denotes a function-descriptor (for use
+ -- by function pointers), whereas the actual entry-code
+ -- address is denoted by the dot-prefixed ".printf" label.
+ -- Moreover, the PPC NCG only ever emits a BL instruction
+ -- for calling C ABI functions. Most of the time these calls
+ -- originate from FFI imports and have a 'ForeignLabel',
+ -- but when profiling the codegen inserts calls via
+ -- 'emitRtsCallGen' which are 'CmmLabel's even though
+ -- they'd technically be more like 'ForeignLabel's.
+ hcat [
+ text "\tbl\t.",
+ ppr lbl
+ ]
+ _ ->
+ hcat [
+ text "\tbl\t",
+ ppr lbl
+ ]
+pprInstr (BCTRL _) = hcat [
+ char '\t',
+ text "bctrl"
+ ]
+pprInstr (ADD reg1 reg2 ri) = pprLogic (sLit "add") reg1 reg2 ri
+pprInstr (ADDIS reg1 reg2 imm) = hcat [
+ char '\t',
+ text "addis",
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprImm imm
+ ]
+
+pprInstr (ADDO reg1 reg2 reg3) = pprLogic (sLit "addo") reg1 reg2 (RIReg reg3)
+pprInstr (ADDC reg1 reg2 reg3) = pprLogic (sLit "addc") reg1 reg2 (RIReg reg3)
+pprInstr (ADDE reg1 reg2 reg3) = pprLogic (sLit "adde") reg1 reg2 (RIReg reg3)
+pprInstr (ADDZE reg1 reg2) = pprUnary (sLit "addze") reg1 reg2
+pprInstr (SUBF reg1 reg2 reg3) = pprLogic (sLit "subf") reg1 reg2 (RIReg reg3)
+pprInstr (SUBFO reg1 reg2 reg3) = pprLogic (sLit "subfo") reg1 reg2 (RIReg reg3)
+pprInstr (SUBFC reg1 reg2 ri) = hcat [
+ char '\t',
+ text "subf",
+ case ri of
+ RIReg _ -> empty
+ RIImm _ -> char 'i',
+ text "c\t",
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprRI ri
+ ]
+pprInstr (SUBFE reg1 reg2 reg3) = pprLogic (sLit "subfe") reg1 reg2 (RIReg reg3)
+pprInstr (MULL fmt reg1 reg2 ri) = pprMul fmt reg1 reg2 ri
+pprInstr (MULLO fmt reg1 reg2 reg3) = hcat [
+ char '\t',
+ text "mull",
+ case fmt of
+ II32 -> char 'w'
+ II64 -> char 'd'
+ _ -> panic "PPC: illegal format",
+ text "o\t",
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprReg reg3
+ ]
+pprInstr (MFOV fmt reg) = vcat [
+ hcat [
+ char '\t',
+ text "mfxer",
+ char '\t',
+ pprReg reg
+ ],
+ hcat [
+ char '\t',
+ text "extr",
+ case fmt of
+ II32 -> char 'w'
+ II64 -> char 'd'
+ _ -> panic "PPC: illegal format",
+ text "i\t",
+ pprReg reg,
+ text ", ",
+ pprReg reg,
+ text ", 1, ",
+ case fmt of
+ II32 -> text "1"
+ II64 -> text "33"
+ _ -> panic "PPC: illegal format"
+ ]
+ ]
+
+pprInstr (MULHU fmt reg1 reg2 reg3) = hcat [
+ char '\t',
+ text "mulh",
+ case fmt of
+ II32 -> char 'w'
+ II64 -> char 'd'
+ _ -> panic "PPC: illegal format",
+ text "u\t",
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprReg reg3
+ ]
+
+pprInstr (DIV fmt sgn reg1 reg2 reg3) = pprDiv fmt sgn reg1 reg2 reg3
+
+ -- for some reason, "andi" doesn't exist.
+ -- we'll use "andi." instead.
+pprInstr (AND reg1 reg2 (RIImm imm)) = hcat [
+ char '\t',
+ text "andi.",
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprImm imm
+ ]
+pprInstr (AND reg1 reg2 ri) = pprLogic (sLit "and") reg1 reg2 ri
+pprInstr (ANDC reg1 reg2 reg3) = pprLogic (sLit "andc") reg1 reg2 (RIReg reg3)
+pprInstr (NAND reg1 reg2 reg3) = pprLogic (sLit "nand") reg1 reg2 (RIReg reg3)
+
+pprInstr (OR reg1 reg2 ri) = pprLogic (sLit "or") reg1 reg2 ri
+pprInstr (XOR reg1 reg2 ri) = pprLogic (sLit "xor") reg1 reg2 ri
+
+pprInstr (ORIS reg1 reg2 imm) = hcat [
+ char '\t',
+ text "oris",
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprImm imm
+ ]
+
+pprInstr (XORIS reg1 reg2 imm) = hcat [
+ char '\t',
+ text "xoris",
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprImm imm
+ ]
+
+pprInstr (EXTS fmt reg1 reg2) = hcat [
+ char '\t',
+ text "exts",
+ pprFormat fmt,
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2
+ ]
+pprInstr (CNTLZ fmt reg1 reg2) = hcat [
+ char '\t',
+ text "cntlz",
+ case fmt of
+ II32 -> char 'w'
+ II64 -> char 'd'
+ _ -> panic "PPC: illegal format",
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2
+ ]
+
+pprInstr (NEG reg1 reg2) = pprUnary (sLit "neg") reg1 reg2
+pprInstr (NOT reg1 reg2) = pprUnary (sLit "not") reg1 reg2
+
+pprInstr (SR II32 reg1 reg2 (RIImm (ImmInt i))) | i < 0 || i > 31 =
+ -- Handle the case where we are asked to shift a 32 bit register by
+ -- less than zero or more than 31 bits. We convert this into a clear
+ -- of the destination register.
+ -- Fixes ticket https://gitlab.haskell.org/ghc/ghc/issues/5900
+ pprInstr (XOR reg1 reg2 (RIReg reg2))
+
+pprInstr (SL II32 reg1 reg2 (RIImm (ImmInt i))) | i < 0 || i > 31 =
+ -- As above for SR, but for left shifts.
+ -- Fixes ticket https://gitlab.haskell.org/ghc/ghc/issues/10870
+ pprInstr (XOR reg1 reg2 (RIReg reg2))
+
+pprInstr (SRA II32 reg1 reg2 (RIImm (ImmInt i))) | i > 31 =
+ -- PT: I don't know what to do for negative shift amounts:
+ -- For now just panic.
+ --
+ -- For shift amounts greater than 31 set all bit to the
+ -- value of the sign bit, this also what sraw does.
+ pprInstr (SRA II32 reg1 reg2 (RIImm (ImmInt 31)))
+
+pprInstr (SL fmt reg1 reg2 ri) =
+ let op = case fmt of
+ II32 -> "slw"
+ II64 -> "sld"
+ _ -> panic "PPC.Ppr.pprInstr: shift illegal size"
+ in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri)
+
+pprInstr (SR fmt reg1 reg2 ri) =
+ let op = case fmt of
+ II32 -> "srw"
+ II64 -> "srd"
+ _ -> panic "PPC.Ppr.pprInstr: shift illegal size"
+ in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri)
+
+pprInstr (SRA fmt reg1 reg2 ri) =
+ let op = case fmt of
+ II32 -> "sraw"
+ II64 -> "srad"
+ _ -> panic "PPC.Ppr.pprInstr: shift illegal size"
+ in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri)
+
+pprInstr (RLWINM reg1 reg2 sh mb me) = hcat [
+ text "\trlwinm\t",
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ int sh,
+ text ", ",
+ int mb,
+ text ", ",
+ int me
+ ]
+
+pprInstr (CLRLI fmt reg1 reg2 n) = hcat [
+ text "\tclrl",
+ pprFormat fmt,
+ text "i ",
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ int n
+ ]
+pprInstr (CLRRI fmt reg1 reg2 n) = hcat [
+ text "\tclrr",
+ pprFormat fmt,
+ text "i ",
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ int n
+ ]
+
+pprInstr (FADD fmt reg1 reg2 reg3) = pprBinaryF (sLit "fadd") fmt reg1 reg2 reg3
+pprInstr (FSUB fmt reg1 reg2 reg3) = pprBinaryF (sLit "fsub") fmt reg1 reg2 reg3
+pprInstr (FMUL fmt reg1 reg2 reg3) = pprBinaryF (sLit "fmul") fmt reg1 reg2 reg3
+pprInstr (FDIV fmt reg1 reg2 reg3) = pprBinaryF (sLit "fdiv") fmt reg1 reg2 reg3
+pprInstr (FABS reg1 reg2) = pprUnary (sLit "fabs") reg1 reg2
+pprInstr (FNEG reg1 reg2) = pprUnary (sLit "fneg") reg1 reg2
+
+pprInstr (FCMP reg1 reg2) = hcat [
+ char '\t',
+ text "fcmpu\t0, ",
+ -- Note: we're using fcmpu, not fcmpo
+ -- The difference is with fcmpo, compare with NaN is an invalid operation.
+ -- We don't handle invalid fp ops, so we don't care.
+ -- Moreover, we use `fcmpu 0, ...` rather than `fcmpu cr0, ...` for
+ -- better portability since some non-GNU assembler (such as
+ -- IBM's `as`) tend not to support the symbolic register name cr0.
+ -- This matches the syntax that GCC seems to emit for PPC targets.
+ pprReg reg1,
+ text ", ",
+ pprReg reg2
+ ]
+
+pprInstr (FCTIWZ reg1 reg2) = pprUnary (sLit "fctiwz") reg1 reg2
+pprInstr (FCTIDZ reg1 reg2) = pprUnary (sLit "fctidz") reg1 reg2
+pprInstr (FCFID reg1 reg2) = pprUnary (sLit "fcfid") reg1 reg2
+pprInstr (FRSP reg1 reg2) = pprUnary (sLit "frsp") reg1 reg2
+
+pprInstr (CRNOR dst src1 src2) = hcat [
+ text "\tcrnor\t",
+ int dst,
+ text ", ",
+ int src1,
+ text ", ",
+ int src2
+ ]
+
+pprInstr (MFCR reg) = hcat [
+ char '\t',
+ text "mfcr",
+ char '\t',
+ pprReg reg
+ ]
+
+pprInstr (MFLR reg) = hcat [
+ char '\t',
+ text "mflr",
+ char '\t',
+ pprReg reg
+ ]
+
+pprInstr (FETCHPC reg) = vcat [
+ text "\tbcl\t20,31,1f",
+ hcat [ text "1:\tmflr\t", pprReg reg ]
+ ]
+
+pprInstr HWSYNC = text "\tsync"
+
+pprInstr ISYNC = text "\tisync"
+
+pprInstr LWSYNC = text "\tlwsync"
+
+pprInstr NOP = text "\tnop"
+
+
+pprLogic :: PtrString -> Reg -> Reg -> RI -> SDoc
+pprLogic op reg1 reg2 ri = hcat [
+ char '\t',
+ ptext op,
+ case ri of
+ RIReg _ -> empty
+ RIImm _ -> char 'i',
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprRI ri
+ ]
+
+
+pprMul :: Format -> Reg -> Reg -> RI -> SDoc
+pprMul fmt reg1 reg2 ri = hcat [
+ char '\t',
+ text "mull",
+ case ri of
+ RIReg _ -> case fmt of
+ II32 -> char 'w'
+ II64 -> char 'd'
+ _ -> panic "PPC: illegal format"
+ RIImm _ -> char 'i',
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprRI ri
+ ]
+
+
+pprDiv :: Format -> Bool -> Reg -> Reg -> Reg -> SDoc
+pprDiv fmt sgn reg1 reg2 reg3 = hcat [
+ char '\t',
+ text "div",
+ case fmt of
+ II32 -> char 'w'
+ II64 -> char 'd'
+ _ -> panic "PPC: illegal format",
+ if sgn then empty else char 'u',
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprReg reg3
+ ]
+
+
+pprUnary :: PtrString -> Reg -> Reg -> SDoc
+pprUnary op reg1 reg2 = hcat [
+ char '\t',
+ ptext op,
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2
+ ]
+
+
+pprBinaryF :: PtrString -> Format -> Reg -> Reg -> Reg -> SDoc
+pprBinaryF op fmt reg1 reg2 reg3 = hcat [
+ char '\t',
+ ptext op,
+ pprFFormat fmt,
+ char '\t',
+ pprReg reg1,
+ text ", ",
+ pprReg reg2,
+ text ", ",
+ pprReg reg3
+ ]
+
+pprRI :: RI -> SDoc
+pprRI (RIReg r) = pprReg r
+pprRI (RIImm r) = pprImm r
+
+
+pprFFormat :: Format -> SDoc
+pprFFormat FF64 = empty
+pprFFormat FF32 = char 's'
+pprFFormat _ = panic "PPC.Ppr.pprFFormat: no match"
+
+ -- limit immediate argument for shift instruction to range 0..63
+ -- for 64 bit size and 0..32 otherwise
+limitShiftRI :: Format -> RI -> RI
+limitShiftRI II64 (RIImm (ImmInt i)) | i > 63 || i < 0 =
+ panic $ "PPC.Ppr: Shift by " ++ show i ++ " bits is not allowed."
+limitShiftRI II32 (RIImm (ImmInt i)) | i > 31 || i < 0 =
+ panic $ "PPC.Ppr: 32 bit: Shift by " ++ show i ++ " bits is not allowed."
+limitShiftRI _ x = x
diff --git a/compiler/GHC/CmmToAsm/PPC/RegInfo.hs b/compiler/GHC/CmmToAsm/PPC/RegInfo.hs
new file mode 100644
index 0000000000..a75040d703
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/RegInfo.hs
@@ -0,0 +1,80 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-specific parts of the register allocator
+--
+-- (c) The University of Glasgow 1996-2004
+--
+-----------------------------------------------------------------------------
+module GHC.CmmToAsm.PPC.RegInfo (
+ JumpDest( DestBlockId ), getJumpDestBlockId,
+ canShortcut,
+ shortcutJump,
+
+ shortcutStatics
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.PPC.Instr
+
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.CLabel
+
+import Unique
+import Outputable (ppr, text, Outputable, (<>))
+
+data JumpDest = DestBlockId BlockId
+
+-- Debug Instance
+instance Outputable JumpDest where
+ ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
+
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut _ = Nothing
+
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump _ other = other
+
+
+-- Here because it knows about JumpDest
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
+shortcutStatics fn (RawCmmStatics lbl statics)
+ = RawCmmStatics lbl $ map (shortcutStatic fn) statics
+ -- we need to get the jump tables, so apply the mapping to the entries
+ -- of a CmmData too.
+
+shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
+shortcutLabel fn lab
+ | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn blkId
+ | otherwise = lab
+
+shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
+shortcutStatic fn (CmmStaticLit (CmmLabel lab))
+ = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
+shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
+ = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
+ -- slightly dodgy, we're ignoring the second label, but this
+ -- works with the way we use CmmLabelDiffOff for jump tables now.
+shortcutStatic _ other_static
+ = other_static
+
+shortBlockId
+ :: (BlockId -> Maybe JumpDest)
+ -> BlockId
+ -> CLabel
+
+shortBlockId fn blockid =
+ case fn blockid of
+ Nothing -> mkLocalBlockLabel uq
+ Just (DestBlockId blockid') -> shortBlockId fn blockid'
+ where uq = getUnique blockid
diff --git a/compiler/GHC/CmmToAsm/PPC/Regs.hs b/compiler/GHC/CmmToAsm/PPC/Regs.hs
new file mode 100644
index 0000000000..8a9a859665
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/Regs.hs
@@ -0,0 +1,333 @@
+{-# LANGUAGE CPP #-}
+
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 1994-2004
+--
+-- -----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.PPC.Regs (
+ -- squeeze functions
+ virtualRegSqueeze,
+ realRegSqueeze,
+
+ mkVirtualReg,
+ regDotColor,
+
+ -- immediates
+ Imm(..),
+ strImmLit,
+ litToImm,
+
+ -- addressing modes
+ AddrMode(..),
+ addrOffset,
+
+ -- registers
+ spRel,
+ argRegs,
+ allArgRegs,
+ callClobberedRegs,
+ allMachRegNos,
+ classOfRealReg,
+ showReg,
+
+ -- machine specific
+ allFPArgRegs,
+ fits16Bits,
+ makeImmediate,
+ fReg,
+ r0, sp, toc, r3, r4, r11, r12, r30,
+ tmpReg,
+ f1,
+
+ allocatableRegs
+
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+import GHC.Cmm.CLabel ( CLabel )
+import Unique
+
+import GHC.Platform.Regs
+import GHC.Driver.Session
+import Outputable
+import GHC.Platform
+
+import Data.Word ( Word8, Word16, Word32, Word64 )
+import Data.Int ( Int8, Int16, Int32, Int64 )
+
+
+-- squeese functions for the graph allocator -----------------------------------
+
+-- | regSqueeze_class reg
+-- Calculate the maximum number of register colors that could be
+-- denied to a node of this class due to having this reg
+-- as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+virtualRegSqueeze cls vr
+ = case cls of
+ RcInteger
+ -> case vr of
+ VirtualRegI{} -> 1
+ VirtualRegHi{} -> 1
+ _other -> 0
+
+ RcDouble
+ -> case vr of
+ VirtualRegD{} -> 1
+ VirtualRegF{} -> 0
+ _other -> 0
+
+ _other -> 0
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+realRegSqueeze cls rr
+ = case cls of
+ RcInteger
+ -> case rr of
+ RealRegSingle regNo
+ | regNo < 32 -> 1 -- first fp reg is 32
+ | otherwise -> 0
+
+ RealRegPair{} -> 0
+
+ RcDouble
+ -> case rr of
+ RealRegSingle regNo
+ | regNo < 32 -> 0
+ | otherwise -> 1
+
+ RealRegPair{} -> 0
+
+ _other -> 0
+
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+ | not (isFloatFormat format) = VirtualRegI u
+ | otherwise
+ = case format of
+ FF32 -> VirtualRegD u
+ FF64 -> VirtualRegD u
+ _ -> panic "mkVirtualReg"
+
+regDotColor :: RealReg -> SDoc
+regDotColor reg
+ = case classOfRealReg reg of
+ RcInteger -> text "blue"
+ RcFloat -> text "red"
+ RcDouble -> text "green"
+
+
+
+-- immediates ------------------------------------------------------------------
+data Imm
+ = ImmInt Int
+ | ImmInteger Integer -- Sigh.
+ | ImmCLbl CLabel -- AbstractC Label (with baggage)
+ | ImmLit SDoc -- Simple string
+ | ImmIndex CLabel Int
+ | ImmFloat Rational
+ | ImmDouble Rational
+ | ImmConstantSum Imm Imm
+ | ImmConstantDiff Imm Imm
+ | LO Imm
+ | HI Imm
+ | HA Imm {- high halfword adjusted -}
+ | HIGHERA Imm
+ | HIGHESTA Imm
+
+
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+litToImm :: CmmLit -> Imm
+litToImm (CmmInt i w) = ImmInteger (narrowS w i)
+ -- narrow to the width: a CmmInt might be out of
+ -- range, but we assume that ImmInteger only contains
+ -- in-range values. A signed value should be fine here.
+litToImm (CmmFloat f W32) = ImmFloat f
+litToImm (CmmFloat f W64) = ImmDouble f
+litToImm (CmmLabel l) = ImmCLbl l
+litToImm (CmmLabelOff l off) = ImmIndex l off
+litToImm (CmmLabelDiffOff l1 l2 off _)
+ = ImmConstantSum
+ (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+ (ImmInt off)
+litToImm _ = panic "PPC.Regs.litToImm: no match"
+
+
+-- addressing modes ------------------------------------------------------------
+
+data AddrMode
+ = AddrRegReg Reg Reg
+ | AddrRegImm Reg Imm
+
+
+addrOffset :: AddrMode -> Int -> Maybe AddrMode
+addrOffset addr off
+ = case addr of
+ AddrRegImm r (ImmInt n)
+ | fits16Bits n2 -> Just (AddrRegImm r (ImmInt n2))
+ | otherwise -> Nothing
+ where n2 = n + off
+
+ AddrRegImm r (ImmInteger n)
+ | fits16Bits n2 -> Just (AddrRegImm r (ImmInt (fromInteger n2)))
+ | otherwise -> Nothing
+ where n2 = n + toInteger off
+
+ _ -> Nothing
+
+
+-- registers -------------------------------------------------------------------
+-- @spRel@ gives us a stack relative addressing mode for volatile
+-- temporaries and for excess call arguments. @fpRel@, where
+-- applicable, is the same but for the frame pointer.
+
+spRel :: DynFlags
+ -> Int -- desired stack offset in words, positive or negative
+ -> AddrMode
+
+spRel dflags n = AddrRegImm sp (ImmInt (n * wORD_SIZE dflags))
+
+
+-- argRegs is the set of regs which are read for an n-argument call to C.
+-- For archs which pass all args on the stack (x86), is empty.
+-- Sparc passes up to the first 6 args in regs.
+argRegs :: RegNo -> [Reg]
+argRegs 0 = []
+argRegs 1 = map regSingle [3]
+argRegs 2 = map regSingle [3,4]
+argRegs 3 = map regSingle [3..5]
+argRegs 4 = map regSingle [3..6]
+argRegs 5 = map regSingle [3..7]
+argRegs 6 = map regSingle [3..8]
+argRegs 7 = map regSingle [3..9]
+argRegs 8 = map regSingle [3..10]
+argRegs _ = panic "MachRegs.argRegs(powerpc): don't know about >8 arguments!"
+
+
+allArgRegs :: [Reg]
+allArgRegs = map regSingle [3..10]
+
+
+-- these are the regs which we cannot assume stay alive over a C call.
+callClobberedRegs :: Platform -> [Reg]
+callClobberedRegs _platform
+ = map regSingle (0:[2..12] ++ map fReg [0..13])
+
+
+allMachRegNos :: [RegNo]
+allMachRegNos = [0..63]
+
+
+{-# INLINE classOfRealReg #-}
+classOfRealReg :: RealReg -> RegClass
+classOfRealReg (RealRegSingle i)
+ | i < 32 = RcInteger
+ | otherwise = RcDouble
+
+classOfRealReg (RealRegPair{})
+ = panic "regClass(ppr): no reg pairs on this architecture"
+
+showReg :: RegNo -> String
+showReg n
+ | n >= 0 && n <= 31 = "%r" ++ show n
+ | n >= 32 && n <= 63 = "%f" ++ show (n - 32)
+ | otherwise = "%unknown_powerpc_real_reg_" ++ show n
+
+
+
+-- machine specific ------------------------------------------------------------
+
+allFPArgRegs :: Platform -> [Reg]
+allFPArgRegs platform
+ = case platformOS platform of
+ OSAIX -> map (regSingle . fReg) [1..13]
+ _ -> case platformArch platform of
+ ArchPPC -> map (regSingle . fReg) [1..8]
+ ArchPPC_64 _ -> map (regSingle . fReg) [1..13]
+ _ -> panic "PPC.Regs.allFPArgRegs: unknown PPC Linux"
+
+fits16Bits :: Integral a => a -> Bool
+fits16Bits x = x >= -32768 && x < 32768
+
+makeImmediate :: Integral a => Width -> Bool -> a -> Maybe Imm
+makeImmediate rep signed x = fmap ImmInt (toI16 rep signed)
+ where
+ narrow W64 False = fromIntegral (fromIntegral x :: Word64)
+ narrow W32 False = fromIntegral (fromIntegral x :: Word32)
+ narrow W16 False = fromIntegral (fromIntegral x :: Word16)
+ narrow W8 False = fromIntegral (fromIntegral x :: Word8)
+ narrow W64 True = fromIntegral (fromIntegral x :: Int64)
+ narrow W32 True = fromIntegral (fromIntegral x :: Int32)
+ narrow W16 True = fromIntegral (fromIntegral x :: Int16)
+ narrow W8 True = fromIntegral (fromIntegral x :: Int8)
+ narrow _ _ = panic "PPC.Regs.narrow: no match"
+
+ narrowed = narrow rep signed
+
+ toI16 W32 True
+ | narrowed >= -32768 && narrowed < 32768 = Just narrowed
+ | otherwise = Nothing
+ toI16 W32 False
+ | narrowed >= 0 && narrowed < 65536 = Just narrowed
+ | otherwise = Nothing
+ toI16 W64 True
+ | narrowed >= -32768 && narrowed < 32768 = Just narrowed
+ | otherwise = Nothing
+ toI16 W64 False
+ | narrowed >= 0 && narrowed < 65536 = Just narrowed
+ | otherwise = Nothing
+ toI16 _ _ = Just narrowed
+
+
+{-
+The PowerPC has 64 registers of interest; 32 integer registers and 32 floating
+point registers.
+-}
+
+fReg :: Int -> RegNo
+fReg x = (32 + x)
+
+r0, sp, toc, r3, r4, r11, r12, r30, f1 :: Reg
+r0 = regSingle 0
+sp = regSingle 1
+toc = regSingle 2
+r3 = regSingle 3
+r4 = regSingle 4
+r11 = regSingle 11
+r12 = regSingle 12
+r30 = regSingle 30
+f1 = regSingle $ fReg 1
+
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: Platform -> [RealReg]
+allocatableRegs platform
+ = let isFree i = freeReg platform i
+ in map RealRegSingle $ filter isFree allMachRegNos
+
+-- temporary register for compiler use
+tmpReg :: Platform -> Reg
+tmpReg platform =
+ case platformArch platform of
+ ArchPPC -> regSingle 13
+ ArchPPC_64 _ -> regSingle 30
+ _ -> panic "PPC.Regs.tmpReg: unknown arch"
diff --git a/compiler/GHC/CmmToAsm/Ppr.hs b/compiler/GHC/CmmToAsm/Ppr.hs
new file mode 100644
index 0000000000..636d2e4e3a
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Ppr.hs
@@ -0,0 +1,275 @@
+{-# LANGUAGE MagicHash #-}
+
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.Ppr (
+ castFloatToWord8Array,
+ castDoubleToWord8Array,
+ floatToBytes,
+ doubleToBytes,
+ pprASCII,
+ pprBytes,
+ pprSectionHeader
+)
+
+where
+
+import GhcPrelude
+
+import AsmUtils
+import GHC.Cmm.CLabel
+import GHC.Cmm
+import GHC.Driver.Session
+import FastString
+import Outputable
+import GHC.Platform
+import FileCleanup
+
+import qualified Data.Array.Unsafe as U ( castSTUArray )
+import Data.Array.ST
+
+import Control.Monad.ST
+
+import Data.Word
+import Data.Bits
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS
+import GHC.Exts
+import GHC.Word
+import System.IO.Unsafe
+
+
+
+-- -----------------------------------------------------------------------------
+-- Converting floating-point literals to integrals for printing
+
+castFloatToWord8Array :: STUArray s Int Float -> ST s (STUArray s Int Word8)
+castFloatToWord8Array = U.castSTUArray
+
+castDoubleToWord8Array :: STUArray s Int Double -> ST s (STUArray s Int Word8)
+castDoubleToWord8Array = U.castSTUArray
+
+-- floatToBytes and doubleToBytes convert to the host's byte
+-- order. Providing that we're not cross-compiling for a
+-- target with the opposite endianness, this should work ok
+-- on all targets.
+
+-- ToDo: this stuff is very similar to the shenanigans in PprAbs,
+-- could they be merged?
+
+floatToBytes :: Float -> [Int]
+floatToBytes f
+ = runST (do
+ arr <- newArray_ ((0::Int),3)
+ writeArray arr 0 f
+ arr <- castFloatToWord8Array arr
+ i0 <- readArray arr 0
+ i1 <- readArray arr 1
+ i2 <- readArray arr 2
+ i3 <- readArray arr 3
+ return (map fromIntegral [i0,i1,i2,i3])
+ )
+
+doubleToBytes :: Double -> [Int]
+doubleToBytes d
+ = runST (do
+ arr <- newArray_ ((0::Int),7)
+ writeArray arr 0 d
+ arr <- castDoubleToWord8Array arr
+ i0 <- readArray arr 0
+ i1 <- readArray arr 1
+ i2 <- readArray arr 2
+ i3 <- readArray arr 3
+ i4 <- readArray arr 4
+ i5 <- readArray arr 5
+ i6 <- readArray arr 6
+ i7 <- readArray arr 7
+ return (map fromIntegral [i0,i1,i2,i3,i4,i5,i6,i7])
+ )
+
+-- ---------------------------------------------------------------------------
+-- Printing ASCII strings.
+--
+-- Print as a string and escape non-printable characters.
+-- This is similar to charToC in Utils.
+
+pprASCII :: ByteString -> SDoc
+pprASCII str
+ -- Transform this given literal bytestring to escaped string and construct
+ -- the literal SDoc directly.
+ -- See #14741
+ -- and Note [Pretty print ASCII when AsmCodeGen]
+ = text $ BS.foldr (\w s -> do1 w ++ s) "" str
+ where
+ do1 :: Word8 -> String
+ do1 w | 0x09 == w = "\\t"
+ | 0x0A == w = "\\n"
+ | 0x22 == w = "\\\""
+ | 0x5C == w = "\\\\"
+ -- ASCII printable characters range
+ | w >= 0x20 && w <= 0x7E = [chr' w]
+ | otherwise = '\\' : octal w
+
+ -- we know that the Chars we create are in the ASCII range
+ -- so we bypass the check in "chr"
+ chr' :: Word8 -> Char
+ chr' (W8# w#) = C# (chr# (word2Int# w#))
+
+ octal :: Word8 -> String
+ octal w = [ chr' (ord0 + (w `unsafeShiftR` 6) .&. 0x07)
+ , chr' (ord0 + (w `unsafeShiftR` 3) .&. 0x07)
+ , chr' (ord0 + w .&. 0x07)
+ ]
+ ord0 = 0x30 -- = ord '0'
+
+-- | Pretty print binary data.
+--
+-- Use either the ".string" directive or a ".incbin" directive.
+-- See Note [Embedding large binary blobs]
+--
+-- A NULL byte is added after the binary data.
+--
+pprBytes :: ByteString -> SDoc
+pprBytes bs = sdocWithDynFlags $ \dflags ->
+ if binBlobThreshold dflags == 0
+ || fromIntegral (BS.length bs) <= binBlobThreshold dflags
+ then text "\t.string " <> doubleQuotes (pprASCII bs)
+ else unsafePerformIO $ do
+ bFile <- newTempName dflags TFL_CurrentModule ".dat"
+ BS.writeFile bFile bs
+ return $ text "\t.incbin "
+ <> pprFilePathString bFile -- proper escape (see #16389)
+ <> text "\n\t.byte 0"
+
+{-
+Note [Embedding large binary blobs]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To embed a blob of binary data (e.g. an UTF-8 encoded string) into the generated
+code object, we have several options:
+
+ 1. Generate a ".byte" directive for each byte. This is what was done in the past
+ (see Note [Pretty print ASCII when AsmCodeGen]).
+
+ 2. Generate a single ".string"/".asciz" directive for the whole sequence of
+ bytes. Bytes in the ASCII printable range are rendered as characters and
+ other values are escaped (e.g., "\t", "\077", etc.).
+
+ 3. Create a temporary file into which we dump the binary data and generate a
+ single ".incbin" directive. The assembler will include the binary file for
+ us in the generated output object.
+
+Now the code generator uses either (2) or (3), depending on the binary blob
+size. Using (3) for small blobs adds too much overhead (see benchmark results
+in #16190), so we only do it when the size is above a threshold (500K at the
+time of writing).
+
+The threshold is configurable via the `-fbinary-blob-threshold` flag.
+
+-}
+
+
+{-
+Note [Pretty print ASCII when AsmCodeGen]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Previously, when generating assembly code, we created SDoc with
+`(ptext . sLit)` for every bytes in literal bytestring, then
+combine them using `hcat`.
+
+When handling literal bytestrings with millions of bytes,
+millions of SDoc would be created and to combine, leading to
+high memory usage.
+
+Now we escape the given bytestring to string directly and construct
+SDoc only once. This improvement could dramatically decrease the
+memory allocation from 4.7GB to 1.3GB when embedding a 3MB literal
+string in source code. See #14741 for profiling results.
+-}
+
+-- ----------------------------------------------------------------------------
+-- Printing section headers.
+--
+-- If -split-section was specified, include the suffix label, otherwise just
+-- print the section type. For Darwin, where subsections-for-symbols are
+-- used instead, only print section type.
+--
+-- For string literals, additional flags are specified to enable merging of
+-- identical strings in the linker. With -split-sections each string also gets
+-- a unique section to allow strings from unused code to be GC'd.
+
+pprSectionHeader :: Platform -> Section -> SDoc
+pprSectionHeader platform (Section t suffix) =
+ case platformOS platform of
+ OSAIX -> pprXcoffSectionHeader t
+ OSDarwin -> pprDarwinSectionHeader t
+ OSMinGW32 -> pprGNUSectionHeader (char '$') t suffix
+ _ -> pprGNUSectionHeader (char '.') t suffix
+
+pprGNUSectionHeader :: SDoc -> SectionType -> CLabel -> SDoc
+pprGNUSectionHeader sep t suffix = sdocWithDynFlags $ \dflags ->
+ let splitSections = gopt Opt_SplitSections dflags
+ subsection | splitSections = sep <> ppr suffix
+ | otherwise = empty
+ in text ".section " <> ptext (header dflags) <> subsection <>
+ flags dflags
+ where
+ header dflags = case t of
+ Text -> sLit ".text"
+ Data -> sLit ".data"
+ ReadOnlyData | OSMinGW32 <- platformOS (targetPlatform dflags)
+ -> sLit ".rdata"
+ | otherwise -> sLit ".rodata"
+ RelocatableReadOnlyData | OSMinGW32 <- platformOS (targetPlatform dflags)
+ -- Concept does not exist on Windows,
+ -- So map these to R/O data.
+ -> sLit ".rdata$rel.ro"
+ | otherwise -> sLit ".data.rel.ro"
+ UninitialisedData -> sLit ".bss"
+ ReadOnlyData16 | OSMinGW32 <- platformOS (targetPlatform dflags)
+ -> sLit ".rdata$cst16"
+ | otherwise -> sLit ".rodata.cst16"
+ CString
+ | OSMinGW32 <- platformOS (targetPlatform dflags)
+ -> sLit ".rdata"
+ | otherwise -> sLit ".rodata.str"
+ OtherSection _ ->
+ panic "PprBase.pprGNUSectionHeader: unknown section type"
+ flags dflags = case t of
+ CString
+ | OSMinGW32 <- platformOS (targetPlatform dflags)
+ -> empty
+ | otherwise -> text ",\"aMS\"," <> sectionType "progbits" <> text ",1"
+ _ -> empty
+
+-- XCOFF doesn't support relocating label-differences, so we place all
+-- RO sections into .text[PR] sections
+pprXcoffSectionHeader :: SectionType -> SDoc
+pprXcoffSectionHeader t = text $ case t of
+ Text -> ".csect .text[PR]"
+ Data -> ".csect .data[RW]"
+ ReadOnlyData -> ".csect .text[PR] # ReadOnlyData"
+ RelocatableReadOnlyData -> ".csect .text[PR] # RelocatableReadOnlyData"
+ ReadOnlyData16 -> ".csect .text[PR] # ReadOnlyData16"
+ CString -> ".csect .text[PR] # CString"
+ UninitialisedData -> ".csect .data[BS]"
+ OtherSection _ ->
+ panic "PprBase.pprXcoffSectionHeader: unknown section type"
+
+pprDarwinSectionHeader :: SectionType -> SDoc
+pprDarwinSectionHeader t =
+ ptext $ case t of
+ Text -> sLit ".text"
+ Data -> sLit ".data"
+ ReadOnlyData -> sLit ".const"
+ RelocatableReadOnlyData -> sLit ".const_data"
+ UninitialisedData -> sLit ".data"
+ ReadOnlyData16 -> sLit ".const"
+ CString -> sLit ".section\t__TEXT,__cstring,cstring_literals"
+ OtherSection _ ->
+ panic "PprBase.pprDarwinSectionHeader: unknown section type"
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph.hs b/compiler/GHC/CmmToAsm/Reg/Graph.hs
new file mode 100644
index 0000000000..6dfe84cf95
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph.hs
@@ -0,0 +1,472 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+-- | Graph coloring register allocator.
+module GHC.CmmToAsm.Reg.Graph (
+ regAlloc
+) where
+import GhcPrelude
+
+import qualified GraphColor as Color
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Reg.Graph.Spill
+import GHC.CmmToAsm.Reg.Graph.SpillClean
+import GHC.CmmToAsm.Reg.Graph.SpillCost
+import GHC.CmmToAsm.Reg.Graph.Stats
+import GHC.CmmToAsm.Reg.Graph.TrivColorable
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Reg.Target
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import Bag
+import GHC.Driver.Session
+import Outputable
+import GHC.Platform
+import UniqFM
+import UniqSet
+import UniqSupply
+import Util (seqList)
+import GHC.CmmToAsm.CFG
+
+import Data.Maybe
+import Control.Monad
+
+
+-- | The maximum number of build\/spill cycles we'll allow.
+--
+-- It should only take 3 or 4 cycles for the allocator to converge.
+-- If it takes any longer than this it's probably in an infinite loop,
+-- so it's better just to bail out and report a bug.
+maxSpinCount :: Int
+maxSpinCount = 10
+
+
+-- | The top level of the graph coloring register allocator.
+regAlloc
+ :: (Outputable statics, Outputable instr, Instruction instr)
+ => DynFlags
+ -> UniqFM (UniqSet RealReg) -- ^ registers we can use for allocation
+ -> UniqSet Int -- ^ set of available spill slots.
+ -> Int -- ^ current number of spill slots
+ -> [LiveCmmDecl statics instr] -- ^ code annotated with liveness information.
+ -> Maybe CFG -- ^ CFG of basic blocks if available
+ -> UniqSM ( [NatCmmDecl statics instr]
+ , Maybe Int, [RegAllocStats statics instr] )
+ -- ^ code with registers allocated, additional stacks required
+ -- and stats for each stage of allocation
+
+regAlloc dflags regsFree slotsFree slotsCount code cfg
+ = do
+ -- TODO: the regClass function is currently hard coded to the default
+ -- target architecture. Would prefer to determine this from dflags.
+ -- There are other uses of targetRegClass later in this module.
+ let platform = targetPlatform dflags
+ triv = trivColorable platform
+ (targetVirtualRegSqueeze platform)
+ (targetRealRegSqueeze platform)
+
+ (code_final, debug_codeGraphs, slotsCount', _)
+ <- regAlloc_spin dflags 0
+ triv
+ regsFree slotsFree slotsCount [] code cfg
+
+ let needStack
+ | slotsCount == slotsCount'
+ = Nothing
+ | otherwise
+ = Just slotsCount'
+
+ return ( code_final
+ , needStack
+ , reverse debug_codeGraphs )
+
+
+-- | Perform solver iterations for the graph coloring allocator.
+--
+-- We extract a register conflict graph from the provided cmm code,
+-- and try to colour it. If that works then we use the solution rewrite
+-- the code with real hregs. If coloring doesn't work we add spill code
+-- and try to colour it again. After `maxSpinCount` iterations we give up.
+--
+regAlloc_spin
+ :: forall instr statics.
+ (Instruction instr,
+ Outputable instr,
+ Outputable statics)
+ => DynFlags
+ -> Int -- ^ Number of solver iterations we've already performed.
+ -> Color.Triv VirtualReg RegClass RealReg
+ -- ^ Function for calculating whether a register is trivially
+ -- colourable.
+ -> UniqFM (UniqSet RealReg) -- ^ Free registers that we can allocate.
+ -> UniqSet Int -- ^ Free stack slots that we can use.
+ -> Int -- ^ Number of spill slots in use
+ -> [RegAllocStats statics instr] -- ^ Current regalloc stats to add to.
+ -> [LiveCmmDecl statics instr] -- ^ Liveness annotated code to allocate.
+ -> Maybe CFG
+ -> UniqSM ( [NatCmmDecl statics instr]
+ , [RegAllocStats statics instr]
+ , Int -- Slots in use
+ , Color.Graph VirtualReg RegClass RealReg)
+
+regAlloc_spin dflags spinCount triv regsFree slotsFree slotsCount debug_codeGraphs code cfg
+ = do
+ let platform = targetPlatform dflags
+
+ -- If any of these dump flags are turned on we want to hang on to
+ -- intermediate structures in the allocator - otherwise tell the
+ -- allocator to ditch them early so we don't end up creating space leaks.
+ let dump = or
+ [ dopt Opt_D_dump_asm_regalloc_stages dflags
+ , dopt Opt_D_dump_asm_stats dflags
+ , dopt Opt_D_dump_asm_conflicts dflags ]
+
+ -- Check that we're not running off down the garden path.
+ when (spinCount > maxSpinCount)
+ $ pprPanic "regAlloc_spin: max build/spill cycle count exceeded."
+ ( text "It looks like the register allocator is stuck in an infinite loop."
+ $$ text "max cycles = " <> int maxSpinCount
+ $$ text "regsFree = " <> (hcat $ punctuate space $ map ppr
+ $ nonDetEltsUniqSet $ unionManyUniqSets
+ $ nonDetEltsUFM regsFree)
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+ $$ text "slotsFree = " <> ppr (sizeUniqSet slotsFree))
+
+ -- Build the register conflict graph from the cmm code.
+ (graph :: Color.Graph VirtualReg RegClass RealReg)
+ <- {-# SCC "BuildGraph" #-} buildGraph code
+
+ -- VERY IMPORTANT:
+ -- We really do want the graph to be fully evaluated _before_ we
+ -- start coloring. If we don't do this now then when the call to
+ -- Color.colorGraph forces bits of it, the heap will be filled with
+ -- half evaluated pieces of graph and zillions of apply thunks.
+ seqGraph graph `seq` return ()
+
+ -- Build a map of the cost of spilling each instruction.
+ -- This is a lazy binding, so the map will only be computed if we
+ -- actually have to spill to the stack.
+ let spillCosts = foldl' plusSpillCostInfo zeroSpillCostInfo
+ $ map (slurpSpillCostInfo platform cfg) code
+
+ -- The function to choose regs to leave uncolored.
+ let spill = chooseSpill spillCosts
+
+ -- Record startup state in our log.
+ let stat1
+ = if spinCount == 0
+ then Just $ RegAllocStatsStart
+ { raLiveCmm = code
+ , raGraph = graph
+ , raSpillCosts = spillCosts }
+ else Nothing
+
+ -- Try and color the graph.
+ let (graph_colored, rsSpill, rmCoalesce)
+ = {-# SCC "ColorGraph" #-}
+ Color.colorGraph
+ (gopt Opt_RegsIterative dflags)
+ spinCount
+ regsFree triv spill graph
+
+ -- Rewrite registers in the code that have been coalesced.
+ let patchF reg
+ | RegVirtual vr <- reg
+ = case lookupUFM rmCoalesce vr of
+ Just vr' -> patchF (RegVirtual vr')
+ Nothing -> reg
+
+ | otherwise
+ = reg
+
+ let (code_coalesced :: [LiveCmmDecl statics instr])
+ = map (patchEraseLive patchF) code
+
+ -- Check whether we've found a coloring.
+ if isEmptyUniqSet rsSpill
+
+ -- Coloring was successful because no registers needed to be spilled.
+ then do
+ -- if -fasm-lint is turned on then validate the graph.
+ -- This checks for bugs in the graph allocator itself.
+ let graph_colored_lint =
+ if gopt Opt_DoAsmLinting dflags
+ then Color.validateGraph (text "")
+ True -- Require all nodes to be colored.
+ graph_colored
+ else graph_colored
+
+ -- Rewrite the code to use real hregs, using the colored graph.
+ let code_patched
+ = map (patchRegsFromGraph platform graph_colored_lint)
+ code_coalesced
+
+ -- Clean out unneeded SPILL/RELOAD meta instructions.
+ -- The spill code generator just spills the entire live range
+ -- of a vreg, but it might not need to be on the stack for
+ -- its entire lifetime.
+ let code_spillclean
+ = map (cleanSpills platform) code_patched
+
+ -- Strip off liveness information from the allocated code.
+ -- Also rewrite SPILL/RELOAD meta instructions into real machine
+ -- instructions along the way
+ let code_final
+ = map (stripLive dflags) code_spillclean
+
+ -- Record what happened in this stage for debugging
+ let stat
+ = RegAllocStatsColored
+ { raCode = code
+ , raGraph = graph
+ , raGraphColored = graph_colored_lint
+ , raCoalesced = rmCoalesce
+ , raCodeCoalesced = code_coalesced
+ , raPatched = code_patched
+ , raSpillClean = code_spillclean
+ , raFinal = code_final
+ , raSRMs = foldl' addSRM (0, 0, 0)
+ $ map countSRMs code_spillclean }
+
+ -- Bundle up all the register allocator statistics.
+ -- .. but make sure to drop them on the floor if they're not
+ -- needed, otherwise we'll get a space leak.
+ let statList =
+ if dump then [stat] ++ maybeToList stat1 ++ debug_codeGraphs
+ else []
+
+ -- Ensure all the statistics are evaluated, to avoid space leaks.
+ seqList statList (return ())
+
+ return ( code_final
+ , statList
+ , slotsCount
+ , graph_colored_lint)
+
+ -- Coloring was unsuccessful. We need to spill some register to the
+ -- stack, make a new graph, and try to color it again.
+ else do
+ -- if -fasm-lint is turned on then validate the graph
+ let graph_colored_lint =
+ if gopt Opt_DoAsmLinting dflags
+ then Color.validateGraph (text "")
+ False -- don't require nodes to be colored
+ graph_colored
+ else graph_colored
+
+ -- Spill uncolored regs to the stack.
+ (code_spilled, slotsFree', slotsCount', spillStats)
+ <- regSpill platform code_coalesced slotsFree slotsCount rsSpill
+
+ -- Recalculate liveness information.
+ -- NOTE: we have to reverse the SCCs here to get them back into
+ -- the reverse-dependency order required by computeLiveness.
+ -- If they're not in the correct order that function will panic.
+ code_relive <- mapM (regLiveness platform . reverseBlocksInTops)
+ code_spilled
+
+ -- Record what happened in this stage for debugging.
+ let stat =
+ RegAllocStatsSpill
+ { raCode = code
+ , raGraph = graph_colored_lint
+ , raCoalesced = rmCoalesce
+ , raSpillStats = spillStats
+ , raSpillCosts = spillCosts
+ , raSpilled = code_spilled }
+
+ -- Bundle up all the register allocator statistics.
+ -- .. but make sure to drop them on the floor if they're not
+ -- needed, otherwise we'll get a space leak.
+ let statList =
+ if dump
+ then [stat] ++ maybeToList stat1 ++ debug_codeGraphs
+ else []
+
+ -- Ensure all the statistics are evaluated, to avoid space leaks.
+ seqList statList (return ())
+
+ regAlloc_spin dflags (spinCount + 1) triv regsFree slotsFree'
+ slotsCount' statList code_relive cfg
+
+
+-- | Build a graph from the liveness and coalesce information in this code.
+buildGraph
+ :: Instruction instr
+ => [LiveCmmDecl statics instr]
+ -> UniqSM (Color.Graph VirtualReg RegClass RealReg)
+
+buildGraph code
+ = do
+ -- Slurp out the conflicts and reg->reg moves from this code.
+ let (conflictList, moveList) =
+ unzip $ map slurpConflicts code
+
+ -- Slurp out the spill/reload coalesces.
+ let moveList2 = map slurpReloadCoalesce code
+
+ -- Add the reg-reg conflicts to the graph.
+ let conflictBag = unionManyBags conflictList
+ let graph_conflict
+ = foldr graphAddConflictSet Color.initGraph conflictBag
+
+ -- Add the coalescences edges to the graph.
+ let moveBag
+ = unionBags (unionManyBags moveList2)
+ (unionManyBags moveList)
+
+ let graph_coalesce
+ = foldr graphAddCoalesce graph_conflict moveBag
+
+ return graph_coalesce
+
+
+-- | Add some conflict edges to the graph.
+-- Conflicts between virtual and real regs are recorded as exclusions.
+graphAddConflictSet
+ :: UniqSet Reg
+ -> Color.Graph VirtualReg RegClass RealReg
+ -> Color.Graph VirtualReg RegClass RealReg
+
+graphAddConflictSet set graph
+ = let virtuals = mkUniqSet
+ [ vr | RegVirtual vr <- nonDetEltsUniqSet set ]
+
+ graph1 = Color.addConflicts virtuals classOfVirtualReg graph
+
+ graph2 = foldr (\(r1, r2) -> Color.addExclusion r1 classOfVirtualReg r2)
+ graph1
+ [ (vr, rr)
+ | RegVirtual vr <- nonDetEltsUniqSet set
+ , RegReal rr <- nonDetEltsUniqSet set]
+ -- See Note [Unique Determinism and code generation]
+
+ in graph2
+
+
+-- | Add some coalesence edges to the graph
+-- Coalesences between virtual and real regs are recorded as preferences.
+graphAddCoalesce
+ :: (Reg, Reg)
+ -> Color.Graph VirtualReg RegClass RealReg
+ -> Color.Graph VirtualReg RegClass RealReg
+
+graphAddCoalesce (r1, r2) graph
+ | RegReal rr <- r1
+ , RegVirtual vr <- r2
+ = Color.addPreference (vr, classOfVirtualReg vr) rr graph
+
+ | RegReal rr <- r2
+ , RegVirtual vr <- r1
+ = Color.addPreference (vr, classOfVirtualReg vr) rr graph
+
+ | RegVirtual vr1 <- r1
+ , RegVirtual vr2 <- r2
+ = Color.addCoalesce
+ (vr1, classOfVirtualReg vr1)
+ (vr2, classOfVirtualReg vr2)
+ graph
+
+ -- We can't coalesce two real regs, but there could well be existing
+ -- hreg,hreg moves in the input code. We'll just ignore these
+ -- for coalescing purposes.
+ | RegReal _ <- r1
+ , RegReal _ <- r2
+ = graph
+
+#if __GLASGOW_HASKELL__ <= 810
+ | otherwise
+ = panic "graphAddCoalesce"
+#endif
+
+
+-- | Patch registers in code using the reg -> reg mapping in this graph.
+patchRegsFromGraph
+ :: (Outputable statics, Outputable instr, Instruction instr)
+ => Platform -> Color.Graph VirtualReg RegClass RealReg
+ -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr
+
+patchRegsFromGraph platform graph code
+ = patchEraseLive patchF code
+ where
+ -- Function to lookup the hardreg for a virtual reg from the graph.
+ patchF reg
+ -- leave real regs alone.
+ | RegReal{} <- reg
+ = reg
+
+ -- this virtual has a regular node in the graph.
+ | RegVirtual vr <- reg
+ , Just node <- Color.lookupNode graph vr
+ = case Color.nodeColor node of
+ Just color -> RegReal color
+ Nothing -> RegVirtual vr
+
+ -- no node in the graph for this virtual, bad news.
+ | otherwise
+ = pprPanic "patchRegsFromGraph: register mapping failed."
+ ( text "There is no node in the graph for register "
+ <> ppr reg
+ $$ ppr code
+ $$ Color.dotGraph
+ (\_ -> text "white")
+ (trivColorable platform
+ (targetVirtualRegSqueeze platform)
+ (targetRealRegSqueeze platform))
+ graph)
+
+
+-----
+-- for when laziness just isn't what you wanted...
+-- We need to deepSeq the whole graph before trying to colour it to avoid
+-- space leaks.
+seqGraph :: Color.Graph VirtualReg RegClass RealReg -> ()
+seqGraph graph = seqNodes (nonDetEltsUFM (Color.graphMap graph))
+ -- See Note [Unique Determinism and code generation]
+
+seqNodes :: [Color.Node VirtualReg RegClass RealReg] -> ()
+seqNodes ns
+ = case ns of
+ [] -> ()
+ (n : ns) -> seqNode n `seq` seqNodes ns
+
+seqNode :: Color.Node VirtualReg RegClass RealReg -> ()
+seqNode node
+ = seqVirtualReg (Color.nodeId node)
+ `seq` seqRegClass (Color.nodeClass node)
+ `seq` seqMaybeRealReg (Color.nodeColor node)
+ `seq` (seqVirtualRegList (nonDetEltsUniqSet (Color.nodeConflicts node)))
+ `seq` (seqRealRegList (nonDetEltsUniqSet (Color.nodeExclusions node)))
+ `seq` (seqRealRegList (Color.nodePreference node))
+ `seq` (seqVirtualRegList (nonDetEltsUniqSet (Color.nodeCoalesce node)))
+ -- It's OK to use nonDetEltsUniqSet for seq
+
+seqVirtualReg :: VirtualReg -> ()
+seqVirtualReg reg = reg `seq` ()
+
+seqRealReg :: RealReg -> ()
+seqRealReg reg = reg `seq` ()
+
+seqRegClass :: RegClass -> ()
+seqRegClass c = c `seq` ()
+
+seqMaybeRealReg :: Maybe RealReg -> ()
+seqMaybeRealReg mr
+ = case mr of
+ Nothing -> ()
+ Just r -> seqRealReg r
+
+seqVirtualRegList :: [VirtualReg] -> ()
+seqVirtualRegList rs
+ = case rs of
+ [] -> ()
+ (r : rs) -> seqVirtualReg r `seq` seqVirtualRegList rs
+
+seqRealRegList :: [RealReg] -> ()
+seqRealRegList rs
+ = case rs of
+ [] -> ()
+ (r : rs) -> seqRealReg r `seq` seqRealRegList rs
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs
new file mode 100644
index 0000000000..95fa174415
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs
@@ -0,0 +1,163 @@
+
+-- | Utils for calculating general worst, bound, squeese and free, functions.
+--
+-- as per: "A Generalized Algorithm for Graph-Coloring Register Allocation"
+-- Michael Smith, Normal Ramsey, Glenn Holloway.
+-- PLDI 2004
+--
+-- These general versions are not used in GHC proper because they are too slow.
+-- Instead, hand written optimised versions are provided for each architecture
+-- in MachRegs*.hs
+--
+-- This code is here because we can test the architecture specific code against
+-- it.
+--
+module GHC.CmmToAsm.Reg.Graph.Base (
+ RegClass(..),
+ Reg(..),
+ RegSub(..),
+
+ worst,
+ bound,
+ squeese
+) where
+
+import GhcPrelude
+
+import UniqSet
+import UniqFM
+import Unique
+import MonadUtils (concatMapM)
+
+
+-- Some basic register classes.
+-- These aren't necessarily in 1-to-1 correspondence with the allocatable
+-- RegClasses in MachRegs.hs
+data RegClass
+ -- general purpose regs
+ = ClassG32 -- 32 bit GPRs
+ | ClassG16 -- 16 bit GPRs
+ | ClassG8 -- 8 bit GPRs
+
+ -- floating point regs
+ | ClassF64 -- 64 bit FPRs
+ deriving (Show, Eq, Enum)
+
+
+-- | A register of some class
+data Reg
+ -- a register of some class
+ = Reg RegClass Int
+
+ -- a sub-component of one of the other regs
+ | RegSub RegSub Reg
+ deriving (Show, Eq)
+
+
+-- | so we can put regs in UniqSets
+instance Uniquable Reg where
+ getUnique (Reg c i)
+ = mkRegSingleUnique
+ $ fromEnum c * 1000 + i
+
+ getUnique (RegSub s (Reg c i))
+ = mkRegSubUnique
+ $ fromEnum s * 10000 + fromEnum c * 1000 + i
+
+ getUnique (RegSub _ (RegSub _ _))
+ = error "RegArchBase.getUnique: can't have a sub-reg of a sub-reg."
+
+
+-- | A subcomponent of another register
+data RegSub
+ = SubL16 -- lowest 16 bits
+ | SubL8 -- lowest 8 bits
+ | SubL8H -- second lowest 8 bits
+ deriving (Show, Enum, Ord, Eq)
+
+
+-- | Worst case displacement
+--
+-- a node N of classN has some number of neighbors,
+-- all of which are from classC.
+--
+-- (worst neighbors classN classC) is the maximum number of potential
+-- colors for N that can be lost by coloring its neighbors.
+--
+-- This should be hand coded/cached for each particular architecture,
+-- because the compute time is very long..
+worst :: (RegClass -> UniqSet Reg)
+ -> (Reg -> UniqSet Reg)
+ -> Int -> RegClass -> RegClass -> Int
+
+worst regsOfClass regAlias neighbors classN classC
+ = let regAliasS regs = unionManyUniqSets
+ $ map regAlias
+ $ nonDetEltsUniqSet regs
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+
+ -- all the regs in classes N, C
+ regsN = regsOfClass classN
+ regsC = regsOfClass classC
+
+ -- all the possible subsets of c which have size < m
+ regsS = filter (\s -> sizeUniqSet s >= 1
+ && sizeUniqSet s <= neighbors)
+ $ powersetLS regsC
+
+ -- for each of the subsets of C, the regs which conflict
+ -- with posiblities for N
+ regsS_conflict
+ = map (\s -> intersectUniqSets regsN (regAliasS s)) regsS
+
+ in maximum $ map sizeUniqSet $ regsS_conflict
+
+
+-- | For a node N of classN and neighbors of classesC
+-- (bound classN classesC) is the maximum number of potential
+-- colors for N that can be lost by coloring its neighbors.
+bound :: (RegClass -> UniqSet Reg)
+ -> (Reg -> UniqSet Reg)
+ -> RegClass -> [RegClass] -> Int
+
+bound regsOfClass regAlias classN classesC
+ = let regAliasS regs = unionManyUniqSets
+ $ map regAlias
+ $ nonDetEltsUFM regs
+ -- See Note [Unique Determinism and code generation]
+
+ regsC_aliases
+ = unionManyUniqSets
+ $ map (regAliasS . getUniqSet . regsOfClass) classesC
+
+ overlap = intersectUniqSets (regsOfClass classN) regsC_aliases
+
+ in sizeUniqSet overlap
+
+
+-- | The total squeese on a particular node with a list of neighbors.
+--
+-- A version of this should be constructed for each particular architecture,
+-- possibly including uses of bound, so that alised registers don't get
+-- counted twice, as per the paper.
+squeese :: (RegClass -> UniqSet Reg)
+ -> (Reg -> UniqSet Reg)
+ -> RegClass -> [(Int, RegClass)] -> Int
+
+squeese regsOfClass regAlias classN countCs
+ = sum
+ $ map (\(i, classC) -> worst regsOfClass regAlias i classN classC)
+ $ countCs
+
+
+-- | powerset (for lists)
+powersetL :: [a] -> [[a]]
+powersetL = concatMapM (\x -> [[],[x]])
+
+
+-- | powersetLS (list of sets)
+powersetLS :: Uniquable a => UniqSet a -> [UniqSet a]
+powersetLS s = map mkUniqSet $ powersetL $ nonDetEltsUniqSet s
+ -- See Note [Unique Determinism and code generation]
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs
new file mode 100644
index 0000000000..d223137dd0
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs
@@ -0,0 +1,99 @@
+-- | Register coalescing.
+module GHC.CmmToAsm.Reg.Graph.Coalesce (
+ regCoalesce,
+ slurpJoinMovs
+) where
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+
+import GHC.Cmm
+import Bag
+import Digraph
+import UniqFM
+import UniqSet
+import UniqSupply
+
+
+-- | Do register coalescing on this top level thing
+--
+-- For Reg -> Reg moves, if the first reg dies at the same time the
+-- second reg is born then the mov only serves to join live ranges.
+-- The two regs can be renamed to be the same and the move instruction
+-- safely erased.
+regCoalesce
+ :: Instruction instr
+ => [LiveCmmDecl statics instr]
+ -> UniqSM [LiveCmmDecl statics instr]
+
+regCoalesce code
+ = do
+ let joins = foldl' unionBags emptyBag
+ $ map slurpJoinMovs code
+
+ let alloc = foldl' buildAlloc emptyUFM
+ $ bagToList joins
+
+ let patched = map (patchEraseLive (sinkReg alloc)) code
+
+ return patched
+
+
+-- | Add a v1 = v2 register renaming to the map.
+-- The register with the lowest lexical name is set as the
+-- canonical version.
+buildAlloc :: UniqFM Reg -> (Reg, Reg) -> UniqFM Reg
+buildAlloc fm (r1, r2)
+ = let rmin = min r1 r2
+ rmax = max r1 r2
+ in addToUFM fm rmax rmin
+
+
+-- | Determine the canonical name for a register by following
+-- v1 = v2 renamings in this map.
+sinkReg :: UniqFM Reg -> Reg -> Reg
+sinkReg fm r
+ = case lookupUFM fm r of
+ Nothing -> r
+ Just r' -> sinkReg fm r'
+
+
+-- | Slurp out mov instructions that only serve to join live ranges.
+--
+-- During a mov, if the source reg dies and the destination reg is
+-- born then we can rename the two regs to the same thing and
+-- eliminate the move.
+slurpJoinMovs
+ :: Instruction instr
+ => LiveCmmDecl statics instr
+ -> Bag (Reg, Reg)
+
+slurpJoinMovs live
+ = slurpCmm emptyBag live
+ where
+ slurpCmm rs CmmData{}
+ = rs
+
+ slurpCmm rs (CmmProc _ _ _ sccs)
+ = foldl' slurpBlock rs (flattenSCCs sccs)
+
+ slurpBlock rs (BasicBlock _ instrs)
+ = foldl' slurpLI rs instrs
+
+ slurpLI rs (LiveInstr _ Nothing) = rs
+ slurpLI rs (LiveInstr instr (Just live))
+ | Just (r1, r2) <- takeRegRegMoveInstr instr
+ , elementOfUniqSet r1 $ liveDieRead live
+ , elementOfUniqSet r2 $ liveBorn live
+
+ -- only coalesce movs between two virtuals for now,
+ -- else we end up with allocatable regs in the live
+ -- regs list..
+ , isVirtualReg r1 && isVirtualReg r2
+ = consBag (r1, r2) rs
+
+ | otherwise
+ = rs
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs
new file mode 100644
index 0000000000..a0e11433f7
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs
@@ -0,0 +1,382 @@
+
+-- | When there aren't enough registers to hold all the vregs we have to spill
+-- some of those vregs to slots on the stack. This module is used modify the
+-- code to use those slots.
+module GHC.CmmToAsm.Reg.Graph.Spill (
+ regSpill,
+ SpillStats(..),
+ accSpillSL
+) where
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.Cmm hiding (RegSet)
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+
+import MonadUtils
+import State
+import Unique
+import UniqFM
+import UniqSet
+import UniqSupply
+import Outputable
+import GHC.Platform
+
+import Data.List
+import Data.Maybe
+import Data.IntSet (IntSet)
+import qualified Data.IntSet as IntSet
+
+
+-- | Spill all these virtual regs to stack slots.
+--
+-- Bumps the number of required stack slots if required.
+--
+--
+-- TODO: See if we can split some of the live ranges instead of just globally
+-- spilling the virtual reg. This might make the spill cleaner's job easier.
+--
+-- TODO: On CISCy x86 and x86_64 we don't necessarily have to add a mov instruction
+-- when making spills. If an instr is using a spilled virtual we may be able to
+-- address the spill slot directly.
+--
+regSpill
+ :: Instruction instr
+ => Platform
+ -> [LiveCmmDecl statics instr] -- ^ the code
+ -> UniqSet Int -- ^ available stack slots
+ -> Int -- ^ current number of spill slots.
+ -> UniqSet VirtualReg -- ^ the regs to spill
+ -> UniqSM
+ ([LiveCmmDecl statics instr]
+ -- code with SPILL and RELOAD meta instructions added.
+ , UniqSet Int -- left over slots
+ , Int -- slot count in use now.
+ , SpillStats ) -- stats about what happened during spilling
+
+regSpill platform code slotsFree slotCount regs
+
+ -- Not enough slots to spill these regs.
+ | sizeUniqSet slotsFree < sizeUniqSet regs
+ = -- pprTrace "Bumping slot count:" (ppr slotCount <> text " -> " <> ppr (slotCount+512)) $
+ let slotsFree' = (addListToUniqSet slotsFree [slotCount+1 .. slotCount+512])
+ in regSpill platform code slotsFree' (slotCount+512) regs
+
+ | otherwise
+ = do
+ -- Allocate a slot for each of the spilled regs.
+ let slots = take (sizeUniqSet regs) $ nonDetEltsUniqSet slotsFree
+ let regSlotMap = listToUFM
+ $ zip (nonDetEltsUniqSet regs) slots
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+
+ -- Grab the unique supply from the monad.
+ us <- getUniqueSupplyM
+
+ -- Run the spiller on all the blocks.
+ let (code', state') =
+ runState (mapM (regSpill_top platform regSlotMap) code)
+ (initSpillS us)
+
+ return ( code'
+ , minusUniqSet slotsFree (mkUniqSet slots)
+ , slotCount
+ , makeSpillStats state')
+
+
+-- | Spill some registers to stack slots in a top-level thing.
+regSpill_top
+ :: Instruction instr
+ => Platform
+ -> RegMap Int
+ -- ^ map of vregs to slots they're being spilled to.
+ -> LiveCmmDecl statics instr
+ -- ^ the top level thing.
+ -> SpillM (LiveCmmDecl statics instr)
+
+regSpill_top platform regSlotMap cmm
+ = case cmm of
+ CmmData{}
+ -> return cmm
+
+ CmmProc info label live sccs
+ | LiveInfo static firstId liveVRegsOnEntry liveSlotsOnEntry <- info
+ -> do
+ -- The liveVRegsOnEntry contains the set of vregs that are live
+ -- on entry to each basic block. If we spill one of those vregs
+ -- we remove it from that set and add the corresponding slot
+ -- number to the liveSlotsOnEntry set. The spill cleaner needs
+ -- this information to erase unneeded spill and reload instructions
+ -- after we've done a successful allocation.
+ let liveSlotsOnEntry' :: BlockMap IntSet
+ liveSlotsOnEntry'
+ = mapFoldlWithKey patchLiveSlot
+ liveSlotsOnEntry liveVRegsOnEntry
+
+ let info'
+ = LiveInfo static firstId
+ liveVRegsOnEntry
+ liveSlotsOnEntry'
+
+ -- Apply the spiller to all the basic blocks in the CmmProc.
+ sccs' <- mapM (mapSCCM (regSpill_block platform regSlotMap)) sccs
+
+ return $ CmmProc info' label live sccs'
+
+ where -- Given a BlockId and the set of registers live in it,
+ -- if registers in this block are being spilled to stack slots,
+ -- then record the fact that these slots are now live in those blocks
+ -- in the given slotmap.
+ patchLiveSlot
+ :: BlockMap IntSet -> BlockId -> RegSet -> BlockMap IntSet
+
+ patchLiveSlot slotMap blockId regsLive
+ = let
+ -- Slots that are already recorded as being live.
+ curSlotsLive = fromMaybe IntSet.empty
+ $ mapLookup blockId slotMap
+
+ moreSlotsLive = IntSet.fromList
+ $ catMaybes
+ $ map (lookupUFM regSlotMap)
+ $ nonDetEltsUniqSet regsLive
+ -- See Note [Unique Determinism and code generation]
+
+ slotMap'
+ = mapInsert blockId (IntSet.union curSlotsLive moreSlotsLive)
+ slotMap
+
+ in slotMap'
+
+
+-- | Spill some registers to stack slots in a basic block.
+regSpill_block
+ :: Instruction instr
+ => Platform
+ -> UniqFM Int -- ^ map of vregs to slots they're being spilled to.
+ -> LiveBasicBlock instr
+ -> SpillM (LiveBasicBlock instr)
+
+regSpill_block platform regSlotMap (BasicBlock i instrs)
+ = do instrss' <- mapM (regSpill_instr platform regSlotMap) instrs
+ return $ BasicBlock i (concat instrss')
+
+
+-- | Spill some registers to stack slots in a single instruction.
+-- If the instruction uses registers that need to be spilled, then it is
+-- prefixed (or postfixed) with the appropriate RELOAD or SPILL meta
+-- instructions.
+regSpill_instr
+ :: Instruction instr
+ => Platform
+ -> UniqFM Int -- ^ map of vregs to slots they're being spilled to.
+ -> LiveInstr instr
+ -> SpillM [LiveInstr instr]
+
+regSpill_instr _ _ li@(LiveInstr _ Nothing)
+ = do return [li]
+
+regSpill_instr platform regSlotMap
+ (LiveInstr instr (Just _))
+ = do
+ -- work out which regs are read and written in this instr
+ let RU rlRead rlWritten = regUsageOfInstr platform instr
+
+ -- sometimes a register is listed as being read more than once,
+ -- nub this so we don't end up inserting two lots of spill code.
+ let rsRead_ = nub rlRead
+ let rsWritten_ = nub rlWritten
+
+ -- if a reg is modified, it appears in both lists, want to undo this..
+ let rsRead = rsRead_ \\ rsWritten_
+ let rsWritten = rsWritten_ \\ rsRead_
+ let rsModify = intersect rsRead_ rsWritten_
+
+ -- work out if any of the regs being used are currently being spilled.
+ let rsSpillRead = filter (\r -> elemUFM r regSlotMap) rsRead
+ let rsSpillWritten = filter (\r -> elemUFM r regSlotMap) rsWritten
+ let rsSpillModify = filter (\r -> elemUFM r regSlotMap) rsModify
+
+ -- rewrite the instr and work out spill code.
+ (instr1, prepost1) <- mapAccumLM (spillRead regSlotMap) instr rsSpillRead
+ (instr2, prepost2) <- mapAccumLM (spillWrite regSlotMap) instr1 rsSpillWritten
+ (instr3, prepost3) <- mapAccumLM (spillModify regSlotMap) instr2 rsSpillModify
+
+ let (mPrefixes, mPostfixes) = unzip (prepost1 ++ prepost2 ++ prepost3)
+ let prefixes = concat mPrefixes
+ let postfixes = concat mPostfixes
+
+ -- final code
+ let instrs' = prefixes
+ ++ [LiveInstr instr3 Nothing]
+ ++ postfixes
+
+ return $ instrs'
+
+
+-- | Add a RELOAD met a instruction to load a value for an instruction that
+-- writes to a vreg that is being spilled.
+spillRead
+ :: Instruction instr
+ => UniqFM Int
+ -> instr
+ -> Reg
+ -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr']))
+
+spillRead regSlotMap instr reg
+ | Just slot <- lookupUFM regSlotMap reg
+ = do (instr', nReg) <- patchInstr reg instr
+
+ modify $ \s -> s
+ { stateSpillSL = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 0, 1) }
+
+ return ( instr'
+ , ( [LiveInstr (RELOAD slot nReg) Nothing]
+ , []) )
+
+ | otherwise = panic "RegSpill.spillRead: no slot defined for spilled reg"
+
+
+-- | Add a SPILL meta instruction to store a value for an instruction that
+-- writes to a vreg that is being spilled.
+spillWrite
+ :: Instruction instr
+ => UniqFM Int
+ -> instr
+ -> Reg
+ -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr']))
+
+spillWrite regSlotMap instr reg
+ | Just slot <- lookupUFM regSlotMap reg
+ = do (instr', nReg) <- patchInstr reg instr
+
+ modify $ \s -> s
+ { stateSpillSL = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 1, 0) }
+
+ return ( instr'
+ , ( []
+ , [LiveInstr (SPILL nReg slot) Nothing]))
+
+ | otherwise = panic "RegSpill.spillWrite: no slot defined for spilled reg"
+
+
+-- | Add both RELOAD and SPILL meta instructions for an instruction that
+-- both reads and writes to a vreg that is being spilled.
+spillModify
+ :: Instruction instr
+ => UniqFM Int
+ -> instr
+ -> Reg
+ -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr']))
+
+spillModify regSlotMap instr reg
+ | Just slot <- lookupUFM regSlotMap reg
+ = do (instr', nReg) <- patchInstr reg instr
+
+ modify $ \s -> s
+ { stateSpillSL = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 1, 1) }
+
+ return ( instr'
+ , ( [LiveInstr (RELOAD slot nReg) Nothing]
+ , [LiveInstr (SPILL nReg slot) Nothing]))
+
+ | otherwise = panic "RegSpill.spillModify: no slot defined for spilled reg"
+
+
+-- | Rewrite uses of this virtual reg in an instr to use a different
+-- virtual reg.
+patchInstr
+ :: Instruction instr
+ => Reg -> instr -> SpillM (instr, Reg)
+
+patchInstr reg instr
+ = do nUnique <- newUnique
+
+ -- The register we're rewriting is supposed to be virtual.
+ -- If it's not then something has gone horribly wrong.
+ let nReg
+ = case reg of
+ RegVirtual vr
+ -> RegVirtual (renameVirtualReg nUnique vr)
+
+ RegReal{}
+ -> panic "RegAlloc.Graph.Spill.patchIntr: not patching real reg"
+
+ let instr' = patchReg1 reg nReg instr
+ return (instr', nReg)
+
+
+patchReg1
+ :: Instruction instr
+ => Reg -> Reg -> instr -> instr
+
+patchReg1 old new instr
+ = let patchF r
+ | r == old = new
+ | otherwise = r
+ in patchRegsOfInstr instr patchF
+
+
+-- Spiller monad --------------------------------------------------------------
+-- | State monad for the spill code generator.
+type SpillM a
+ = State SpillS a
+
+-- | Spill code generator state.
+data SpillS
+ = SpillS
+ { -- | Unique supply for generating fresh vregs.
+ stateUS :: UniqSupply
+
+ -- | Spilled vreg vs the number of times it was loaded, stored.
+ , stateSpillSL :: UniqFM (Reg, Int, Int) }
+
+
+-- | Create a new spiller state.
+initSpillS :: UniqSupply -> SpillS
+initSpillS uniqueSupply
+ = SpillS
+ { stateUS = uniqueSupply
+ , stateSpillSL = emptyUFM }
+
+
+-- | Allocate a new unique in the spiller monad.
+newUnique :: SpillM Unique
+newUnique
+ = do us <- gets stateUS
+ case takeUniqFromSupply us of
+ (uniq, us')
+ -> do modify $ \s -> s { stateUS = us' }
+ return uniq
+
+
+-- | Add a spill/reload count to a stats record for a register.
+accSpillSL :: (Reg, Int, Int) -> (Reg, Int, Int) -> (Reg, Int, Int)
+accSpillSL (r1, s1, l1) (_, s2, l2)
+ = (r1, s1 + s2, l1 + l2)
+
+
+-- Spiller stats --------------------------------------------------------------
+-- | Spiller statistics.
+-- Tells us what registers were spilled.
+data SpillStats
+ = SpillStats
+ { spillStoreLoad :: UniqFM (Reg, Int, Int) }
+
+
+-- | Extract spiller statistics from the spiller state.
+makeSpillStats :: SpillS -> SpillStats
+makeSpillStats s
+ = SpillStats
+ { spillStoreLoad = stateSpillSL s }
+
+
+instance Outputable SpillStats where
+ ppr stats
+ = pprUFM (spillStoreLoad stats)
+ (vcat . map (\(r, s, l) -> ppr r <+> int s <+> int l))
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs b/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs
new file mode 100644
index 0000000000..6d14c7194b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs
@@ -0,0 +1,616 @@
+{-# LANGUAGE CPP #-}
+
+-- | Clean out unneeded spill\/reload instructions.
+--
+-- Handling of join points
+-- ~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- B1: B2:
+-- ... ...
+-- RELOAD SLOT(0), %r1 RELOAD SLOT(0), %r1
+-- ... A ... ... B ...
+-- jump B3 jump B3
+--
+-- B3: ... C ...
+-- RELOAD SLOT(0), %r1
+-- ...
+--
+-- The Plan
+-- ~~~~~~~~
+-- As long as %r1 hasn't been written to in A, B or C then we don't need
+-- the reload in B3.
+--
+-- What we really care about here is that on the entry to B3, %r1 will
+-- always have the same value that is in SLOT(0) (ie, %r1 is _valid_)
+--
+-- This also works if the reloads in B1\/B2 were spills instead, because
+-- spilling %r1 to a slot makes that slot have the same value as %r1.
+--
+module GHC.CmmToAsm.Reg.Graph.SpillClean (
+ cleanSpills
+) where
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import UniqSet
+import UniqFM
+import Unique
+import State
+import Outputable
+import GHC.Platform
+import GHC.Cmm.Dataflow.Collections
+
+import Data.List
+import Data.Maybe
+import Data.IntSet (IntSet)
+import qualified Data.IntSet as IntSet
+
+
+-- | The identification number of a spill slot.
+-- A value is stored in a spill slot when we don't have a free
+-- register to hold it.
+type Slot = Int
+
+
+-- | Clean out unneeded spill\/reloads from this top level thing.
+cleanSpills
+ :: Instruction instr
+ => Platform
+ -> LiveCmmDecl statics instr
+ -> LiveCmmDecl statics instr
+
+cleanSpills platform cmm
+ = evalState (cleanSpin platform 0 cmm) initCleanS
+
+
+-- | Do one pass of cleaning.
+cleanSpin
+ :: Instruction instr
+ => Platform
+ -> Int -- ^ Iteration number for the cleaner.
+ -> LiveCmmDecl statics instr -- ^ Liveness annotated code to clean.
+ -> CleanM (LiveCmmDecl statics instr)
+
+cleanSpin platform spinCount code
+ = do
+ -- Initialise count of cleaned spill and reload instructions.
+ modify $ \s -> s
+ { sCleanedSpillsAcc = 0
+ , sCleanedReloadsAcc = 0
+ , sReloadedBy = emptyUFM }
+
+ code_forward <- mapBlockTopM (cleanBlockForward platform) code
+ code_backward <- cleanTopBackward code_forward
+
+ -- During the cleaning of each block we collected information about
+ -- what regs were valid across each jump. Based on this, work out
+ -- whether it will be safe to erase reloads after join points for
+ -- the next pass.
+ collateJoinPoints
+
+ -- Remember how many spill and reload instructions we cleaned in this pass.
+ spills <- gets sCleanedSpillsAcc
+ reloads <- gets sCleanedReloadsAcc
+ modify $ \s -> s
+ { sCleanedCount = (spills, reloads) : sCleanedCount s }
+
+ -- If nothing was cleaned in this pass or the last one
+ -- then we're done and it's time to bail out.
+ cleanedCount <- gets sCleanedCount
+ if take 2 cleanedCount == [(0, 0), (0, 0)]
+ then return code
+
+ -- otherwise go around again
+ else cleanSpin platform (spinCount + 1) code_backward
+
+
+-------------------------------------------------------------------------------
+-- | Clean out unneeded reload instructions,
+-- while walking forward over the code.
+cleanBlockForward
+ :: Instruction instr
+ => Platform
+ -> LiveBasicBlock instr
+ -> CleanM (LiveBasicBlock instr)
+
+cleanBlockForward platform (BasicBlock blockId instrs)
+ = do
+ -- See if we have a valid association for the entry to this block.
+ jumpValid <- gets sJumpValid
+ let assoc = case lookupUFM jumpValid blockId of
+ Just assoc -> assoc
+ Nothing -> emptyAssoc
+
+ instrs_reload <- cleanForward platform blockId assoc [] instrs
+ return $ BasicBlock blockId instrs_reload
+
+
+
+-- | Clean out unneeded reload instructions.
+--
+-- Walking forwards across the code
+-- On a reload, if we know a reg already has the same value as a slot
+-- then we don't need to do the reload.
+--
+cleanForward
+ :: Instruction instr
+ => Platform
+ -> BlockId -- ^ the block that we're currently in
+ -> Assoc Store -- ^ two store locations are associated if
+ -- they have the same value
+ -> [LiveInstr instr] -- ^ acc
+ -> [LiveInstr instr] -- ^ instrs to clean (in backwards order)
+ -> CleanM [LiveInstr instr] -- ^ cleaned instrs (in forward order)
+
+cleanForward _ _ _ acc []
+ = return acc
+
+-- Rewrite live range joins via spill slots to just a spill and a reg-reg move
+-- hopefully the spill will be also be cleaned in the next pass
+cleanForward platform blockId assoc acc (li1 : li2 : instrs)
+
+ | LiveInstr (SPILL reg1 slot1) _ <- li1
+ , LiveInstr (RELOAD slot2 reg2) _ <- li2
+ , slot1 == slot2
+ = do
+ modify $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 }
+ cleanForward platform blockId assoc acc
+ $ li1 : LiveInstr (mkRegRegMoveInstr platform reg1 reg2) Nothing
+ : instrs
+
+cleanForward platform blockId assoc acc (li@(LiveInstr i1 _) : instrs)
+ | Just (r1, r2) <- takeRegRegMoveInstr i1
+ = if r1 == r2
+ -- Erase any left over nop reg reg moves while we're here
+ -- this will also catch any nop moves that the previous case
+ -- happens to add.
+ then cleanForward platform blockId assoc acc instrs
+
+ -- If r1 has the same value as some slots and we copy r1 to r2,
+ -- then r2 is now associated with those slots instead
+ else do let assoc' = addAssoc (SReg r1) (SReg r2)
+ $ delAssoc (SReg r2)
+ $ assoc
+
+ cleanForward platform blockId assoc' (li : acc) instrs
+
+
+cleanForward platform blockId assoc acc (li : instrs)
+
+ -- Update association due to the spill.
+ | LiveInstr (SPILL reg slot) _ <- li
+ = let assoc' = addAssoc (SReg reg) (SSlot slot)
+ $ delAssoc (SSlot slot)
+ $ assoc
+ in cleanForward platform blockId assoc' (li : acc) instrs
+
+ -- Clean a reload instr.
+ | LiveInstr (RELOAD{}) _ <- li
+ = do (assoc', mli) <- cleanReload platform blockId assoc li
+ case mli of
+ Nothing -> cleanForward platform blockId assoc' acc
+ instrs
+
+ Just li' -> cleanForward platform blockId assoc' (li' : acc)
+ instrs
+
+ -- Remember the association over a jump.
+ | LiveInstr instr _ <- li
+ , targets <- jumpDestsOfInstr instr
+ , not $ null targets
+ = do mapM_ (accJumpValid assoc) targets
+ cleanForward platform blockId assoc (li : acc) instrs
+
+ -- Writing to a reg changes its value.
+ | LiveInstr instr _ <- li
+ , RU _ written <- regUsageOfInstr platform instr
+ = let assoc' = foldr delAssoc assoc (map SReg $ nub written)
+ in cleanForward platform blockId assoc' (li : acc) instrs
+
+
+
+-- | Try and rewrite a reload instruction to something more pleasing
+cleanReload
+ :: Instruction instr
+ => Platform
+ -> BlockId
+ -> Assoc Store
+ -> LiveInstr instr
+ -> CleanM (Assoc Store, Maybe (LiveInstr instr))
+
+cleanReload platform blockId assoc li@(LiveInstr (RELOAD slot reg) _)
+
+ -- If the reg we're reloading already has the same value as the slot
+ -- then we can erase the instruction outright.
+ | elemAssoc (SSlot slot) (SReg reg) assoc
+ = do modify $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 }
+ return (assoc, Nothing)
+
+ -- If we can find another reg with the same value as this slot then
+ -- do a move instead of a reload.
+ | Just reg2 <- findRegOfSlot assoc slot
+ = do modify $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 }
+
+ let assoc' = addAssoc (SReg reg) (SReg reg2)
+ $ delAssoc (SReg reg)
+ $ assoc
+
+ return ( assoc'
+ , Just $ LiveInstr (mkRegRegMoveInstr platform reg2 reg) Nothing)
+
+ -- Gotta keep this instr.
+ | otherwise
+ = do -- Update the association.
+ let assoc'
+ = addAssoc (SReg reg) (SSlot slot)
+ -- doing the reload makes reg and slot the same value
+ $ delAssoc (SReg reg)
+ -- reg value changes on reload
+ $ assoc
+
+ -- Remember that this block reloads from this slot.
+ accBlockReloadsSlot blockId slot
+
+ return (assoc', Just li)
+
+cleanReload _ _ _ _
+ = panic "RegSpillClean.cleanReload: unhandled instr"
+
+
+-------------------------------------------------------------------------------
+-- | Clean out unneeded spill instructions,
+-- while walking backwards over the code.
+--
+-- If there were no reloads from a slot between a spill and the last one
+-- then the slot was never read and we don't need the spill.
+--
+-- SPILL r0 -> s1
+-- RELOAD s1 -> r2
+-- SPILL r3 -> s1 <--- don't need this spill
+-- SPILL r4 -> s1
+-- RELOAD s1 -> r5
+--
+-- Maintain a set of
+-- "slots which were spilled to but not reloaded from yet"
+--
+-- Walking backwards across the code:
+-- a) On a reload from a slot, remove it from the set.
+--
+-- a) On a spill from a slot
+-- If the slot is in set then we can erase the spill,
+-- because it won't be reloaded from until after the next spill.
+--
+-- otherwise
+-- keep the spill and add the slot to the set
+--
+-- TODO: This is mostly inter-block
+-- we should really be updating the noReloads set as we cross jumps also.
+--
+-- TODO: generate noReloads from liveSlotsOnEntry
+--
+cleanTopBackward
+ :: Instruction instr
+ => LiveCmmDecl statics instr
+ -> CleanM (LiveCmmDecl statics instr)
+
+cleanTopBackward cmm
+ = case cmm of
+ CmmData{}
+ -> return cmm
+
+ CmmProc info label live sccs
+ | LiveInfo _ _ _ liveSlotsOnEntry <- info
+ -> do sccs' <- mapM (mapSCCM (cleanBlockBackward liveSlotsOnEntry)) sccs
+ return $ CmmProc info label live sccs'
+
+
+cleanBlockBackward
+ :: Instruction instr
+ => BlockMap IntSet
+ -> LiveBasicBlock instr
+ -> CleanM (LiveBasicBlock instr)
+
+cleanBlockBackward liveSlotsOnEntry (BasicBlock blockId instrs)
+ = do instrs_spill <- cleanBackward liveSlotsOnEntry emptyUniqSet [] instrs
+ return $ BasicBlock blockId instrs_spill
+
+
+
+cleanBackward
+ :: Instruction instr
+ => BlockMap IntSet -- ^ Slots live on entry to each block
+ -> UniqSet Int -- ^ Slots that have been spilled, but not reloaded from
+ -> [LiveInstr instr] -- ^ acc
+ -> [LiveInstr instr] -- ^ Instrs to clean (in forwards order)
+ -> CleanM [LiveInstr instr] -- ^ Cleaned instrs (in backwards order)
+
+cleanBackward liveSlotsOnEntry noReloads acc lis
+ = do reloadedBy <- gets sReloadedBy
+ cleanBackward' liveSlotsOnEntry reloadedBy noReloads acc lis
+
+
+cleanBackward'
+ :: Instruction instr
+ => BlockMap IntSet
+ -> UniqFM [BlockId]
+ -> UniqSet Int
+ -> [LiveInstr instr]
+ -> [LiveInstr instr]
+ -> State CleanS [LiveInstr instr]
+
+cleanBackward' _ _ _ acc []
+ = return acc
+
+cleanBackward' liveSlotsOnEntry reloadedBy noReloads acc (li : instrs)
+
+ -- If nothing ever reloads from this slot then we don't need the spill.
+ | LiveInstr (SPILL _ slot) _ <- li
+ , Nothing <- lookupUFM reloadedBy (SSlot slot)
+ = do modify $ \s -> s { sCleanedSpillsAcc = sCleanedSpillsAcc s + 1 }
+ cleanBackward liveSlotsOnEntry noReloads acc instrs
+
+ | LiveInstr (SPILL _ slot) _ <- li
+ = if elementOfUniqSet slot noReloads
+
+ -- We can erase this spill because the slot won't be read until
+ -- after the next one
+ then do
+ modify $ \s -> s { sCleanedSpillsAcc = sCleanedSpillsAcc s + 1 }
+ cleanBackward liveSlotsOnEntry noReloads acc instrs
+
+ else do
+ -- This slot is being spilled to, but we haven't seen any reloads yet.
+ let noReloads' = addOneToUniqSet noReloads slot
+ cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs
+
+ -- if we reload from a slot then it's no longer unused
+ | LiveInstr (RELOAD slot _) _ <- li
+ , noReloads' <- delOneFromUniqSet noReloads slot
+ = cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs
+
+ -- If a slot is live in a jump target then assume it's reloaded there.
+ --
+ -- TODO: A real dataflow analysis would do a better job here.
+ -- If the target block _ever_ used the slot then we assume
+ -- it always does, but if those reloads are cleaned the slot
+ -- liveness map doesn't get updated.
+ | LiveInstr instr _ <- li
+ , targets <- jumpDestsOfInstr instr
+ = do
+ let slotsReloadedByTargets
+ = IntSet.unions
+ $ catMaybes
+ $ map (flip mapLookup liveSlotsOnEntry)
+ $ targets
+
+ let noReloads'
+ = foldl' delOneFromUniqSet noReloads
+ $ IntSet.toList slotsReloadedByTargets
+
+ cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs
+
+#if __GLASGOW_HASKELL__ <= 810
+ -- some other instruction
+ | otherwise
+ = cleanBackward liveSlotsOnEntry noReloads (li : acc) instrs
+#endif
+
+
+-- | Combine the associations from all the inward control flow edges.
+--
+collateJoinPoints :: CleanM ()
+collateJoinPoints
+ = modify $ \s -> s
+ { sJumpValid = mapUFM intersects (sJumpValidAcc s)
+ , sJumpValidAcc = emptyUFM }
+
+intersects :: [Assoc Store] -> Assoc Store
+intersects [] = emptyAssoc
+intersects assocs = foldl1' intersectAssoc assocs
+
+
+-- | See if we have a reg with the same value as this slot in the association table.
+findRegOfSlot :: Assoc Store -> Int -> Maybe Reg
+findRegOfSlot assoc slot
+ | close <- closeAssoc (SSlot slot) assoc
+ , Just (SReg reg) <- find isStoreReg $ nonDetEltsUniqSet close
+ -- See Note [Unique Determinism and code generation]
+ = Just reg
+
+ | otherwise
+ = Nothing
+
+
+-------------------------------------------------------------------------------
+-- | Cleaner monad.
+type CleanM
+ = State CleanS
+
+-- | Cleaner state.
+data CleanS
+ = CleanS
+ { -- | Regs which are valid at the start of each block.
+ sJumpValid :: UniqFM (Assoc Store)
+
+ -- | Collecting up what regs were valid across each jump.
+ -- in the next pass we can collate these and write the results
+ -- to sJumpValid.
+ , sJumpValidAcc :: UniqFM [Assoc Store]
+
+ -- | Map of (slot -> blocks which reload from this slot)
+ -- used to decide if whether slot spilled to will ever be
+ -- reloaded from on this path.
+ , sReloadedBy :: UniqFM [BlockId]
+
+ -- | Spills and reloads cleaned each pass (latest at front)
+ , sCleanedCount :: [(Int, Int)]
+
+ -- | Spills and reloads that have been cleaned in this pass so far.
+ , sCleanedSpillsAcc :: Int
+ , sCleanedReloadsAcc :: Int }
+
+
+-- | Construct the initial cleaner state.
+initCleanS :: CleanS
+initCleanS
+ = CleanS
+ { sJumpValid = emptyUFM
+ , sJumpValidAcc = emptyUFM
+
+ , sReloadedBy = emptyUFM
+
+ , sCleanedCount = []
+
+ , sCleanedSpillsAcc = 0
+ , sCleanedReloadsAcc = 0 }
+
+
+-- | Remember the associations before a jump.
+accJumpValid :: Assoc Store -> BlockId -> CleanM ()
+accJumpValid assocs target
+ = modify $ \s -> s {
+ sJumpValidAcc = addToUFM_C (++)
+ (sJumpValidAcc s)
+ target
+ [assocs] }
+
+
+accBlockReloadsSlot :: BlockId -> Slot -> CleanM ()
+accBlockReloadsSlot blockId slot
+ = modify $ \s -> s {
+ sReloadedBy = addToUFM_C (++)
+ (sReloadedBy s)
+ (SSlot slot)
+ [blockId] }
+
+
+-------------------------------------------------------------------------------
+-- A store location can be a stack slot or a register
+data Store
+ = SSlot Int
+ | SReg Reg
+
+
+-- | Check if this is a reg store.
+isStoreReg :: Store -> Bool
+isStoreReg ss
+ = case ss of
+ SSlot _ -> False
+ SReg _ -> True
+
+
+-- Spill cleaning is only done once all virtuals have been allocated to realRegs
+instance Uniquable Store where
+ getUnique (SReg r)
+ | RegReal (RealRegSingle i) <- r
+ = mkRegSingleUnique i
+
+ | RegReal (RealRegPair r1 r2) <- r
+ = mkRegPairUnique (r1 * 65535 + r2)
+
+ | otherwise
+ = error $ "RegSpillClean.getUnique: found virtual reg during spill clean,"
+ ++ "only real regs expected."
+
+ getUnique (SSlot i) = mkRegSubUnique i -- [SLPJ] I hope "SubUnique" is ok
+
+
+instance Outputable Store where
+ ppr (SSlot i) = text "slot" <> int i
+ ppr (SReg r) = ppr r
+
+
+-------------------------------------------------------------------------------
+-- Association graphs.
+-- In the spill cleaner, two store locations are associated if they are known
+-- to hold the same value.
+--
+type Assoc a = UniqFM (UniqSet a)
+
+-- | An empty association
+emptyAssoc :: Assoc a
+emptyAssoc = emptyUFM
+
+
+-- | Add an association between these two things.
+addAssoc :: Uniquable a
+ => a -> a -> Assoc a -> Assoc a
+
+addAssoc a b m
+ = let m1 = addToUFM_C unionUniqSets m a (unitUniqSet b)
+ m2 = addToUFM_C unionUniqSets m1 b (unitUniqSet a)
+ in m2
+
+
+-- | Delete all associations to a node.
+delAssoc :: (Uniquable a)
+ => a -> Assoc a -> Assoc a
+
+delAssoc a m
+ | Just aSet <- lookupUFM m a
+ , m1 <- delFromUFM m a
+ = nonDetFoldUniqSet (\x m -> delAssoc1 x a m) m1 aSet
+ -- It's OK to use nonDetFoldUFM here because deletion is commutative
+
+ | otherwise = m
+
+
+-- | Delete a single association edge (a -> b).
+delAssoc1 :: Uniquable a
+ => a -> a -> Assoc a -> Assoc a
+
+delAssoc1 a b m
+ | Just aSet <- lookupUFM m a
+ = addToUFM m a (delOneFromUniqSet aSet b)
+
+ | otherwise = m
+
+
+-- | Check if these two things are associated.
+elemAssoc :: (Uniquable a)
+ => a -> a -> Assoc a -> Bool
+
+elemAssoc a b m
+ = elementOfUniqSet b (closeAssoc a m)
+
+
+-- | Find the refl. trans. closure of the association from this point.
+closeAssoc :: (Uniquable a)
+ => a -> Assoc a -> UniqSet a
+
+closeAssoc a assoc
+ = closeAssoc' assoc emptyUniqSet (unitUniqSet a)
+ where
+ closeAssoc' assoc visited toVisit
+ = case nonDetEltsUniqSet toVisit of
+ -- See Note [Unique Determinism and code generation]
+
+ -- nothing else to visit, we're done
+ [] -> visited
+
+ (x:_)
+ -- we've already seen this node
+ | elementOfUniqSet x visited
+ -> closeAssoc' assoc visited (delOneFromUniqSet toVisit x)
+
+ -- haven't seen this node before,
+ -- remember to visit all its neighbors
+ | otherwise
+ -> let neighbors
+ = case lookupUFM assoc x of
+ Nothing -> emptyUniqSet
+ Just set -> set
+
+ in closeAssoc' assoc
+ (addOneToUniqSet visited x)
+ (unionUniqSets toVisit neighbors)
+
+-- | Intersect two associations.
+intersectAssoc :: Assoc a -> Assoc a -> Assoc a
+intersectAssoc a b
+ = intersectUFM_C (intersectUniqSets) a b
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs b/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs
new file mode 100644
index 0000000000..e3e456e98d
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs
@@ -0,0 +1,317 @@
+{-# LANGUAGE ScopedTypeVariables, GADTs, BangPatterns #-}
+module GHC.CmmToAsm.Reg.Graph.SpillCost (
+ SpillCostRecord,
+ plusSpillCostRecord,
+ pprSpillCostRecord,
+
+ SpillCostInfo,
+ zeroSpillCostInfo,
+ plusSpillCostInfo,
+
+ slurpSpillCostInfo,
+ chooseSpill,
+
+ lifeMapFromSpillCostInfo
+) where
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GraphBase
+
+import GHC.Cmm.Dataflow.Collections (mapLookup)
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm
+import UniqFM
+import UniqSet
+import Digraph (flattenSCCs)
+import Outputable
+import GHC.Platform
+import State
+import GHC.CmmToAsm.CFG
+
+import Data.List (nub, minimumBy)
+import Data.Maybe
+import Control.Monad (join)
+
+
+-- | Records the expected cost to spill some register.
+type SpillCostRecord
+ = ( VirtualReg -- register name
+ , Int -- number of writes to this reg
+ , Int -- number of reads from this reg
+ , Int) -- number of instrs this reg was live on entry to
+
+
+-- | Map of `SpillCostRecord`
+type SpillCostInfo
+ = UniqFM SpillCostRecord
+
+type SpillCostState = State (UniqFM SpillCostRecord) ()
+
+-- | An empty map of spill costs.
+zeroSpillCostInfo :: SpillCostInfo
+zeroSpillCostInfo = emptyUFM
+
+
+-- | Add two spill cost infos.
+plusSpillCostInfo :: SpillCostInfo -> SpillCostInfo -> SpillCostInfo
+plusSpillCostInfo sc1 sc2
+ = plusUFM_C plusSpillCostRecord sc1 sc2
+
+
+-- | Add two spill cost records.
+plusSpillCostRecord :: SpillCostRecord -> SpillCostRecord -> SpillCostRecord
+plusSpillCostRecord (r1, a1, b1, c1) (r2, a2, b2, c2)
+ | r1 == r2 = (r1, a1 + a2, b1 + b2, c1 + c2)
+ | otherwise = error "RegSpillCost.plusRegInt: regs don't match"
+
+
+-- | Slurp out information used for determining spill costs.
+--
+-- For each vreg, the number of times it was written to, read from,
+-- and the number of instructions it was live on entry to (lifetime)
+--
+slurpSpillCostInfo :: forall instr statics. (Outputable instr, Instruction instr)
+ => Platform
+ -> Maybe CFG
+ -> LiveCmmDecl statics instr
+ -> SpillCostInfo
+
+slurpSpillCostInfo platform cfg cmm
+ = execState (countCmm cmm) zeroSpillCostInfo
+ where
+ countCmm CmmData{} = return ()
+ countCmm (CmmProc info _ _ sccs)
+ = mapM_ (countBlock info freqMap)
+ $ flattenSCCs sccs
+ where
+ LiveInfo _ entries _ _ = info
+ freqMap = (fst . mkGlobalWeights (head entries)) <$> cfg
+
+ -- Lookup the regs that are live on entry to this block in
+ -- the info table from the CmmProc.
+ countBlock info freqMap (BasicBlock blockId instrs)
+ | LiveInfo _ _ blockLive _ <- info
+ , Just rsLiveEntry <- mapLookup blockId blockLive
+ , rsLiveEntry_virt <- takeVirtuals rsLiveEntry
+ = countLIs (ceiling $ blockFreq freqMap blockId) rsLiveEntry_virt instrs
+
+ | otherwise
+ = error "RegAlloc.SpillCost.slurpSpillCostInfo: bad block"
+
+
+ countLIs :: Int -> UniqSet VirtualReg -> [LiveInstr instr] -> SpillCostState
+ countLIs _ _ []
+ = return ()
+
+ -- Skip over comment and delta pseudo instrs.
+ countLIs scale rsLive (LiveInstr instr Nothing : lis)
+ | isMetaInstr instr
+ = countLIs scale rsLive lis
+
+ | otherwise
+ = pprPanic "RegSpillCost.slurpSpillCostInfo"
+ $ text "no liveness information on instruction " <> ppr instr
+
+ countLIs scale rsLiveEntry (LiveInstr instr (Just live) : lis)
+ = do
+ -- Increment the lifetime counts for regs live on entry to this instr.
+ mapM_ incLifetime $ nonDetEltsUniqSet rsLiveEntry
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+
+ -- Increment counts for what regs were read/written from.
+ let (RU read written) = regUsageOfInstr platform instr
+ mapM_ (incUses scale) $ catMaybes $ map takeVirtualReg $ nub read
+ mapM_ (incDefs scale) $ catMaybes $ map takeVirtualReg $ nub written
+
+ -- Compute liveness for entry to next instruction.
+ let liveDieRead_virt = takeVirtuals (liveDieRead live)
+ let liveDieWrite_virt = takeVirtuals (liveDieWrite live)
+ let liveBorn_virt = takeVirtuals (liveBorn live)
+
+ let rsLiveAcross
+ = rsLiveEntry `minusUniqSet` liveDieRead_virt
+
+ let rsLiveNext
+ = (rsLiveAcross `unionUniqSets` liveBorn_virt)
+ `minusUniqSet` liveDieWrite_virt
+
+ countLIs scale rsLiveNext lis
+
+ incDefs count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, count, 0, 0)
+ incUses count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, count, 0)
+ incLifetime reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, 0, 1)
+
+ blockFreq :: Maybe (LabelMap Double) -> Label -> Double
+ blockFreq freqs bid
+ | Just freq <- join (mapLookup bid <$> freqs)
+ = max 1.0 (10000 * freq)
+ | otherwise
+ = 1.0 -- Only if no cfg given
+
+-- | Take all the virtual registers from this set.
+takeVirtuals :: UniqSet Reg -> UniqSet VirtualReg
+takeVirtuals set = mkUniqSet
+ [ vr | RegVirtual vr <- nonDetEltsUniqSet set ]
+ -- See Note [Unique Determinism and code generation]
+
+
+-- | Choose a node to spill from this graph
+chooseSpill
+ :: SpillCostInfo
+ -> Graph VirtualReg RegClass RealReg
+ -> VirtualReg
+
+chooseSpill info graph
+ = let cost = spillCost_length info graph
+ node = minimumBy (\n1 n2 -> compare (cost $ nodeId n1) (cost $ nodeId n2))
+ $ nonDetEltsUFM $ graphMap graph
+ -- See Note [Unique Determinism and code generation]
+
+ in nodeId node
+
+
+-------------------------------------------------------------------------------
+-- | Chaitins spill cost function is:
+--
+-- cost = sum loadCost * freq (u) + sum storeCost * freq (d)
+-- u <- uses (v) d <- defs (v)
+--
+-- There are no loops in our code at the moment, so we can set the freq's to 1.
+--
+-- If we don't have live range splitting then Chaitins function performs badly
+-- if we have lots of nested live ranges and very few registers.
+--
+-- v1 v2 v3
+-- def v1 .
+-- use v1 .
+-- def v2 . .
+-- def v3 . . .
+-- use v1 . . .
+-- use v3 . . .
+-- use v2 . .
+-- use v1 .
+--
+-- defs uses degree cost
+-- v1: 1 3 3 1.5
+-- v2: 1 2 3 1.0
+-- v3: 1 1 3 0.666
+--
+-- v3 has the lowest cost, but if we only have 2 hardregs and we insert
+-- spill code for v3 then this isn't going to improve the colorability of
+-- the graph.
+--
+-- When compiling SHA1, which as very long basic blocks and some vregs
+-- with very long live ranges the allocator seems to try and spill from
+-- the inside out and eventually run out of stack slots.
+--
+-- Without live range splitting, its's better to spill from the outside
+-- in so set the cost of very long live ranges to zero
+--
+
+-- spillCost_chaitin
+-- :: SpillCostInfo
+-- -> Graph VirtualReg RegClass RealReg
+-- -> VirtualReg
+-- -> Float
+
+-- spillCost_chaitin info graph reg
+-- -- Spilling a live range that only lives for 1 instruction
+-- -- isn't going to help us at all - and we definitely want to avoid
+-- -- trying to re-spill previously inserted spill code.
+-- | lifetime <= 1 = 1/0
+
+-- -- It's unlikely that we'll find a reg for a live range this long
+-- -- better to spill it straight up and not risk trying to keep it around
+-- -- and have to go through the build/color cycle again.
+
+-- -- To facility this we scale down the spill cost of long ranges.
+-- -- This makes sure long ranges are still spilled first.
+-- -- But this way spill cost remains relevant for long live
+-- -- ranges.
+-- | lifetime >= 128
+-- = (spillCost / conflicts) / 10.0
+
+
+-- -- Otherwise revert to chaitin's regular cost function.
+-- | otherwise = (spillCost / conflicts)
+-- where
+-- !spillCost = fromIntegral (uses + defs) :: Float
+-- conflicts = fromIntegral (nodeDegree classOfVirtualReg graph reg)
+-- (_, defs, uses, lifetime)
+-- = fromMaybe (reg, 0, 0, 0) $ lookupUFM info reg
+
+
+-- Just spill the longest live range.
+spillCost_length
+ :: SpillCostInfo
+ -> Graph VirtualReg RegClass RealReg
+ -> VirtualReg
+ -> Float
+
+spillCost_length info _ reg
+ | lifetime <= 1 = 1/0
+ | otherwise = 1 / fromIntegral lifetime
+ where (_, _, _, lifetime)
+ = fromMaybe (reg, 0, 0, 0)
+ $ lookupUFM info reg
+
+
+-- | Extract a map of register lifetimes from a `SpillCostInfo`.
+lifeMapFromSpillCostInfo :: SpillCostInfo -> UniqFM (VirtualReg, Int)
+lifeMapFromSpillCostInfo info
+ = listToUFM
+ $ map (\(r, _, _, life) -> (r, (r, life)))
+ $ nonDetEltsUFM info
+ -- See Note [Unique Determinism and code generation]
+
+
+-- | Determine the degree (number of neighbors) of this node which
+-- have the same class.
+nodeDegree
+ :: (VirtualReg -> RegClass)
+ -> Graph VirtualReg RegClass RealReg
+ -> VirtualReg
+ -> Int
+
+nodeDegree classOfVirtualReg graph reg
+ | Just node <- lookupUFM (graphMap graph) reg
+
+ , virtConflicts
+ <- length
+ $ filter (\r -> classOfVirtualReg r == classOfVirtualReg reg)
+ $ nonDetEltsUniqSet
+ -- See Note [Unique Determinism and code generation]
+ $ nodeConflicts node
+
+ = virtConflicts + sizeUniqSet (nodeExclusions node)
+
+ | otherwise
+ = 0
+
+
+-- | Show a spill cost record, including the degree from the graph
+-- and final calculated spill cost.
+pprSpillCostRecord
+ :: (VirtualReg -> RegClass)
+ -> (Reg -> SDoc)
+ -> Graph VirtualReg RegClass RealReg
+ -> SpillCostRecord
+ -> SDoc
+
+pprSpillCostRecord regClass pprReg graph (reg, uses, defs, life)
+ = hsep
+ [ pprReg (RegVirtual reg)
+ , ppr uses
+ , ppr defs
+ , ppr life
+ , ppr $ nodeDegree regClass graph reg
+ , text $ show $ (fromIntegral (uses + defs)
+ / fromIntegral (nodeDegree regClass graph reg) :: Float) ]
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs
new file mode 100644
index 0000000000..05d2e814af
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs
@@ -0,0 +1,346 @@
+{-# LANGUAGE BangPatterns, CPP #-}
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-- | Carries interesting info for debugging / profiling of the
+-- graph coloring register allocator.
+module GHC.CmmToAsm.Reg.Graph.Stats (
+ RegAllocStats (..),
+
+ pprStats,
+ pprStatsSpills,
+ pprStatsLifetimes,
+ pprStatsConflict,
+ pprStatsLifeConflict,
+
+ countSRMs, addSRM
+) where
+
+import GhcPrelude
+
+import qualified GraphColor as Color
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Reg.Graph.Spill
+import GHC.CmmToAsm.Reg.Graph.SpillCost
+import GHC.CmmToAsm.Reg.Graph.TrivColorable
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Reg.Target
+
+import Outputable
+import UniqFM
+import UniqSet
+import State
+
+-- | Holds interesting statistics from the register allocator.
+data RegAllocStats statics instr
+
+ -- Information about the initial conflict graph.
+ = RegAllocStatsStart
+ { -- | Initial code, with liveness.
+ raLiveCmm :: [LiveCmmDecl statics instr]
+
+ -- | The initial, uncolored graph.
+ , raGraph :: Color.Graph VirtualReg RegClass RealReg
+
+ -- | Information to help choose which regs to spill.
+ , raSpillCosts :: SpillCostInfo }
+
+
+ -- Information about an intermediate graph.
+ -- This is one that we couldn't color, so had to insert spill code
+ -- instruction stream.
+ | RegAllocStatsSpill
+ { -- | Code we tried to allocate registers for.
+ raCode :: [LiveCmmDecl statics instr]
+
+ -- | Partially colored graph.
+ , raGraph :: Color.Graph VirtualReg RegClass RealReg
+
+ -- | The regs that were coalesced.
+ , raCoalesced :: UniqFM VirtualReg
+
+ -- | Spiller stats.
+ , raSpillStats :: SpillStats
+
+ -- | Number of instructions each reg lives for.
+ , raSpillCosts :: SpillCostInfo
+
+ -- | Code with spill instructions added.
+ , raSpilled :: [LiveCmmDecl statics instr] }
+
+
+ -- a successful coloring
+ | RegAllocStatsColored
+ { -- | Code we tried to allocate registers for.
+ raCode :: [LiveCmmDecl statics instr]
+
+ -- | Uncolored graph.
+ , raGraph :: Color.Graph VirtualReg RegClass RealReg
+
+ -- | Coalesced and colored graph.
+ , raGraphColored :: Color.Graph VirtualReg RegClass RealReg
+
+ -- | Regs that were coalesced.
+ , raCoalesced :: UniqFM VirtualReg
+
+ -- | Code with coalescings applied.
+ , raCodeCoalesced :: [LiveCmmDecl statics instr]
+
+ -- | Code with vregs replaced by hregs.
+ , raPatched :: [LiveCmmDecl statics instr]
+
+ -- | Code with unneeded spill\/reloads cleaned out.
+ , raSpillClean :: [LiveCmmDecl statics instr]
+
+ -- | Final code.
+ , raFinal :: [NatCmmDecl statics instr]
+
+ -- | Spill\/reload\/reg-reg moves present in this code.
+ , raSRMs :: (Int, Int, Int) }
+
+
+instance (Outputable statics, Outputable instr)
+ => Outputable (RegAllocStats statics instr) where
+
+ ppr (s@RegAllocStatsStart{}) = sdocWithPlatform $ \platform ->
+ text "# Start"
+ $$ text "# Native code with liveness information."
+ $$ ppr (raLiveCmm s)
+ $$ text ""
+ $$ text "# Initial register conflict graph."
+ $$ Color.dotGraph
+ (targetRegDotColor platform)
+ (trivColorable platform
+ (targetVirtualRegSqueeze platform)
+ (targetRealRegSqueeze platform))
+ (raGraph s)
+
+
+ ppr (s@RegAllocStatsSpill{}) =
+ text "# Spill"
+
+ $$ text "# Code with liveness information."
+ $$ ppr (raCode s)
+ $$ text ""
+
+ $$ (if (not $ isNullUFM $ raCoalesced s)
+ then text "# Registers coalesced."
+ $$ pprUFMWithKeys (raCoalesced s) (vcat . map ppr)
+ $$ text ""
+ else empty)
+
+ $$ text "# Spills inserted."
+ $$ ppr (raSpillStats s)
+ $$ text ""
+
+ $$ text "# Code with spills inserted."
+ $$ ppr (raSpilled s)
+
+
+ ppr (s@RegAllocStatsColored { raSRMs = (spills, reloads, moves) })
+ = sdocWithPlatform $ \platform ->
+ text "# Colored"
+
+ $$ text "# Code with liveness information."
+ $$ ppr (raCode s)
+ $$ text ""
+
+ $$ text "# Register conflict graph (colored)."
+ $$ Color.dotGraph
+ (targetRegDotColor platform)
+ (trivColorable platform
+ (targetVirtualRegSqueeze platform)
+ (targetRealRegSqueeze platform))
+ (raGraphColored s)
+ $$ text ""
+
+ $$ (if (not $ isNullUFM $ raCoalesced s)
+ then text "# Registers coalesced."
+ $$ pprUFMWithKeys (raCoalesced s) (vcat . map ppr)
+ $$ text ""
+ else empty)
+
+ $$ text "# Native code after coalescings applied."
+ $$ ppr (raCodeCoalesced s)
+ $$ text ""
+
+ $$ text "# Native code after register allocation."
+ $$ ppr (raPatched s)
+ $$ text ""
+
+ $$ text "# Clean out unneeded spill/reloads."
+ $$ ppr (raSpillClean s)
+ $$ text ""
+
+ $$ text "# Final code, after rewriting spill/rewrite pseudo instrs."
+ $$ ppr (raFinal s)
+ $$ text ""
+ $$ text "# Score:"
+ $$ (text "# spills inserted: " <> int spills)
+ $$ (text "# reloads inserted: " <> int reloads)
+ $$ (text "# reg-reg moves remaining: " <> int moves)
+ $$ text ""
+
+
+-- | Do all the different analysis on this list of RegAllocStats
+pprStats
+ :: [RegAllocStats statics instr]
+ -> Color.Graph VirtualReg RegClass RealReg
+ -> SDoc
+
+pprStats stats graph
+ = let outSpills = pprStatsSpills stats
+ outLife = pprStatsLifetimes stats
+ outConflict = pprStatsConflict stats
+ outScatter = pprStatsLifeConflict stats graph
+
+ in vcat [outSpills, outLife, outConflict, outScatter]
+
+
+-- | Dump a table of how many spill loads \/ stores were inserted for each vreg.
+pprStatsSpills
+ :: [RegAllocStats statics instr] -> SDoc
+
+pprStatsSpills stats
+ = let
+ finals = [ s | s@RegAllocStatsColored{} <- stats]
+
+ -- sum up how many stores\/loads\/reg-reg-moves were left in the code
+ total = foldl' addSRM (0, 0, 0)
+ $ map raSRMs finals
+
+ in ( text "-- spills-added-total"
+ $$ text "-- (stores, loads, reg_reg_moves_remaining)"
+ $$ ppr total
+ $$ text "")
+
+
+-- | Dump a table of how long vregs tend to live for in the initial code.
+pprStatsLifetimes
+ :: [RegAllocStats statics instr] -> SDoc
+
+pprStatsLifetimes stats
+ = let info = foldl' plusSpillCostInfo zeroSpillCostInfo
+ [ raSpillCosts s
+ | s@RegAllocStatsStart{} <- stats ]
+
+ lifeBins = binLifetimeCount $ lifeMapFromSpillCostInfo info
+
+ in ( text "-- vreg-population-lifetimes"
+ $$ text "-- (instruction_count, number_of_vregs_that_lived_that_long)"
+ $$ pprUFM lifeBins (vcat . map ppr)
+ $$ text "\n")
+
+
+binLifetimeCount :: UniqFM (VirtualReg, Int) -> UniqFM (Int, Int)
+binLifetimeCount fm
+ = let lifes = map (\l -> (l, (l, 1)))
+ $ map snd
+ $ nonDetEltsUFM fm
+ -- See Note [Unique Determinism and code generation]
+
+ in addListToUFM_C
+ (\(l1, c1) (_, c2) -> (l1, c1 + c2))
+ emptyUFM
+ lifes
+
+
+-- | Dump a table of how many conflicts vregs tend to have in the initial code.
+pprStatsConflict
+ :: [RegAllocStats statics instr] -> SDoc
+
+pprStatsConflict stats
+ = let confMap = foldl' (plusUFM_C (\(c1, n1) (_, n2) -> (c1, n1 + n2)))
+ emptyUFM
+ $ map Color.slurpNodeConflictCount
+ [ raGraph s | s@RegAllocStatsStart{} <- stats ]
+
+ in ( text "-- vreg-conflicts"
+ $$ text "-- (conflict_count, number_of_vregs_that_had_that_many_conflicts)"
+ $$ pprUFM confMap (vcat . map ppr)
+ $$ text "\n")
+
+
+-- | For every vreg, dump how many conflicts it has, and its lifetime.
+-- Good for making a scatter plot.
+pprStatsLifeConflict
+ :: [RegAllocStats statics instr]
+ -> Color.Graph VirtualReg RegClass RealReg -- ^ global register conflict graph
+ -> SDoc
+
+pprStatsLifeConflict stats graph
+ = let lifeMap = lifeMapFromSpillCostInfo
+ $ foldl' plusSpillCostInfo zeroSpillCostInfo
+ $ [ raSpillCosts s | s@RegAllocStatsStart{} <- stats ]
+
+ scatter = map (\r -> let lifetime = case lookupUFM lifeMap r of
+ Just (_, l) -> l
+ Nothing -> 0
+ Just node = Color.lookupNode graph r
+ in parens $ hcat $ punctuate (text ", ")
+ [ doubleQuotes $ ppr $ Color.nodeId node
+ , ppr $ sizeUniqSet (Color.nodeConflicts node)
+ , ppr $ lifetime ])
+ $ map Color.nodeId
+ $ nonDetEltsUFM
+ -- See Note [Unique Determinism and code generation]
+ $ Color.graphMap graph
+
+ in ( text "-- vreg-conflict-lifetime"
+ $$ text "-- (vreg, vreg_conflicts, vreg_lifetime)"
+ $$ (vcat scatter)
+ $$ text "\n")
+
+
+-- | Count spill/reload/reg-reg moves.
+-- Lets us see how well the register allocator has done.
+countSRMs
+ :: Instruction instr
+ => LiveCmmDecl statics instr -> (Int, Int, Int)
+
+countSRMs cmm
+ = execState (mapBlockTopM countSRM_block cmm) (0, 0, 0)
+
+
+countSRM_block
+ :: Instruction instr
+ => GenBasicBlock (LiveInstr instr)
+ -> State (Int, Int, Int) (GenBasicBlock (LiveInstr instr))
+
+countSRM_block (BasicBlock i instrs)
+ = do instrs' <- mapM countSRM_instr instrs
+ return $ BasicBlock i instrs'
+
+
+countSRM_instr
+ :: Instruction instr
+ => LiveInstr instr -> State (Int, Int, Int) (LiveInstr instr)
+
+countSRM_instr li
+ | LiveInstr SPILL{} _ <- li
+ = do modify $ \(s, r, m) -> (s + 1, r, m)
+ return li
+
+ | LiveInstr RELOAD{} _ <- li
+ = do modify $ \(s, r, m) -> (s, r + 1, m)
+ return li
+
+ | LiveInstr instr _ <- li
+ , Just _ <- takeRegRegMoveInstr instr
+ = do modify $ \(s, r, m) -> (s, r, m + 1)
+ return li
+
+ | otherwise
+ = return li
+
+
+-- sigh..
+addSRM :: (Int, Int, Int) -> (Int, Int, Int) -> (Int, Int, Int)
+addSRM (s1, r1, m1) (s2, r2, m2)
+ = let !s = s1 + s2
+ !r = r1 + r2
+ !m = m1 + m2
+ in (s, r, m)
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
new file mode 100644
index 0000000000..ec7c5ad13e
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
@@ -0,0 +1,274 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.Reg.Graph.TrivColorable (
+ trivColorable,
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GraphBase
+
+import UniqSet
+import GHC.Platform
+import Panic
+
+-- trivColorable ---------------------------------------------------------------
+
+-- trivColorable function for the graph coloring allocator
+--
+-- This gets hammered by scanGraph during register allocation,
+-- so needs to be fairly efficient.
+--
+-- NOTE: This only works for architectures with just RcInteger and RcDouble
+-- (which are disjoint) ie. x86, x86_64 and ppc
+--
+-- The number of allocatable regs is hard coded in here so we can do
+-- a fast comparison in trivColorable.
+--
+-- It's ok if these numbers are _less_ than the actual number of free
+-- regs, but they can't be more or the register conflict
+-- graph won't color.
+--
+-- If the graph doesn't color then the allocator will panic, but it won't
+-- generate bad object code or anything nasty like that.
+--
+-- There is an allocatableRegsInClass :: RegClass -> Int, but doing
+-- the unboxing is too slow for us here.
+-- TODO: Is that still true? Could we use allocatableRegsInClass
+-- without losing performance now?
+--
+-- Look at includes/stg/MachRegs.h to get the numbers.
+--
+
+
+-- Disjoint registers ----------------------------------------------------------
+--
+-- The definition has been unfolded into individual cases for speed.
+-- Each architecture has a different register setup, so we use a
+-- different regSqueeze function for each.
+--
+accSqueeze
+ :: Int
+ -> Int
+ -> (reg -> Int)
+ -> UniqSet reg
+ -> Int
+
+accSqueeze count maxCount squeeze us = acc count (nonDetEltsUniqSet us)
+ -- See Note [Unique Determinism and code generation]
+ where acc count [] = count
+ acc count _ | count >= maxCount = count
+ acc count (r:rs) = acc (count + squeeze r) rs
+
+{- Note [accSqueeze]
+~~~~~~~~~~~~~~~~~~~~
+BL 2007/09
+Doing a nice fold over the UniqSet makes trivColorable use
+32% of total compile time and 42% of total alloc when compiling SHA1.hs from darcs.
+Therefore the UniqFM is made non-abstract and we use custom fold.
+
+MS 2010/04
+When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal
+representation any more. But it is imperative that the accSqueeze stops
+the folding if the count gets greater or equal to maxCount. We thus convert
+UniqFM to a (lazy) list, do the fold and stops if necessary, which was
+the most efficient variant tried. Benchmark compiling 10-times SHA1.hs follows.
+(original = previous implementation, folding = fold of the whole UFM,
+ lazyFold = the current implementation,
+ hackFold = using internal representation of Data.IntMap)
+
+ original folding hackFold lazyFold
+ -O -fasm (used everywhere) 31.509s 30.387s 30.791s 30.603s
+ 100.00% 96.44% 97.72% 97.12%
+ -fregs-graph 67.938s 74.875s 62.673s 64.679s
+ 100.00% 110.21% 92.25% 95.20%
+ -fregs-iterative 89.761s 143.913s 81.075s 86.912s
+ 100.00% 160.33% 90.32% 96.83%
+ -fnew-codegen 38.225s 37.142s 37.551s 37.119s
+ 100.00% 97.17% 98.24% 97.11%
+ -fnew-codegen -fregs-graph 91.786s 91.51s 87.368s 86.88s
+ 100.00% 99.70% 95.19% 94.65%
+ -fnew-codegen -fregs-iterative 206.72s 343.632s 194.694s 208.677s
+ 100.00% 166.23% 94.18% 100.95%
+-}
+
+trivColorable
+ :: Platform
+ -> (RegClass -> VirtualReg -> Int)
+ -> (RegClass -> RealReg -> Int)
+ -> Triv VirtualReg RegClass RealReg
+
+trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions
+ | let cALLOCATABLE_REGS_INTEGER
+ = (case platformArch platform of
+ ArchX86 -> 3
+ ArchX86_64 -> 5
+ ArchPPC -> 16
+ ArchSPARC -> 14
+ ArchSPARC64 -> panic "trivColorable ArchSPARC64"
+ ArchPPC_64 _ -> 15
+ ArchARM _ _ _ -> panic "trivColorable ArchARM"
+ ArchARM64 -> panic "trivColorable ArchARM64"
+ ArchAlpha -> panic "trivColorable ArchAlpha"
+ ArchMipseb -> panic "trivColorable ArchMipseb"
+ ArchMipsel -> panic "trivColorable ArchMipsel"
+ ArchS390X -> panic "trivColorable ArchS390X"
+ ArchJavaScript-> panic "trivColorable ArchJavaScript"
+ ArchUnknown -> panic "trivColorable ArchUnknown")
+ , count2 <- accSqueeze 0 cALLOCATABLE_REGS_INTEGER
+ (virtualRegSqueeze RcInteger)
+ conflicts
+
+ , count3 <- accSqueeze count2 cALLOCATABLE_REGS_INTEGER
+ (realRegSqueeze RcInteger)
+ exclusions
+
+ = count3 < cALLOCATABLE_REGS_INTEGER
+
+trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
+ | let cALLOCATABLE_REGS_FLOAT
+ = (case platformArch platform of
+ -- On x86_64 and x86, Float and RcDouble
+ -- use the same registers,
+ -- so we only use RcDouble to represent the
+ -- register allocation problem on those types.
+ ArchX86 -> 0
+ ArchX86_64 -> 0
+ ArchPPC -> 0
+ ArchSPARC -> 22
+ ArchSPARC64 -> panic "trivColorable ArchSPARC64"
+ ArchPPC_64 _ -> 0
+ ArchARM _ _ _ -> panic "trivColorable ArchARM"
+ ArchARM64 -> panic "trivColorable ArchARM64"
+ ArchAlpha -> panic "trivColorable ArchAlpha"
+ ArchMipseb -> panic "trivColorable ArchMipseb"
+ ArchMipsel -> panic "trivColorable ArchMipsel"
+ ArchS390X -> panic "trivColorable ArchS390X"
+ ArchJavaScript-> panic "trivColorable ArchJavaScript"
+ ArchUnknown -> panic "trivColorable ArchUnknown")
+ , count2 <- accSqueeze 0 cALLOCATABLE_REGS_FLOAT
+ (virtualRegSqueeze RcFloat)
+ conflicts
+
+ , count3 <- accSqueeze count2 cALLOCATABLE_REGS_FLOAT
+ (realRegSqueeze RcFloat)
+ exclusions
+
+ = count3 < cALLOCATABLE_REGS_FLOAT
+
+trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
+ | let cALLOCATABLE_REGS_DOUBLE
+ = (case platformArch platform of
+ ArchX86 -> 8
+ -- in x86 32bit mode sse2 there are only
+ -- 8 XMM registers xmm0 ... xmm7
+ ArchX86_64 -> 10
+ -- in x86_64 there are 16 XMM registers
+ -- xmm0 .. xmm15, here 10 is a
+ -- "dont need to solve conflicts" count that
+ -- was chosen at some point in the past.
+ ArchPPC -> 26
+ ArchSPARC -> 11
+ ArchSPARC64 -> panic "trivColorable ArchSPARC64"
+ ArchPPC_64 _ -> 20
+ ArchARM _ _ _ -> panic "trivColorable ArchARM"
+ ArchARM64 -> panic "trivColorable ArchARM64"
+ ArchAlpha -> panic "trivColorable ArchAlpha"
+ ArchMipseb -> panic "trivColorable ArchMipseb"
+ ArchMipsel -> panic "trivColorable ArchMipsel"
+ ArchS390X -> panic "trivColorable ArchS390X"
+ ArchJavaScript-> panic "trivColorable ArchJavaScript"
+ ArchUnknown -> panic "trivColorable ArchUnknown")
+ , count2 <- accSqueeze 0 cALLOCATABLE_REGS_DOUBLE
+ (virtualRegSqueeze RcDouble)
+ conflicts
+
+ , count3 <- accSqueeze count2 cALLOCATABLE_REGS_DOUBLE
+ (realRegSqueeze RcDouble)
+ exclusions
+
+ = count3 < cALLOCATABLE_REGS_DOUBLE
+
+
+
+
+-- Specification Code ----------------------------------------------------------
+--
+-- The trivColorable function for each particular architecture should
+-- implement the following function, but faster.
+--
+
+{-
+trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
+trivColorable classN conflicts exclusions
+ = let
+
+ acc :: Reg -> (Int, Int) -> (Int, Int)
+ acc r (cd, cf)
+ = case regClass r of
+ RcInteger -> (cd+1, cf)
+ RcFloat -> (cd, cf+1)
+ _ -> panic "Regs.trivColorable: reg class not handled"
+
+ tmp = nonDetFoldUFM acc (0, 0) conflicts
+ (countInt, countFloat) = nonDetFoldUFM acc tmp exclusions
+
+ squeese = worst countInt classN RcInteger
+ + worst countFloat classN RcFloat
+
+ in squeese < allocatableRegsInClass classN
+
+-- | Worst case displacement
+-- node N of classN has n neighbors of class C.
+--
+-- We currently only have RcInteger and RcDouble, which don't conflict at all.
+-- This is a bit boring compared to what's in RegArchX86.
+--
+worst :: Int -> RegClass -> RegClass -> Int
+worst n classN classC
+ = case classN of
+ RcInteger
+ -> case classC of
+ RcInteger -> min n (allocatableRegsInClass RcInteger)
+ RcFloat -> 0
+
+ RcDouble
+ -> case classC of
+ RcFloat -> min n (allocatableRegsInClass RcFloat)
+ RcInteger -> 0
+
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: [RegNo]
+allocatableRegs
+ = let isFree i = freeReg i
+ in filter isFree allMachRegNos
+
+
+-- | The number of regs in each class.
+-- We go via top level CAFs to ensure that we're not recomputing
+-- the length of these lists each time the fn is called.
+allocatableRegsInClass :: RegClass -> Int
+allocatableRegsInClass cls
+ = case cls of
+ RcInteger -> allocatableRegsInteger
+ RcFloat -> allocatableRegsDouble
+
+allocatableRegsInteger :: Int
+allocatableRegsInteger
+ = length $ filter (\r -> regClass r == RcInteger)
+ $ map RealReg allocatableRegs
+
+allocatableRegsFloat :: Int
+allocatableRegsFloat
+ = length $ filter (\r -> regClass r == RcFloat
+ $ map RealReg allocatableRegs
+-}
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs b/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs
new file mode 100644
index 0000000000..0d4c56ba21
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs
@@ -0,0 +1,161 @@
+
+-- | A description of the register set of the X86.
+--
+-- This isn't used directly in GHC proper.
+--
+-- See RegArchBase.hs for the reference.
+-- See MachRegs.hs for the actual trivColorable function used in GHC.
+--
+module GHC.CmmToAsm.Reg.Graph.X86 (
+ classOfReg,
+ regsOfClass,
+ regName,
+ regAlias,
+ worst,
+ squeese,
+) where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Graph.Base (Reg(..), RegSub(..), RegClass(..))
+import UniqSet
+
+import qualified Data.Array as A
+
+
+-- | Determine the class of a register
+classOfReg :: Reg -> RegClass
+classOfReg reg
+ = case reg of
+ Reg c _ -> c
+
+ RegSub SubL16 _ -> ClassG16
+ RegSub SubL8 _ -> ClassG8
+ RegSub SubL8H _ -> ClassG8
+
+
+-- | Determine all the regs that make up a certain class.
+regsOfClass :: RegClass -> UniqSet Reg
+regsOfClass c
+ = case c of
+ ClassG32
+ -> mkUniqSet [ Reg ClassG32 i
+ | i <- [0..7] ]
+
+ ClassG16
+ -> mkUniqSet [ RegSub SubL16 (Reg ClassG32 i)
+ | i <- [0..7] ]
+
+ ClassG8
+ -> unionUniqSets
+ (mkUniqSet [ RegSub SubL8 (Reg ClassG32 i) | i <- [0..3] ])
+ (mkUniqSet [ RegSub SubL8H (Reg ClassG32 i) | i <- [0..3] ])
+
+ ClassF64
+ -> mkUniqSet [ Reg ClassF64 i
+ | i <- [0..5] ]
+
+
+-- | Determine the common name of a reg
+-- returns Nothing if this reg is not part of the machine.
+regName :: Reg -> Maybe String
+regName reg
+ = case reg of
+ Reg ClassG32 i
+ | i <= 7 ->
+ let names = A.listArray (0,8)
+ [ "eax", "ebx", "ecx", "edx"
+ , "ebp", "esi", "edi", "esp" ]
+ in Just $ names A.! i
+
+ RegSub SubL16 (Reg ClassG32 i)
+ | i <= 7 ->
+ let names = A.listArray (0,8)
+ [ "ax", "bx", "cx", "dx"
+ , "bp", "si", "di", "sp"]
+ in Just $ names A.! i
+
+ RegSub SubL8 (Reg ClassG32 i)
+ | i <= 3 ->
+ let names = A.listArray (0,4) [ "al", "bl", "cl", "dl"]
+ in Just $ names A.! i
+
+ RegSub SubL8H (Reg ClassG32 i)
+ | i <= 3 ->
+ let names = A.listArray (0,4) [ "ah", "bh", "ch", "dh"]
+ in Just $ names A.! i
+
+ _ -> Nothing
+
+
+-- | Which regs alias what other regs.
+regAlias :: Reg -> UniqSet Reg
+regAlias reg
+ = case reg of
+
+ -- 32 bit regs alias all of the subregs
+ Reg ClassG32 i
+
+ -- for eax, ebx, ecx, eds
+ | i <= 3
+ -> mkUniqSet
+ $ [ Reg ClassG32 i, RegSub SubL16 reg
+ , RegSub SubL8 reg, RegSub SubL8H reg ]
+
+ -- for esi, edi, esp, ebp
+ | 4 <= i && i <= 7
+ -> mkUniqSet
+ $ [ Reg ClassG32 i, RegSub SubL16 reg ]
+
+ -- 16 bit subregs alias the whole reg
+ RegSub SubL16 r@(Reg ClassG32 _)
+ -> regAlias r
+
+ -- 8 bit subregs alias the 32 and 16, but not the other 8 bit subreg
+ RegSub SubL8 r@(Reg ClassG32 _)
+ -> mkUniqSet $ [ r, RegSub SubL16 r, RegSub SubL8 r ]
+
+ RegSub SubL8H r@(Reg ClassG32 _)
+ -> mkUniqSet $ [ r, RegSub SubL16 r, RegSub SubL8H r ]
+
+ -- fp
+ Reg ClassF64 _
+ -> unitUniqSet reg
+
+ _ -> error "regAlias: invalid register"
+
+
+-- | Optimised versions of RegColorBase.{worst, squeese} specific to x86
+worst :: Int -> RegClass -> RegClass -> Int
+worst n classN classC
+ = case classN of
+ ClassG32
+ -> case classC of
+ ClassG32 -> min n 8
+ ClassG16 -> min n 8
+ ClassG8 -> min n 4
+ ClassF64 -> 0
+
+ ClassG16
+ -> case classC of
+ ClassG32 -> min n 8
+ ClassG16 -> min n 8
+ ClassG8 -> min n 4
+ ClassF64 -> 0
+
+ ClassG8
+ -> case classC of
+ ClassG32 -> min (n*2) 8
+ ClassG16 -> min (n*2) 8
+ ClassG8 -> min n 8
+ ClassF64 -> 0
+
+ ClassF64
+ -> case classC of
+ ClassF64 -> min n 6
+ _ -> 0
+
+squeese :: RegClass -> [(Int, RegClass)] -> Int
+squeese classN countCs
+ = sum (map (\(i, classC) -> worst i classN classC) countCs)
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear.hs b/compiler/GHC/CmmToAsm/Reg/Linear.hs
new file mode 100644
index 0000000000..9b263889d8
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear.hs
@@ -0,0 +1,920 @@
+{-# LANGUAGE BangPatterns, CPP, ScopedTypeVariables #-}
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- The register allocator
+--
+-- (c) The University of Glasgow 2004
+--
+-----------------------------------------------------------------------------
+
+{-
+The algorithm is roughly:
+
+ 1) Compute strongly connected components of the basic block list.
+
+ 2) Compute liveness (mapping from pseudo register to
+ point(s) of death?).
+
+ 3) Walk instructions in each basic block. We keep track of
+ (a) Free real registers (a bitmap?)
+ (b) Current assignment of temporaries to machine registers and/or
+ spill slots (call this the "assignment").
+ (c) Partial mapping from basic block ids to a virt-to-loc mapping.
+ When we first encounter a branch to a basic block,
+ we fill in its entry in this table with the current mapping.
+
+ For each instruction:
+ (a) For each temporary *read* by the instruction:
+ If the temporary does not have a real register allocation:
+ - Allocate a real register from the free list. If
+ the list is empty:
+ - Find a temporary to spill. Pick one that is
+ not used in this instruction (ToDo: not
+ used for a while...)
+ - generate a spill instruction
+ - If the temporary was previously spilled,
+ generate an instruction to read the temp from its spill loc.
+ (optimisation: if we can see that a real register is going to
+ be used soon, then don't use it for allocation).
+
+ (b) For each real register clobbered by this instruction:
+ If a temporary resides in it,
+ If the temporary is live after this instruction,
+ Move the temporary to another (non-clobbered & free) reg,
+ or spill it to memory. Mark the temporary as residing
+ in both memory and a register if it was spilled (it might
+ need to be read by this instruction).
+
+ (ToDo: this is wrong for jump instructions?)
+
+ We do this after step (a), because if we start with
+ movq v1, %rsi
+ which is an instruction that clobbers %rsi, if v1 currently resides
+ in %rsi we want to get
+ movq %rsi, %freereg
+ movq %rsi, %rsi -- will disappear
+ instead of
+ movq %rsi, %freereg
+ movq %freereg, %rsi
+
+ (c) Update the current assignment
+
+ (d) If the instruction is a branch:
+ if the destination block already has a register assignment,
+ Generate a new block with fixup code and redirect the
+ jump to the new block.
+ else,
+ Update the block id->assignment mapping with the current
+ assignment.
+
+ (e) Delete all register assignments for temps which are read
+ (only) and die here. Update the free register list.
+
+ (f) Mark all registers clobbered by this instruction as not free,
+ and mark temporaries which have been spilled due to clobbering
+ as in memory (step (a) marks then as in both mem & reg).
+
+ (g) For each temporary *written* by this instruction:
+ Allocate a real register as for (b), spilling something
+ else if necessary.
+ - except when updating the assignment, drop any memory
+ locations that the temporary was previously in, since
+ they will be no longer valid after this instruction.
+
+ (h) Delete all register assignments for temps which are
+ written and die here (there should rarely be any). Update
+ the free register list.
+
+ (i) Rewrite the instruction with the new mapping.
+
+ (j) For each spilled reg known to be now dead, re-add its stack slot
+ to the free list.
+
+-}
+
+module GHC.CmmToAsm.Reg.Linear (
+ regAlloc,
+ module GHC.CmmToAsm.Reg.Linear.Base,
+ module GHC.CmmToAsm.Reg.Linear.Stats
+ ) where
+
+#include "HsVersions.h"
+
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.State
+import GHC.CmmToAsm.Reg.Linear.Base
+import GHC.CmmToAsm.Reg.Linear.StackMap
+import GHC.CmmToAsm.Reg.Linear.FreeRegs
+import GHC.CmmToAsm.Reg.Linear.Stats
+import GHC.CmmToAsm.Reg.Linear.JoinToTargets
+import qualified GHC.CmmToAsm.Reg.Linear.PPC as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86 as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+import GHC.CmmToAsm.Reg.Target
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm hiding (RegSet)
+
+import Digraph
+import GHC.Driver.Session
+import Unique
+import UniqSet
+import UniqFM
+import UniqSupply
+import Outputable
+import GHC.Platform
+
+import Data.Maybe
+import Data.List
+import Control.Monad
+
+-- -----------------------------------------------------------------------------
+-- Top level of the register allocator
+
+-- Allocate registers
+regAlloc
+ :: (Outputable instr, Instruction instr)
+ => DynFlags
+ -> LiveCmmDecl statics instr
+ -> UniqSM ( NatCmmDecl statics instr
+ , Maybe Int -- number of extra stack slots required,
+ -- beyond maxSpillSlots
+ , Maybe RegAllocStats
+ )
+
+regAlloc _ (CmmData sec d)
+ = return
+ ( CmmData sec d
+ , Nothing
+ , Nothing )
+
+regAlloc _ (CmmProc (LiveInfo info _ _ _) lbl live [])
+ = return ( CmmProc info lbl live (ListGraph [])
+ , Nothing
+ , Nothing )
+
+regAlloc dflags (CmmProc static lbl live sccs)
+ | LiveInfo info entry_ids@(first_id:_) block_live _ <- static
+ = do
+ -- do register allocation on each component.
+ (final_blocks, stats, stack_use)
+ <- linearRegAlloc dflags entry_ids block_live sccs
+
+ -- make sure the block that was first in the input list
+ -- stays at the front of the output
+ let ((first':_), rest')
+ = partition ((== first_id) . blockId) final_blocks
+
+ let max_spill_slots = maxSpillSlots dflags
+ extra_stack
+ | stack_use > max_spill_slots
+ = Just (stack_use - max_spill_slots)
+ | otherwise
+ = Nothing
+
+ return ( CmmProc info lbl live (ListGraph (first' : rest'))
+ , extra_stack
+ , Just stats)
+
+-- bogus. to make non-exhaustive match warning go away.
+regAlloc _ (CmmProc _ _ _ _)
+ = panic "RegAllocLinear.regAlloc: no match"
+
+
+-- -----------------------------------------------------------------------------
+-- Linear sweep to allocate registers
+
+
+-- | Do register allocation on some basic blocks.
+-- But be careful to allocate a block in an SCC only if it has
+-- an entry in the block map or it is the first block.
+--
+linearRegAlloc
+ :: (Outputable instr, Instruction instr)
+ => DynFlags
+ -> [BlockId] -- ^ entry points
+ -> BlockMap RegSet
+ -- ^ live regs on entry to each basic block
+ -> [SCC (LiveBasicBlock instr)]
+ -- ^ instructions annotated with "deaths"
+ -> UniqSM ([NatBasicBlock instr], RegAllocStats, Int)
+
+linearRegAlloc dflags entry_ids block_live sccs
+ = case platformArch platform of
+ ArchX86 -> go $ (frInitFreeRegs platform :: X86.FreeRegs)
+ ArchX86_64 -> go $ (frInitFreeRegs platform :: X86_64.FreeRegs)
+ ArchS390X -> panic "linearRegAlloc ArchS390X"
+ ArchSPARC -> go $ (frInitFreeRegs platform :: SPARC.FreeRegs)
+ ArchSPARC64 -> panic "linearRegAlloc ArchSPARC64"
+ ArchPPC -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
+ ArchARM _ _ _ -> panic "linearRegAlloc ArchARM"
+ ArchARM64 -> panic "linearRegAlloc ArchARM64"
+ ArchPPC_64 _ -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
+ ArchAlpha -> panic "linearRegAlloc ArchAlpha"
+ ArchMipseb -> panic "linearRegAlloc ArchMipseb"
+ ArchMipsel -> panic "linearRegAlloc ArchMipsel"
+ ArchJavaScript -> panic "linearRegAlloc ArchJavaScript"
+ ArchUnknown -> panic "linearRegAlloc ArchUnknown"
+ where
+ go f = linearRegAlloc' dflags f entry_ids block_live sccs
+ platform = targetPlatform dflags
+
+linearRegAlloc'
+ :: (FR freeRegs, Outputable instr, Instruction instr)
+ => DynFlags
+ -> freeRegs
+ -> [BlockId] -- ^ entry points
+ -> BlockMap RegSet -- ^ live regs on entry to each basic block
+ -> [SCC (LiveBasicBlock instr)] -- ^ instructions annotated with "deaths"
+ -> UniqSM ([NatBasicBlock instr], RegAllocStats, Int)
+
+linearRegAlloc' dflags initFreeRegs entry_ids block_live sccs
+ = do us <- getUniqueSupplyM
+ let (_, stack, stats, blocks) =
+ runR dflags mapEmpty initFreeRegs emptyRegMap (emptyStackMap dflags) us
+ $ linearRA_SCCs entry_ids block_live [] sccs
+ return (blocks, stats, getStackUse stack)
+
+
+linearRA_SCCs :: (FR freeRegs, Instruction instr, Outputable instr)
+ => [BlockId]
+ -> BlockMap RegSet
+ -> [NatBasicBlock instr]
+ -> [SCC (LiveBasicBlock instr)]
+ -> RegM freeRegs [NatBasicBlock instr]
+
+linearRA_SCCs _ _ blocksAcc []
+ = return $ reverse blocksAcc
+
+linearRA_SCCs entry_ids block_live blocksAcc (AcyclicSCC block : sccs)
+ = do blocks' <- processBlock block_live block
+ linearRA_SCCs entry_ids block_live
+ ((reverse blocks') ++ blocksAcc)
+ sccs
+
+linearRA_SCCs entry_ids block_live blocksAcc (CyclicSCC blocks : sccs)
+ = do
+ blockss' <- process entry_ids block_live blocks [] (return []) False
+ linearRA_SCCs entry_ids block_live
+ (reverse (concat blockss') ++ blocksAcc)
+ sccs
+
+{- from John Dias's patch 2008/10/16:
+ The linear-scan allocator sometimes allocates a block
+ before allocating one of its predecessors, which could lead to
+ inconsistent allocations. Make it so a block is only allocated
+ if a predecessor has set the "incoming" assignments for the block, or
+ if it's the procedure's entry block.
+
+ BL 2009/02: Careful. If the assignment for a block doesn't get set for
+ some reason then this function will loop. We should probably do some
+ more sanity checking to guard against this eventuality.
+-}
+
+process :: (FR freeRegs, Instruction instr, Outputable instr)
+ => [BlockId]
+ -> BlockMap RegSet
+ -> [GenBasicBlock (LiveInstr instr)]
+ -> [GenBasicBlock (LiveInstr instr)]
+ -> [[NatBasicBlock instr]]
+ -> Bool
+ -> RegM freeRegs [[NatBasicBlock instr]]
+
+process _ _ [] [] accum _
+ = return $ reverse accum
+
+process entry_ids block_live [] next_round accum madeProgress
+ | not madeProgress
+
+ {- BUGS: There are so many unreachable blocks in the code the warnings are overwhelming.
+ pprTrace "RegAlloc.Linear.Main.process: no progress made, bailing out."
+ ( text "Unreachable blocks:"
+ $$ vcat (map ppr next_round)) -}
+ = return $ reverse accum
+
+ | otherwise
+ = process entry_ids block_live
+ next_round [] accum False
+
+process entry_ids block_live (b@(BasicBlock id _) : blocks)
+ next_round accum madeProgress
+ = do
+ block_assig <- getBlockAssigR
+
+ if isJust (mapLookup id block_assig)
+ || id `elem` entry_ids
+ then do
+ b' <- processBlock block_live b
+ process entry_ids block_live blocks
+ next_round (b' : accum) True
+
+ else process entry_ids block_live blocks
+ (b : next_round) accum madeProgress
+
+
+-- | Do register allocation on this basic block
+--
+processBlock
+ :: (FR freeRegs, Outputable instr, Instruction instr)
+ => BlockMap RegSet -- ^ live regs on entry to each basic block
+ -> LiveBasicBlock instr -- ^ block to do register allocation on
+ -> RegM freeRegs [NatBasicBlock instr] -- ^ block with registers allocated
+
+processBlock block_live (BasicBlock id instrs)
+ = do initBlock id block_live
+ (instrs', fixups)
+ <- linearRA block_live [] [] id instrs
+ return $ BasicBlock id instrs' : fixups
+
+
+-- | Load the freeregs and current reg assignment into the RegM state
+-- for the basic block with this BlockId.
+initBlock :: FR freeRegs
+ => BlockId -> BlockMap RegSet -> RegM freeRegs ()
+initBlock id block_live
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ block_assig <- getBlockAssigR
+ case mapLookup id block_assig of
+ -- no prior info about this block: we must consider
+ -- any fixed regs to be allocated, but we can ignore
+ -- virtual regs (presumably this is part of a loop,
+ -- and we'll iterate again). The assignment begins
+ -- empty.
+ Nothing
+ -> do -- pprTrace "initFreeRegs" (text $ show initFreeRegs) (return ())
+ case mapLookup id block_live of
+ Nothing ->
+ setFreeRegsR (frInitFreeRegs platform)
+ Just live ->
+ setFreeRegsR $ foldl' (flip $ frAllocateReg platform) (frInitFreeRegs platform)
+ [ r | RegReal r <- nonDetEltsUniqSet live ]
+ -- See Note [Unique Determinism and code generation]
+ setAssigR emptyRegMap
+
+ -- load info about register assignments leading into this block.
+ Just (freeregs, assig)
+ -> do setFreeRegsR freeregs
+ setAssigR assig
+
+
+-- | Do allocation for a sequence of instructions.
+linearRA
+ :: (FR freeRegs, Outputable instr, Instruction instr)
+ => BlockMap RegSet -- ^ map of what vregs are live on entry to each block.
+ -> [instr] -- ^ accumulator for instructions already processed.
+ -> [NatBasicBlock instr] -- ^ accumulator for blocks of fixup code.
+ -> BlockId -- ^ id of the current block, for debugging.
+ -> [LiveInstr instr] -- ^ liveness annotated instructions in this block.
+
+ -> RegM freeRegs
+ ( [instr] -- instructions after register allocation
+ , [NatBasicBlock instr]) -- fresh blocks of fixup code.
+
+
+linearRA _ accInstr accFixup _ []
+ = return
+ ( reverse accInstr -- instrs need to be returned in the correct order.
+ , accFixup) -- it doesn't matter what order the fixup blocks are returned in.
+
+
+linearRA block_live accInstr accFixups id (instr:instrs)
+ = do
+ (accInstr', new_fixups) <- raInsn block_live accInstr id instr
+
+ linearRA block_live accInstr' (new_fixups ++ accFixups) id instrs
+
+
+-- | Do allocation for a single instruction.
+raInsn
+ :: (FR freeRegs, Outputable instr, Instruction instr)
+ => BlockMap RegSet -- ^ map of what vregs are love on entry to each block.
+ -> [instr] -- ^ accumulator for instructions already processed.
+ -> BlockId -- ^ the id of the current block, for debugging
+ -> LiveInstr instr -- ^ the instr to have its regs allocated, with liveness info.
+ -> RegM freeRegs
+ ( [instr] -- new instructions
+ , [NatBasicBlock instr]) -- extra fixup blocks
+
+raInsn _ new_instrs _ (LiveInstr ii Nothing)
+ | Just n <- takeDeltaInstr ii
+ = do setDeltaR n
+ return (new_instrs, [])
+
+raInsn _ new_instrs _ (LiveInstr ii@(Instr i) Nothing)
+ | isMetaInstr ii
+ = return (i : new_instrs, [])
+
+
+raInsn block_live new_instrs id (LiveInstr (Instr instr) (Just live))
+ = do
+ assig <- getAssigR
+
+ -- If we have a reg->reg move between virtual registers, where the
+ -- src register is not live after this instruction, and the dst
+ -- register does not already have an assignment,
+ -- and the source register is assigned to a register, not to a spill slot,
+ -- then we can eliminate the instruction.
+ -- (we can't eliminate it if the source register is on the stack, because
+ -- we do not want to use one spill slot for different virtual registers)
+ case takeRegRegMoveInstr instr of
+ Just (src,dst) | src `elementOfUniqSet` (liveDieRead live),
+ isVirtualReg dst,
+ not (dst `elemUFM` assig),
+ isRealReg src || isInReg src assig -> do
+ case src of
+ (RegReal rr) -> setAssigR (addToUFM assig dst (InReg rr))
+ -- if src is a fixed reg, then we just map dest to this
+ -- reg in the assignment. src must be an allocatable reg,
+ -- otherwise it wouldn't be in r_dying.
+ _virt -> case lookupUFM assig src of
+ Nothing -> panic "raInsn"
+ Just loc ->
+ setAssigR (addToUFM (delFromUFM assig src) dst loc)
+
+ -- we have eliminated this instruction
+ {-
+ freeregs <- getFreeRegsR
+ assig <- getAssigR
+ pprTrace "raInsn" (text "ELIMINATED: " <> docToSDoc (pprInstr instr)
+ $$ ppr r_dying <+> ppr w_dying $$ text (show freeregs) $$ ppr assig) $ do
+ -}
+ return (new_instrs, [])
+
+ _ -> genRaInsn block_live new_instrs id instr
+ (nonDetEltsUniqSet $ liveDieRead live)
+ (nonDetEltsUniqSet $ liveDieWrite live)
+ -- See Note [Unique Determinism and code generation]
+
+raInsn _ _ _ instr
+ = pprPanic "raInsn" (text "no match for:" <> ppr instr)
+
+-- ToDo: what can we do about
+--
+-- R1 = x
+-- jump I64[x] // [R1]
+--
+-- where x is mapped to the same reg as R1. We want to coalesce x and
+-- R1, but the register allocator doesn't know whether x will be
+-- assigned to again later, in which case x and R1 should be in
+-- different registers. Right now we assume the worst, and the
+-- assignment to R1 will clobber x, so we'll spill x into another reg,
+-- generating another reg->reg move.
+
+
+isInReg :: Reg -> RegMap Loc -> Bool
+isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
+ | otherwise = False
+
+
+genRaInsn :: (FR freeRegs, Instruction instr, Outputable instr)
+ => BlockMap RegSet
+ -> [instr]
+ -> BlockId
+ -> instr
+ -> [Reg]
+ -> [Reg]
+ -> RegM freeRegs ([instr], [NatBasicBlock instr])
+
+genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ case regUsageOfInstr platform instr of { RU read written ->
+ do
+ let real_written = [ rr | (RegReal rr) <- written ]
+ let virt_written = [ vr | (RegVirtual vr) <- written ]
+
+ -- we don't need to do anything with real registers that are
+ -- only read by this instr. (the list is typically ~2 elements,
+ -- so using nub isn't a problem).
+ let virt_read = nub [ vr | (RegVirtual vr) <- read ]
+
+ -- debugging
+{- freeregs <- getFreeRegsR
+ assig <- getAssigR
+ pprDebugAndThen (defaultDynFlags Settings{ sTargetPlatform=platform } undefined) trace "genRaInsn"
+ (ppr instr
+ $$ text "r_dying = " <+> ppr r_dying
+ $$ text "w_dying = " <+> ppr w_dying
+ $$ text "virt_read = " <+> ppr virt_read
+ $$ text "virt_written = " <+> ppr virt_written
+ $$ text "freeregs = " <+> text (show freeregs)
+ $$ text "assig = " <+> ppr assig)
+ $ do
+-}
+
+ -- (a), (b) allocate real regs for all regs read by this instruction.
+ (r_spills, r_allocd) <-
+ allocateRegsAndSpill True{-reading-} virt_read [] [] virt_read
+
+ -- (c) save any temporaries which will be clobbered by this instruction
+ clobber_saves <- saveClobberedTemps real_written r_dying
+
+ -- (d) Update block map for new destinations
+ -- NB. do this before removing dead regs from the assignment, because
+ -- these dead regs might in fact be live in the jump targets (they're
+ -- only dead in the code that follows in the current basic block).
+ (fixup_blocks, adjusted_instr)
+ <- joinToTargets block_live block_id instr
+
+ -- Debugging - show places where the reg alloc inserted
+ -- assignment fixup blocks.
+ -- when (not $ null fixup_blocks) $
+ -- pprTrace "fixup_blocks" (ppr fixup_blocks) (return ())
+
+ -- (e) Delete all register assignments for temps which are read
+ -- (only) and die here. Update the free register list.
+ releaseRegs r_dying
+
+ -- (f) Mark regs which are clobbered as unallocatable
+ clobberRegs real_written
+
+ -- (g) Allocate registers for temporaries *written* (only)
+ (w_spills, w_allocd) <-
+ allocateRegsAndSpill False{-writing-} virt_written [] [] virt_written
+
+ -- (h) Release registers for temps which are written here and not
+ -- used again.
+ releaseRegs w_dying
+
+ let
+ -- (i) Patch the instruction
+ patch_map
+ = listToUFM
+ [ (t, RegReal r)
+ | (t, r) <- zip virt_read r_allocd
+ ++ zip virt_written w_allocd ]
+
+ patched_instr
+ = patchRegsOfInstr adjusted_instr patchLookup
+
+ patchLookup x
+ = case lookupUFM patch_map x of
+ Nothing -> x
+ Just y -> y
+
+
+ -- (j) free up stack slots for dead spilled regs
+ -- TODO (can't be bothered right now)
+
+ -- erase reg->reg moves where the source and destination are the same.
+ -- If the src temp didn't die in this instr but happened to be allocated
+ -- to the same real reg as the destination, then we can erase the move anyway.
+ let squashed_instr = case takeRegRegMoveInstr patched_instr of
+ Just (src, dst)
+ | src == dst -> []
+ _ -> [patched_instr]
+
+ let code = squashed_instr ++ w_spills ++ reverse r_spills
+ ++ clobber_saves ++ new_instrs
+
+-- pprTrace "patched-code" ((vcat $ map (docToSDoc . pprInstr) code)) $ do
+-- pprTrace "pached-fixup" ((ppr fixup_blocks)) $ do
+
+ return (code, fixup_blocks)
+
+ }
+
+-- -----------------------------------------------------------------------------
+-- releaseRegs
+
+releaseRegs :: FR freeRegs => [Reg] -> RegM freeRegs ()
+releaseRegs regs = do
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ assig <- getAssigR
+ free <- getFreeRegsR
+ let loop assig !free [] = do setAssigR assig; setFreeRegsR free; return ()
+ loop assig !free (RegReal rr : rs) = loop assig (frReleaseReg platform rr free) rs
+ loop assig !free (r:rs) =
+ case lookupUFM assig r of
+ Just (InBoth real _) -> loop (delFromUFM assig r)
+ (frReleaseReg platform real free) rs
+ Just (InReg real) -> loop (delFromUFM assig r)
+ (frReleaseReg platform real free) rs
+ _ -> loop (delFromUFM assig r) free rs
+ loop assig free regs
+
+
+-- -----------------------------------------------------------------------------
+-- Clobber real registers
+
+-- For each temp in a register that is going to be clobbered:
+-- - if the temp dies after this instruction, do nothing
+-- - otherwise, put it somewhere safe (another reg if possible,
+-- otherwise spill and record InBoth in the assignment).
+-- - for allocateRegs on the temps *read*,
+-- - clobbered regs are allocatable.
+--
+-- for allocateRegs on the temps *written*,
+-- - clobbered regs are not allocatable.
+--
+
+saveClobberedTemps
+ :: (Instruction instr, FR freeRegs)
+ => [RealReg] -- real registers clobbered by this instruction
+ -> [Reg] -- registers which are no longer live after this insn
+ -> RegM freeRegs [instr] -- return: instructions to spill any temps that will
+ -- be clobbered.
+
+saveClobberedTemps [] _
+ = return []
+
+saveClobberedTemps clobbered dying
+ = do
+ assig <- getAssigR
+ let to_spill
+ = [ (temp,reg)
+ | (temp, InReg reg) <- nonDetUFMToList assig
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+ , any (realRegsAlias reg) clobbered
+ , temp `notElem` map getUnique dying ]
+
+ (instrs,assig') <- clobber assig [] to_spill
+ setAssigR assig'
+ return instrs
+
+ where
+ clobber assig instrs []
+ = return (instrs, assig)
+
+ clobber assig instrs ((temp, reg) : rest)
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+
+ freeRegs <- getFreeRegsR
+ let regclass = targetClassOfRealReg platform reg
+ freeRegs_thisClass = frGetFreeRegs platform regclass freeRegs
+
+ case filter (`notElem` clobbered) freeRegs_thisClass of
+
+ -- (1) we have a free reg of the right class that isn't
+ -- clobbered by this instruction; use it to save the
+ -- clobbered value.
+ (my_reg : _) -> do
+ setFreeRegsR (frAllocateReg platform my_reg freeRegs)
+
+ let new_assign = addToUFM assig temp (InReg my_reg)
+ let instr = mkRegRegMoveInstr platform
+ (RegReal reg) (RegReal my_reg)
+
+ clobber new_assign (instr : instrs) rest
+
+ -- (2) no free registers: spill the value
+ [] -> do
+ (spill, slot) <- spillR (RegReal reg) temp
+
+ -- record why this reg was spilled for profiling
+ recordSpill (SpillClobber temp)
+
+ let new_assign = addToUFM assig temp (InBoth reg slot)
+
+ clobber new_assign (spill : instrs) rest
+
+
+
+-- | Mark all these real regs as allocated,
+-- and kick out their vreg assignments.
+--
+clobberRegs :: FR freeRegs => [RealReg] -> RegM freeRegs ()
+clobberRegs []
+ = return ()
+
+clobberRegs clobbered
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+
+ freeregs <- getFreeRegsR
+ setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered
+
+ assig <- getAssigR
+ setAssigR $! clobber assig (nonDetUFMToList assig)
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+
+ where
+ -- if the temp was InReg and clobbered, then we will have
+ -- saved it in saveClobberedTemps above. So the only case
+ -- we have to worry about here is InBoth. Note that this
+ -- also catches temps which were loaded up during allocation
+ -- of read registers, not just those saved in saveClobberedTemps.
+
+ clobber assig []
+ = assig
+
+ clobber assig ((temp, InBoth reg slot) : rest)
+ | any (realRegsAlias reg) clobbered
+ = clobber (addToUFM assig temp (InMem slot)) rest
+
+ clobber assig (_:rest)
+ = clobber assig rest
+
+-- -----------------------------------------------------------------------------
+-- allocateRegsAndSpill
+
+-- Why are we performing a spill?
+data SpillLoc = ReadMem StackSlot -- reading from register only in memory
+ | WriteNew -- writing to a new variable
+ | WriteMem -- writing to register only in memory
+-- Note that ReadNew is not valid, since you don't want to be reading
+-- from an uninitialized register. We also don't need the location of
+-- the register in memory, since that will be invalidated by the write.
+-- Technically, we could coalesce WriteNew and WriteMem into a single
+-- entry as well. -- EZY
+
+-- This function does several things:
+-- For each temporary referred to by this instruction,
+-- we allocate a real register (spilling another temporary if necessary).
+-- We load the temporary up from memory if necessary.
+-- We also update the register assignment in the process, and
+-- the list of free registers and free stack slots.
+
+allocateRegsAndSpill
+ :: (FR freeRegs, Outputable instr, Instruction instr)
+ => Bool -- True <=> reading (load up spilled regs)
+ -> [VirtualReg] -- don't push these out
+ -> [instr] -- spill insns
+ -> [RealReg] -- real registers allocated (accum.)
+ -> [VirtualReg] -- temps to allocate
+ -> RegM freeRegs ( [instr] , [RealReg])
+
+allocateRegsAndSpill _ _ spills alloc []
+ = return (spills, reverse alloc)
+
+allocateRegsAndSpill reading keep spills alloc (r:rs)
+ = do assig <- getAssigR
+ let doSpill = allocRegsAndSpill_spill reading keep spills alloc r rs assig
+ case lookupUFM assig r of
+ -- case (1a): already in a register
+ Just (InReg my_reg) ->
+ allocateRegsAndSpill reading keep spills (my_reg:alloc) rs
+
+ -- case (1b): already in a register (and memory)
+ -- NB1. if we're writing this register, update its assignment to be
+ -- InReg, because the memory value is no longer valid.
+ -- NB2. This is why we must process written registers here, even if they
+ -- are also read by the same instruction.
+ Just (InBoth my_reg _)
+ -> do when (not reading) (setAssigR (addToUFM assig r (InReg my_reg)))
+ allocateRegsAndSpill reading keep spills (my_reg:alloc) rs
+
+ -- Not already in a register, so we need to find a free one...
+ Just (InMem slot) | reading -> doSpill (ReadMem slot)
+ | otherwise -> doSpill WriteMem
+ Nothing | reading ->
+ pprPanic "allocateRegsAndSpill: Cannot read from uninitialized register" (ppr r)
+ -- NOTE: if the input to the NCG contains some
+ -- unreachable blocks with junk code, this panic
+ -- might be triggered. Make sure you only feed
+ -- sensible code into the NCG. In GHC.Cmm.Pipeline we
+ -- call removeUnreachableBlocks at the end for this
+ -- reason.
+
+ | otherwise -> doSpill WriteNew
+
+
+-- reading is redundant with reason, but we keep it around because it's
+-- convenient and it maintains the recursive structure of the allocator. -- EZY
+allocRegsAndSpill_spill :: (FR freeRegs, Instruction instr, Outputable instr)
+ => Bool
+ -> [VirtualReg]
+ -> [instr]
+ -> [RealReg]
+ -> VirtualReg
+ -> [VirtualReg]
+ -> UniqFM Loc
+ -> SpillLoc
+ -> RegM freeRegs ([instr], [RealReg])
+allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ freeRegs <- getFreeRegsR
+ let freeRegs_thisClass = frGetFreeRegs platform (classOfVirtualReg r) freeRegs
+
+ case freeRegs_thisClass of
+
+ -- case (2): we have a free register
+ (my_reg : _) ->
+ do spills' <- loadTemp r spill_loc my_reg spills
+
+ setAssigR (addToUFM assig r $! newLocation spill_loc my_reg)
+ setFreeRegsR $ frAllocateReg platform my_reg freeRegs
+
+ allocateRegsAndSpill reading keep spills' (my_reg : alloc) rs
+
+
+ -- case (3): we need to push something out to free up a register
+ [] ->
+ do let inRegOrBoth (InReg _) = True
+ inRegOrBoth (InBoth _ _) = True
+ inRegOrBoth _ = False
+ let candidates' =
+ flip delListFromUFM keep $
+ filterUFM inRegOrBoth $
+ assig
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+ let candidates = nonDetUFMToList candidates'
+
+ -- the vregs we could kick out that are already in a slot
+ let candidates_inBoth
+ = [ (temp, reg, mem)
+ | (temp, InBoth reg mem) <- candidates
+ , targetClassOfRealReg platform reg == classOfVirtualReg r ]
+
+ -- the vregs we could kick out that are only in a reg
+ -- this would require writing the reg to a new slot before using it.
+ let candidates_inReg
+ = [ (temp, reg)
+ | (temp, InReg reg) <- candidates
+ , targetClassOfRealReg platform reg == classOfVirtualReg r ]
+
+ let result
+
+ -- we have a temporary that is in both register and mem,
+ -- just free up its register for use.
+ | (temp, my_reg, slot) : _ <- candidates_inBoth
+ = do spills' <- loadTemp r spill_loc my_reg spills
+ let assig1 = addToUFM assig temp (InMem slot)
+ let assig2 = addToUFM assig1 r $! newLocation spill_loc my_reg
+
+ setAssigR assig2
+ allocateRegsAndSpill reading keep spills' (my_reg:alloc) rs
+
+ -- otherwise, we need to spill a temporary that currently
+ -- resides in a register.
+ | (temp_to_push_out, (my_reg :: RealReg)) : _
+ <- candidates_inReg
+ = do
+ (spill_insn, slot) <- spillR (RegReal my_reg) temp_to_push_out
+ let spill_store = (if reading then id else reverse)
+ [ -- COMMENT (fsLit "spill alloc")
+ spill_insn ]
+
+ -- record that this temp was spilled
+ recordSpill (SpillAlloc temp_to_push_out)
+
+ -- update the register assignment
+ let assig1 = addToUFM assig temp_to_push_out (InMem slot)
+ let assig2 = addToUFM assig1 r $! newLocation spill_loc my_reg
+ setAssigR assig2
+
+ -- if need be, load up a spilled temp into the reg we've just freed up.
+ spills' <- loadTemp r spill_loc my_reg spills
+
+ allocateRegsAndSpill reading keep
+ (spill_store ++ spills')
+ (my_reg:alloc) rs
+
+
+ -- there wasn't anything to spill, so we're screwed.
+ | otherwise
+ = pprPanic ("RegAllocLinear.allocRegsAndSpill: no spill candidates\n")
+ $ vcat
+ [ text "allocating vreg: " <> text (show r)
+ , text "assignment: " <> ppr assig
+ , text "freeRegs: " <> text (show freeRegs)
+ , text "initFreeRegs: " <> text (show (frInitFreeRegs platform `asTypeOf` freeRegs)) ]
+
+ result
+
+
+-- | Calculate a new location after a register has been loaded.
+newLocation :: SpillLoc -> RealReg -> Loc
+-- if the tmp was read from a slot, then now its in a reg as well
+newLocation (ReadMem slot) my_reg = InBoth my_reg slot
+-- writes will always result in only the register being available
+newLocation _ my_reg = InReg my_reg
+
+-- | Load up a spilled temporary if we need to (read from memory).
+loadTemp
+ :: (Instruction instr)
+ => VirtualReg -- the temp being loaded
+ -> SpillLoc -- the current location of this temp
+ -> RealReg -- the hreg to load the temp into
+ -> [instr]
+ -> RegM freeRegs [instr]
+
+loadTemp vreg (ReadMem slot) hreg spills
+ = do
+ insn <- loadR (RegReal hreg) slot
+ recordSpill (SpillLoad $ getUnique vreg)
+ return $ {- COMMENT (fsLit "spill load") : -} insn : spills
+
+loadTemp _ _ _ spills =
+ return spills
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs b/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs
new file mode 100644
index 0000000000..43dbab843b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs
@@ -0,0 +1,141 @@
+
+-- | Put common type definitions here to break recursive module dependencies.
+
+module GHC.CmmToAsm.Reg.Linear.Base (
+ BlockAssignment,
+
+ Loc(..),
+ regsOfLoc,
+
+ -- for stats
+ SpillReason(..),
+ RegAllocStats(..),
+
+ -- the allocator monad
+ RA_State(..),
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.StackMap
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.Platform.Reg
+
+import GHC.Driver.Session
+import Outputable
+import Unique
+import UniqFM
+import UniqSupply
+import GHC.Cmm.BlockId
+
+
+-- | Used to store the register assignment on entry to a basic block.
+-- We use this to handle join points, where multiple branch instructions
+-- target a particular label. We have to insert fixup code to make
+-- the register assignments from the different sources match up.
+--
+type BlockAssignment freeRegs
+ = BlockMap (freeRegs, RegMap Loc)
+
+
+-- | Where a vreg is currently stored
+-- A temporary can be marked as living in both a register and memory
+-- (InBoth), for example if it was recently loaded from a spill location.
+-- This makes it cheap to spill (no save instruction required), but we
+-- have to be careful to turn this into InReg if the value in the
+-- register is changed.
+
+-- This is also useful when a temporary is about to be clobbered. We
+-- save it in a spill location, but mark it as InBoth because the current
+-- instruction might still want to read it.
+--
+data Loc
+ -- | vreg is in a register
+ = InReg !RealReg
+
+ -- | vreg is held in a stack slot
+ | InMem {-# UNPACK #-} !StackSlot
+
+
+ -- | vreg is held in both a register and a stack slot
+ | InBoth !RealReg
+ {-# UNPACK #-} !StackSlot
+ deriving (Eq, Show, Ord)
+
+instance Outputable Loc where
+ ppr l = text (show l)
+
+
+-- | Get the reg numbers stored in this Loc.
+regsOfLoc :: Loc -> [RealReg]
+regsOfLoc (InReg r) = [r]
+regsOfLoc (InBoth r _) = [r]
+regsOfLoc (InMem _) = []
+
+
+-- | Reasons why instructions might be inserted by the spiller.
+-- Used when generating stats for -ddrop-asm-stats.
+--
+data SpillReason
+ -- | vreg was spilled to a slot so we could use its
+ -- current hreg for another vreg
+ = SpillAlloc !Unique
+
+ -- | vreg was moved because its hreg was clobbered
+ | SpillClobber !Unique
+
+ -- | vreg was loaded from a spill slot
+ | SpillLoad !Unique
+
+ -- | reg-reg move inserted during join to targets
+ | SpillJoinRR !Unique
+
+ -- | reg-mem move inserted during join to targets
+ | SpillJoinRM !Unique
+
+
+-- | Used to carry interesting stats out of the register allocator.
+data RegAllocStats
+ = RegAllocStats
+ { ra_spillInstrs :: UniqFM [Int]
+ , ra_fixupList :: [(BlockId,BlockId,BlockId)]
+ -- ^ (from,fixup,to) : We inserted fixup code between from and to
+ }
+
+
+-- | The register allocator state
+data RA_State freeRegs
+ = RA_State
+
+ {
+ -- | the current mapping from basic blocks to
+ -- the register assignments at the beginning of that block.
+ ra_blockassig :: BlockAssignment freeRegs
+
+ -- | free machine registers
+ , ra_freeregs :: !freeRegs
+
+ -- | assignment of temps to locations
+ , ra_assig :: RegMap Loc
+
+ -- | current stack delta
+ , ra_delta :: Int
+
+ -- | free stack slots for spilling
+ , ra_stack :: StackMap
+
+ -- | unique supply for generating names for join point fixup blocks.
+ , ra_us :: UniqSupply
+
+ -- | Record why things were spilled, for -ddrop-asm-stats.
+ -- Just keep a list here instead of a map of regs -> reasons.
+ -- We don't want to slow down the allocator if we're not going to emit the stats.
+ , ra_spills :: [SpillReason]
+ , ra_DynFlags :: DynFlags
+
+ -- | (from,fixup,to) : We inserted fixup code between from and to
+ , ra_fixups :: [(BlockId,BlockId,BlockId)] }
+
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
new file mode 100644
index 0000000000..0d72d8b6e9
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
@@ -0,0 +1,89 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.Reg.Linear.FreeRegs (
+ FR(..),
+ maxSpillSlots
+)
+
+#include "HsVersions.h"
+
+where
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+
+import GHC.Driver.Session
+import Panic
+import GHC.Platform
+
+-- -----------------------------------------------------------------------------
+-- The free register set
+-- This needs to be *efficient*
+-- Here's an inefficient 'executable specification' of the FreeRegs data type:
+--
+-- type FreeRegs = [RegNo]
+-- noFreeRegs = 0
+-- releaseReg n f = if n `elem` f then f else (n : f)
+-- initFreeRegs = allocatableRegs
+-- getFreeRegs cls f = filter ( (==cls) . regClass . RealReg ) f
+-- allocateReg f r = filter (/= r) f
+
+import qualified GHC.CmmToAsm.Reg.Linear.PPC as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86 as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+
+import qualified GHC.CmmToAsm.PPC.Instr as PPC.Instr
+import qualified GHC.CmmToAsm.SPARC.Instr as SPARC.Instr
+import qualified GHC.CmmToAsm.X86.Instr as X86.Instr
+
+class Show freeRegs => FR freeRegs where
+ frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs
+ frGetFreeRegs :: Platform -> RegClass -> freeRegs -> [RealReg]
+ frInitFreeRegs :: Platform -> freeRegs
+ frReleaseReg :: Platform -> RealReg -> freeRegs -> freeRegs
+
+instance FR X86.FreeRegs where
+ frAllocateReg = \_ -> X86.allocateReg
+ frGetFreeRegs = X86.getFreeRegs
+ frInitFreeRegs = X86.initFreeRegs
+ frReleaseReg = \_ -> X86.releaseReg
+
+instance FR X86_64.FreeRegs where
+ frAllocateReg = \_ -> X86_64.allocateReg
+ frGetFreeRegs = X86_64.getFreeRegs
+ frInitFreeRegs = X86_64.initFreeRegs
+ frReleaseReg = \_ -> X86_64.releaseReg
+
+instance FR PPC.FreeRegs where
+ frAllocateReg = \_ -> PPC.allocateReg
+ frGetFreeRegs = \_ -> PPC.getFreeRegs
+ frInitFreeRegs = PPC.initFreeRegs
+ frReleaseReg = \_ -> PPC.releaseReg
+
+instance FR SPARC.FreeRegs where
+ frAllocateReg = SPARC.allocateReg
+ frGetFreeRegs = \_ -> SPARC.getFreeRegs
+ frInitFreeRegs = SPARC.initFreeRegs
+ frReleaseReg = SPARC.releaseReg
+
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+ = case platformArch (targetPlatform dflags) of
+ ArchX86 -> X86.Instr.maxSpillSlots dflags
+ ArchX86_64 -> X86.Instr.maxSpillSlots dflags
+ ArchPPC -> PPC.Instr.maxSpillSlots dflags
+ ArchS390X -> panic "maxSpillSlots ArchS390X"
+ ArchSPARC -> SPARC.Instr.maxSpillSlots dflags
+ ArchSPARC64 -> panic "maxSpillSlots ArchSPARC64"
+ ArchARM _ _ _ -> panic "maxSpillSlots ArchARM"
+ ArchARM64 -> panic "maxSpillSlots ArchARM64"
+ ArchPPC_64 _ -> PPC.Instr.maxSpillSlots dflags
+ ArchAlpha -> panic "maxSpillSlots ArchAlpha"
+ ArchMipseb -> panic "maxSpillSlots ArchMipseb"
+ ArchMipsel -> panic "maxSpillSlots ArchMipsel"
+ ArchJavaScript-> panic "maxSpillSlots ArchJavaScript"
+ ArchUnknown -> panic "maxSpillSlots ArchUnknown"
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
new file mode 100644
index 0000000000..b4ad1b948c
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
@@ -0,0 +1,378 @@
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-- | Handles joining of a jump instruction to its targets.
+
+-- The first time we encounter a jump to a particular basic block, we
+-- record the assignment of temporaries. The next time we encounter a
+-- jump to the same block, we compare our current assignment to the
+-- stored one. They might be different if spilling has occurred in one
+-- branch; so some fixup code will be required to match up the assignments.
+--
+module GHC.CmmToAsm.Reg.Linear.JoinToTargets (joinToTargets) where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.State
+import GHC.CmmToAsm.Reg.Linear.Base
+import GHC.CmmToAsm.Reg.Linear.FreeRegs
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import Digraph
+import GHC.Driver.Session
+import Outputable
+import Unique
+import UniqFM
+import UniqSet
+
+-- | For a jump instruction at the end of a block, generate fixup code so its
+-- vregs are in the correct regs for its destination.
+--
+joinToTargets
+ :: (FR freeRegs, Instruction instr, Outputable instr)
+ => BlockMap RegSet -- ^ maps the unique of the blockid to the set of vregs
+ -- that are known to be live on the entry to each block.
+
+ -> BlockId -- ^ id of the current block
+ -> instr -- ^ branch instr on the end of the source block.
+
+ -> RegM freeRegs ([NatBasicBlock instr] -- fresh blocks of fixup code.
+ , instr) -- the original branch
+ -- instruction, but maybe
+ -- patched to jump
+ -- to a fixup block first.
+
+joinToTargets block_live id instr
+
+ -- we only need to worry about jump instructions.
+ | not $ isJumpishInstr instr
+ = return ([], instr)
+
+ | otherwise
+ = joinToTargets' block_live [] id instr (jumpDestsOfInstr instr)
+
+-----
+joinToTargets'
+ :: (FR freeRegs, Instruction instr, Outputable instr)
+ => BlockMap RegSet -- ^ maps the unique of the blockid to the set of vregs
+ -- that are known to be live on the entry to each block.
+
+ -> [NatBasicBlock instr] -- ^ acc blocks of fixup code.
+
+ -> BlockId -- ^ id of the current block
+ -> instr -- ^ branch instr on the end of the source block.
+
+ -> [BlockId] -- ^ branch destinations still to consider.
+
+ -> RegM freeRegs ([NatBasicBlock instr], instr)
+
+-- no more targets to consider. all done.
+joinToTargets' _ new_blocks _ instr []
+ = return (new_blocks, instr)
+
+-- handle a branch target.
+joinToTargets' block_live new_blocks block_id instr (dest:dests)
+ = do
+ -- get the map of where the vregs are stored on entry to each basic block.
+ block_assig <- getBlockAssigR
+
+ -- get the assignment on entry to the branch instruction.
+ assig <- getAssigR
+
+ -- adjust the current assignment to remove any vregs that are not live
+ -- on entry to the destination block.
+ let Just live_set = mapLookup dest block_live
+ let still_live uniq _ = uniq `elemUniqSet_Directly` live_set
+ let adjusted_assig = filterUFM_Directly still_live assig
+
+ -- and free up those registers which are now free.
+ let to_free =
+ [ r | (reg, loc) <- nonDetUFMToList assig
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+ , not (elemUniqSet_Directly reg live_set)
+ , r <- regsOfLoc loc ]
+
+ case mapLookup dest block_assig of
+ Nothing
+ -> joinToTargets_first
+ block_live new_blocks block_id instr dest dests
+ block_assig adjusted_assig to_free
+
+ Just (_, dest_assig)
+ -> joinToTargets_again
+ block_live new_blocks block_id instr dest dests
+ adjusted_assig dest_assig
+
+
+-- this is the first time we jumped to this block.
+joinToTargets_first :: (FR freeRegs, Instruction instr, Outputable instr)
+ => BlockMap RegSet
+ -> [NatBasicBlock instr]
+ -> BlockId
+ -> instr
+ -> BlockId
+ -> [BlockId]
+ -> BlockAssignment freeRegs
+ -> RegMap Loc
+ -> [RealReg]
+ -> RegM freeRegs ([NatBasicBlock instr], instr)
+joinToTargets_first block_live new_blocks block_id instr dest dests
+ block_assig src_assig
+ to_free
+
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+
+ -- free up the regs that are not live on entry to this block.
+ freeregs <- getFreeRegsR
+ let freeregs' = foldl' (flip $ frReleaseReg platform) freeregs to_free
+
+ -- remember the current assignment on entry to this block.
+ setBlockAssigR (mapInsert dest (freeregs', src_assig) block_assig)
+
+ joinToTargets' block_live new_blocks block_id instr dests
+
+
+-- we've jumped to this block before
+joinToTargets_again :: (Instruction instr, FR freeRegs, Outputable instr)
+ => BlockMap RegSet
+ -> [NatBasicBlock instr]
+ -> BlockId
+ -> instr
+ -> BlockId
+ -> [BlockId]
+ -> UniqFM Loc
+ -> UniqFM Loc
+ -> RegM freeRegs ([NatBasicBlock instr], instr)
+joinToTargets_again
+ block_live new_blocks block_id instr dest dests
+ src_assig dest_assig
+
+ -- the assignments already match, no problem.
+ | nonDetUFMToList dest_assig == nonDetUFMToList src_assig
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+ = joinToTargets' block_live new_blocks block_id instr dests
+
+ -- assignments don't match, need fixup code
+ | otherwise
+ = do
+
+ -- make a graph of what things need to be moved where.
+ let graph = makeRegMovementGraph src_assig dest_assig
+
+ -- look for cycles in the graph. This can happen if regs need to be swapped.
+ -- Note that we depend on the fact that this function does a
+ -- bottom up traversal of the tree-like portions of the graph.
+ --
+ -- eg, if we have
+ -- R1 -> R2 -> R3
+ --
+ -- ie move value in R1 to R2 and value in R2 to R3.
+ --
+ -- We need to do the R2 -> R3 move before R1 -> R2.
+ --
+ let sccs = stronglyConnCompFromEdgedVerticesOrdR graph
+
+ -- debugging
+ {-
+ pprTrace
+ ("joinToTargets: making fixup code")
+ (vcat [ text " in block: " <> ppr block_id
+ , text " jmp instruction: " <> ppr instr
+ , text " src assignment: " <> ppr src_assig
+ , text " dest assignment: " <> ppr dest_assig
+ , text " movement graph: " <> ppr graph
+ , text " sccs of graph: " <> ppr sccs
+ , text ""])
+ (return ())
+ -}
+ delta <- getDeltaR
+ fixUpInstrs_ <- mapM (handleComponent delta instr) sccs
+ let fixUpInstrs = concat fixUpInstrs_
+
+ -- make a new basic block containing the fixup code.
+ -- A the end of the current block we will jump to the fixup one,
+ -- then that will jump to our original destination.
+ fixup_block_id <- mkBlockId <$> getUniqueR
+ let block = BasicBlock fixup_block_id
+ $ fixUpInstrs ++ mkJumpInstr dest
+
+ -- if we didn't need any fixups, then don't include the block
+ case fixUpInstrs of
+ [] -> joinToTargets' block_live new_blocks block_id instr dests
+
+ -- patch the original branch instruction so it goes to our
+ -- fixup block instead.
+ _ -> let instr' = patchJumpInstr instr
+ (\bid -> if bid == dest
+ then fixup_block_id
+ else bid) -- no change!
+
+ in do
+ {- --debugging
+ pprTrace "FixUpEdge info:"
+ (
+ text "inBlock:" <> ppr block_id $$
+ text "instr:" <> ppr instr $$
+ text "instr':" <> ppr instr' $$
+ text "fixup_block_id':" <>
+ ppr fixup_block_id $$
+ text "dest:" <> ppr dest
+ ) (return ())
+ -}
+ recordFixupBlock block_id fixup_block_id dest
+ joinToTargets' block_live (block : new_blocks)
+ block_id instr' dests
+
+
+-- | Construct a graph of register\/spill movements.
+--
+-- Cyclic components seem to occur only very rarely.
+--
+-- We cut some corners by not handling memory-to-memory moves.
+-- This shouldn't happen because every temporary gets its own stack slot.
+--
+makeRegMovementGraph :: RegMap Loc -> RegMap Loc -> [Node Loc Unique]
+makeRegMovementGraph adjusted_assig dest_assig
+ = [ node | (vreg, src) <- nonDetUFMToList adjusted_assig
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+ -- source reg might not be needed at the dest:
+ , Just loc <- [lookupUFM_Directly dest_assig vreg]
+ , node <- expandNode vreg src loc ]
+
+
+-- | Expand out the destination, so InBoth destinations turn into
+-- a combination of InReg and InMem.
+
+-- The InBoth handling is a little tricky here. If the destination is
+-- InBoth, then we must ensure that the value ends up in both locations.
+-- An InBoth destination must conflict with an InReg or InMem source, so
+-- we expand an InBoth destination as necessary.
+--
+-- An InBoth source is slightly different: we only care about the register
+-- that the source value is in, so that we can move it to the destinations.
+--
+expandNode
+ :: a
+ -> Loc -- ^ source of move
+ -> Loc -- ^ destination of move
+ -> [Node Loc a ]
+
+expandNode vreg loc@(InReg src) (InBoth dst mem)
+ | src == dst = [DigraphNode vreg loc [InMem mem]]
+ | otherwise = [DigraphNode vreg loc [InReg dst, InMem mem]]
+
+expandNode vreg loc@(InMem src) (InBoth dst mem)
+ | src == mem = [DigraphNode vreg loc [InReg dst]]
+ | otherwise = [DigraphNode vreg loc [InReg dst, InMem mem]]
+
+expandNode _ (InBoth _ src) (InMem dst)
+ | src == dst = [] -- guaranteed to be true
+
+expandNode _ (InBoth src _) (InReg dst)
+ | src == dst = []
+
+expandNode vreg (InBoth src _) dst
+ = expandNode vreg (InReg src) dst
+
+expandNode vreg src dst
+ | src == dst = []
+ | otherwise = [DigraphNode vreg src [dst]]
+
+
+-- | Generate fixup code for a particular component in the move graph
+-- This component tells us what values need to be moved to what
+-- destinations. We have eliminated any possibility of single-node
+-- cycles in expandNode above.
+--
+handleComponent
+ :: Instruction instr
+ => Int -> instr -> SCC (Node Loc Unique)
+ -> RegM freeRegs [instr]
+
+-- If the graph is acyclic then we won't get the swapping problem below.
+-- In this case we can just do the moves directly, and avoid having to
+-- go via a spill slot.
+--
+handleComponent delta _ (AcyclicSCC (DigraphNode vreg src dsts))
+ = mapM (makeMove delta vreg src) dsts
+
+
+-- Handle some cyclic moves.
+-- This can happen if we have two regs that need to be swapped.
+-- eg:
+-- vreg source loc dest loc
+-- (vreg1, InReg r1, [InReg r2])
+-- (vreg2, InReg r2, [InReg r1])
+--
+-- To avoid needing temp register, we just spill all the source regs, then
+-- reaload them into their destination regs.
+--
+-- Note that we can not have cycles that involve memory locations as
+-- sources as single destination because memory locations (stack slots)
+-- are allocated exclusively for a virtual register and therefore can not
+-- require a fixup.
+--
+handleComponent delta instr
+ (CyclicSCC ((DigraphNode vreg (InReg sreg) ((InReg dreg: _))) : rest))
+ -- dest list may have more than one element, if the reg is also InMem.
+ = do
+ -- spill the source into its slot
+ (instrSpill, slot)
+ <- spillR (RegReal sreg) vreg
+
+ -- reload into destination reg
+ instrLoad <- loadR (RegReal dreg) slot
+
+ remainingFixUps <- mapM (handleComponent delta instr)
+ (stronglyConnCompFromEdgedVerticesOrdR rest)
+
+ -- make sure to do all the reloads after all the spills,
+ -- so we don't end up clobbering the source values.
+ return ([instrSpill] ++ concat remainingFixUps ++ [instrLoad])
+
+handleComponent _ _ (CyclicSCC _)
+ = panic "Register Allocator: handleComponent cyclic"
+
+
+-- | Move a vreg between these two locations.
+--
+makeMove
+ :: Instruction instr
+ => Int -- ^ current C stack delta.
+ -> Unique -- ^ unique of the vreg that we're moving.
+ -> Loc -- ^ source location.
+ -> Loc -- ^ destination location.
+ -> RegM freeRegs instr -- ^ move instruction.
+
+makeMove delta vreg src dst
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+
+ case (src, dst) of
+ (InReg s, InReg d) ->
+ do recordSpill (SpillJoinRR vreg)
+ return $ mkRegRegMoveInstr platform (RegReal s) (RegReal d)
+ (InMem s, InReg d) ->
+ do recordSpill (SpillJoinRM vreg)
+ return $ mkLoadInstr dflags (RegReal d) delta s
+ (InReg s, InMem d) ->
+ do recordSpill (SpillJoinRM vreg)
+ return $ mkSpillInstr dflags (RegReal s) delta d
+ _ ->
+ -- we don't handle memory to memory moves.
+ -- they shouldn't happen because we don't share
+ -- stack slots between vregs.
+ panic ("makeMove " ++ show vreg ++ " (" ++ show src ++ ") ("
+ ++ show dst ++ ")"
+ ++ " we don't handle mem->mem moves.")
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs b/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs
new file mode 100644
index 0000000000..ce0a187647
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs
@@ -0,0 +1,60 @@
+-- | Free regs map for PowerPC
+module GHC.CmmToAsm.Reg.Linear.PPC where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.PPC.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import Outputable
+import GHC.Platform
+
+import Data.Word
+import Data.Bits
+
+-- The PowerPC has 32 integer and 32 floating point registers.
+-- This is 32bit PowerPC, so Word64 is inefficient - two Word32s are much
+-- better.
+-- Note that when getFreeRegs scans for free registers, it starts at register
+-- 31 and counts down. This is a hack for the PowerPC - the higher-numbered
+-- registers are callee-saves, while the lower regs are caller-saves, so it
+-- makes sense to start at the high end.
+-- Apart from that, the code does nothing PowerPC-specific, so feel free to
+-- add your favourite platform to the #if (if you have 64 registers but only
+-- 32-bit words).
+
+data FreeRegs = FreeRegs !Word32 !Word32
+ deriving( Show ) -- The Show is used in an ASSERT
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0 0
+
+releaseReg :: RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle r) (FreeRegs g f)
+ | r > 31 = FreeRegs g (f .|. (1 `shiftL` (r - 32)))
+ | otherwise = FreeRegs (g .|. (1 `shiftL` r)) f
+
+releaseReg _ _
+ = panic "RegAlloc.Linear.PPC.releaseReg: bad reg"
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+getFreeRegs :: RegClass -> FreeRegs -> [RealReg] -- lazily
+getFreeRegs cls (FreeRegs g f)
+ | RcDouble <- cls = go f (0x80000000) 63
+ | RcInteger <- cls = go g (0x80000000) 31
+ | otherwise = pprPanic "RegAllocLinear.getFreeRegs: Bad register class" (ppr cls)
+ where
+ go _ 0 _ = []
+ go x m i | x .&. m /= 0 = RealRegSingle i : (go x (m `shiftR` 1) $! i-1)
+ | otherwise = go x (m `shiftR` 1) $! i-1
+
+allocateReg :: RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs g f)
+ | r > 31 = FreeRegs g (f .&. complement (1 `shiftL` (r - 32)))
+ | otherwise = FreeRegs (g .&. complement (1 `shiftL` r)) f
+
+allocateReg _ _
+ = panic "RegAlloc.Linear.PPC.allocateReg: bad reg"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs b/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs
new file mode 100644
index 0000000000..7fa85f0913
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs
@@ -0,0 +1,188 @@
+{-# LANGUAGE CPP #-}
+
+-- | Free regs map for SPARC
+module GHC.CmmToAsm.Reg.Linear.SPARC where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import Outputable
+import GHC.Platform
+
+import Data.Word
+import Data.Bits
+
+
+--------------------------------------------------------------------------------
+-- SPARC is like PPC, except for twinning of floating point regs.
+-- When we allocate a double reg we must take an even numbered
+-- float reg, as well as the one after it.
+
+
+-- Holds bitmaps showing what registers are currently allocated.
+-- The float and double reg bitmaps overlap, but we only alloc
+-- float regs into the float map, and double regs into the double map.
+--
+-- Free regs have a bit set in the corresponding bitmap.
+--
+data FreeRegs
+ = FreeRegs
+ !Word32 -- int reg bitmap regs 0..31
+ !Word32 -- float reg bitmap regs 32..63
+ !Word32 -- double reg bitmap regs 32..63
+
+instance Show FreeRegs where
+ show = showFreeRegs
+
+-- | A reg map where no regs are free to be allocated.
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0 0 0
+
+
+-- | The initial set of free regs.
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform
+ = foldl' (flip $ releaseReg platform) noFreeRegs allocatableRegs
+
+
+-- | Get all the free registers of this class.
+getFreeRegs :: RegClass -> FreeRegs -> [RealReg] -- lazily
+getFreeRegs cls (FreeRegs g f d)
+ | RcInteger <- cls = map RealRegSingle $ go 1 g 1 0
+ | RcFloat <- cls = map RealRegSingle $ go 1 f 1 32
+ | RcDouble <- cls = map (\i -> RealRegPair i (i+1)) $ go 2 d 1 32
+#if __GLASGOW_HASKELL__ <= 810
+ | otherwise = pprPanic "RegAllocLinear.getFreeRegs: Bad register class " (ppr cls)
+#endif
+ where
+ go _ _ 0 _
+ = []
+
+ go step bitmap mask ix
+ | bitmap .&. mask /= 0
+ = ix : (go step bitmap (mask `shiftL` step) $! ix + step)
+
+ | otherwise
+ = go step bitmap (mask `shiftL` step) $! ix + step
+
+
+-- | Grab a register.
+allocateReg :: Platform -> RealReg -> FreeRegs -> FreeRegs
+allocateReg platform
+ reg@(RealRegSingle r)
+ (FreeRegs g f d)
+
+ -- can't allocate free regs
+ | not $ freeReg platform r
+ = pprPanic "SPARC.FreeRegs.allocateReg: not allocating pinned reg" (ppr reg)
+
+ -- a general purpose reg
+ | r <= 31
+ = let mask = complement (bitMask r)
+ in FreeRegs
+ (g .&. mask)
+ f
+ d
+
+ -- a float reg
+ | r >= 32, r <= 63
+ = let mask = complement (bitMask (r - 32))
+
+ -- the mask of the double this FP reg aliases
+ maskLow = if r `mod` 2 == 0
+ then complement (bitMask (r - 32))
+ else complement (bitMask (r - 32 - 1))
+ in FreeRegs
+ g
+ (f .&. mask)
+ (d .&. maskLow)
+
+ | otherwise
+ = pprPanic "SPARC.FreeRegs.releaseReg: not allocating bad reg" (ppr reg)
+
+allocateReg _
+ reg@(RealRegPair r1 r2)
+ (FreeRegs g f d)
+
+ | r1 >= 32, r1 <= 63, r1 `mod` 2 == 0
+ , r2 >= 32, r2 <= 63
+ = let mask1 = complement (bitMask (r1 - 32))
+ mask2 = complement (bitMask (r2 - 32))
+ in
+ FreeRegs
+ g
+ ((f .&. mask1) .&. mask2)
+ (d .&. mask1)
+
+ | otherwise
+ = pprPanic "SPARC.FreeRegs.releaseReg: not allocating bad reg" (ppr reg)
+
+
+
+-- | Release a register from allocation.
+-- The register liveness information says that most regs die after a C call,
+-- but we still don't want to allocate to some of them.
+--
+releaseReg :: Platform -> RealReg -> FreeRegs -> FreeRegs
+releaseReg platform
+ reg@(RealRegSingle r)
+ regs@(FreeRegs g f d)
+
+ -- don't release pinned reg
+ | not $ freeReg platform r
+ = regs
+
+ -- a general purpose reg
+ | r <= 31
+ = let mask = bitMask r
+ in FreeRegs (g .|. mask) f d
+
+ -- a float reg
+ | r >= 32, r <= 63
+ = let mask = bitMask (r - 32)
+
+ -- the mask of the double this FP reg aliases
+ maskLow = if r `mod` 2 == 0
+ then bitMask (r - 32)
+ else bitMask (r - 32 - 1)
+ in FreeRegs
+ g
+ (f .|. mask)
+ (d .|. maskLow)
+
+ | otherwise
+ = pprPanic "SPARC.FreeRegs.releaseReg: not releasing bad reg" (ppr reg)
+
+releaseReg _
+ reg@(RealRegPair r1 r2)
+ (FreeRegs g f d)
+
+ | r1 >= 32, r1 <= 63, r1 `mod` 2 == 0
+ , r2 >= 32, r2 <= 63
+ = let mask1 = bitMask (r1 - 32)
+ mask2 = bitMask (r2 - 32)
+ in
+ FreeRegs
+ g
+ ((f .|. mask1) .|. mask2)
+ (d .|. mask1)
+
+ | otherwise
+ = pprPanic "SPARC.FreeRegs.releaseReg: not releasing bad reg" (ppr reg)
+
+
+
+bitMask :: Int -> Word32
+bitMask n = 1 `shiftL` n
+
+
+showFreeRegs :: FreeRegs -> String
+showFreeRegs regs
+ = "FreeRegs\n"
+ ++ " integer: " ++ (show $ getFreeRegs RcInteger regs) ++ "\n"
+ ++ " float: " ++ (show $ getFreeRegs RcFloat regs) ++ "\n"
+ ++ " double: " ++ (show $ getFreeRegs RcDouble regs) ++ "\n"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs b/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs
new file mode 100644
index 0000000000..630b101fc7
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs
@@ -0,0 +1,61 @@
+
+-- | The assignment of virtual registers to stack slots
+
+-- We have lots of stack slots. Memory-to-memory moves are a pain on most
+-- architectures. Therefore, we avoid having to generate memory-to-memory moves
+-- by simply giving every virtual register its own stack slot.
+
+-- The StackMap stack map keeps track of virtual register - stack slot
+-- associations and of which stack slots are still free. Once it has been
+-- associated, a stack slot is never "freed" or removed from the StackMap again,
+-- it remains associated until we are done with the current CmmProc.
+--
+module GHC.CmmToAsm.Reg.Linear.StackMap (
+ StackSlot,
+ StackMap(..),
+ emptyStackMap,
+ getStackSlotFor,
+ getStackUse
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Driver.Session
+import UniqFM
+import Unique
+
+
+-- | Identifier for a stack slot.
+type StackSlot = Int
+
+data StackMap
+ = StackMap
+ { -- | The slots that are still available to be allocated.
+ stackMapNextFreeSlot :: !Int
+
+ -- | Assignment of vregs to stack slots.
+ , stackMapAssignment :: UniqFM StackSlot }
+
+
+-- | An empty stack map, with all slots available.
+emptyStackMap :: DynFlags -> StackMap
+emptyStackMap _ = StackMap 0 emptyUFM
+
+
+-- | If this vreg unique already has a stack assignment then return the slot number,
+-- otherwise allocate a new slot, and update the map.
+--
+getStackSlotFor :: StackMap -> Unique -> (StackMap, Int)
+
+getStackSlotFor fs@(StackMap _ reserved) reg
+ | Just slot <- lookupUFM reserved reg = (fs, slot)
+
+getStackSlotFor (StackMap freeSlot reserved) reg =
+ (StackMap (freeSlot+1) (addToUFM reserved reg freeSlot), freeSlot)
+
+-- | Return the number of stack slots that were allocated
+getStackUse :: StackMap -> Int
+getStackUse (StackMap freeSlot _) = freeSlot
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
new file mode 100644
index 0000000000..a167cc7e00
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
@@ -0,0 +1,184 @@
+{-# LANGUAGE CPP, PatternSynonyms, DeriveFunctor #-}
+
+#if !defined(GHC_LOADED_INTO_GHCI)
+{-# LANGUAGE UnboxedTuples #-}
+#endif
+
+-- | State monad for the linear register allocator.
+
+-- Here we keep all the state that the register allocator keeps track
+-- of as it walks the instructions in a basic block.
+
+module GHC.CmmToAsm.Reg.Linear.State (
+ RA_State(..),
+ RegM,
+ runR,
+
+ spillR,
+ loadR,
+
+ getFreeRegsR,
+ setFreeRegsR,
+
+ getAssigR,
+ setAssigR,
+
+ getBlockAssigR,
+ setBlockAssigR,
+
+ setDeltaR,
+ getDeltaR,
+
+ getUniqueR,
+
+ recordSpill,
+ recordFixupBlock
+)
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.Stats
+import GHC.CmmToAsm.Reg.Linear.StackMap
+import GHC.CmmToAsm.Reg.Linear.Base
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.Cmm.BlockId
+
+import GHC.Driver.Session
+import Unique
+import UniqSupply
+
+import Control.Monad (ap)
+
+-- Avoids using unboxed tuples when loading into GHCi
+#if !defined(GHC_LOADED_INTO_GHCI)
+
+type RA_Result freeRegs a = (# RA_State freeRegs, a #)
+
+pattern RA_Result :: a -> b -> (# a, b #)
+pattern RA_Result a b = (# a, b #)
+{-# COMPLETE RA_Result #-}
+#else
+
+data RA_Result freeRegs a = RA_Result {-# UNPACK #-} !(RA_State freeRegs) !a
+ deriving (Functor)
+
+#endif
+
+-- | The register allocator monad type.
+newtype RegM freeRegs a
+ = RegM { unReg :: RA_State freeRegs -> RA_Result freeRegs a }
+ deriving (Functor)
+
+instance Applicative (RegM freeRegs) where
+ pure a = RegM $ \s -> RA_Result s a
+ (<*>) = ap
+
+instance Monad (RegM freeRegs) where
+ m >>= k = RegM $ \s -> case unReg m s of { RA_Result s a -> unReg (k a) s }
+
+instance HasDynFlags (RegM a) where
+ getDynFlags = RegM $ \s -> RA_Result s (ra_DynFlags s)
+
+
+-- | Run a computation in the RegM register allocator monad.
+runR :: DynFlags
+ -> BlockAssignment freeRegs
+ -> freeRegs
+ -> RegMap Loc
+ -> StackMap
+ -> UniqSupply
+ -> RegM freeRegs a
+ -> (BlockAssignment freeRegs, StackMap, RegAllocStats, a)
+
+runR dflags block_assig freeregs assig stack us thing =
+ case unReg thing
+ (RA_State
+ { ra_blockassig = block_assig
+ , ra_freeregs = freeregs
+ , ra_assig = assig
+ , ra_delta = 0{-???-}
+ , ra_stack = stack
+ , ra_us = us
+ , ra_spills = []
+ , ra_DynFlags = dflags
+ , ra_fixups = [] })
+ of
+ RA_Result state returned_thing
+ -> (ra_blockassig state, ra_stack state, makeRAStats state, returned_thing)
+
+
+-- | Make register allocator stats from its final state.
+makeRAStats :: RA_State freeRegs -> RegAllocStats
+makeRAStats state
+ = RegAllocStats
+ { ra_spillInstrs = binSpillReasons (ra_spills state)
+ , ra_fixupList = ra_fixups state }
+
+
+spillR :: Instruction instr
+ => Reg -> Unique -> RegM freeRegs (instr, Int)
+
+spillR reg temp = RegM $ \ s@RA_State{ra_delta=delta, ra_stack=stack0} ->
+ let dflags = ra_DynFlags s
+ (stack1,slot) = getStackSlotFor stack0 temp
+ instr = mkSpillInstr dflags reg delta slot
+ in
+ RA_Result s{ra_stack=stack1} (instr,slot)
+
+
+loadR :: Instruction instr
+ => Reg -> Int -> RegM freeRegs instr
+
+loadR reg slot = RegM $ \ s@RA_State{ra_delta=delta} ->
+ let dflags = ra_DynFlags s
+ in RA_Result s (mkLoadInstr dflags reg delta slot)
+
+getFreeRegsR :: RegM freeRegs freeRegs
+getFreeRegsR = RegM $ \ s@RA_State{ra_freeregs = freeregs} ->
+ RA_Result s freeregs
+
+setFreeRegsR :: freeRegs -> RegM freeRegs ()
+setFreeRegsR regs = RegM $ \ s ->
+ RA_Result s{ra_freeregs = regs} ()
+
+getAssigR :: RegM freeRegs (RegMap Loc)
+getAssigR = RegM $ \ s@RA_State{ra_assig = assig} ->
+ RA_Result s assig
+
+setAssigR :: RegMap Loc -> RegM freeRegs ()
+setAssigR assig = RegM $ \ s ->
+ RA_Result s{ra_assig=assig} ()
+
+getBlockAssigR :: RegM freeRegs (BlockAssignment freeRegs)
+getBlockAssigR = RegM $ \ s@RA_State{ra_blockassig = assig} ->
+ RA_Result s assig
+
+setBlockAssigR :: BlockAssignment freeRegs -> RegM freeRegs ()
+setBlockAssigR assig = RegM $ \ s ->
+ RA_Result s{ra_blockassig = assig} ()
+
+setDeltaR :: Int -> RegM freeRegs ()
+setDeltaR n = RegM $ \ s ->
+ RA_Result s{ra_delta = n} ()
+
+getDeltaR :: RegM freeRegs Int
+getDeltaR = RegM $ \s -> RA_Result s (ra_delta s)
+
+getUniqueR :: RegM freeRegs Unique
+getUniqueR = RegM $ \s ->
+ case takeUniqFromSupply (ra_us s) of
+ (uniq, us) -> RA_Result s{ra_us = us} uniq
+
+
+-- | Record that a spill instruction was inserted, for profiling.
+recordSpill :: SpillReason -> RegM freeRegs ()
+recordSpill spill
+ = RegM $ \s -> RA_Result (s { ra_spills = spill : ra_spills s }) ()
+
+-- | Record a created fixup block
+recordFixupBlock :: BlockId -> BlockId -> BlockId -> RegM freeRegs ()
+recordFixupBlock from between to
+ = RegM $ \s -> RA_Result (s { ra_fixups = (from,between,to) : ra_fixups s }) ()
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs b/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs
new file mode 100644
index 0000000000..1176b220a3
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs
@@ -0,0 +1,87 @@
+module GHC.CmmToAsm.Reg.Linear.Stats (
+ binSpillReasons,
+ countRegRegMovesNat,
+ pprStats
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.Base
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+
+import UniqFM
+import Outputable
+
+import State
+
+-- | Build a map of how many times each reg was alloced, clobbered, loaded etc.
+binSpillReasons
+ :: [SpillReason] -> UniqFM [Int]
+
+binSpillReasons reasons
+ = addListToUFM_C
+ (zipWith (+))
+ emptyUFM
+ (map (\reason -> case reason of
+ SpillAlloc r -> (r, [1, 0, 0, 0, 0])
+ SpillClobber r -> (r, [0, 1, 0, 0, 0])
+ SpillLoad r -> (r, [0, 0, 1, 0, 0])
+ SpillJoinRR r -> (r, [0, 0, 0, 1, 0])
+ SpillJoinRM r -> (r, [0, 0, 0, 0, 1])) reasons)
+
+
+-- | Count reg-reg moves remaining in this code.
+countRegRegMovesNat
+ :: Instruction instr
+ => NatCmmDecl statics instr -> Int
+
+countRegRegMovesNat cmm
+ = execState (mapGenBlockTopM countBlock cmm) 0
+ where
+ countBlock b@(BasicBlock _ instrs)
+ = do mapM_ countInstr instrs
+ return b
+
+ countInstr instr
+ | Just _ <- takeRegRegMoveInstr instr
+ = do modify (+ 1)
+ return instr
+
+ | otherwise
+ = return instr
+
+
+-- | Pretty print some RegAllocStats
+pprStats
+ :: Instruction instr
+ => [NatCmmDecl statics instr] -> [RegAllocStats] -> SDoc
+
+pprStats code statss
+ = let -- sum up all the instrs inserted by the spiller
+ spills = foldl' (plusUFM_C (zipWith (+)))
+ emptyUFM
+ $ map ra_spillInstrs statss
+
+ spillTotals = foldl' (zipWith (+))
+ [0, 0, 0, 0, 0]
+ $ nonDetEltsUFM spills
+ -- See Note [Unique Determinism and code generation]
+
+ -- count how many reg-reg-moves remain in the code
+ moves = sum $ map countRegRegMovesNat code
+
+ pprSpill (reg, spills)
+ = parens $ (hcat $ punctuate (text ", ") (doubleQuotes (ppr reg) : map ppr spills))
+
+ in ( text "-- spills-added-total"
+ $$ text "-- (allocs, clobbers, loads, joinRR, joinRM, reg_reg_moves_remaining)"
+ $$ (parens $ (hcat $ punctuate (text ", ") (map ppr spillTotals ++ [ppr moves])))
+ $$ text ""
+ $$ text "-- spills-added"
+ $$ text "-- (reg_name, allocs, clobbers, loads, joinRR, joinRM)"
+ $$ (pprUFMWithKeys spills (vcat . map pprSpill))
+ $$ text "")
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs b/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs
new file mode 100644
index 0000000000..ce103bd6b2
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs
@@ -0,0 +1,52 @@
+
+-- | Free regs map for i386
+module GHC.CmmToAsm.Reg.Linear.X86 where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import Panic
+import GHC.Platform
+
+import Data.Word
+import Data.Bits
+
+newtype FreeRegs = FreeRegs Word32
+ deriving Show
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0
+
+releaseReg :: RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle n) (FreeRegs f)
+ = FreeRegs (f .|. (1 `shiftL` n))
+
+releaseReg _ _
+ = panic "RegAlloc.Linear.X86.FreeRegs.releaseReg: no reg"
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform
+ = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+getFreeRegs :: Platform -> RegClass -> FreeRegs -> [RealReg] -- lazily
+getFreeRegs platform cls (FreeRegs f) = go f 0
+
+ where go 0 _ = []
+ go n m
+ | n .&. 1 /= 0 && classOfRealReg platform (RealRegSingle m) == cls
+ = RealRegSingle m : (go (n `shiftR` 1) $! (m+1))
+
+ | otherwise
+ = go (n `shiftR` 1) $! (m+1)
+ -- ToDo: there's no point looking through all the integer registers
+ -- in order to find a floating-point one.
+
+allocateReg :: RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs f)
+ = FreeRegs (f .&. complement (1 `shiftL` r))
+
+allocateReg _ _
+ = panic "RegAlloc.Linear.X86.FreeRegs.allocateReg: no reg"
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs b/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs
new file mode 100644
index 0000000000..322ddd6bdd
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs
@@ -0,0 +1,53 @@
+
+-- | Free regs map for x86_64
+module GHC.CmmToAsm.Reg.Linear.X86_64 where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import Panic
+import GHC.Platform
+
+import Data.Word
+import Data.Bits
+
+newtype FreeRegs = FreeRegs Word64
+ deriving Show
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0
+
+releaseReg :: RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle n) (FreeRegs f)
+ = FreeRegs (f .|. (1 `shiftL` n))
+
+releaseReg _ _
+ = panic "RegAlloc.Linear.X86_64.FreeRegs.releaseReg: no reg"
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform
+ = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+getFreeRegs :: Platform -> RegClass -> FreeRegs -> [RealReg] -- lazily
+getFreeRegs platform cls (FreeRegs f) = go f 0
+
+ where go 0 _ = []
+ go n m
+ | n .&. 1 /= 0 && classOfRealReg platform (RealRegSingle m) == cls
+ = RealRegSingle m : (go (n `shiftR` 1) $! (m+1))
+
+ | otherwise
+ = go (n `shiftR` 1) $! (m+1)
+ -- ToDo: there's no point looking through all the integer registers
+ -- in order to find a floating-point one.
+
+allocateReg :: RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs f)
+ = FreeRegs (f .&. complement (1 `shiftL` r))
+
+allocateReg _ _
+ = panic "RegAlloc.Linear.X86_64.FreeRegs.allocateReg: no reg"
+
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Liveness.hs b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
new file mode 100644
index 0000000000..03b8123f93
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
@@ -0,0 +1,1025 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeFamilies #-}
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- The register liveness determinator
+--
+-- (c) The University of Glasgow 2004-2013
+--
+-----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.Reg.Liveness (
+ RegSet,
+ RegMap, emptyRegMap,
+ BlockMap, mapEmpty,
+ LiveCmmDecl,
+ InstrSR (..),
+ LiveInstr (..),
+ Liveness (..),
+ LiveInfo (..),
+ LiveBasicBlock,
+
+ mapBlockTop, mapBlockTopM, mapSCCM,
+ mapGenBlockTop, mapGenBlockTopM,
+ stripLive,
+ stripLiveBlock,
+ slurpConflicts,
+ slurpReloadCoalesce,
+ eraseDeltasLive,
+ patchEraseLive,
+ patchRegsLiveInstr,
+ reverseBlocksInTops,
+ regLiveness,
+ cmmTopLiveness
+ ) where
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Instr
+
+import GHC.Cmm.BlockId
+import GHC.CmmToAsm.CFG
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm hiding (RegSet, emptyRegSet)
+
+import Digraph
+import GHC.Driver.Session
+import MonadUtils
+import Outputable
+import GHC.Platform
+import UniqSet
+import UniqFM
+import UniqSupply
+import Bag
+import State
+
+import Data.List
+import Data.Maybe
+import Data.IntSet (IntSet)
+
+-----------------------------------------------------------------------------
+type RegSet = UniqSet Reg
+
+type RegMap a = UniqFM a
+
+emptyRegMap :: UniqFM a
+emptyRegMap = emptyUFM
+
+emptyRegSet :: RegSet
+emptyRegSet = emptyUniqSet
+
+type BlockMap a = LabelMap a
+
+
+-- | A top level thing which carries liveness information.
+type LiveCmmDecl statics instr
+ = GenCmmDecl
+ statics
+ LiveInfo
+ [SCC (LiveBasicBlock instr)]
+
+
+-- | The register allocator also wants to use SPILL/RELOAD meta instructions,
+-- so we'll keep those here.
+data InstrSR instr
+ -- | A real machine instruction
+ = Instr instr
+
+ -- | spill this reg to a stack slot
+ | SPILL Reg Int
+
+ -- | reload this reg from a stack slot
+ | RELOAD Int Reg
+
+instance Instruction instr => Instruction (InstrSR instr) where
+ regUsageOfInstr platform i
+ = case i of
+ Instr instr -> regUsageOfInstr platform instr
+ SPILL reg _ -> RU [reg] []
+ RELOAD _ reg -> RU [] [reg]
+
+ patchRegsOfInstr i f
+ = case i of
+ Instr instr -> Instr (patchRegsOfInstr instr f)
+ SPILL reg slot -> SPILL (f reg) slot
+ RELOAD slot reg -> RELOAD slot (f reg)
+
+ isJumpishInstr i
+ = case i of
+ Instr instr -> isJumpishInstr instr
+ _ -> False
+
+ jumpDestsOfInstr i
+ = case i of
+ Instr instr -> jumpDestsOfInstr instr
+ _ -> []
+
+ patchJumpInstr i f
+ = case i of
+ Instr instr -> Instr (patchJumpInstr instr f)
+ _ -> i
+
+ mkSpillInstr = error "mkSpillInstr[InstrSR]: Not making SPILL meta-instr"
+ mkLoadInstr = error "mkLoadInstr[InstrSR]: Not making LOAD meta-instr"
+
+ takeDeltaInstr i
+ = case i of
+ Instr instr -> takeDeltaInstr instr
+ _ -> Nothing
+
+ isMetaInstr i
+ = case i of
+ Instr instr -> isMetaInstr instr
+ _ -> False
+
+ mkRegRegMoveInstr platform r1 r2
+ = Instr (mkRegRegMoveInstr platform r1 r2)
+
+ takeRegRegMoveInstr i
+ = case i of
+ Instr instr -> takeRegRegMoveInstr instr
+ _ -> Nothing
+
+ mkJumpInstr target = map Instr (mkJumpInstr target)
+
+ mkStackAllocInstr platform amount =
+ Instr <$> mkStackAllocInstr platform amount
+
+ mkStackDeallocInstr platform amount =
+ Instr <$> mkStackDeallocInstr platform amount
+
+
+-- | An instruction with liveness information.
+data LiveInstr instr
+ = LiveInstr (InstrSR instr) (Maybe Liveness)
+
+-- | Liveness information.
+-- The regs which die are ones which are no longer live in the *next* instruction
+-- in this sequence.
+-- (NB. if the instruction is a jump, these registers might still be live
+-- at the jump target(s) - you have to check the liveness at the destination
+-- block to find out).
+
+data Liveness
+ = Liveness
+ { liveBorn :: RegSet -- ^ registers born in this instruction (written to for first time).
+ , liveDieRead :: RegSet -- ^ registers that died because they were read for the last time.
+ , liveDieWrite :: RegSet } -- ^ registers that died because they were clobbered by something.
+
+
+-- | Stash regs live on entry to each basic block in the info part of the cmm code.
+data LiveInfo
+ = LiveInfo
+ (LabelMap RawCmmStatics) -- cmm info table static stuff
+ [BlockId] -- entry points (first one is the
+ -- entry point for the proc).
+ (BlockMap RegSet) -- argument locals live on entry to this block
+ (BlockMap IntSet) -- stack slots live on entry to this block
+
+
+-- | A basic block with liveness information.
+type LiveBasicBlock instr
+ = GenBasicBlock (LiveInstr instr)
+
+
+instance Outputable instr
+ => Outputable (InstrSR instr) where
+
+ ppr (Instr realInstr)
+ = ppr realInstr
+
+ ppr (SPILL reg slot)
+ = hcat [
+ text "\tSPILL",
+ char ' ',
+ ppr reg,
+ comma,
+ text "SLOT" <> parens (int slot)]
+
+ ppr (RELOAD slot reg)
+ = hcat [
+ text "\tRELOAD",
+ char ' ',
+ text "SLOT" <> parens (int slot),
+ comma,
+ ppr reg]
+
+instance Outputable instr
+ => Outputable (LiveInstr instr) where
+
+ ppr (LiveInstr instr Nothing)
+ = ppr instr
+
+ ppr (LiveInstr instr (Just live))
+ = ppr instr
+ $$ (nest 8
+ $ vcat
+ [ pprRegs (text "# born: ") (liveBorn live)
+ , pprRegs (text "# r_dying: ") (liveDieRead live)
+ , pprRegs (text "# w_dying: ") (liveDieWrite live) ]
+ $+$ space)
+
+ where pprRegs :: SDoc -> RegSet -> SDoc
+ pprRegs name regs
+ | isEmptyUniqSet regs = empty
+ | otherwise = name <>
+ (pprUFM (getUniqSet regs) (hcat . punctuate space . map ppr))
+
+instance Outputable LiveInfo where
+ ppr (LiveInfo mb_static entryIds liveVRegsOnEntry liveSlotsOnEntry)
+ = (ppr mb_static)
+ $$ text "# entryIds = " <> ppr entryIds
+ $$ text "# liveVRegsOnEntry = " <> ppr liveVRegsOnEntry
+ $$ text "# liveSlotsOnEntry = " <> text (show liveSlotsOnEntry)
+
+
+
+-- | map a function across all the basic blocks in this code
+--
+mapBlockTop
+ :: (LiveBasicBlock instr -> LiveBasicBlock instr)
+ -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr
+
+mapBlockTop f cmm
+ = evalState (mapBlockTopM (\x -> return $ f x) cmm) ()
+
+
+-- | map a function across all the basic blocks in this code (monadic version)
+--
+mapBlockTopM
+ :: Monad m
+ => (LiveBasicBlock instr -> m (LiveBasicBlock instr))
+ -> LiveCmmDecl statics instr -> m (LiveCmmDecl statics instr)
+
+mapBlockTopM _ cmm@(CmmData{})
+ = return cmm
+
+mapBlockTopM f (CmmProc header label live sccs)
+ = do sccs' <- mapM (mapSCCM f) sccs
+ return $ CmmProc header label live sccs'
+
+mapSCCM :: Monad m => (a -> m b) -> SCC a -> m (SCC b)
+mapSCCM f (AcyclicSCC x)
+ = do x' <- f x
+ return $ AcyclicSCC x'
+
+mapSCCM f (CyclicSCC xs)
+ = do xs' <- mapM f xs
+ return $ CyclicSCC xs'
+
+
+-- map a function across all the basic blocks in this code
+mapGenBlockTop
+ :: (GenBasicBlock i -> GenBasicBlock i)
+ -> (GenCmmDecl d h (ListGraph i) -> GenCmmDecl d h (ListGraph i))
+
+mapGenBlockTop f cmm
+ = evalState (mapGenBlockTopM (\x -> return $ f x) cmm) ()
+
+
+-- | map a function across all the basic blocks in this code (monadic version)
+mapGenBlockTopM
+ :: Monad m
+ => (GenBasicBlock i -> m (GenBasicBlock i))
+ -> (GenCmmDecl d h (ListGraph i) -> m (GenCmmDecl d h (ListGraph i)))
+
+mapGenBlockTopM _ cmm@(CmmData{})
+ = return cmm
+
+mapGenBlockTopM f (CmmProc header label live (ListGraph blocks))
+ = do blocks' <- mapM f blocks
+ return $ CmmProc header label live (ListGraph blocks')
+
+
+-- | Slurp out the list of register conflicts and reg-reg moves from this top level thing.
+-- Slurping of conflicts and moves is wrapped up together so we don't have
+-- to make two passes over the same code when we want to build the graph.
+--
+slurpConflicts
+ :: Instruction instr
+ => LiveCmmDecl statics instr
+ -> (Bag (UniqSet Reg), Bag (Reg, Reg))
+
+slurpConflicts live
+ = slurpCmm (emptyBag, emptyBag) live
+
+ where slurpCmm rs CmmData{} = rs
+ slurpCmm rs (CmmProc info _ _ sccs)
+ = foldl' (slurpSCC info) rs sccs
+
+ slurpSCC info rs (AcyclicSCC b)
+ = slurpBlock info rs b
+
+ slurpSCC info rs (CyclicSCC bs)
+ = foldl' (slurpBlock info) rs bs
+
+ slurpBlock info rs (BasicBlock blockId instrs)
+ | LiveInfo _ _ blockLive _ <- info
+ , Just rsLiveEntry <- mapLookup blockId blockLive
+ , (conflicts, moves) <- slurpLIs rsLiveEntry rs instrs
+ = (consBag rsLiveEntry conflicts, moves)
+
+ | otherwise
+ = panic "Liveness.slurpConflicts: bad block"
+
+ slurpLIs rsLive (conflicts, moves) []
+ = (consBag rsLive conflicts, moves)
+
+ slurpLIs rsLive rs (LiveInstr _ Nothing : lis)
+ = slurpLIs rsLive rs lis
+
+ slurpLIs rsLiveEntry (conflicts, moves) (LiveInstr instr (Just live) : lis)
+ = let
+ -- regs that die because they are read for the last time at the start of an instruction
+ -- are not live across it.
+ rsLiveAcross = rsLiveEntry `minusUniqSet` (liveDieRead live)
+
+ -- regs live on entry to the next instruction.
+ -- be careful of orphans, make sure to delete dying regs _after_ unioning
+ -- in the ones that are born here.
+ rsLiveNext = (rsLiveAcross `unionUniqSets` (liveBorn live))
+ `minusUniqSet` (liveDieWrite live)
+
+ -- orphan vregs are the ones that die in the same instruction they are born in.
+ -- these are likely to be results that are never used, but we still
+ -- need to assign a hreg to them..
+ rsOrphans = intersectUniqSets
+ (liveBorn live)
+ (unionUniqSets (liveDieWrite live) (liveDieRead live))
+
+ --
+ rsConflicts = unionUniqSets rsLiveNext rsOrphans
+
+ in case takeRegRegMoveInstr instr of
+ Just rr -> slurpLIs rsLiveNext
+ ( consBag rsConflicts conflicts
+ , consBag rr moves) lis
+
+ Nothing -> slurpLIs rsLiveNext
+ ( consBag rsConflicts conflicts
+ , moves) lis
+
+
+-- | For spill\/reloads
+--
+-- SPILL v1, slot1
+-- ...
+-- RELOAD slot1, v2
+--
+-- If we can arrange that v1 and v2 are allocated to the same hreg it's more likely
+-- the spill\/reload instrs can be cleaned and replaced by a nop reg-reg move.
+--
+--
+slurpReloadCoalesce
+ :: forall statics instr. Instruction instr
+ => LiveCmmDecl statics instr
+ -> Bag (Reg, Reg)
+
+slurpReloadCoalesce live
+ = slurpCmm emptyBag live
+
+ where
+ slurpCmm :: Bag (Reg, Reg)
+ -> GenCmmDecl t t1 [SCC (LiveBasicBlock instr)]
+ -> Bag (Reg, Reg)
+ slurpCmm cs CmmData{} = cs
+ slurpCmm cs (CmmProc _ _ _ sccs)
+ = slurpComp cs (flattenSCCs sccs)
+
+ slurpComp :: Bag (Reg, Reg)
+ -> [LiveBasicBlock instr]
+ -> Bag (Reg, Reg)
+ slurpComp cs blocks
+ = let (moveBags, _) = runState (slurpCompM blocks) emptyUFM
+ in unionManyBags (cs : moveBags)
+
+ slurpCompM :: [LiveBasicBlock instr]
+ -> State (UniqFM [UniqFM Reg]) [Bag (Reg, Reg)]
+ slurpCompM blocks
+ = do -- run the analysis once to record the mapping across jumps.
+ mapM_ (slurpBlock False) blocks
+
+ -- run it a second time while using the information from the last pass.
+ -- We /could/ run this many more times to deal with graphical control
+ -- flow and propagating info across multiple jumps, but it's probably
+ -- not worth the trouble.
+ mapM (slurpBlock True) blocks
+
+ slurpBlock :: Bool -> LiveBasicBlock instr
+ -> State (UniqFM [UniqFM Reg]) (Bag (Reg, Reg))
+ slurpBlock propagate (BasicBlock blockId instrs)
+ = do -- grab the slot map for entry to this block
+ slotMap <- if propagate
+ then getSlotMap blockId
+ else return emptyUFM
+
+ (_, mMoves) <- mapAccumLM slurpLI slotMap instrs
+ return $ listToBag $ catMaybes mMoves
+
+ slurpLI :: UniqFM Reg -- current slotMap
+ -> LiveInstr instr
+ -> State (UniqFM [UniqFM Reg]) -- blockId -> [slot -> reg]
+ -- for tracking slotMaps across jumps
+
+ ( UniqFM Reg -- new slotMap
+ , Maybe (Reg, Reg)) -- maybe a new coalesce edge
+
+ slurpLI slotMap li
+
+ -- remember what reg was stored into the slot
+ | LiveInstr (SPILL reg slot) _ <- li
+ , slotMap' <- addToUFM slotMap slot reg
+ = return (slotMap', Nothing)
+
+ -- add an edge between the this reg and the last one stored into the slot
+ | LiveInstr (RELOAD slot reg) _ <- li
+ = case lookupUFM slotMap slot of
+ Just reg2
+ | reg /= reg2 -> return (slotMap, Just (reg, reg2))
+ | otherwise -> return (slotMap, Nothing)
+
+ Nothing -> return (slotMap, Nothing)
+
+ -- if we hit a jump, remember the current slotMap
+ | LiveInstr (Instr instr) _ <- li
+ , targets <- jumpDestsOfInstr instr
+ , not $ null targets
+ = do mapM_ (accSlotMap slotMap) targets
+ return (slotMap, Nothing)
+
+ | otherwise
+ = return (slotMap, Nothing)
+
+ -- record a slotmap for an in edge to this block
+ accSlotMap slotMap blockId
+ = modify (\s -> addToUFM_C (++) s blockId [slotMap])
+
+ -- work out the slot map on entry to this block
+ -- if we have slot maps for multiple in-edges then we need to merge them.
+ getSlotMap blockId
+ = do map <- get
+ let slotMaps = fromMaybe [] (lookupUFM map blockId)
+ return $ foldr mergeSlotMaps emptyUFM slotMaps
+
+ mergeSlotMaps :: UniqFM Reg -> UniqFM Reg -> UniqFM Reg
+ mergeSlotMaps map1 map2
+ = listToUFM
+ $ [ (k, r1)
+ | (k, r1) <- nonDetUFMToList map1
+ -- This is non-deterministic but we do not
+ -- currently support deterministic code-generation.
+ -- See Note [Unique Determinism and code generation]
+ , case lookupUFM map2 k of
+ Nothing -> False
+ Just r2 -> r1 == r2 ]
+
+
+-- | Strip away liveness information, yielding NatCmmDecl
+stripLive
+ :: (Outputable statics, Outputable instr, Instruction instr)
+ => DynFlags
+ -> LiveCmmDecl statics instr
+ -> NatCmmDecl statics instr
+
+stripLive dflags live
+ = stripCmm live
+
+ where stripCmm :: (Outputable statics, Outputable instr, Instruction instr)
+ => LiveCmmDecl statics instr -> NatCmmDecl statics instr
+ stripCmm (CmmData sec ds) = CmmData sec ds
+ stripCmm (CmmProc (LiveInfo info (first_id:_) _ _) label live sccs)
+ = let final_blocks = flattenSCCs sccs
+
+ -- make sure the block that was first in the input list
+ -- stays at the front of the output. This is the entry point
+ -- of the proc, and it needs to come first.
+ ((first':_), rest')
+ = partition ((== first_id) . blockId) final_blocks
+
+ in CmmProc info label live
+ (ListGraph $ map (stripLiveBlock dflags) $ first' : rest')
+
+ -- If the proc has blocks but we don't know what the first one was, then we're dead.
+ stripCmm proc
+ = pprPanic "RegAlloc.Liveness.stripLive: no first_id on proc" (ppr proc)
+
+-- | Strip away liveness information from a basic block,
+-- and make real spill instructions out of SPILL, RELOAD pseudos along the way.
+
+stripLiveBlock
+ :: Instruction instr
+ => DynFlags
+ -> LiveBasicBlock instr
+ -> NatBasicBlock instr
+
+stripLiveBlock dflags (BasicBlock i lis)
+ = BasicBlock i instrs'
+
+ where (instrs', _)
+ = runState (spillNat [] lis) 0
+
+ spillNat acc []
+ = return (reverse acc)
+
+ spillNat acc (LiveInstr (SPILL reg slot) _ : instrs)
+ = do delta <- get
+ spillNat (mkSpillInstr dflags reg delta slot : acc) instrs
+
+ spillNat acc (LiveInstr (RELOAD slot reg) _ : instrs)
+ = do delta <- get
+ spillNat (mkLoadInstr dflags reg delta slot : acc) instrs
+
+ spillNat acc (LiveInstr (Instr instr) _ : instrs)
+ | Just i <- takeDeltaInstr instr
+ = do put i
+ spillNat acc instrs
+
+ spillNat acc (LiveInstr (Instr instr) _ : instrs)
+ = spillNat (instr : acc) instrs
+
+
+-- | Erase Delta instructions.
+
+eraseDeltasLive
+ :: Instruction instr
+ => LiveCmmDecl statics instr
+ -> LiveCmmDecl statics instr
+
+eraseDeltasLive cmm
+ = mapBlockTop eraseBlock cmm
+ where
+ eraseBlock (BasicBlock id lis)
+ = BasicBlock id
+ $ filter (\(LiveInstr i _) -> not $ isJust $ takeDeltaInstr i)
+ $ lis
+
+
+-- | Patch the registers in this code according to this register mapping.
+-- also erase reg -> reg moves when the reg is the same.
+-- also erase reg -> reg moves when the destination dies in this instr.
+patchEraseLive
+ :: Instruction instr
+ => (Reg -> Reg)
+ -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr
+
+patchEraseLive patchF cmm
+ = patchCmm cmm
+ where
+ patchCmm cmm@CmmData{} = cmm
+
+ patchCmm (CmmProc info label live sccs)
+ | LiveInfo static id blockMap mLiveSlots <- info
+ = let
+ patchRegSet set = mkUniqSet $ map patchF $ nonDetEltsUFM set
+ -- See Note [Unique Determinism and code generation]
+ blockMap' = mapMap (patchRegSet . getUniqSet) blockMap
+
+ info' = LiveInfo static id blockMap' mLiveSlots
+ in CmmProc info' label live $ map patchSCC sccs
+
+ patchSCC (AcyclicSCC b) = AcyclicSCC (patchBlock b)
+ patchSCC (CyclicSCC bs) = CyclicSCC (map patchBlock bs)
+
+ patchBlock (BasicBlock id lis)
+ = BasicBlock id $ patchInstrs lis
+
+ patchInstrs [] = []
+ patchInstrs (li : lis)
+
+ | LiveInstr i (Just live) <- li'
+ , Just (r1, r2) <- takeRegRegMoveInstr i
+ , eatMe r1 r2 live
+ = patchInstrs lis
+
+ | otherwise
+ = li' : patchInstrs lis
+
+ where li' = patchRegsLiveInstr patchF li
+
+ eatMe r1 r2 live
+ -- source and destination regs are the same
+ | r1 == r2 = True
+
+ -- destination reg is never used
+ | elementOfUniqSet r2 (liveBorn live)
+ , elementOfUniqSet r2 (liveDieRead live) || elementOfUniqSet r2 (liveDieWrite live)
+ = True
+
+ | otherwise = False
+
+
+-- | Patch registers in this LiveInstr, including the liveness information.
+--
+patchRegsLiveInstr
+ :: Instruction instr
+ => (Reg -> Reg)
+ -> LiveInstr instr -> LiveInstr instr
+
+patchRegsLiveInstr patchF li
+ = case li of
+ LiveInstr instr Nothing
+ -> LiveInstr (patchRegsOfInstr instr patchF) Nothing
+
+ LiveInstr instr (Just live)
+ -> LiveInstr
+ (patchRegsOfInstr instr patchF)
+ (Just live
+ { -- WARNING: have to go via lists here because patchF changes the uniq in the Reg
+ liveBorn = mapUniqSet patchF $ liveBorn live
+ , liveDieRead = mapUniqSet patchF $ liveDieRead live
+ , liveDieWrite = mapUniqSet patchF $ liveDieWrite live })
+ -- See Note [Unique Determinism and code generation]
+
+
+--------------------------------------------------------------------------------
+-- | Convert a NatCmmDecl to a LiveCmmDecl, with liveness information
+
+cmmTopLiveness
+ :: (Outputable instr, Instruction instr)
+ => Maybe CFG -> Platform
+ -> NatCmmDecl statics instr
+ -> UniqSM (LiveCmmDecl statics instr)
+cmmTopLiveness cfg platform cmm
+ = regLiveness platform $ natCmmTopToLive cfg cmm
+
+natCmmTopToLive
+ :: (Instruction instr, Outputable instr)
+ => Maybe CFG -> NatCmmDecl statics instr
+ -> LiveCmmDecl statics instr
+
+natCmmTopToLive _ (CmmData i d)
+ = CmmData i d
+
+natCmmTopToLive _ (CmmProc info lbl live (ListGraph []))
+ = CmmProc (LiveInfo info [] mapEmpty mapEmpty) lbl live []
+
+natCmmTopToLive mCfg proc@(CmmProc info lbl live (ListGraph blocks@(first : _)))
+ = CmmProc (LiveInfo info' (first_id : entry_ids) mapEmpty mapEmpty)
+ lbl live sccsLive
+ where
+ first_id = blockId first
+ all_entry_ids = entryBlocks proc
+ sccs = sccBlocks blocks all_entry_ids mCfg
+ sccsLive = map (fmap (\(BasicBlock l instrs) ->
+ BasicBlock l (map (\i -> LiveInstr (Instr i) Nothing) instrs)))
+ $ sccs
+
+ entry_ids = filter (reachable_node) .
+ filter (/= first_id) $ all_entry_ids
+ info' = mapFilterWithKey (\node _ -> reachable_node node) info
+ reachable_node
+ | Just cfg <- mCfg
+ = hasNode cfg
+ | otherwise
+ = const True
+
+--
+-- Compute the liveness graph of the set of basic blocks. Important:
+-- we also discard any unreachable code here, starting from the entry
+-- points (the first block in the list, and any blocks with info
+-- tables). Unreachable code arises when code blocks are orphaned in
+-- earlier optimisation passes, and may confuse the register allocator
+-- by referring to registers that are not initialised. It's easy to
+-- discard the unreachable code as part of the SCC pass, so that's
+-- exactly what we do. (#7574)
+--
+sccBlocks
+ :: forall instr . Instruction instr
+ => [NatBasicBlock instr]
+ -> [BlockId]
+ -> Maybe CFG
+ -> [SCC (NatBasicBlock instr)]
+
+sccBlocks blocks entries mcfg = map (fmap node_payload) sccs
+ where
+ nodes :: [ Node BlockId (NatBasicBlock instr) ]
+ nodes = [ DigraphNode block id (getOutEdges instrs)
+ | block@(BasicBlock id instrs) <- blocks ]
+
+ g1 = graphFromEdgedVerticesUniq nodes
+
+ reachable :: LabelSet
+ reachable
+ | Just cfg <- mcfg
+ -- Our CFG only contains reachable nodes by construction at this point.
+ = setFromList $ getCfgNodes cfg
+ | otherwise
+ = setFromList $ [ node_key node | node <- reachablesG g1 roots ]
+
+ g2 = graphFromEdgedVerticesUniq [ node | node <- nodes
+ , node_key node
+ `setMember` reachable ]
+
+ sccs = stronglyConnCompG g2
+
+ getOutEdges :: Instruction instr => [instr] -> [BlockId]
+ getOutEdges instrs = concatMap jumpDestsOfInstr instrs
+
+ -- This is truly ugly, but I don't see a good alternative.
+ -- Digraph just has the wrong API. We want to identify nodes
+ -- by their keys (BlockId), but Digraph requires the whole
+ -- node: (NatBasicBlock, BlockId, [BlockId]). This takes
+ -- advantage of the fact that Digraph only looks at the key,
+ -- even though it asks for the whole triple.
+ roots = [DigraphNode (panic "sccBlocks") b (panic "sccBlocks")
+ | b <- entries ]
+
+--------------------------------------------------------------------------------
+-- Annotate code with register liveness information
+--
+
+regLiveness
+ :: (Outputable instr, Instruction instr)
+ => Platform
+ -> LiveCmmDecl statics instr
+ -> UniqSM (LiveCmmDecl statics instr)
+
+regLiveness _ (CmmData i d)
+ = return $ CmmData i d
+
+regLiveness _ (CmmProc info lbl live [])
+ | LiveInfo static mFirst _ _ <- info
+ = return $ CmmProc
+ (LiveInfo static mFirst mapEmpty mapEmpty)
+ lbl live []
+
+regLiveness platform (CmmProc info lbl live sccs)
+ | LiveInfo static mFirst _ liveSlotsOnEntry <- info
+ = let (ann_sccs, block_live) = computeLiveness platform sccs
+
+ in return $ CmmProc (LiveInfo static mFirst block_live liveSlotsOnEntry)
+ lbl live ann_sccs
+
+
+-- -----------------------------------------------------------------------------
+-- | Check ordering of Blocks
+-- The computeLiveness function requires SCCs to be in reverse
+-- dependent order. If they're not the liveness information will be
+-- wrong, and we'll get a bad allocation. Better to check for this
+-- precondition explicitly or some other poor sucker will waste a
+-- day staring at bad assembly code..
+--
+checkIsReverseDependent
+ :: Instruction instr
+ => [SCC (LiveBasicBlock instr)] -- ^ SCCs of blocks that we're about to run the liveness determinator on.
+ -> Maybe BlockId -- ^ BlockIds that fail the test (if any)
+
+checkIsReverseDependent sccs'
+ = go emptyUniqSet sccs'
+
+ where go _ []
+ = Nothing
+
+ go blocksSeen (AcyclicSCC block : sccs)
+ = let dests = slurpJumpDestsOfBlock block
+ blocksSeen' = unionUniqSets blocksSeen $ mkUniqSet [blockId block]
+ badDests = dests `minusUniqSet` blocksSeen'
+ in case nonDetEltsUniqSet badDests of
+ -- See Note [Unique Determinism and code generation]
+ [] -> go blocksSeen' sccs
+ bad : _ -> Just bad
+
+ go blocksSeen (CyclicSCC blocks : sccs)
+ = let dests = unionManyUniqSets $ map slurpJumpDestsOfBlock blocks
+ blocksSeen' = unionUniqSets blocksSeen $ mkUniqSet $ map blockId blocks
+ badDests = dests `minusUniqSet` blocksSeen'
+ in case nonDetEltsUniqSet badDests of
+ -- See Note [Unique Determinism and code generation]
+ [] -> go blocksSeen' sccs
+ bad : _ -> Just bad
+
+ slurpJumpDestsOfBlock (BasicBlock _ instrs)
+ = unionManyUniqSets
+ $ map (mkUniqSet . jumpDestsOfInstr)
+ [ i | LiveInstr i _ <- instrs]
+
+
+-- | If we've compute liveness info for this code already we have to reverse
+-- the SCCs in each top to get them back to the right order so we can do it again.
+reverseBlocksInTops :: LiveCmmDecl statics instr -> LiveCmmDecl statics instr
+reverseBlocksInTops top
+ = case top of
+ CmmData{} -> top
+ CmmProc info lbl live sccs -> CmmProc info lbl live (reverse sccs)
+
+
+-- | Computing liveness
+--
+-- On entry, the SCCs must be in "reverse" order: later blocks may transfer
+-- control to earlier ones only, else `panic`.
+--
+-- The SCCs returned are in the *opposite* order, which is exactly what we
+-- want for the next pass.
+--
+computeLiveness
+ :: (Outputable instr, Instruction instr)
+ => Platform
+ -> [SCC (LiveBasicBlock instr)]
+ -> ([SCC (LiveBasicBlock instr)], -- instructions annotated with list of registers
+ -- which are "dead after this instruction".
+ BlockMap RegSet) -- blocks annotated with set of live registers
+ -- on entry to the block.
+
+computeLiveness platform sccs
+ = case checkIsReverseDependent sccs of
+ Nothing -> livenessSCCs platform mapEmpty [] sccs
+ Just bad -> pprPanic "RegAlloc.Liveness.computeLiveness"
+ (vcat [ text "SCCs aren't in reverse dependent order"
+ , text "bad blockId" <+> ppr bad
+ , ppr sccs])
+
+livenessSCCs
+ :: Instruction instr
+ => Platform
+ -> BlockMap RegSet
+ -> [SCC (LiveBasicBlock instr)] -- accum
+ -> [SCC (LiveBasicBlock instr)]
+ -> ( [SCC (LiveBasicBlock instr)]
+ , BlockMap RegSet)
+
+livenessSCCs _ blockmap done []
+ = (done, blockmap)
+
+livenessSCCs platform blockmap done (AcyclicSCC block : sccs)
+ = let (blockmap', block') = livenessBlock platform blockmap block
+ in livenessSCCs platform blockmap' (AcyclicSCC block' : done) sccs
+
+livenessSCCs platform blockmap done
+ (CyclicSCC blocks : sccs) =
+ livenessSCCs platform blockmap' (CyclicSCC blocks':done) sccs
+ where (blockmap', blocks')
+ = iterateUntilUnchanged linearLiveness equalBlockMaps
+ blockmap blocks
+
+ iterateUntilUnchanged
+ :: (a -> b -> (a,c)) -> (a -> a -> Bool)
+ -> a -> b
+ -> (a,c)
+
+ iterateUntilUnchanged f eq a b
+ = head $
+ concatMap tail $
+ groupBy (\(a1, _) (a2, _) -> eq a1 a2) $
+ iterate (\(a, _) -> f a b) $
+ (a, panic "RegLiveness.livenessSCCs")
+
+
+ linearLiveness
+ :: Instruction instr
+ => BlockMap RegSet -> [LiveBasicBlock instr]
+ -> (BlockMap RegSet, [LiveBasicBlock instr])
+
+ linearLiveness = mapAccumL (livenessBlock platform)
+
+ -- probably the least efficient way to compare two
+ -- BlockMaps for equality.
+ equalBlockMaps a b
+ = a' == b'
+ where a' = map f $ mapToList a
+ b' = map f $ mapToList b
+ f (key,elt) = (key, nonDetEltsUniqSet elt)
+ -- See Note [Unique Determinism and code generation]
+
+
+
+-- | Annotate a basic block with register liveness information.
+--
+livenessBlock
+ :: Instruction instr
+ => Platform
+ -> BlockMap RegSet
+ -> LiveBasicBlock instr
+ -> (BlockMap RegSet, LiveBasicBlock instr)
+
+livenessBlock platform blockmap (BasicBlock block_id instrs)
+ = let
+ (regsLiveOnEntry, instrs1)
+ = livenessBack platform emptyUniqSet blockmap [] (reverse instrs)
+ blockmap' = mapInsert block_id regsLiveOnEntry blockmap
+
+ instrs2 = livenessForward platform regsLiveOnEntry instrs1
+
+ output = BasicBlock block_id instrs2
+
+ in ( blockmap', output)
+
+-- | Calculate liveness going forwards,
+-- filling in when regs are born
+
+livenessForward
+ :: Instruction instr
+ => Platform
+ -> RegSet -- regs live on this instr
+ -> [LiveInstr instr] -> [LiveInstr instr]
+
+livenessForward _ _ [] = []
+livenessForward platform rsLiveEntry (li@(LiveInstr instr mLive) : lis)
+ | Just live <- mLive
+ = let
+ RU _ written = regUsageOfInstr platform instr
+ -- Regs that are written to but weren't live on entry to this instruction
+ -- are recorded as being born here.
+ rsBorn = mkUniqSet
+ $ filter (\r -> not $ elementOfUniqSet r rsLiveEntry) written
+
+ rsLiveNext = (rsLiveEntry `unionUniqSets` rsBorn)
+ `minusUniqSet` (liveDieRead live)
+ `minusUniqSet` (liveDieWrite live)
+
+ in LiveInstr instr (Just live { liveBorn = rsBorn })
+ : livenessForward platform rsLiveNext lis
+
+ | otherwise
+ = li : livenessForward platform rsLiveEntry lis
+
+
+-- | Calculate liveness going backwards,
+-- filling in when regs die, and what regs are live across each instruction
+
+livenessBack
+ :: Instruction instr
+ => Platform
+ -> RegSet -- regs live on this instr
+ -> BlockMap RegSet -- regs live on entry to other BBs
+ -> [LiveInstr instr] -- instructions (accum)
+ -> [LiveInstr instr] -- instructions
+ -> (RegSet, [LiveInstr instr])
+
+livenessBack _ liveregs _ done [] = (liveregs, done)
+
+livenessBack platform liveregs blockmap acc (instr : instrs)
+ = let (liveregs', instr') = liveness1 platform liveregs blockmap instr
+ in livenessBack platform liveregs' blockmap (instr' : acc) instrs
+
+
+-- don't bother tagging comments or deltas with liveness
+liveness1
+ :: Instruction instr
+ => Platform
+ -> RegSet
+ -> BlockMap RegSet
+ -> LiveInstr instr
+ -> (RegSet, LiveInstr instr)
+
+liveness1 _ liveregs _ (LiveInstr instr _)
+ | isMetaInstr instr
+ = (liveregs, LiveInstr instr Nothing)
+
+liveness1 platform liveregs blockmap (LiveInstr instr _)
+
+ | not_a_branch
+ = (liveregs1, LiveInstr instr
+ (Just $ Liveness
+ { liveBorn = emptyUniqSet
+ , liveDieRead = mkUniqSet r_dying
+ , liveDieWrite = mkUniqSet w_dying }))
+
+ | otherwise
+ = (liveregs_br, LiveInstr instr
+ (Just $ Liveness
+ { liveBorn = emptyUniqSet
+ , liveDieRead = mkUniqSet r_dying_br
+ , liveDieWrite = mkUniqSet w_dying }))
+
+ where
+ !(RU read written) = regUsageOfInstr platform instr
+
+ -- registers that were written here are dead going backwards.
+ -- registers that were read here are live going backwards.
+ liveregs1 = (liveregs `delListFromUniqSet` written)
+ `addListToUniqSet` read
+
+ -- registers that are not live beyond this point, are recorded
+ -- as dying here.
+ r_dying = [ reg | reg <- read, reg `notElem` written,
+ not (elementOfUniqSet reg liveregs) ]
+
+ w_dying = [ reg | reg <- written,
+ not (elementOfUniqSet reg liveregs) ]
+
+ -- union in the live regs from all the jump destinations of this
+ -- instruction.
+ targets = jumpDestsOfInstr instr -- where we go from here
+ not_a_branch = null targets
+
+ targetLiveRegs target
+ = case mapLookup target blockmap of
+ Just ra -> ra
+ Nothing -> emptyRegSet
+
+ live_from_branch = unionManyUniqSets (map targetLiveRegs targets)
+
+ liveregs_br = liveregs1 `unionUniqSets` live_from_branch
+
+ -- registers that are live only in the branch targets should
+ -- be listed as dying here.
+ live_branch_only = live_from_branch `minusUniqSet` liveregs
+ r_dying_br = nonDetEltsUniqSet (mkUniqSet r_dying `unionUniqSets`
+ live_branch_only)
+ -- See Note [Unique Determinism and code generation]
diff --git a/compiler/GHC/CmmToAsm/Reg/Target.hs b/compiler/GHC/CmmToAsm/Reg/Target.hs
new file mode 100644
index 0000000000..a45d70c826
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Target.hs
@@ -0,0 +1,135 @@
+{-# LANGUAGE CPP #-}
+-- | Hard wired things related to registers.
+-- This is module is preventing the native code generator being able to
+-- emit code for non-host architectures.
+--
+-- TODO: Do a better job of the overloading, and eliminate this module.
+-- We'd probably do better with a Register type class, and hook this to
+-- Instruction somehow.
+--
+-- TODO: We should also make arch specific versions of RegAlloc.Graph.TrivColorable
+module GHC.CmmToAsm.Reg.Target (
+ targetVirtualRegSqueeze,
+ targetRealRegSqueeze,
+ targetClassOfRealReg,
+ targetMkVirtualReg,
+ targetRegDotColor,
+ targetClassOfReg
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import Outputable
+import Unique
+import GHC.Platform
+
+import qualified GHC.CmmToAsm.X86.Regs as X86
+import qualified GHC.CmmToAsm.X86.RegInfo as X86
+import qualified GHC.CmmToAsm.PPC.Regs as PPC
+import qualified GHC.CmmToAsm.SPARC.Regs as SPARC
+
+targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int
+targetVirtualRegSqueeze platform
+ = case platformArch platform of
+ ArchX86 -> X86.virtualRegSqueeze
+ ArchX86_64 -> X86.virtualRegSqueeze
+ ArchPPC -> PPC.virtualRegSqueeze
+ ArchS390X -> panic "targetVirtualRegSqueeze ArchS390X"
+ ArchSPARC -> SPARC.virtualRegSqueeze
+ ArchSPARC64 -> panic "targetVirtualRegSqueeze ArchSPARC64"
+ ArchPPC_64 _ -> PPC.virtualRegSqueeze
+ ArchARM _ _ _ -> panic "targetVirtualRegSqueeze ArchARM"
+ ArchARM64 -> panic "targetVirtualRegSqueeze ArchARM64"
+ ArchAlpha -> panic "targetVirtualRegSqueeze ArchAlpha"
+ ArchMipseb -> panic "targetVirtualRegSqueeze ArchMipseb"
+ ArchMipsel -> panic "targetVirtualRegSqueeze ArchMipsel"
+ ArchJavaScript-> panic "targetVirtualRegSqueeze ArchJavaScript"
+ ArchUnknown -> panic "targetVirtualRegSqueeze ArchUnknown"
+
+
+targetRealRegSqueeze :: Platform -> RegClass -> RealReg -> Int
+targetRealRegSqueeze platform
+ = case platformArch platform of
+ ArchX86 -> X86.realRegSqueeze
+ ArchX86_64 -> X86.realRegSqueeze
+ ArchPPC -> PPC.realRegSqueeze
+ ArchS390X -> panic "targetRealRegSqueeze ArchS390X"
+ ArchSPARC -> SPARC.realRegSqueeze
+ ArchSPARC64 -> panic "targetRealRegSqueeze ArchSPARC64"
+ ArchPPC_64 _ -> PPC.realRegSqueeze
+ ArchARM _ _ _ -> panic "targetRealRegSqueeze ArchARM"
+ ArchARM64 -> panic "targetRealRegSqueeze ArchARM64"
+ ArchAlpha -> panic "targetRealRegSqueeze ArchAlpha"
+ ArchMipseb -> panic "targetRealRegSqueeze ArchMipseb"
+ ArchMipsel -> panic "targetRealRegSqueeze ArchMipsel"
+ ArchJavaScript-> panic "targetRealRegSqueeze ArchJavaScript"
+ ArchUnknown -> panic "targetRealRegSqueeze ArchUnknown"
+
+targetClassOfRealReg :: Platform -> RealReg -> RegClass
+targetClassOfRealReg platform
+ = case platformArch platform of
+ ArchX86 -> X86.classOfRealReg platform
+ ArchX86_64 -> X86.classOfRealReg platform
+ ArchPPC -> PPC.classOfRealReg
+ ArchS390X -> panic "targetClassOfRealReg ArchS390X"
+ ArchSPARC -> SPARC.classOfRealReg
+ ArchSPARC64 -> panic "targetClassOfRealReg ArchSPARC64"
+ ArchPPC_64 _ -> PPC.classOfRealReg
+ ArchARM _ _ _ -> panic "targetClassOfRealReg ArchARM"
+ ArchARM64 -> panic "targetClassOfRealReg ArchARM64"
+ ArchAlpha -> panic "targetClassOfRealReg ArchAlpha"
+ ArchMipseb -> panic "targetClassOfRealReg ArchMipseb"
+ ArchMipsel -> panic "targetClassOfRealReg ArchMipsel"
+ ArchJavaScript-> panic "targetClassOfRealReg ArchJavaScript"
+ ArchUnknown -> panic "targetClassOfRealReg ArchUnknown"
+
+targetMkVirtualReg :: Platform -> Unique -> Format -> VirtualReg
+targetMkVirtualReg platform
+ = case platformArch platform of
+ ArchX86 -> X86.mkVirtualReg
+ ArchX86_64 -> X86.mkVirtualReg
+ ArchPPC -> PPC.mkVirtualReg
+ ArchS390X -> panic "targetMkVirtualReg ArchS390X"
+ ArchSPARC -> SPARC.mkVirtualReg
+ ArchSPARC64 -> panic "targetMkVirtualReg ArchSPARC64"
+ ArchPPC_64 _ -> PPC.mkVirtualReg
+ ArchARM _ _ _ -> panic "targetMkVirtualReg ArchARM"
+ ArchARM64 -> panic "targetMkVirtualReg ArchARM64"
+ ArchAlpha -> panic "targetMkVirtualReg ArchAlpha"
+ ArchMipseb -> panic "targetMkVirtualReg ArchMipseb"
+ ArchMipsel -> panic "targetMkVirtualReg ArchMipsel"
+ ArchJavaScript-> panic "targetMkVirtualReg ArchJavaScript"
+ ArchUnknown -> panic "targetMkVirtualReg ArchUnknown"
+
+targetRegDotColor :: Platform -> RealReg -> SDoc
+targetRegDotColor platform
+ = case platformArch platform of
+ ArchX86 -> X86.regDotColor platform
+ ArchX86_64 -> X86.regDotColor platform
+ ArchPPC -> PPC.regDotColor
+ ArchS390X -> panic "targetRegDotColor ArchS390X"
+ ArchSPARC -> SPARC.regDotColor
+ ArchSPARC64 -> panic "targetRegDotColor ArchSPARC64"
+ ArchPPC_64 _ -> PPC.regDotColor
+ ArchARM _ _ _ -> panic "targetRegDotColor ArchARM"
+ ArchARM64 -> panic "targetRegDotColor ArchARM64"
+ ArchAlpha -> panic "targetRegDotColor ArchAlpha"
+ ArchMipseb -> panic "targetRegDotColor ArchMipseb"
+ ArchMipsel -> panic "targetRegDotColor ArchMipsel"
+ ArchJavaScript-> panic "targetRegDotColor ArchJavaScript"
+ ArchUnknown -> panic "targetRegDotColor ArchUnknown"
+
+
+targetClassOfReg :: Platform -> Reg -> RegClass
+targetClassOfReg platform reg
+ = case reg of
+ RegVirtual vr -> classOfVirtualReg vr
+ RegReal rr -> targetClassOfRealReg platform rr
diff --git a/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs b/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs
new file mode 100644
index 0000000000..6cc660bba9
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs
@@ -0,0 +1,44 @@
+
+module GHC.CmmToAsm.SPARC.AddrMode (
+ AddrMode(..),
+ addrOffset
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Base
+import GHC.Platform.Reg
+
+-- addressing modes ------------------------------------------------------------
+
+-- | Represents a memory address in an instruction.
+-- Being a RISC machine, the SPARC addressing modes are very regular.
+--
+data AddrMode
+ = AddrRegReg Reg Reg -- addr = r1 + r2
+ | AddrRegImm Reg Imm -- addr = r1 + imm
+
+
+-- | Add an integer offset to the address in an AddrMode.
+--
+addrOffset :: AddrMode -> Int -> Maybe AddrMode
+addrOffset addr off
+ = case addr of
+ AddrRegImm r (ImmInt n)
+ | fits13Bits n2 -> Just (AddrRegImm r (ImmInt n2))
+ | otherwise -> Nothing
+ where n2 = n + off
+
+ AddrRegImm r (ImmInteger n)
+ | fits13Bits n2 -> Just (AddrRegImm r (ImmInt (fromInteger n2)))
+ | otherwise -> Nothing
+ where n2 = n + toInteger off
+
+ AddrRegReg r (RegReal (RealRegSingle 0))
+ | fits13Bits off -> Just (AddrRegImm r (ImmInt off))
+ | otherwise -> Nothing
+
+ _ -> Nothing
diff --git a/compiler/GHC/CmmToAsm/SPARC/Base.hs b/compiler/GHC/CmmToAsm/SPARC/Base.hs
new file mode 100644
index 0000000000..86a897dacb
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Base.hs
@@ -0,0 +1,77 @@
+
+-- | Bits and pieces on the bottom of the module dependency tree.
+-- Also import the required constants, so we know what we're using.
+--
+-- In the interests of cross-compilation, we want to free ourselves
+-- from the autoconf generated modules like main/Constants
+
+module GHC.CmmToAsm.SPARC.Base (
+ wordLength,
+ wordLengthInBits,
+ spillAreaLength,
+ spillSlotSize,
+ extraStackArgsHere,
+ fits13Bits,
+ is32BitInteger,
+ largeOffsetError
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Driver.Session
+import Panic
+
+import Data.Int
+
+
+-- On 32 bit SPARC, pointers are 32 bits.
+wordLength :: Int
+wordLength = 4
+
+wordLengthInBits :: Int
+wordLengthInBits
+ = wordLength * 8
+
+-- Size of the available spill area
+spillAreaLength :: DynFlags -> Int
+spillAreaLength
+ = rESERVED_C_STACK_BYTES
+
+-- | We need 8 bytes because our largest registers are 64 bit.
+spillSlotSize :: Int
+spillSlotSize = 8
+
+
+-- | We (allegedly) put the first six C-call arguments in registers;
+-- where do we start putting the rest of them?
+extraStackArgsHere :: Int
+extraStackArgsHere = 23
+
+
+{-# SPECIALIZE fits13Bits :: Int -> Bool, Integer -> Bool #-}
+-- | Check whether an offset is representable with 13 bits.
+fits13Bits :: Integral a => a -> Bool
+fits13Bits x = x >= -4096 && x < 4096
+
+-- | Check whether an integer will fit in 32 bits.
+-- A CmmInt is intended to be truncated to the appropriate
+-- number of bits, so here we truncate it to Int64. This is
+-- important because e.g. -1 as a CmmInt might be either
+-- -1 or 18446744073709551615.
+--
+is32BitInteger :: Integer -> Bool
+is32BitInteger i
+ = i64 <= 0x7fffffff && i64 >= -0x80000000
+ where i64 = fromIntegral i :: Int64
+
+
+-- | Sadness.
+largeOffsetError :: (Show a) => a -> b
+largeOffsetError i
+ = panic ("ERROR: SPARC native-code generator cannot handle large offset ("
+ ++ show i ++ ");\nprobably because of large constant data structures;" ++
+ "\nworkaround: use -fllvm on this module.\n")
+
+
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs
new file mode 100644
index 0000000000..2580ea4014
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs
@@ -0,0 +1,700 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Generating machine code (instruction selection)
+--
+-- (c) The University of Glasgow 1996-2013
+--
+-----------------------------------------------------------------------------
+
+{-# LANGUAGE GADTs #-}
+module GHC.CmmToAsm.SPARC.CodeGen (
+ cmmTopCodeGen,
+ generateJumpTableForInstr,
+ InstrBlock
+)
+
+where
+
+#include "HsVersions.h"
+
+-- NCG stuff:
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.SPARC.CodeGen.Sanity
+import GHC.CmmToAsm.SPARC.CodeGen.Amode
+import GHC.CmmToAsm.SPARC.CodeGen.CondCode
+import GHC.CmmToAsm.SPARC.CodeGen.Gen64
+import GHC.CmmToAsm.SPARC.CodeGen.Gen32
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Stack
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Monad ( NatM, getNewRegNat, getNewLabelNat )
+
+-- Our intermediate code:
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.CmmToAsm.PIC
+import GHC.Platform.Reg
+import GHC.Cmm.CLabel
+import GHC.CmmToAsm.CPrim
+
+-- The rest:
+import BasicTypes
+import GHC.Driver.Session
+import FastString
+import OrdList
+import Outputable
+import GHC.Platform
+
+import Control.Monad ( mapAndUnzipM )
+
+-- | Top level code generation
+cmmTopCodeGen :: RawCmmDecl
+ -> NatM [NatCmmDecl RawCmmStatics Instr]
+
+cmmTopCodeGen (CmmProc info lab live graph)
+ = do let blocks = toBlockListEntryFirst graph
+ (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+
+ let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+ let tops = proc : concat statics
+
+ return tops
+
+cmmTopCodeGen (CmmData sec dat) = do
+ return [CmmData sec dat] -- no translation, we just use CmmStatic
+
+
+-- | Do code generation on a single block of CMM code.
+-- code generation may introduce new basic block boundaries, which
+-- are indicated by the NEWBLOCK instruction. We must split up the
+-- instruction stream into basic blocks again. Also, we extract
+-- LDATAs here too.
+basicBlockCodeGen :: CmmBlock
+ -> NatM ( [NatBasicBlock Instr]
+ , [NatCmmDecl RawCmmStatics Instr])
+
+basicBlockCodeGen block = do
+ let (_, nodes, tail) = blockSplit block
+ id = entryLabel block
+ stmts = blockToList nodes
+ mid_instrs <- stmtsToInstrs stmts
+ tail_instrs <- stmtToInstrs tail
+ let instrs = mid_instrs `appOL` tail_instrs
+ let
+ (top,other_blocks,statics)
+ = foldrOL mkBlocks ([],[],[]) instrs
+
+ mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+ = ([], BasicBlock id instrs : blocks, statics)
+
+ mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+ = (instrs, blocks, CmmData sec dat:statics)
+
+ mkBlocks instr (instrs,blocks,statics)
+ = (instr:instrs, blocks, statics)
+
+ -- do intra-block sanity checking
+ blocksChecked
+ = map (checkBlock block)
+ $ BasicBlock id top : other_blocks
+
+ return (blocksChecked, statics)
+
+
+-- | Convert some Cmm statements to SPARC instructions.
+stmtsToInstrs :: [CmmNode e x] -> NatM InstrBlock
+stmtsToInstrs stmts
+ = do instrss <- mapM stmtToInstrs stmts
+ return (concatOL instrss)
+
+
+stmtToInstrs :: CmmNode e x -> NatM InstrBlock
+stmtToInstrs stmt = do
+ dflags <- getDynFlags
+ case stmt of
+ CmmComment s -> return (unitOL (COMMENT s))
+ CmmTick {} -> return nilOL
+ CmmUnwind {} -> return nilOL
+
+ CmmAssign reg src
+ | isFloatType ty -> assignReg_FltCode format reg src
+ | isWord64 ty -> assignReg_I64Code reg src
+ | otherwise -> assignReg_IntCode format reg src
+ where ty = cmmRegType dflags reg
+ format = cmmTypeFormat ty
+
+ CmmStore addr src
+ | isFloatType ty -> assignMem_FltCode format addr src
+ | isWord64 ty -> assignMem_I64Code addr src
+ | otherwise -> assignMem_IntCode format addr src
+ where ty = cmmExprType dflags src
+ format = cmmTypeFormat ty
+
+ CmmUnsafeForeignCall target result_regs args
+ -> genCCall target result_regs args
+
+ CmmBranch id -> genBranch id
+ CmmCondBranch arg true false _ -> do
+ b1 <- genCondJump true arg
+ b2 <- genBranch false
+ return (b1 `appOL` b2)
+ CmmSwitch arg ids -> do dflags <- getDynFlags
+ genSwitch dflags arg ids
+ CmmCall { cml_target = arg } -> genJump arg
+
+ _
+ -> panic "stmtToInstrs: statement should have been cps'd away"
+
+
+{-
+Now, given a tree (the argument to a CmmLoad) that references memory,
+produce a suitable addressing mode.
+
+A Rule of the Game (tm) for Amodes: use of the addr bit must
+immediately follow use of the code part, since the code part puts
+values in registers which the addr then refers to. So you can't put
+anything in between, lest it overwrite some of those registers. If
+you need to do some other computation between the code part and use of
+the addr bit, first store the effective address from the amode in a
+temporary, then do the other computation, and then use the temporary:
+
+ code
+ LEA amode, tmp
+ ... other computation ...
+ ... (tmp) ...
+-}
+
+
+
+-- | Convert a BlockId to some CmmStatic data
+jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
+jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+ where blockLabel = blockLbl blockid
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business. Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers. If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side. This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_IntCode pk addr src = do
+ (srcReg, code) <- getSomeReg src
+ Amode dstAddr addr_code <- getAmode addr
+ return $ code `appOL` addr_code `snocOL` ST pk srcReg dstAddr
+
+
+assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode _ reg src = do
+ dflags <- getDynFlags
+ r <- getRegister src
+ let dst = getRegisterReg (targetPlatform dflags) reg
+ return $ case r of
+ Any _ code -> code dst
+ Fixed _ freg fcode -> fcode `snocOL` OR False g0 (RIReg freg) dst
+
+
+
+-- Floating point assignment to memory
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_FltCode pk addr src = do
+ dflags <- getDynFlags
+ Amode dst__2 code1 <- getAmode addr
+ (src__2, code2) <- getSomeReg src
+ tmp1 <- getNewRegNat pk
+ let
+ pk__2 = cmmExprType dflags src
+ code__2 = code1 `appOL` code2 `appOL`
+ if formatToWidth pk == typeWidth pk__2
+ then unitOL (ST pk src__2 dst__2)
+ else toOL [ FxTOy (cmmTypeFormat pk__2) pk src__2 tmp1
+ , ST pk tmp1 dst__2]
+ return code__2
+
+-- Floating point assignment to a register/temporary
+assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode pk dstCmmReg srcCmmExpr = do
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ srcRegister <- getRegister srcCmmExpr
+ let dstReg = getRegisterReg platform dstCmmReg
+
+ return $ case srcRegister of
+ Any _ code -> code dstReg
+ Fixed _ srcFixedReg srcCode -> srcCode `snocOL` FMOV pk srcFixedReg dstReg
+
+
+
+
+genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
+
+genJump (CmmLit (CmmLabel lbl))
+ = return (toOL [CALL (Left target) 0 True, NOP])
+ where
+ target = ImmCLbl lbl
+
+genJump tree
+ = do
+ (target, code) <- getSomeReg tree
+ return (code `snocOL` JMP (AddrRegReg target g0) `snocOL` NOP)
+
+-- -----------------------------------------------------------------------------
+-- Unconditional branches
+
+genBranch :: BlockId -> NatM InstrBlock
+genBranch = return . toOL . mkJumpInstr
+
+
+-- -----------------------------------------------------------------------------
+-- Conditional jumps
+
+{-
+Conditional jumps are always to local labels, so we can use branch
+instructions. We peek at the arguments to decide what kind of
+comparison to do.
+
+SPARC: First, we have to ensure that the condition codes are set
+according to the supplied comparison operation. We generate slightly
+different code for floating point comparisons, because a floating
+point operation cannot directly precede a @BF@. We assume the worst
+and fill that slot with a @NOP@.
+
+SPARC: Do not fill the delay slots here; you will confuse the register
+allocator.
+-}
+
+
+genCondJump
+ :: BlockId -- the branch target
+ -> CmmExpr -- the condition on which to branch
+ -> NatM InstrBlock
+
+
+
+genCondJump bid bool = do
+ CondCode is_float cond code <- getCondCode bool
+ return (
+ code `appOL`
+ toOL (
+ if is_float
+ then [NOP, BF cond False bid, NOP]
+ else [BI cond False bid, NOP]
+ )
+ )
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+genSwitch dflags expr targets
+ | positionIndependent dflags
+ = error "MachCodeGen: sparc genSwitch PIC not finished\n"
+
+ | otherwise
+ = do (e_reg, e_code) <- getSomeReg (cmmOffset dflags expr offset)
+
+ base_reg <- getNewRegNat II32
+ offset_reg <- getNewRegNat II32
+ dst <- getNewRegNat II32
+
+ label <- getNewLabelNat
+
+ return $ e_code `appOL`
+ toOL
+ [ -- load base of jump table
+ SETHI (HI (ImmCLbl label)) base_reg
+ , OR False base_reg (RIImm $ LO $ ImmCLbl label) base_reg
+
+ -- the addrs in the table are 32 bits wide..
+ , SLL e_reg (RIImm $ ImmInt 2) offset_reg
+
+ -- load and jump to the destination
+ , LD II32 (AddrRegReg base_reg offset_reg) dst
+ , JMP_TBL (AddrRegImm dst (ImmInt 0)) ids label
+ , NOP ]
+ where (offset, ids) = switchTargetsToTable targets
+
+generateJumpTableForInstr :: DynFlags -> Instr
+ -> Maybe (NatCmmDecl RawCmmStatics Instr)
+generateJumpTableForInstr dflags (JMP_TBL _ ids label) =
+ let jumpTable = map (jumpTableEntry dflags) ids
+ in Just (CmmData (Section ReadOnlyData label) (RawCmmStatics label jumpTable))
+generateJumpTableForInstr _ _ = Nothing
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating C calls
+
+{-
+ Now the biggest nightmare---calls. Most of the nastiness is buried in
+ @get_arg@, which moves the arguments to the correct registers/stack
+ locations. Apart from that, the code is easy.
+
+ The SPARC calling convention is an absolute
+ nightmare. The first 6x32 bits of arguments are mapped into
+ %o0 through %o5, and the remaining arguments are dumped to the
+ stack, beginning at [%sp+92]. (Note that %o6 == %sp.)
+
+ If we have to put args on the stack, move %o6==%sp down by
+ the number of words to go on the stack, to ensure there's enough space.
+
+ According to Fraser and Hanson's lcc book, page 478, fig 17.2,
+ 16 words above the stack pointer is a word for the address of
+ a structure return value. I use this as a temporary location
+ for moving values from float to int regs. Certainly it isn't
+ safe to put anything in the 16 words starting at %sp, since
+ this area can get trashed at any time due to window overflows
+ caused by signal handlers.
+
+ A final complication (if the above isn't enough) is that
+ we can't blithely calculate the arguments one by one into
+ %o0 .. %o5. Consider the following nested calls:
+
+ fff a (fff b c)
+
+ Naive code moves a into %o0, and (fff b c) into %o1. Unfortunately
+ the inner call will itself use %o0, which trashes the value put there
+ in preparation for the outer call. Upshot: we need to calculate the
+ args into temporary regs, and move those to arg regs or onto the
+ stack only immediately prior to the call proper. Sigh.
+-}
+
+genCCall
+ :: ForeignTarget -- function to call
+ -> [CmmFormal] -- where to put the result
+ -> [CmmActual] -- arguments (of mixed type)
+ -> NatM InstrBlock
+
+
+
+-- On SPARC under TSO (Total Store Ordering), writes earlier in the instruction stream
+-- are guaranteed to take place before writes afterwards (unlike on PowerPC).
+-- Ref: Section 8.4 of the SPARC V9 Architecture manual.
+--
+-- In the SPARC case we don't need a barrier.
+--
+genCCall (PrimTarget MO_ReadBarrier) _ _
+ = return $ nilOL
+genCCall (PrimTarget MO_WriteBarrier) _ _
+ = return $ nilOL
+
+genCCall (PrimTarget (MO_Prefetch_Data _)) _ _
+ = return $ nilOL
+
+genCCall target dest_regs args
+ = do -- work out the arguments, and assign them to integer regs
+ argcode_and_vregs <- mapM arg_to_int_vregs args
+ let (argcodes, vregss) = unzip argcode_and_vregs
+ let vregs = concat vregss
+
+ let n_argRegs = length allArgRegs
+ let n_argRegs_used = min (length vregs) n_argRegs
+
+
+ -- deal with static vs dynamic call targets
+ callinsns <- case target of
+ ForeignTarget (CmmLit (CmmLabel lbl)) _ ->
+ return (unitOL (CALL (Left (litToImm (CmmLabel lbl))) n_argRegs_used False))
+
+ ForeignTarget expr _
+ -> do (dyn_c, dyn_rs) <- arg_to_int_vregs expr
+ let dyn_r = case dyn_rs of
+ [dyn_r'] -> dyn_r'
+ _ -> panic "SPARC.CodeGen.genCCall: arg_to_int"
+ return (dyn_c `snocOL` CALL (Right dyn_r) n_argRegs_used False)
+
+ PrimTarget mop
+ -> do res <- outOfLineMachOp mop
+ lblOrMopExpr <- case res of
+ Left lbl -> do
+ return (unitOL (CALL (Left (litToImm (CmmLabel lbl))) n_argRegs_used False))
+
+ Right mopExpr -> do
+ (dyn_c, dyn_rs) <- arg_to_int_vregs mopExpr
+ let dyn_r = case dyn_rs of
+ [dyn_r'] -> dyn_r'
+ _ -> panic "SPARC.CodeGen.genCCall: arg_to_int"
+ return (dyn_c `snocOL` CALL (Right dyn_r) n_argRegs_used False)
+
+ return lblOrMopExpr
+
+ let argcode = concatOL argcodes
+
+ let (move_sp_down, move_sp_up)
+ = let diff = length vregs - n_argRegs
+ nn = if odd diff then diff + 1 else diff -- keep 8-byte alignment
+ in if nn <= 0
+ then (nilOL, nilOL)
+ else (unitOL (moveSp (-1*nn)), unitOL (moveSp (1*nn)))
+
+ let transfer_code
+ = toOL (move_final vregs allArgRegs extraStackArgsHere)
+
+ dflags <- getDynFlags
+ return
+ $ argcode `appOL`
+ move_sp_down `appOL`
+ transfer_code `appOL`
+ callinsns `appOL`
+ unitOL NOP `appOL`
+ move_sp_up `appOL`
+ assign_code (targetPlatform dflags) dest_regs
+
+
+-- | Generate code to calculate an argument, and move it into one
+-- or two integer vregs.
+arg_to_int_vregs :: CmmExpr -> NatM (OrdList Instr, [Reg])
+arg_to_int_vregs arg = do dflags <- getDynFlags
+ arg_to_int_vregs' dflags arg
+
+arg_to_int_vregs' :: DynFlags -> CmmExpr -> NatM (OrdList Instr, [Reg])
+arg_to_int_vregs' dflags arg
+
+ -- If the expr produces a 64 bit int, then we can just use iselExpr64
+ | isWord64 (cmmExprType dflags arg)
+ = do (ChildCode64 code r_lo) <- iselExpr64 arg
+ let r_hi = getHiVRegFromLo r_lo
+ return (code, [r_hi, r_lo])
+
+ | otherwise
+ = do (src, code) <- getSomeReg arg
+ let pk = cmmExprType dflags arg
+
+ case cmmTypeFormat pk of
+
+ -- Load a 64 bit float return value into two integer regs.
+ FF64 -> do
+ v1 <- getNewRegNat II32
+ v2 <- getNewRegNat II32
+
+ let code2 =
+ code `snocOL`
+ FMOV FF64 src f0 `snocOL`
+ ST FF32 f0 (spRel 16) `snocOL`
+ LD II32 (spRel 16) v1 `snocOL`
+ ST FF32 f1 (spRel 16) `snocOL`
+ LD II32 (spRel 16) v2
+
+ return (code2, [v1,v2])
+
+ -- Load a 32 bit float return value into an integer reg
+ FF32 -> do
+ v1 <- getNewRegNat II32
+
+ let code2 =
+ code `snocOL`
+ ST FF32 src (spRel 16) `snocOL`
+ LD II32 (spRel 16) v1
+
+ return (code2, [v1])
+
+ -- Move an integer return value into its destination reg.
+ _ -> do
+ v1 <- getNewRegNat II32
+
+ let code2 =
+ code `snocOL`
+ OR False g0 (RIReg src) v1
+
+ return (code2, [v1])
+
+
+-- | Move args from the integer vregs into which they have been
+-- marshalled, into %o0 .. %o5, and the rest onto the stack.
+--
+move_final :: [Reg] -> [Reg] -> Int -> [Instr]
+
+-- all args done
+move_final [] _ _
+ = []
+
+-- out of aregs; move to stack
+move_final (v:vs) [] offset
+ = ST II32 v (spRel offset)
+ : move_final vs [] (offset+1)
+
+-- move into an arg (%o[0..5]) reg
+move_final (v:vs) (a:az) offset
+ = OR False g0 (RIReg v) a
+ : move_final vs az offset
+
+
+-- | Assign results returned from the call into their
+-- destination regs.
+--
+assign_code :: Platform -> [LocalReg] -> OrdList Instr
+
+assign_code _ [] = nilOL
+
+assign_code platform [dest]
+ = let rep = localRegType dest
+ width = typeWidth rep
+ r_dest = getRegisterReg platform (CmmLocal dest)
+
+ result
+ | isFloatType rep
+ , W32 <- width
+ = unitOL $ FMOV FF32 (regSingle $ fReg 0) r_dest
+
+ | isFloatType rep
+ , W64 <- width
+ = unitOL $ FMOV FF64 (regSingle $ fReg 0) r_dest
+
+ | not $ isFloatType rep
+ , W32 <- width
+ = unitOL $ mkRegRegMoveInstr platform (regSingle $ oReg 0) r_dest
+
+ | not $ isFloatType rep
+ , W64 <- width
+ , r_dest_hi <- getHiVRegFromLo r_dest
+ = toOL [ mkRegRegMoveInstr platform (regSingle $ oReg 0) r_dest_hi
+ , mkRegRegMoveInstr platform (regSingle $ oReg 1) r_dest]
+
+ | otherwise
+ = panic "SPARC.CodeGen.GenCCall: no match"
+
+ in result
+
+assign_code _ _
+ = panic "SPARC.CodeGen.GenCCall: no match"
+
+
+
+-- | Generate a call to implement an out-of-line floating point operation
+outOfLineMachOp
+ :: CallishMachOp
+ -> NatM (Either CLabel CmmExpr)
+
+outOfLineMachOp mop
+ = do let functionName
+ = outOfLineMachOp_table mop
+
+ dflags <- getDynFlags
+ mopExpr <- cmmMakeDynamicReference dflags CallReference
+ $ mkForeignLabel functionName Nothing ForeignLabelInExternalPackage IsFunction
+
+ let mopLabelOrExpr
+ = case mopExpr of
+ CmmLit (CmmLabel lbl) -> Left lbl
+ _ -> Right mopExpr
+
+ return mopLabelOrExpr
+
+
+-- | Decide what C function to use to implement a CallishMachOp
+--
+outOfLineMachOp_table
+ :: CallishMachOp
+ -> FastString
+
+outOfLineMachOp_table mop
+ = case mop of
+ MO_F32_Exp -> fsLit "expf"
+ MO_F32_ExpM1 -> fsLit "expm1f"
+ MO_F32_Log -> fsLit "logf"
+ MO_F32_Log1P -> fsLit "log1pf"
+ MO_F32_Sqrt -> fsLit "sqrtf"
+ MO_F32_Fabs -> unsupported
+ MO_F32_Pwr -> fsLit "powf"
+
+ MO_F32_Sin -> fsLit "sinf"
+ MO_F32_Cos -> fsLit "cosf"
+ MO_F32_Tan -> fsLit "tanf"
+
+ MO_F32_Asin -> fsLit "asinf"
+ MO_F32_Acos -> fsLit "acosf"
+ MO_F32_Atan -> fsLit "atanf"
+
+ MO_F32_Sinh -> fsLit "sinhf"
+ MO_F32_Cosh -> fsLit "coshf"
+ MO_F32_Tanh -> fsLit "tanhf"
+
+ MO_F32_Asinh -> fsLit "asinhf"
+ MO_F32_Acosh -> fsLit "acoshf"
+ MO_F32_Atanh -> fsLit "atanhf"
+
+ MO_F64_Exp -> fsLit "exp"
+ MO_F64_ExpM1 -> fsLit "expm1"
+ MO_F64_Log -> fsLit "log"
+ MO_F64_Log1P -> fsLit "log1p"
+ MO_F64_Sqrt -> fsLit "sqrt"
+ MO_F64_Fabs -> unsupported
+ MO_F64_Pwr -> fsLit "pow"
+
+ MO_F64_Sin -> fsLit "sin"
+ MO_F64_Cos -> fsLit "cos"
+ MO_F64_Tan -> fsLit "tan"
+
+ MO_F64_Asin -> fsLit "asin"
+ MO_F64_Acos -> fsLit "acos"
+ MO_F64_Atan -> fsLit "atan"
+
+ MO_F64_Sinh -> fsLit "sinh"
+ MO_F64_Cosh -> fsLit "cosh"
+ MO_F64_Tanh -> fsLit "tanh"
+
+ MO_F64_Asinh -> fsLit "asinh"
+ MO_F64_Acosh -> fsLit "acosh"
+ MO_F64_Atanh -> fsLit "atanh"
+
+ MO_UF_Conv w -> fsLit $ word2FloatLabel w
+
+ MO_Memcpy _ -> fsLit "memcpy"
+ MO_Memset _ -> fsLit "memset"
+ MO_Memmove _ -> fsLit "memmove"
+ MO_Memcmp _ -> fsLit "memcmp"
+
+ MO_BSwap w -> fsLit $ bSwapLabel w
+ MO_BRev w -> fsLit $ bRevLabel w
+ MO_PopCnt w -> fsLit $ popCntLabel w
+ MO_Pdep w -> fsLit $ pdepLabel w
+ MO_Pext w -> fsLit $ pextLabel w
+ MO_Clz w -> fsLit $ clzLabel w
+ MO_Ctz w -> fsLit $ ctzLabel w
+ MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
+ MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
+ MO_AtomicRead w -> fsLit $ atomicReadLabel w
+ MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
+
+ MO_S_Mul2 {} -> unsupported
+ MO_S_QuotRem {} -> unsupported
+ MO_U_QuotRem {} -> unsupported
+ MO_U_QuotRem2 {} -> unsupported
+ MO_Add2 {} -> unsupported
+ MO_AddWordC {} -> unsupported
+ MO_SubWordC {} -> unsupported
+ MO_AddIntC {} -> unsupported
+ MO_SubIntC {} -> unsupported
+ MO_U_Mul2 {} -> unsupported
+ MO_ReadBarrier -> unsupported
+ MO_WriteBarrier -> unsupported
+ MO_Touch -> unsupported
+ (MO_Prefetch_Data _) -> unsupported
+ where unsupported = panic ("outOfLineCmmOp: " ++ show mop
+ ++ " not supported here")
+
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs
new file mode 100644
index 0000000000..d6c9d7b360
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs
@@ -0,0 +1,74 @@
+module GHC.CmmToAsm.SPARC.CodeGen.Amode (
+ getAmode
+)
+
+where
+
+import GhcPrelude
+
+import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+
+import OrdList
+
+
+-- | Generate code to reference a memory address.
+getAmode
+ :: CmmExpr -- ^ expr producing an address
+ -> NatM Amode
+
+getAmode tree@(CmmRegOff _ _)
+ = do dflags <- getDynFlags
+ getAmode (mangleIndexTree dflags tree)
+
+getAmode (CmmMachOp (MO_Sub _) [x, CmmLit (CmmInt i _)])
+ | fits13Bits (-i)
+ = do
+ (reg, code) <- getSomeReg x
+ let
+ off = ImmInt (-(fromInteger i))
+ return (Amode (AddrRegImm reg off) code)
+
+
+getAmode (CmmMachOp (MO_Add _) [x, CmmLit (CmmInt i _)])
+ | fits13Bits i
+ = do
+ (reg, code) <- getSomeReg x
+ let
+ off = ImmInt (fromInteger i)
+ return (Amode (AddrRegImm reg off) code)
+
+getAmode (CmmMachOp (MO_Add _) [x, y])
+ = do
+ (regX, codeX) <- getSomeReg x
+ (regY, codeY) <- getSomeReg y
+ let
+ code = codeX `appOL` codeY
+ return (Amode (AddrRegReg regX regY) code)
+
+getAmode (CmmLit lit)
+ = do
+ let imm__2 = litToImm lit
+ tmp1 <- getNewRegNat II32
+ tmp2 <- getNewRegNat II32
+
+ let code = toOL [ SETHI (HI imm__2) tmp1
+ , OR False tmp1 (RIImm (LO imm__2)) tmp2]
+
+ return (Amode (AddrRegReg tmp2 g0) code)
+
+getAmode other
+ = do
+ (reg, code) <- getSomeReg other
+ let
+ off = ImmInt 0
+ return (Amode (AddrRegImm reg off) code)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs
new file mode 100644
index 0000000000..cf249303e4
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs
@@ -0,0 +1,119 @@
+module GHC.CmmToAsm.SPARC.CodeGen.Base (
+ InstrBlock,
+ CondCode(..),
+ ChildCode64(..),
+ Amode(..),
+
+ Register(..),
+ setFormatOfRegister,
+
+ getRegisterReg,
+ mangleIndexTree
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import GHC.Driver.Session
+import GHC.Cmm
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+import GHC.Platform
+
+import Outputable
+import OrdList
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+-- They are really trees of insns to facilitate fast appending, where a
+-- left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+ = OrdList Instr
+
+
+-- | Condition codes passed up the tree.
+--
+data CondCode
+ = CondCode Bool Cond InstrBlock
+
+
+-- | a.k.a "Register64"
+-- Reg is the lower 32-bit temporary which contains the result.
+-- Use getHiVRegFromLo to find the other VRegUnique.
+--
+-- Rules of this simplified insn selection game are therefore that
+-- the returned Reg may be modified
+--
+data ChildCode64
+ = ChildCode64
+ InstrBlock
+ Reg
+
+
+-- | Holds code that references a memory address.
+data Amode
+ = Amode
+ -- the AddrMode we can use in the instruction
+ -- that does the real load\/store.
+ AddrMode
+
+ -- other setup code we have to run first before we can use the
+ -- above AddrMode.
+ InstrBlock
+
+
+
+--------------------------------------------------------------------------------
+-- | Code to produce a result into a register.
+-- If the result must go in a specific register, it comes out as Fixed.
+-- Otherwise, the parent can decide which register to put it in.
+--
+data Register
+ = Fixed Format Reg InstrBlock
+ | Any Format (Reg -> InstrBlock)
+
+
+-- | Change the format field in a Register.
+setFormatOfRegister
+ :: Register -> Format -> Register
+
+setFormatOfRegister reg format
+ = case reg of
+ Fixed _ reg code -> Fixed format reg code
+ Any _ codefn -> Any format codefn
+
+
+--------------------------------------------------------------------------------
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform -> CmmReg -> Reg
+
+getRegisterReg _ (CmmLocal (LocalReg u pk))
+ = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
+
+getRegisterReg platform (CmmGlobal mid)
+ = case globalRegMaybe platform mid of
+ Just reg -> RegReal reg
+ Nothing -> pprPanic
+ "SPARC.CodeGen.Base.getRegisterReg: global is in memory"
+ (ppr $ CmmGlobal mid)
+
+
+-- Expand CmmRegOff. ToDo: should we do it this way around, or convert
+-- CmmExprs into CmmRegOff?
+mangleIndexTree :: DynFlags -> CmmExpr -> CmmExpr
+
+mangleIndexTree dflags (CmmRegOff reg off)
+ = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+ where width = typeWidth (cmmRegType dflags reg)
+
+mangleIndexTree _ _
+ = panic "SPARC.CodeGen.Base.mangleIndexTree: no match"
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs
new file mode 100644
index 0000000000..e501d799f2
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs
@@ -0,0 +1,110 @@
+module GHC.CmmToAsm.SPARC.CodeGen.CondCode (
+ getCondCode,
+ condIntCode,
+ condFltCode
+)
+
+where
+
+import GhcPrelude
+
+import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+
+import OrdList
+import Outputable
+
+
+getCondCode :: CmmExpr -> NatM CondCode
+getCondCode (CmmMachOp mop [x, y])
+ =
+ case mop of
+ MO_F_Eq W32 -> condFltCode EQQ x y
+ MO_F_Ne W32 -> condFltCode NE x y
+ MO_F_Gt W32 -> condFltCode GTT x y
+ MO_F_Ge W32 -> condFltCode GE x y
+ MO_F_Lt W32 -> condFltCode LTT x y
+ MO_F_Le W32 -> condFltCode LE x y
+
+ MO_F_Eq W64 -> condFltCode EQQ x y
+ MO_F_Ne W64 -> condFltCode NE x y
+ MO_F_Gt W64 -> condFltCode GTT x y
+ MO_F_Ge W64 -> condFltCode GE x y
+ MO_F_Lt W64 -> condFltCode LTT x y
+ MO_F_Le W64 -> condFltCode LE x y
+
+ MO_Eq _ -> condIntCode EQQ x y
+ MO_Ne _ -> condIntCode NE x y
+
+ MO_S_Gt _ -> condIntCode GTT x y
+ MO_S_Ge _ -> condIntCode GE x y
+ MO_S_Lt _ -> condIntCode LTT x y
+ MO_S_Le _ -> condIntCode LE x y
+
+ MO_U_Gt _ -> condIntCode GU x y
+ MO_U_Ge _ -> condIntCode GEU x y
+ MO_U_Lt _ -> condIntCode LU x y
+ MO_U_Le _ -> condIntCode LEU x y
+
+ _ -> pprPanic "SPARC.CodeGen.CondCode.getCondCode" (ppr (CmmMachOp mop [x,y]))
+
+getCondCode other = pprPanic "SPARC.CodeGen.CondCode.getCondCode" (ppr other)
+
+
+
+
+
+-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
+-- passed back up the tree.
+
+condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condIntCode cond x (CmmLit (CmmInt y _))
+ | fits13Bits y
+ = do
+ (src1, code) <- getSomeReg x
+ let
+ src2 = ImmInt (fromInteger y)
+ code' = code `snocOL` SUB False True src1 (RIImm src2) g0
+ return (CondCode False cond code')
+
+condIntCode cond x y = do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let
+ code__2 = code1 `appOL` code2 `snocOL`
+ SUB False True src1 (RIReg src2) g0
+ return (CondCode False cond code__2)
+
+
+condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condFltCode cond x y = do
+ dflags <- getDynFlags
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ tmp <- getNewRegNat FF64
+ let
+ promote x = FxTOy FF32 FF64 x tmp
+
+ pk1 = cmmExprType dflags x
+ pk2 = cmmExprType dflags y
+
+ code__2 =
+ if pk1 `cmmEqType` pk2 then
+ code1 `appOL` code2 `snocOL`
+ FCMP True (cmmTypeFormat pk1) src1 src2
+ else if typeWidth pk1 == W32 then
+ code1 `snocOL` promote src1 `appOL` code2 `snocOL`
+ FCMP True FF64 tmp src2
+ else
+ code1 `appOL` code2 `snocOL` promote src2 `snocOL`
+ FCMP True FF64 src1 tmp
+ return (CondCode True cond code__2)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs
new file mode 100644
index 0000000000..77732cf70c
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs
@@ -0,0 +1,156 @@
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-- | Expand out synthetic instructions into single machine instrs.
+module GHC.CmmToAsm.SPARC.CodeGen.Expand (
+ expandTop
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Format
+import GHC.Cmm
+
+
+import Outputable
+import OrdList
+
+-- | Expand out synthetic instructions in this top level thing
+expandTop :: NatCmmDecl RawCmmStatics Instr -> NatCmmDecl RawCmmStatics Instr
+expandTop top@(CmmData{})
+ = top
+
+expandTop (CmmProc info lbl live (ListGraph blocks))
+ = CmmProc info lbl live (ListGraph $ map expandBlock blocks)
+
+
+-- | Expand out synthetic instructions in this block
+expandBlock :: NatBasicBlock Instr -> NatBasicBlock Instr
+
+expandBlock (BasicBlock label instrs)
+ = let instrs_ol = expandBlockInstrs instrs
+ instrs' = fromOL instrs_ol
+ in BasicBlock label instrs'
+
+
+-- | Expand out some instructions
+expandBlockInstrs :: [Instr] -> OrdList Instr
+expandBlockInstrs [] = nilOL
+
+expandBlockInstrs (ii:is)
+ = let ii_doubleRegs = remapRegPair ii
+ is_misaligned = expandMisalignedDoubles ii_doubleRegs
+
+ in is_misaligned `appOL` expandBlockInstrs is
+
+
+
+-- | In the SPARC instruction set the FP register pairs that are used
+-- to hold 64 bit floats are referred to by just the first reg
+-- of the pair. Remap our internal reg pairs to the appropriate reg.
+--
+-- For example:
+-- ldd [%l1], (%f0 | %f1)
+--
+-- gets mapped to
+-- ldd [$l1], %f0
+--
+remapRegPair :: Instr -> Instr
+remapRegPair instr
+ = let patchF reg
+ = case reg of
+ RegReal (RealRegSingle _)
+ -> reg
+
+ RegReal (RealRegPair r1 r2)
+
+ -- sanity checking
+ | r1 >= 32
+ , r1 <= 63
+ , r1 `mod` 2 == 0
+ , r2 == r1 + 1
+ -> RegReal (RealRegSingle r1)
+
+ | otherwise
+ -> pprPanic "SPARC.CodeGen.Expand: not remapping dodgy looking reg pair " (ppr reg)
+
+ RegVirtual _
+ -> pprPanic "SPARC.CodeGen.Expand: not remapping virtual reg " (ppr reg)
+
+ in patchRegsOfInstr instr patchF
+
+
+
+
+-- Expand out 64 bit load/stores into individual instructions to handle
+-- possible double alignment problems.
+--
+-- TODO: It'd be better to use a scratch reg instead of the add/sub thing.
+-- We might be able to do this faster if we use the UA2007 instr set
+-- instead of restricting ourselves to SPARC V9.
+--
+expandMisalignedDoubles :: Instr -> OrdList Instr
+expandMisalignedDoubles instr
+
+ -- Translate to:
+ -- add g1,g2,g1
+ -- ld [g1],%fn
+ -- ld [g1+4],%f(n+1)
+ -- sub g1,g2,g1 -- to restore g1
+ | LD FF64 (AddrRegReg r1 r2) fReg <- instr
+ = toOL [ ADD False False r1 (RIReg r2) r1
+ , LD FF32 (AddrRegReg r1 g0) fReg
+ , LD FF32 (AddrRegImm r1 (ImmInt 4)) (fRegHi fReg)
+ , SUB False False r1 (RIReg r2) r1 ]
+
+ -- Translate to
+ -- ld [addr],%fn
+ -- ld [addr+4],%f(n+1)
+ | LD FF64 addr fReg <- instr
+ = let Just addr' = addrOffset addr 4
+ in toOL [ LD FF32 addr fReg
+ , LD FF32 addr' (fRegHi fReg) ]
+
+ -- Translate to:
+ -- add g1,g2,g1
+ -- st %fn,[g1]
+ -- st %f(n+1),[g1+4]
+ -- sub g1,g2,g1 -- to restore g1
+ | ST FF64 fReg (AddrRegReg r1 r2) <- instr
+ = toOL [ ADD False False r1 (RIReg r2) r1
+ , ST FF32 fReg (AddrRegReg r1 g0)
+ , ST FF32 (fRegHi fReg) (AddrRegImm r1 (ImmInt 4))
+ , SUB False False r1 (RIReg r2) r1 ]
+
+ -- Translate to
+ -- ld [addr],%fn
+ -- ld [addr+4],%f(n+1)
+ | ST FF64 fReg addr <- instr
+ = let Just addr' = addrOffset addr 4
+ in toOL [ ST FF32 fReg addr
+ , ST FF32 (fRegHi fReg) addr' ]
+
+ -- some other instr
+ | otherwise
+ = unitOL instr
+
+
+
+-- | The high partner for this float reg.
+fRegHi :: Reg -> Reg
+fRegHi (RegReal (RealRegSingle r1))
+ | r1 >= 32
+ , r1 <= 63
+ , r1 `mod` 2 == 0
+ = (RegReal $ RealRegSingle (r1 + 1))
+
+-- Can't take high partner for non-low reg.
+fRegHi reg
+ = pprPanic "SPARC.CodeGen.Expand: can't take fRegHi from " (ppr reg)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs
new file mode 100644
index 0000000000..ee67bd4a9d
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs
@@ -0,0 +1,692 @@
+-- | Evaluation of 32 bit values.
+module GHC.CmmToAsm.SPARC.CodeGen.Gen32 (
+ getSomeReg,
+ getRegister
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.CodeGen.CondCode
+import GHC.CmmToAsm.SPARC.CodeGen.Amode
+import GHC.CmmToAsm.SPARC.CodeGen.Gen64
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.Stack
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import GHC.Cmm
+
+import Control.Monad (liftM)
+import GHC.Driver.Session
+import OrdList
+import Outputable
+
+-- | The dual to getAnyReg: compute an expression into a register, but
+-- we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+ r <- getRegister expr
+ case r of
+ Any rep code -> do
+ tmp <- getNewRegNat rep
+ return (tmp, code tmp)
+ Fixed _ reg code ->
+ return (reg, code)
+
+
+
+-- | Make code to evaluate a 32 bit expression.
+--
+getRegister :: CmmExpr -> NatM Register
+
+getRegister (CmmReg reg)
+ = do dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ return (Fixed (cmmTypeFormat (cmmRegType dflags reg))
+ (getRegisterReg platform reg) nilOL)
+
+getRegister tree@(CmmRegOff _ _)
+ = do dflags <- getDynFlags
+ getRegister (mangleIndexTree dflags tree)
+
+getRegister (CmmMachOp (MO_UU_Conv W64 W32)
+ [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister (CmmMachOp (MO_SS_Conv W64 W32)
+ [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister (CmmMachOp (MO_UU_Conv W64 W32) [x]) = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 rlo code
+
+getRegister (CmmMachOp (MO_SS_Conv W64 W32) [x]) = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 rlo code
+
+
+-- Load a literal float into a float register.
+-- The actual literal is stored in a new data area, and we load it
+-- at runtime.
+getRegister (CmmLit (CmmFloat f W32)) = do
+
+ -- a label for the new data area
+ lbl <- getNewLabelNat
+ tmp <- getNewRegNat II32
+
+ let code dst = toOL [
+ -- the data area
+ LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl
+ [CmmStaticLit (CmmFloat f W32)],
+
+ -- load the literal
+ SETHI (HI (ImmCLbl lbl)) tmp,
+ LD II32 (AddrRegImm tmp (LO (ImmCLbl lbl))) dst]
+
+ return (Any FF32 code)
+
+getRegister (CmmLit (CmmFloat d W64)) = do
+ lbl <- getNewLabelNat
+ tmp <- getNewRegNat II32
+ let code dst = toOL [
+ LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl
+ [CmmStaticLit (CmmFloat d W64)],
+ SETHI (HI (ImmCLbl lbl)) tmp,
+ LD II64 (AddrRegImm tmp (LO (ImmCLbl lbl))) dst]
+ return (Any FF64 code)
+
+
+-- Unary machine ops
+getRegister (CmmMachOp mop [x])
+ = case mop of
+ -- Floating point negation -------------------------
+ MO_F_Neg W32 -> trivialUFCode FF32 (FNEG FF32) x
+ MO_F_Neg W64 -> trivialUFCode FF64 (FNEG FF64) x
+
+
+ -- Integer negation --------------------------------
+ MO_S_Neg rep -> trivialUCode (intFormat rep) (SUB False False g0) x
+ MO_Not rep -> trivialUCode (intFormat rep) (XNOR False g0) x
+
+
+ -- Float word size conversion ----------------------
+ MO_FF_Conv W64 W32 -> coerceDbl2Flt x
+ MO_FF_Conv W32 W64 -> coerceFlt2Dbl x
+
+
+ -- Float <-> Signed Int conversion -----------------
+ MO_FS_Conv from to -> coerceFP2Int from to x
+ MO_SF_Conv from to -> coerceInt2FP from to x
+
+
+ -- Unsigned integer word size conversions ----------
+
+ -- If it's the same size, then nothing needs to be done.
+ MO_UU_Conv from to
+ | from == to -> conversionNop (intFormat to) x
+
+ -- To narrow an unsigned word, mask out the high bits to simulate what would
+ -- happen if we copied the value into a smaller register.
+ MO_UU_Conv W16 W8 -> trivialCode W8 (AND False) x (CmmLit (CmmInt 255 W8))
+ MO_UU_Conv W32 W8 -> trivialCode W8 (AND False) x (CmmLit (CmmInt 255 W8))
+
+ -- for narrowing 32 bit to 16 bit, don't use a literal mask value like the W16->W8
+ -- case because the only way we can load it is via SETHI, which needs 2 ops.
+ -- Do some shifts to chop out the high bits instead.
+ MO_UU_Conv W32 W16
+ -> do tmpReg <- getNewRegNat II32
+ (xReg, xCode) <- getSomeReg x
+ let code dst
+ = xCode
+ `appOL` toOL
+ [ SLL xReg (RIImm $ ImmInt 16) tmpReg
+ , SRL tmpReg (RIImm $ ImmInt 16) dst]
+
+ return $ Any II32 code
+
+ -- trivialCode W16 (AND False) x (CmmLit (CmmInt 65535 W16))
+
+ -- To widen an unsigned word we don't have to do anything.
+ -- Just leave it in the same register and mark the result as the new size.
+ MO_UU_Conv W8 W16 -> conversionNop (intFormat W16) x
+ MO_UU_Conv W8 W32 -> conversionNop (intFormat W32) x
+ MO_UU_Conv W16 W32 -> conversionNop (intFormat W32) x
+
+
+ -- Signed integer word size conversions ------------
+
+ -- Mask out high bits when narrowing them
+ MO_SS_Conv W16 W8 -> trivialCode W8 (AND False) x (CmmLit (CmmInt 255 W8))
+ MO_SS_Conv W32 W8 -> trivialCode W8 (AND False) x (CmmLit (CmmInt 255 W8))
+ MO_SS_Conv W32 W16 -> trivialCode W16 (AND False) x (CmmLit (CmmInt 65535 W16))
+
+ -- Sign extend signed words when widening them.
+ MO_SS_Conv W8 W16 -> integerExtend W8 W16 x
+ MO_SS_Conv W8 W32 -> integerExtend W8 W32 x
+ MO_SS_Conv W16 W32 -> integerExtend W16 W32 x
+
+ _ -> panic ("Unknown unary mach op: " ++ show mop)
+
+
+-- Binary machine ops
+getRegister (CmmMachOp mop [x, y])
+ = case mop of
+ MO_Eq _ -> condIntReg EQQ x y
+ MO_Ne _ -> condIntReg NE x y
+
+ MO_S_Gt _ -> condIntReg GTT x y
+ MO_S_Ge _ -> condIntReg GE x y
+ MO_S_Lt _ -> condIntReg LTT x y
+ MO_S_Le _ -> condIntReg LE x y
+
+ MO_U_Gt W32 -> condIntReg GU x y
+ MO_U_Ge W32 -> condIntReg GEU x y
+ MO_U_Lt W32 -> condIntReg LU x y
+ MO_U_Le W32 -> condIntReg LEU x y
+
+ MO_U_Gt W16 -> condIntReg GU x y
+ MO_U_Ge W16 -> condIntReg GEU x y
+ MO_U_Lt W16 -> condIntReg LU x y
+ MO_U_Le W16 -> condIntReg LEU x y
+
+ MO_Add W32 -> trivialCode W32 (ADD False False) x y
+ MO_Sub W32 -> trivialCode W32 (SUB False False) x y
+
+ MO_S_MulMayOflo rep -> imulMayOflo rep x y
+
+ MO_S_Quot W32 -> idiv True False x y
+ MO_U_Quot W32 -> idiv False False x y
+
+ MO_S_Rem W32 -> irem True x y
+ MO_U_Rem W32 -> irem False x y
+
+ MO_F_Eq _ -> condFltReg EQQ x y
+ MO_F_Ne _ -> condFltReg NE x y
+
+ MO_F_Gt _ -> condFltReg GTT x y
+ MO_F_Ge _ -> condFltReg GE x y
+ MO_F_Lt _ -> condFltReg LTT x y
+ MO_F_Le _ -> condFltReg LE x y
+
+ MO_F_Add w -> trivialFCode w FADD x y
+ MO_F_Sub w -> trivialFCode w FSUB x y
+ MO_F_Mul w -> trivialFCode w FMUL x y
+ MO_F_Quot w -> trivialFCode w FDIV x y
+
+ MO_And rep -> trivialCode rep (AND False) x y
+ MO_Or rep -> trivialCode rep (OR False) x y
+ MO_Xor rep -> trivialCode rep (XOR False) x y
+
+ MO_Mul rep -> trivialCode rep (SMUL False) x y
+
+ MO_Shl rep -> trivialCode rep SLL x y
+ MO_U_Shr rep -> trivialCode rep SRL x y
+ MO_S_Shr rep -> trivialCode rep SRA x y
+
+ _ -> pprPanic "getRegister(sparc) - binary CmmMachOp (1)" (pprMachOp mop)
+
+getRegister (CmmLoad mem pk) = do
+ Amode src code <- getAmode mem
+ let
+ code__2 dst = code `snocOL` LD (cmmTypeFormat pk) src dst
+ return (Any (cmmTypeFormat pk) code__2)
+
+getRegister (CmmLit (CmmInt i _))
+ | fits13Bits i
+ = let
+ src = ImmInt (fromInteger i)
+ code dst = unitOL (OR False g0 (RIImm src) dst)
+ in
+ return (Any II32 code)
+
+getRegister (CmmLit lit)
+ = let imm = litToImm lit
+ code dst = toOL [
+ SETHI (HI imm) dst,
+ OR False dst (RIImm (LO imm)) dst]
+ in return (Any II32 code)
+
+
+getRegister _
+ = panic "SPARC.CodeGen.Gen32.getRegister: no match"
+
+
+-- | sign extend and widen
+integerExtend
+ :: Width -- ^ width of source expression
+ -> Width -- ^ width of result
+ -> CmmExpr -- ^ source expression
+ -> NatM Register
+
+integerExtend from to expr
+ = do -- load the expr into some register
+ (reg, e_code) <- getSomeReg expr
+ tmp <- getNewRegNat II32
+ let bitCount
+ = case (from, to) of
+ (W8, W32) -> 24
+ (W16, W32) -> 16
+ (W8, W16) -> 24
+ _ -> panic "SPARC.CodeGen.Gen32: no match"
+ let code dst
+ = e_code
+
+ -- local shift word left to load the sign bit
+ `snocOL` SLL reg (RIImm (ImmInt bitCount)) tmp
+
+ -- arithmetic shift right to sign extend
+ `snocOL` SRA tmp (RIImm (ImmInt bitCount)) dst
+
+ return (Any (intFormat to) code)
+
+
+-- | For nop word format conversions we set the resulting value to have the
+-- required size, but don't need to generate any actual code.
+--
+conversionNop
+ :: Format -> CmmExpr -> NatM Register
+
+conversionNop new_rep expr
+ = do e_code <- getRegister expr
+ return (setFormatOfRegister e_code new_rep)
+
+
+
+-- | Generate an integer division instruction.
+idiv :: Bool -> Bool -> CmmExpr -> CmmExpr -> NatM Register
+
+-- For unsigned division with a 32 bit numerator,
+-- we can just clear the Y register.
+idiv False cc x y
+ = do
+ (a_reg, a_code) <- getSomeReg x
+ (b_reg, b_code) <- getSomeReg y
+
+ let code dst
+ = a_code
+ `appOL` b_code
+ `appOL` toOL
+ [ WRY g0 g0
+ , UDIV cc a_reg (RIReg b_reg) dst]
+
+ return (Any II32 code)
+
+
+-- For _signed_ division with a 32 bit numerator,
+-- we have to sign extend the numerator into the Y register.
+idiv True cc x y
+ = do
+ (a_reg, a_code) <- getSomeReg x
+ (b_reg, b_code) <- getSomeReg y
+
+ tmp <- getNewRegNat II32
+
+ let code dst
+ = a_code
+ `appOL` b_code
+ `appOL` toOL
+ [ SRA a_reg (RIImm (ImmInt 16)) tmp -- sign extend
+ , SRA tmp (RIImm (ImmInt 16)) tmp
+
+ , WRY tmp g0
+ , SDIV cc a_reg (RIReg b_reg) dst]
+
+ return (Any II32 code)
+
+
+-- | Do an integer remainder.
+--
+-- NOTE: The SPARC v8 architecture manual says that integer division
+-- instructions _may_ generate a remainder, depending on the implementation.
+-- If so it is _recommended_ that the remainder is placed in the Y register.
+--
+-- The UltraSparc 2007 manual says Y is _undefined_ after division.
+--
+-- The SPARC T2 doesn't store the remainder, not sure about the others.
+-- It's probably best not to worry about it, and just generate our own
+-- remainders.
+--
+irem :: Bool -> CmmExpr -> CmmExpr -> NatM Register
+
+-- For unsigned operands:
+-- Division is between a 64 bit numerator and a 32 bit denominator,
+-- so we still have to clear the Y register.
+irem False x y
+ = do
+ (a_reg, a_code) <- getSomeReg x
+ (b_reg, b_code) <- getSomeReg y
+
+ tmp_reg <- getNewRegNat II32
+
+ let code dst
+ = a_code
+ `appOL` b_code
+ `appOL` toOL
+ [ WRY g0 g0
+ , UDIV False a_reg (RIReg b_reg) tmp_reg
+ , UMUL False tmp_reg (RIReg b_reg) tmp_reg
+ , SUB False False a_reg (RIReg tmp_reg) dst]
+
+ return (Any II32 code)
+
+
+
+-- For signed operands:
+-- Make sure to sign extend into the Y register, or the remainder
+-- will have the wrong sign when the numerator is negative.
+--
+-- TODO: When sign extending, GCC only shifts the a_reg right by 17 bits,
+-- not the full 32. Not sure why this is, something to do with overflow?
+-- If anyone cares enough about the speed of signed remainder they
+-- can work it out themselves (then tell me). -- BL 2009/01/20
+irem True x y
+ = do
+ (a_reg, a_code) <- getSomeReg x
+ (b_reg, b_code) <- getSomeReg y
+
+ tmp1_reg <- getNewRegNat II32
+ tmp2_reg <- getNewRegNat II32
+
+ let code dst
+ = a_code
+ `appOL` b_code
+ `appOL` toOL
+ [ SRA a_reg (RIImm (ImmInt 16)) tmp1_reg -- sign extend
+ , SRA tmp1_reg (RIImm (ImmInt 16)) tmp1_reg -- sign extend
+ , WRY tmp1_reg g0
+
+ , SDIV False a_reg (RIReg b_reg) tmp2_reg
+ , SMUL False tmp2_reg (RIReg b_reg) tmp2_reg
+ , SUB False False a_reg (RIReg tmp2_reg) dst]
+
+ return (Any II32 code)
+
+
+imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
+imulMayOflo rep a b
+ = do
+ (a_reg, a_code) <- getSomeReg a
+ (b_reg, b_code) <- getSomeReg b
+ res_lo <- getNewRegNat II32
+ res_hi <- getNewRegNat II32
+
+ let shift_amt = case rep of
+ W32 -> 31
+ W64 -> 63
+ _ -> panic "shift_amt"
+
+ let code dst = a_code `appOL` b_code `appOL`
+ toOL [
+ SMUL False a_reg (RIReg b_reg) res_lo,
+ RDY res_hi,
+ SRA res_lo (RIImm (ImmInt shift_amt)) res_lo,
+ SUB False False res_lo (RIReg res_hi) dst
+ ]
+ return (Any II32 code)
+
+
+-- -----------------------------------------------------------------------------
+-- 'trivial*Code': deal with trivial instructions
+
+-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
+-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
+-- Only look for constants on the right hand side, because that's
+-- where the generic optimizer will have put them.
+
+-- Similarly, for unary instructions, we don't have to worry about
+-- matching an StInt as the argument, because genericOpt will already
+-- have handled the constant-folding.
+
+trivialCode
+ :: Width
+ -> (Reg -> RI -> Reg -> Instr)
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+
+trivialCode _ instr x (CmmLit (CmmInt y _))
+ | fits13Bits y
+ = do
+ (src1, code) <- getSomeReg x
+ let
+ src2 = ImmInt (fromInteger y)
+ code__2 dst = code `snocOL` instr src1 (RIImm src2) dst
+ return (Any II32 code__2)
+
+
+trivialCode _ instr x y = do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let
+ code__2 dst = code1 `appOL` code2 `snocOL`
+ instr src1 (RIReg src2) dst
+ return (Any II32 code__2)
+
+
+trivialFCode
+ :: Width
+ -> (Format -> Reg -> Reg -> Reg -> Instr)
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+
+trivialFCode pk instr x y = do
+ dflags <- getDynFlags
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ tmp <- getNewRegNat FF64
+ let
+ promote x = FxTOy FF32 FF64 x tmp
+
+ pk1 = cmmExprType dflags x
+ pk2 = cmmExprType dflags y
+
+ code__2 dst =
+ if pk1 `cmmEqType` pk2 then
+ code1 `appOL` code2 `snocOL`
+ instr (floatFormat pk) src1 src2 dst
+ else if typeWidth pk1 == W32 then
+ code1 `snocOL` promote src1 `appOL` code2 `snocOL`
+ instr FF64 tmp src2 dst
+ else
+ code1 `appOL` code2 `snocOL` promote src2 `snocOL`
+ instr FF64 src1 tmp dst
+ return (Any (cmmTypeFormat $ if pk1 `cmmEqType` pk2 then pk1 else cmmFloat W64)
+ code__2)
+
+
+
+trivialUCode
+ :: Format
+ -> (RI -> Reg -> Instr)
+ -> CmmExpr
+ -> NatM Register
+
+trivialUCode format instr x = do
+ (src, code) <- getSomeReg x
+ let
+ code__2 dst = code `snocOL` instr (RIReg src) dst
+ return (Any format code__2)
+
+
+trivialUFCode
+ :: Format
+ -> (Reg -> Reg -> Instr)
+ -> CmmExpr
+ -> NatM Register
+
+trivialUFCode pk instr x = do
+ (src, code) <- getSomeReg x
+ let
+ code__2 dst = code `snocOL` instr src dst
+ return (Any pk code__2)
+
+
+
+
+-- Coercions -------------------------------------------------------------------
+
+-- | Coerce a integer value to floating point
+coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP width1 width2 x = do
+ (src, code) <- getSomeReg x
+ let
+ code__2 dst = code `appOL` toOL [
+ ST (intFormat width1) src (spRel (-2)),
+ LD (intFormat width1) (spRel (-2)) dst,
+ FxTOy (intFormat width1) (floatFormat width2) dst dst]
+ return (Any (floatFormat $ width2) code__2)
+
+
+
+-- | Coerce a floating point value to integer
+--
+-- NOTE: On sparc v9 there are no instructions to move a value from an
+-- FP register directly to an int register, so we have to use a load/store.
+--
+coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int width1 width2 x
+ = do let fformat1 = floatFormat width1
+ fformat2 = floatFormat width2
+
+ iformat2 = intFormat width2
+
+ (fsrc, code) <- getSomeReg x
+ fdst <- getNewRegNat fformat2
+
+ let code2 dst
+ = code
+ `appOL` toOL
+ -- convert float to int format, leaving it in a float reg.
+ [ FxTOy fformat1 iformat2 fsrc fdst
+
+ -- store the int into mem, then load it back to move
+ -- it into an actual int reg.
+ , ST fformat2 fdst (spRel (-2))
+ , LD iformat2 (spRel (-2)) dst]
+
+ return (Any iformat2 code2)
+
+
+-- | Coerce a double precision floating point value to single precision.
+coerceDbl2Flt :: CmmExpr -> NatM Register
+coerceDbl2Flt x = do
+ (src, code) <- getSomeReg x
+ return (Any FF32 (\dst -> code `snocOL` FxTOy FF64 FF32 src dst))
+
+
+-- | Coerce a single precision floating point value to double precision
+coerceFlt2Dbl :: CmmExpr -> NatM Register
+coerceFlt2Dbl x = do
+ (src, code) <- getSomeReg x
+ return (Any FF64 (\dst -> code `snocOL` FxTOy FF32 FF64 src dst))
+
+
+
+
+-- Condition Codes -------------------------------------------------------------
+--
+-- Evaluate a comparison, and get the result into a register.
+--
+-- Do not fill the delay slots here. you will confuse the register allocator.
+--
+condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+condIntReg EQQ x (CmmLit (CmmInt 0 _)) = do
+ (src, code) <- getSomeReg x
+ let
+ code__2 dst = code `appOL` toOL [
+ SUB False True g0 (RIReg src) g0,
+ SUB True False g0 (RIImm (ImmInt (-1))) dst]
+ return (Any II32 code__2)
+
+condIntReg EQQ x y = do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let
+ code__2 dst = code1 `appOL` code2 `appOL` toOL [
+ XOR False src1 (RIReg src2) dst,
+ SUB False True g0 (RIReg dst) g0,
+ SUB True False g0 (RIImm (ImmInt (-1))) dst]
+ return (Any II32 code__2)
+
+condIntReg NE x (CmmLit (CmmInt 0 _)) = do
+ (src, code) <- getSomeReg x
+ let
+ code__2 dst = code `appOL` toOL [
+ SUB False True g0 (RIReg src) g0,
+ ADD True False g0 (RIImm (ImmInt 0)) dst]
+ return (Any II32 code__2)
+
+condIntReg NE x y = do
+ (src1, code1) <- getSomeReg x
+ (src2, code2) <- getSomeReg y
+ let
+ code__2 dst = code1 `appOL` code2 `appOL` toOL [
+ XOR False src1 (RIReg src2) dst,
+ SUB False True g0 (RIReg dst) g0,
+ ADD True False g0 (RIImm (ImmInt 0)) dst]
+ return (Any II32 code__2)
+
+condIntReg cond x y = do
+ bid1 <- liftM (\a -> seq a a) getBlockIdNat
+ bid2 <- liftM (\a -> seq a a) getBlockIdNat
+ CondCode _ cond cond_code <- condIntCode cond x y
+ let
+ code__2 dst
+ = cond_code
+ `appOL` toOL
+ [ BI cond False bid1
+ , NOP
+
+ , OR False g0 (RIImm (ImmInt 0)) dst
+ , BI ALWAYS False bid2
+ , NOP
+
+ , NEWBLOCK bid1
+ , OR False g0 (RIImm (ImmInt 1)) dst
+ , BI ALWAYS False bid2
+ , NOP
+
+ , NEWBLOCK bid2]
+
+ return (Any II32 code__2)
+
+
+condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+condFltReg cond x y = do
+ bid1 <- liftM (\a -> seq a a) getBlockIdNat
+ bid2 <- liftM (\a -> seq a a) getBlockIdNat
+
+ CondCode _ cond cond_code <- condFltCode cond x y
+ let
+ code__2 dst
+ = cond_code
+ `appOL` toOL
+ [ NOP
+ , BF cond False bid1
+ , NOP
+
+ , OR False g0 (RIImm (ImmInt 0)) dst
+ , BI ALWAYS False bid2
+ , NOP
+
+ , NEWBLOCK bid1
+ , OR False g0 (RIImm (ImmInt 1)) dst
+ , BI ALWAYS False bid2
+ , NOP
+
+ , NEWBLOCK bid2 ]
+
+ return (Any II32 code__2)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot
new file mode 100644
index 0000000000..cf9553a63c
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot
@@ -0,0 +1,16 @@
+
+module GHC.CmmToAsm.SPARC.CodeGen.Gen32 (
+ getSomeReg,
+ getRegister
+)
+
+where
+
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.Monad
+import GHC.Platform.Reg
+
+import GHC.Cmm
+
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getRegister :: CmmExpr -> NatM Register
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs
new file mode 100644
index 0000000000..43807ec027
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs
@@ -0,0 +1,216 @@
+-- | Evaluation of 64 bit values on 32 bit platforms.
+module GHC.CmmToAsm.SPARC.CodeGen.Gen64 (
+ assignMem_I64Code,
+ assignReg_I64Code,
+ iselExpr64
+)
+
+where
+
+import GhcPrelude
+
+import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.CodeGen.Amode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Instr
+-- GHC.CmmToAsm.SPARC.Ppr()
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import GHC.Cmm
+
+import GHC.Driver.Session
+import OrdList
+import Outputable
+
+-- | Code to assign a 64 bit value to memory.
+assignMem_I64Code
+ :: CmmExpr -- ^ expr producing the destination address
+ -> CmmExpr -- ^ expr producing the source value.
+ -> NatM InstrBlock
+
+assignMem_I64Code addrTree valueTree
+ = do
+ ChildCode64 vcode rlo <- iselExpr64 valueTree
+
+ (src, acode) <- getSomeReg addrTree
+ let
+ rhi = getHiVRegFromLo rlo
+
+ -- Big-endian store
+ mov_hi = ST II32 rhi (AddrRegImm src (ImmInt 0))
+ mov_lo = ST II32 rlo (AddrRegImm src (ImmInt 4))
+
+ code = vcode `appOL` acode `snocOL` mov_hi `snocOL` mov_lo
+
+{- pprTrace "assignMem_I64Code"
+ (vcat [ text "addrTree: " <+> ppr addrTree
+ , text "valueTree: " <+> ppr valueTree
+ , text "vcode:"
+ , vcat $ map ppr $ fromOL vcode
+ , text ""
+ , text "acode:"
+ , vcat $ map ppr $ fromOL acode ])
+ $ -}
+ return code
+
+
+-- | Code to assign a 64 bit value to a register.
+assignReg_I64Code
+ :: CmmReg -- ^ the destination register
+ -> CmmExpr -- ^ expr producing the source value
+ -> NatM InstrBlock
+
+assignReg_I64Code (CmmLocal (LocalReg u_dst pk)) valueTree
+ = do
+ ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
+ let
+ r_dst_lo = RegVirtual $ mkVirtualReg u_dst (cmmTypeFormat pk)
+ r_dst_hi = getHiVRegFromLo r_dst_lo
+ r_src_hi = getHiVRegFromLo r_src_lo
+ mov_lo = mkMOV r_src_lo r_dst_lo
+ mov_hi = mkMOV r_src_hi r_dst_hi
+ mkMOV sreg dreg = OR False g0 (RIReg sreg) dreg
+
+ return (vcode `snocOL` mov_hi `snocOL` mov_lo)
+
+assignReg_I64Code _ _
+ = panic "assignReg_I64Code(sparc): invalid lvalue"
+
+
+
+
+-- | Get the value of an expression into a 64 bit register.
+
+iselExpr64 :: CmmExpr -> NatM ChildCode64
+
+-- Load a 64 bit word
+iselExpr64 (CmmLoad addrTree ty)
+ | isWord64 ty
+ = do Amode amode addr_code <- getAmode addrTree
+ let result
+
+ | AddrRegReg r1 r2 <- amode
+ = do rlo <- getNewRegNat II32
+ tmp <- getNewRegNat II32
+ let rhi = getHiVRegFromLo rlo
+
+ return $ ChildCode64
+ ( addr_code
+ `appOL` toOL
+ [ ADD False False r1 (RIReg r2) tmp
+ , LD II32 (AddrRegImm tmp (ImmInt 0)) rhi
+ , LD II32 (AddrRegImm tmp (ImmInt 4)) rlo ])
+ rlo
+
+ | AddrRegImm r1 (ImmInt i) <- amode
+ = do rlo <- getNewRegNat II32
+ let rhi = getHiVRegFromLo rlo
+
+ return $ ChildCode64
+ ( addr_code
+ `appOL` toOL
+ [ LD II32 (AddrRegImm r1 (ImmInt $ 0 + i)) rhi
+ , LD II32 (AddrRegImm r1 (ImmInt $ 4 + i)) rlo ])
+ rlo
+
+ | otherwise
+ = panic "SPARC.CodeGen.Gen64: no match"
+
+ result
+
+
+-- Add a literal to a 64 bit integer
+iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)])
+ = do ChildCode64 code1 r1_lo <- iselExpr64 e1
+ let r1_hi = getHiVRegFromLo r1_lo
+
+ r_dst_lo <- getNewRegNat II32
+ let r_dst_hi = getHiVRegFromLo r_dst_lo
+
+ let code = code1
+ `appOL` toOL
+ [ ADD False True r1_lo (RIImm (ImmInteger i)) r_dst_lo
+ , ADD True False r1_hi (RIReg g0) r_dst_hi ]
+
+ return $ ChildCode64 code r_dst_lo
+
+
+-- Addition of II64
+iselExpr64 (CmmMachOp (MO_Add _) [e1, e2])
+ = do ChildCode64 code1 r1_lo <- iselExpr64 e1
+ let r1_hi = getHiVRegFromLo r1_lo
+
+ ChildCode64 code2 r2_lo <- iselExpr64 e2
+ let r2_hi = getHiVRegFromLo r2_lo
+
+ r_dst_lo <- getNewRegNat II32
+ let r_dst_hi = getHiVRegFromLo r_dst_lo
+
+ let code = code1
+ `appOL` code2
+ `appOL` toOL
+ [ ADD False True r1_lo (RIReg r2_lo) r_dst_lo
+ , ADD True False r1_hi (RIReg r2_hi) r_dst_hi ]
+
+ return $ ChildCode64 code r_dst_lo
+
+
+iselExpr64 (CmmReg (CmmLocal (LocalReg uq ty)))
+ | isWord64 ty
+ = do
+ r_dst_lo <- getNewRegNat II32
+ let r_dst_hi = getHiVRegFromLo r_dst_lo
+ r_src_lo = RegVirtual $ mkVirtualReg uq II32
+ r_src_hi = getHiVRegFromLo r_src_lo
+ mov_lo = mkMOV r_src_lo r_dst_lo
+ mov_hi = mkMOV r_src_hi r_dst_hi
+ mkMOV sreg dreg = OR False g0 (RIReg sreg) dreg
+ return (
+ ChildCode64 (toOL [mov_hi, mov_lo]) r_dst_lo
+ )
+
+-- Convert something into II64
+iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr])
+ = do
+ r_dst_lo <- getNewRegNat II32
+ let r_dst_hi = getHiVRegFromLo r_dst_lo
+
+ -- compute expr and load it into r_dst_lo
+ (a_reg, a_code) <- getSomeReg expr
+
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ code = a_code
+ `appOL` toOL
+ [ mkRegRegMoveInstr platform g0 r_dst_hi -- clear high 32 bits
+ , mkRegRegMoveInstr platform a_reg r_dst_lo ]
+
+ return $ ChildCode64 code r_dst_lo
+
+-- only W32 supported for now
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr])
+ = do
+ r_dst_lo <- getNewRegNat II32
+ let r_dst_hi = getHiVRegFromLo r_dst_lo
+
+ -- compute expr and load it into r_dst_lo
+ (a_reg, a_code) <- getSomeReg expr
+
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ code = a_code
+ `appOL` toOL
+ [ SRA a_reg (RIImm (ImmInt 31)) r_dst_hi
+ , mkRegRegMoveInstr platform a_reg r_dst_lo ]
+
+ return $ ChildCode64 code r_dst_lo
+
+
+iselExpr64 expr
+ = pprPanic "iselExpr64(sparc)" (ppr expr)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs
new file mode 100644
index 0000000000..f8648c4549
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs
@@ -0,0 +1,69 @@
+-- | One ounce of sanity checking is worth 10000000000000000 ounces
+-- of staring blindly at assembly code trying to find the problem..
+module GHC.CmmToAsm.SPARC.CodeGen.Sanity (
+ checkBlock
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Ppr () -- For Outputable instances
+import GHC.CmmToAsm.Instr
+
+import GHC.Cmm
+
+import Outputable
+
+
+-- | Enforce intra-block invariants.
+--
+checkBlock :: CmmBlock
+ -> NatBasicBlock Instr
+ -> NatBasicBlock Instr
+
+checkBlock cmm block@(BasicBlock _ instrs)
+ | checkBlockInstrs instrs
+ = block
+
+ | otherwise
+ = pprPanic
+ ("SPARC.CodeGen: bad block\n")
+ ( vcat [ text " -- cmm -----------------\n"
+ , ppr cmm
+ , text " -- native code ---------\n"
+ , ppr block ])
+
+
+checkBlockInstrs :: [Instr] -> Bool
+checkBlockInstrs ii
+
+ -- An unconditional jumps end the block.
+ -- There must be an unconditional jump in the block, otherwise
+ -- the register liveness determinator will get the liveness
+ -- information wrong.
+ --
+ -- If the block ends with a cmm call that never returns
+ -- then there can be unreachable instructions after the jump,
+ -- but we don't mind here.
+ --
+ | instr : NOP : _ <- ii
+ , isUnconditionalJump instr
+ = True
+
+ -- All jumps must have a NOP in their branch delay slot.
+ -- The liveness determinator and register allocators aren't smart
+ -- enough to handle branch delay slots.
+ --
+ | instr : NOP : is <- ii
+ , isJumpishInstr instr
+ = checkBlockInstrs is
+
+ -- keep checking
+ | _:i2:is <- ii
+ = checkBlockInstrs (i2:is)
+
+ -- this block is no good
+ | otherwise
+ = False
diff --git a/compiler/GHC/CmmToAsm/SPARC/Cond.hs b/compiler/GHC/CmmToAsm/SPARC/Cond.hs
new file mode 100644
index 0000000000..89b64b7c3a
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Cond.hs
@@ -0,0 +1,54 @@
+module GHC.CmmToAsm.SPARC.Cond (
+ Cond(..),
+ condUnsigned,
+ condToSigned,
+ condToUnsigned
+)
+
+where
+
+import GhcPrelude
+
+-- | Branch condition codes.
+data Cond
+ = ALWAYS
+ | EQQ
+ | GE
+ | GEU
+ | GTT
+ | GU
+ | LE
+ | LEU
+ | LTT
+ | LU
+ | NE
+ | NEG
+ | NEVER
+ | POS
+ | VC
+ | VS
+ deriving Eq
+
+
+condUnsigned :: Cond -> Bool
+condUnsigned GU = True
+condUnsigned LU = True
+condUnsigned GEU = True
+condUnsigned LEU = True
+condUnsigned _ = False
+
+
+condToSigned :: Cond -> Cond
+condToSigned GU = GTT
+condToSigned LU = LTT
+condToSigned GEU = GE
+condToSigned LEU = LE
+condToSigned x = x
+
+
+condToUnsigned :: Cond -> Cond
+condToUnsigned GTT = GU
+condToUnsigned LTT = LU
+condToUnsigned GE = GEU
+condToUnsigned LE = LEU
+condToUnsigned x = x
diff --git a/compiler/GHC/CmmToAsm/SPARC/Imm.hs b/compiler/GHC/CmmToAsm/SPARC/Imm.hs
new file mode 100644
index 0000000000..71b0257ac5
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Imm.hs
@@ -0,0 +1,67 @@
+module GHC.CmmToAsm.SPARC.Imm (
+ -- immediate values
+ Imm(..),
+ strImmLit,
+ litToImm
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Cmm
+import GHC.Cmm.CLabel
+
+import Outputable
+
+-- | An immediate value.
+-- Not all of these are directly representable by the machine.
+-- Things like ImmLit are slurped out and put in a data segment instead.
+--
+data Imm
+ = ImmInt Int
+
+ -- Sigh.
+ | ImmInteger Integer
+
+ -- AbstractC Label (with baggage)
+ | ImmCLbl CLabel
+
+ -- Simple string
+ | ImmLit SDoc
+ | ImmIndex CLabel Int
+ | ImmFloat Rational
+ | ImmDouble Rational
+
+ | ImmConstantSum Imm Imm
+ | ImmConstantDiff Imm Imm
+
+ | LO Imm
+ | HI Imm
+
+
+-- | Create a ImmLit containing this string.
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+-- | Convert a CmmLit to an Imm.
+-- Narrow to the width: a CmmInt might be out of
+-- range, but we assume that ImmInteger only contains
+-- in-range values. A signed value should be fine here.
+--
+litToImm :: CmmLit -> Imm
+litToImm lit
+ = case lit of
+ CmmInt i w -> ImmInteger (narrowS w i)
+ CmmFloat f W32 -> ImmFloat f
+ CmmFloat f W64 -> ImmDouble f
+ CmmLabel l -> ImmCLbl l
+ CmmLabelOff l off -> ImmIndex l off
+
+ CmmLabelDiffOff l1 l2 off _
+ -> ImmConstantSum
+ (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+ (ImmInt off)
+
+ _ -> panic "SPARC.Regs.litToImm: no match"
diff --git a/compiler/GHC/CmmToAsm/SPARC/Instr.hs b/compiler/GHC/CmmToAsm/SPARC/Instr.hs
new file mode 100644
index 0000000000..ec74d3723b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Instr.hs
@@ -0,0 +1,481 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-dependent assembly language
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-----------------------------------------------------------------------------
+#include "HsVersions.h"
+
+module GHC.CmmToAsm.SPARC.Instr (
+ RI(..),
+ riZero,
+
+ fpRelEA,
+ moveSp,
+
+ isUnconditionalJump,
+
+ Instr(..),
+ maxSpillSlots
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Stack
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Reg.Target
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm.CLabel
+import GHC.Platform.Regs
+import GHC.Cmm.BlockId
+import GHC.Driver.Session
+import GHC.Cmm
+import FastString
+import Outputable
+import GHC.Platform
+
+
+-- | Register or immediate
+data RI
+ = RIReg Reg
+ | RIImm Imm
+
+-- | Check if a RI represents a zero value.
+-- - a literal zero
+-- - register %g0, which is always zero.
+--
+riZero :: RI -> Bool
+riZero (RIImm (ImmInt 0)) = True
+riZero (RIImm (ImmInteger 0)) = True
+riZero (RIReg (RegReal (RealRegSingle 0))) = True
+riZero _ = False
+
+
+-- | Calculate the effective address which would be used by the
+-- corresponding fpRel sequence.
+fpRelEA :: Int -> Reg -> Instr
+fpRelEA n dst
+ = ADD False False fp (RIImm (ImmInt (n * wordLength))) dst
+
+
+-- | Code to shift the stack pointer by n words.
+moveSp :: Int -> Instr
+moveSp n
+ = ADD False False sp (RIImm (ImmInt (n * wordLength))) sp
+
+-- | An instruction that will cause the one after it never to be exectuted
+isUnconditionalJump :: Instr -> Bool
+isUnconditionalJump ii
+ = case ii of
+ CALL{} -> True
+ JMP{} -> True
+ JMP_TBL{} -> True
+ BI ALWAYS _ _ -> True
+ BF ALWAYS _ _ -> True
+ _ -> False
+
+
+-- | instance for sparc instruction set
+instance Instruction Instr where
+ regUsageOfInstr = sparc_regUsageOfInstr
+ patchRegsOfInstr = sparc_patchRegsOfInstr
+ isJumpishInstr = sparc_isJumpishInstr
+ jumpDestsOfInstr = sparc_jumpDestsOfInstr
+ patchJumpInstr = sparc_patchJumpInstr
+ mkSpillInstr = sparc_mkSpillInstr
+ mkLoadInstr = sparc_mkLoadInstr
+ takeDeltaInstr = sparc_takeDeltaInstr
+ isMetaInstr = sparc_isMetaInstr
+ mkRegRegMoveInstr = sparc_mkRegRegMoveInstr
+ takeRegRegMoveInstr = sparc_takeRegRegMoveInstr
+ mkJumpInstr = sparc_mkJumpInstr
+ mkStackAllocInstr = panic "no sparc_mkStackAllocInstr"
+ mkStackDeallocInstr = panic "no sparc_mkStackDeallocInstr"
+
+
+-- | SPARC instruction set.
+-- Not complete. This is only the ones we need.
+--
+data Instr
+
+ -- meta ops --------------------------------------------------
+ -- comment pseudo-op
+ = COMMENT FastString
+
+ -- some static data spat out during code generation.
+ -- Will be extracted before pretty-printing.
+ | LDATA Section RawCmmStatics
+
+ -- Start a new basic block. Useful during codegen, removed later.
+ -- Preceding instruction should be a jump, as per the invariants
+ -- for a BasicBlock (see Cmm).
+ | NEWBLOCK BlockId
+
+ -- specify current stack offset for benefit of subsequent passes.
+ | DELTA Int
+
+ -- real instrs -----------------------------------------------
+ -- Loads and stores.
+ | LD Format AddrMode Reg -- format, src, dst
+ | ST Format Reg AddrMode -- format, src, dst
+
+ -- Int Arithmetic.
+ -- x: add/sub with carry bit.
+ -- In SPARC V9 addx and friends were renamed addc.
+ --
+ -- cc: modify condition codes
+ --
+ | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
+ | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst
+
+ | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst
+ | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst
+
+
+ -- The SPARC divide instructions perform 64bit by 32bit division
+ -- The Y register is xored into the first operand.
+
+ -- On _some implementations_ the Y register is overwritten by
+ -- the remainder, so we have to make sure it is 0 each time.
+
+ -- dst <- ((Y `shiftL` 32) `or` src1) `div` src2
+ | UDIV Bool Reg RI Reg -- cc?, src1, src2, dst
+ | SDIV Bool Reg RI Reg -- cc?, src1, src2, dst
+
+ | RDY Reg -- move contents of Y register to reg
+ | WRY Reg Reg -- Y <- src1 `xor` src2
+
+ -- Logic operations.
+ | AND Bool Reg RI Reg -- cc?, src1, src2, dst
+ | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst
+ | OR Bool Reg RI Reg -- cc?, src1, src2, dst
+ | ORN Bool Reg RI Reg -- cc?, src1, src2, dst
+ | XOR Bool Reg RI Reg -- cc?, src1, src2, dst
+ | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst
+ | SLL Reg RI Reg -- src1, src2, dst
+ | SRL Reg RI Reg -- src1, src2, dst
+ | SRA Reg RI Reg -- src1, src2, dst
+
+ -- Load immediates.
+ | SETHI Imm Reg -- src, dst
+
+ -- Do nothing.
+ -- Implemented by the assembler as SETHI 0, %g0, but worth an alias
+ | NOP
+
+ -- Float Arithmetic.
+ -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
+ -- instructions right up until we spit them out.
+ --
+ | FABS Format Reg Reg -- src dst
+ | FADD Format Reg Reg Reg -- src1, src2, dst
+ | FCMP Bool Format Reg Reg -- exception?, src1, src2, dst
+ | FDIV Format Reg Reg Reg -- src1, src2, dst
+ | FMOV Format Reg Reg -- src, dst
+ | FMUL Format Reg Reg Reg -- src1, src2, dst
+ | FNEG Format Reg Reg -- src, dst
+ | FSQRT Format Reg Reg -- src, dst
+ | FSUB Format Reg Reg Reg -- src1, src2, dst
+ | FxTOy Format Format Reg Reg -- src, dst
+
+ -- Jumping around.
+ | BI Cond Bool BlockId -- cond, annul?, target
+ | BF Cond Bool BlockId -- cond, annul?, target
+
+ | JMP AddrMode -- target
+
+ -- With a tabled jump we know all the possible destinations.
+ -- We also need this info so we can work out what regs are live across the jump.
+ --
+ | JMP_TBL AddrMode [Maybe BlockId] CLabel
+
+ | CALL (Either Imm Reg) Int Bool -- target, args, terminal
+
+
+-- | regUsage returns the sets of src and destination registers used
+-- by a particular instruction. Machine registers that are
+-- pre-allocated to stgRegs are filtered out, because they are
+-- uninteresting from a register allocation standpoint. (We wouldn't
+-- want them to end up on the free list!) As far as we are concerned,
+-- the fixed registers simply don't exist (for allocation purposes,
+-- anyway).
+
+-- regUsage doesn't need to do any trickery for jumps and such. Just
+-- state precisely the regs read and written by that insn. The
+-- consequences of control flow transfers, as far as register
+-- allocation goes, are taken care of by the register allocator.
+--
+sparc_regUsageOfInstr :: Platform -> Instr -> RegUsage
+sparc_regUsageOfInstr platform instr
+ = case instr of
+ LD _ addr reg -> usage (regAddr addr, [reg])
+ ST _ reg addr -> usage (reg : regAddr addr, [])
+ ADD _ _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ SUB _ _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ UMUL _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ SMUL _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ UDIV _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ SDIV _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ RDY rd -> usage ([], [rd])
+ WRY r1 r2 -> usage ([r1, r2], [])
+ AND _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ ANDN _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ OR _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ ORN _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ XOR _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ XNOR _ r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ SLL r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ SRL r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ SRA r1 ar r2 -> usage (r1 : regRI ar, [r2])
+ SETHI _ reg -> usage ([], [reg])
+ FABS _ r1 r2 -> usage ([r1], [r2])
+ FADD _ r1 r2 r3 -> usage ([r1, r2], [r3])
+ FCMP _ _ r1 r2 -> usage ([r1, r2], [])
+ FDIV _ r1 r2 r3 -> usage ([r1, r2], [r3])
+ FMOV _ r1 r2 -> usage ([r1], [r2])
+ FMUL _ r1 r2 r3 -> usage ([r1, r2], [r3])
+ FNEG _ r1 r2 -> usage ([r1], [r2])
+ FSQRT _ r1 r2 -> usage ([r1], [r2])
+ FSUB _ r1 r2 r3 -> usage ([r1, r2], [r3])
+ FxTOy _ _ r1 r2 -> usage ([r1], [r2])
+
+ JMP addr -> usage (regAddr addr, [])
+ JMP_TBL addr _ _ -> usage (regAddr addr, [])
+
+ CALL (Left _ ) _ True -> noUsage
+ CALL (Left _ ) n False -> usage (argRegs n, callClobberedRegs)
+ CALL (Right reg) _ True -> usage ([reg], [])
+ CALL (Right reg) n False -> usage (reg : (argRegs n), callClobberedRegs)
+ _ -> noUsage
+
+ where
+ usage (src, dst)
+ = RU (filter (interesting platform) src)
+ (filter (interesting platform) dst)
+
+ regAddr (AddrRegReg r1 r2) = [r1, r2]
+ regAddr (AddrRegImm r1 _) = [r1]
+
+ regRI (RIReg r) = [r]
+ regRI _ = []
+
+
+-- | Interesting regs are virtuals, or ones that are allocatable
+-- by the register allocator.
+interesting :: Platform -> Reg -> Bool
+interesting platform reg
+ = case reg of
+ RegVirtual _ -> True
+ RegReal (RealRegSingle r1) -> freeReg platform r1
+ RegReal (RealRegPair r1 _) -> freeReg platform r1
+
+
+
+-- | Apply a given mapping to tall the register references in this instruction.
+sparc_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+sparc_patchRegsOfInstr instr env = case instr of
+ LD fmt addr reg -> LD fmt (fixAddr addr) (env reg)
+ ST fmt reg addr -> ST fmt (env reg) (fixAddr addr)
+
+ ADD x cc r1 ar r2 -> ADD x cc (env r1) (fixRI ar) (env r2)
+ SUB x cc r1 ar r2 -> SUB x cc (env r1) (fixRI ar) (env r2)
+ UMUL cc r1 ar r2 -> UMUL cc (env r1) (fixRI ar) (env r2)
+ SMUL cc r1 ar r2 -> SMUL cc (env r1) (fixRI ar) (env r2)
+ UDIV cc r1 ar r2 -> UDIV cc (env r1) (fixRI ar) (env r2)
+ SDIV cc r1 ar r2 -> SDIV cc (env r1) (fixRI ar) (env r2)
+ RDY rd -> RDY (env rd)
+ WRY r1 r2 -> WRY (env r1) (env r2)
+ AND b r1 ar r2 -> AND b (env r1) (fixRI ar) (env r2)
+ ANDN b r1 ar r2 -> ANDN b (env r1) (fixRI ar) (env r2)
+ OR b r1 ar r2 -> OR b (env r1) (fixRI ar) (env r2)
+ ORN b r1 ar r2 -> ORN b (env r1) (fixRI ar) (env r2)
+ XOR b r1 ar r2 -> XOR b (env r1) (fixRI ar) (env r2)
+ XNOR b r1 ar r2 -> XNOR b (env r1) (fixRI ar) (env r2)
+ SLL r1 ar r2 -> SLL (env r1) (fixRI ar) (env r2)
+ SRL r1 ar r2 -> SRL (env r1) (fixRI ar) (env r2)
+ SRA r1 ar r2 -> SRA (env r1) (fixRI ar) (env r2)
+
+ SETHI imm reg -> SETHI imm (env reg)
+
+ FABS s r1 r2 -> FABS s (env r1) (env r2)
+ FADD s r1 r2 r3 -> FADD s (env r1) (env r2) (env r3)
+ FCMP e s r1 r2 -> FCMP e s (env r1) (env r2)
+ FDIV s r1 r2 r3 -> FDIV s (env r1) (env r2) (env r3)
+ FMOV s r1 r2 -> FMOV s (env r1) (env r2)
+ FMUL s r1 r2 r3 -> FMUL s (env r1) (env r2) (env r3)
+ FNEG s r1 r2 -> FNEG s (env r1) (env r2)
+ FSQRT s r1 r2 -> FSQRT s (env r1) (env r2)
+ FSUB s r1 r2 r3 -> FSUB s (env r1) (env r2) (env r3)
+ FxTOy s1 s2 r1 r2 -> FxTOy s1 s2 (env r1) (env r2)
+
+ JMP addr -> JMP (fixAddr addr)
+ JMP_TBL addr ids l -> JMP_TBL (fixAddr addr) ids l
+
+ CALL (Left i) n t -> CALL (Left i) n t
+ CALL (Right r) n t -> CALL (Right (env r)) n t
+ _ -> instr
+
+ where
+ fixAddr (AddrRegReg r1 r2) = AddrRegReg (env r1) (env r2)
+ fixAddr (AddrRegImm r1 i) = AddrRegImm (env r1) i
+
+ fixRI (RIReg r) = RIReg (env r)
+ fixRI other = other
+
+
+--------------------------------------------------------------------------------
+sparc_isJumpishInstr :: Instr -> Bool
+sparc_isJumpishInstr instr
+ = case instr of
+ BI{} -> True
+ BF{} -> True
+ JMP{} -> True
+ JMP_TBL{} -> True
+ CALL{} -> True
+ _ -> False
+
+sparc_jumpDestsOfInstr :: Instr -> [BlockId]
+sparc_jumpDestsOfInstr insn
+ = case insn of
+ BI _ _ id -> [id]
+ BF _ _ id -> [id]
+ JMP_TBL _ ids _ -> [id | Just id <- ids]
+ _ -> []
+
+
+sparc_patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
+sparc_patchJumpInstr insn patchF
+ = case insn of
+ BI cc annul id -> BI cc annul (patchF id)
+ BF cc annul id -> BF cc annul (patchF id)
+ JMP_TBL n ids l -> JMP_TBL n (map (fmap patchF) ids) l
+ _ -> insn
+
+
+--------------------------------------------------------------------------------
+-- | Make a spill instruction.
+-- On SPARC we spill below frame pointer leaving 2 words/spill
+sparc_mkSpillInstr
+ :: DynFlags
+ -> Reg -- ^ register to spill
+ -> Int -- ^ current stack delta
+ -> Int -- ^ spill slot to use
+ -> Instr
+
+sparc_mkSpillInstr dflags reg _ slot
+ = let platform = targetPlatform dflags
+ off = spillSlotToOffset dflags slot
+ off_w = 1 + (off `div` 4)
+ fmt = case targetClassOfReg platform reg of
+ RcInteger -> II32
+ RcFloat -> FF32
+ RcDouble -> FF64
+
+ in ST fmt reg (fpRel (negate off_w))
+
+
+-- | Make a spill reload instruction.
+sparc_mkLoadInstr
+ :: DynFlags
+ -> Reg -- ^ register to load into
+ -> Int -- ^ current stack delta
+ -> Int -- ^ spill slot to use
+ -> Instr
+
+sparc_mkLoadInstr dflags reg _ slot
+ = let platform = targetPlatform dflags
+ off = spillSlotToOffset dflags slot
+ off_w = 1 + (off `div` 4)
+ fmt = case targetClassOfReg platform reg of
+ RcInteger -> II32
+ RcFloat -> FF32
+ RcDouble -> FF64
+
+ in LD fmt (fpRel (- off_w)) reg
+
+
+--------------------------------------------------------------------------------
+-- | See if this instruction is telling us the current C stack delta
+sparc_takeDeltaInstr
+ :: Instr
+ -> Maybe Int
+
+sparc_takeDeltaInstr instr
+ = case instr of
+ DELTA i -> Just i
+ _ -> Nothing
+
+
+sparc_isMetaInstr
+ :: Instr
+ -> Bool
+
+sparc_isMetaInstr instr
+ = case instr of
+ COMMENT{} -> True
+ LDATA{} -> True
+ NEWBLOCK{} -> True
+ DELTA{} -> True
+ _ -> False
+
+
+-- | Make a reg-reg move instruction.
+-- On SPARC v8 there are no instructions to move directly between
+-- floating point and integer regs. If we need to do that then we
+-- have to go via memory.
+--
+sparc_mkRegRegMoveInstr
+ :: Platform
+ -> Reg
+ -> Reg
+ -> Instr
+
+sparc_mkRegRegMoveInstr platform src dst
+ | srcClass <- targetClassOfReg platform src
+ , dstClass <- targetClassOfReg platform dst
+ , srcClass == dstClass
+ = case srcClass of
+ RcInteger -> ADD False False src (RIReg g0) dst
+ RcDouble -> FMOV FF64 src dst
+ RcFloat -> FMOV FF32 src dst
+
+ | otherwise
+ = panic "SPARC.Instr.mkRegRegMoveInstr: classes of src and dest not the same"
+
+
+-- | Check whether an instruction represents a reg-reg move.
+-- The register allocator attempts to eliminate reg->reg moves whenever it can,
+-- by assigning the src and dest temporaries to the same real register.
+--
+sparc_takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
+sparc_takeRegRegMoveInstr instr
+ = case instr of
+ ADD False False src (RIReg src2) dst
+ | g0 == src2 -> Just (src, dst)
+
+ FMOV FF64 src dst -> Just (src, dst)
+ FMOV FF32 src dst -> Just (src, dst)
+ _ -> Nothing
+
+
+-- | Make an unconditional branch instruction.
+sparc_mkJumpInstr
+ :: BlockId
+ -> [Instr]
+
+sparc_mkJumpInstr id
+ = [BI ALWAYS False id
+ , NOP] -- fill the branch delay slot.
diff --git a/compiler/GHC/CmmToAsm/SPARC/Ppr.hs b/compiler/GHC/CmmToAsm/SPARC/Ppr.hs
new file mode 100644
index 0000000000..2f3ea778f7
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Ppr.hs
@@ -0,0 +1,645 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+module GHC.CmmToAsm.SPARC.Ppr (
+ pprNatCmmDecl,
+ pprBasicBlock,
+ pprData,
+ pprInstr,
+ pprFormat,
+ pprImm,
+ pprDataItem
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Ppr
+
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.Ppr() -- For Outputable instances
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Collections
+
+import Unique ( pprUniqueAlways )
+import Outputable
+import GHC.Platform
+import FastString
+
+-- -----------------------------------------------------------------------------
+-- Printing this stuff out
+
+pprNatCmmDecl :: NatCmmDecl RawCmmStatics Instr -> SDoc
+pprNatCmmDecl (CmmData section dats) =
+ pprSectionAlign section $$ pprDatas dats
+
+pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+ case topInfoTable proc of
+ Nothing ->
+ -- special case for code without info table:
+ pprSectionAlign (Section Text lbl) $$
+ pprLabel lbl $$ -- blocks guaranteed not null, so label needed
+ vcat (map (pprBasicBlock top_info) blocks)
+
+ Just (RawCmmStatics info_lbl _) ->
+ sdocWithPlatform $ \platform ->
+ (if platformHasSubsectionsViaSymbols platform
+ then pprSectionAlign dspSection $$
+ ppr (mkDeadStripPreventer info_lbl) <> char ':'
+ else empty) $$
+ vcat (map (pprBasicBlock top_info) blocks) $$
+ -- above: Even the first block gets a label, because with branch-chain
+ -- elimination, it might be the target of a goto.
+ (if platformHasSubsectionsViaSymbols platform
+ then
+ -- See Note [Subsections Via Symbols] in X86/Ppr.hs
+ text "\t.long "
+ <+> ppr info_lbl
+ <+> char '-'
+ <+> ppr (mkDeadStripPreventer info_lbl)
+ else empty)
+
+dspSection :: Section
+dspSection = Section Text $
+ panic "subsections-via-symbols doesn't combine with split-sections"
+
+pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc
+pprBasicBlock info_env (BasicBlock blockid instrs)
+ = maybe_infotable $$
+ pprLabel (blockLbl blockid) $$
+ vcat (map pprInstr instrs)
+ where
+ maybe_infotable = case mapLookup blockid info_env of
+ Nothing -> empty
+ Just (RawCmmStatics info_lbl info) ->
+ pprAlignForSection Text $$
+ vcat (map pprData info) $$
+ pprLabel info_lbl
+
+
+pprDatas :: RawCmmStatics -> SDoc
+-- See note [emit-time elimination of static indirections] in CLabel.
+pprDatas (RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+ | lbl == mkIndStaticInfoLabel
+ , let labelInd (CmmLabelOff l _) = Just l
+ labelInd (CmmLabel l) = Just l
+ labelInd _ = Nothing
+ , Just ind' <- labelInd ind
+ , alias `mayRedirectTo` ind'
+ = pprGloblDecl alias
+ $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind')
+pprDatas (RawCmmStatics lbl dats) = vcat (pprLabel lbl : map pprData dats)
+
+pprData :: CmmStatic -> SDoc
+pprData (CmmString str) = pprBytes str
+pprData (CmmUninitialised bytes) = text ".skip " <> int bytes
+pprData (CmmStaticLit lit) = pprDataItem lit
+
+pprGloblDecl :: CLabel -> SDoc
+pprGloblDecl lbl
+ | not (externallyVisibleCLabel lbl) = empty
+ | otherwise = text ".global " <> ppr lbl
+
+pprTypeAndSizeDecl :: CLabel -> SDoc
+pprTypeAndSizeDecl lbl
+ = sdocWithPlatform $ \platform ->
+ if platformOS platform == OSLinux && externallyVisibleCLabel lbl
+ then text ".type " <> ppr lbl <> ptext (sLit ", @object")
+ else empty
+
+pprLabel :: CLabel -> SDoc
+pprLabel lbl = pprGloblDecl lbl
+ $$ pprTypeAndSizeDecl lbl
+ $$ (ppr lbl <> char ':')
+
+-- -----------------------------------------------------------------------------
+-- pprInstr: print an 'Instr'
+
+instance Outputable Instr where
+ ppr instr = pprInstr instr
+
+
+-- | Pretty print a register.
+pprReg :: Reg -> SDoc
+pprReg reg
+ = case reg of
+ RegVirtual vr
+ -> case vr of
+ VirtualRegI u -> text "%vI_" <> pprUniqueAlways u
+ VirtualRegHi u -> text "%vHi_" <> pprUniqueAlways u
+ VirtualRegF u -> text "%vF_" <> pprUniqueAlways u
+ VirtualRegD u -> text "%vD_" <> pprUniqueAlways u
+
+
+ RegReal rr
+ -> case rr of
+ RealRegSingle r1
+ -> pprReg_ofRegNo r1
+
+ RealRegPair r1 r2
+ -> text "(" <> pprReg_ofRegNo r1
+ <> vbar <> pprReg_ofRegNo r2
+ <> text ")"
+
+
+
+-- | Pretty print a register name, based on this register number.
+-- The definition has been unfolded so we get a jump-table in the
+-- object code. This function is called quite a lot when emitting
+-- the asm file..
+--
+pprReg_ofRegNo :: Int -> SDoc
+pprReg_ofRegNo i
+ = ptext
+ (case i of {
+ 0 -> sLit "%g0"; 1 -> sLit "%g1";
+ 2 -> sLit "%g2"; 3 -> sLit "%g3";
+ 4 -> sLit "%g4"; 5 -> sLit "%g5";
+ 6 -> sLit "%g6"; 7 -> sLit "%g7";
+ 8 -> sLit "%o0"; 9 -> sLit "%o1";
+ 10 -> sLit "%o2"; 11 -> sLit "%o3";
+ 12 -> sLit "%o4"; 13 -> sLit "%o5";
+ 14 -> sLit "%o6"; 15 -> sLit "%o7";
+ 16 -> sLit "%l0"; 17 -> sLit "%l1";
+ 18 -> sLit "%l2"; 19 -> sLit "%l3";
+ 20 -> sLit "%l4"; 21 -> sLit "%l5";
+ 22 -> sLit "%l6"; 23 -> sLit "%l7";
+ 24 -> sLit "%i0"; 25 -> sLit "%i1";
+ 26 -> sLit "%i2"; 27 -> sLit "%i3";
+ 28 -> sLit "%i4"; 29 -> sLit "%i5";
+ 30 -> sLit "%i6"; 31 -> sLit "%i7";
+ 32 -> sLit "%f0"; 33 -> sLit "%f1";
+ 34 -> sLit "%f2"; 35 -> sLit "%f3";
+ 36 -> sLit "%f4"; 37 -> sLit "%f5";
+ 38 -> sLit "%f6"; 39 -> sLit "%f7";
+ 40 -> sLit "%f8"; 41 -> sLit "%f9";
+ 42 -> sLit "%f10"; 43 -> sLit "%f11";
+ 44 -> sLit "%f12"; 45 -> sLit "%f13";
+ 46 -> sLit "%f14"; 47 -> sLit "%f15";
+ 48 -> sLit "%f16"; 49 -> sLit "%f17";
+ 50 -> sLit "%f18"; 51 -> sLit "%f19";
+ 52 -> sLit "%f20"; 53 -> sLit "%f21";
+ 54 -> sLit "%f22"; 55 -> sLit "%f23";
+ 56 -> sLit "%f24"; 57 -> sLit "%f25";
+ 58 -> sLit "%f26"; 59 -> sLit "%f27";
+ 60 -> sLit "%f28"; 61 -> sLit "%f29";
+ 62 -> sLit "%f30"; 63 -> sLit "%f31";
+ _ -> sLit "very naughty sparc register" })
+
+
+-- | Pretty print a format for an instruction suffix.
+pprFormat :: Format -> SDoc
+pprFormat x
+ = ptext
+ (case x of
+ II8 -> sLit "ub"
+ II16 -> sLit "uh"
+ II32 -> sLit ""
+ II64 -> sLit "d"
+ FF32 -> sLit ""
+ FF64 -> sLit "d")
+
+
+-- | Pretty print a format for an instruction suffix.
+-- eg LD is 32bit on sparc, but LDD is 64 bit.
+pprStFormat :: Format -> SDoc
+pprStFormat x
+ = ptext
+ (case x of
+ II8 -> sLit "b"
+ II16 -> sLit "h"
+ II32 -> sLit ""
+ II64 -> sLit "x"
+ FF32 -> sLit ""
+ FF64 -> sLit "d")
+
+
+
+-- | Pretty print a condition code.
+pprCond :: Cond -> SDoc
+pprCond c
+ = ptext
+ (case c of
+ ALWAYS -> sLit ""
+ NEVER -> sLit "n"
+ GEU -> sLit "geu"
+ LU -> sLit "lu"
+ EQQ -> sLit "e"
+ GTT -> sLit "g"
+ GE -> sLit "ge"
+ GU -> sLit "gu"
+ LTT -> sLit "l"
+ LE -> sLit "le"
+ LEU -> sLit "leu"
+ NE -> sLit "ne"
+ NEG -> sLit "neg"
+ POS -> sLit "pos"
+ VC -> sLit "vc"
+ VS -> sLit "vs")
+
+
+-- | Pretty print an address mode.
+pprAddr :: AddrMode -> SDoc
+pprAddr am
+ = case am of
+ AddrRegReg r1 (RegReal (RealRegSingle 0))
+ -> pprReg r1
+
+ AddrRegReg r1 r2
+ -> hcat [ pprReg r1, char '+', pprReg r2 ]
+
+ AddrRegImm r1 (ImmInt i)
+ | i == 0 -> pprReg r1
+ | not (fits13Bits i) -> largeOffsetError i
+ | otherwise -> hcat [ pprReg r1, pp_sign, int i ]
+ where
+ pp_sign = if i > 0 then char '+' else empty
+
+ AddrRegImm r1 (ImmInteger i)
+ | i == 0 -> pprReg r1
+ | not (fits13Bits i) -> largeOffsetError i
+ | otherwise -> hcat [ pprReg r1, pp_sign, integer i ]
+ where
+ pp_sign = if i > 0 then char '+' else empty
+
+ AddrRegImm r1 imm
+ -> hcat [ pprReg r1, char '+', pprImm imm ]
+
+
+-- | Pretty print an immediate value.
+pprImm :: Imm -> SDoc
+pprImm imm
+ = case imm of
+ ImmInt i -> int i
+ ImmInteger i -> integer i
+ ImmCLbl l -> ppr l
+ ImmIndex l i -> ppr l <> char '+' <> int i
+ ImmLit s -> s
+
+ ImmConstantSum a b
+ -> pprImm a <> char '+' <> pprImm b
+
+ ImmConstantDiff a b
+ -> pprImm a <> char '-' <> lparen <> pprImm b <> rparen
+
+ LO i
+ -> hcat [ text "%lo(", pprImm i, rparen ]
+
+ HI i
+ -> hcat [ text "%hi(", pprImm i, rparen ]
+
+ -- these should have been converted to bytes and placed
+ -- in the data section.
+ ImmFloat _ -> text "naughty float immediate"
+ ImmDouble _ -> text "naughty double immediate"
+
+
+-- | Pretty print a section \/ segment header.
+-- On SPARC all the data sections must be at least 8 byte aligned
+-- incase we store doubles in them.
+--
+pprSectionAlign :: Section -> SDoc
+pprSectionAlign sec@(Section seg _) =
+ sdocWithPlatform $ \platform ->
+ pprSectionHeader platform sec $$
+ pprAlignForSection seg
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: SectionType -> SDoc
+pprAlignForSection seg =
+ ptext (case seg of
+ Text -> sLit ".align 4"
+ Data -> sLit ".align 8"
+ ReadOnlyData -> sLit ".align 8"
+ RelocatableReadOnlyData
+ -> sLit ".align 8"
+ UninitialisedData -> sLit ".align 8"
+ ReadOnlyData16 -> sLit ".align 16"
+ -- TODO: This is copied from the ReadOnlyData case, but it can likely be
+ -- made more efficient.
+ CString -> sLit ".align 8"
+ OtherSection _ -> panic "PprMach.pprSectionHeader: unknown section")
+
+-- | Pretty print a data item.
+pprDataItem :: CmmLit -> SDoc
+pprDataItem lit
+ = sdocWithDynFlags $ \dflags ->
+ vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit)
+ where
+ imm = litToImm lit
+
+ ppr_item II8 _ = [text "\t.byte\t" <> pprImm imm]
+ ppr_item II32 _ = [text "\t.long\t" <> pprImm imm]
+
+ ppr_item FF32 (CmmFloat r _)
+ = let bs = floatToBytes (fromRational r)
+ in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+ ppr_item FF64 (CmmFloat r _)
+ = let bs = doubleToBytes (fromRational r)
+ in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+ ppr_item II16 _ = [text "\t.short\t" <> pprImm imm]
+ ppr_item II64 _ = [text "\t.quad\t" <> pprImm imm]
+ ppr_item _ _ = panic "SPARC.Ppr.pprDataItem: no match"
+
+
+-- | Pretty print an instruction.
+pprInstr :: Instr -> SDoc
+
+-- nuke comments.
+pprInstr (COMMENT _)
+ = empty
+
+pprInstr (DELTA d)
+ = pprInstr (COMMENT (mkFastString ("\tdelta = " ++ show d)))
+
+-- Newblocks and LData should have been slurped out before producing the .s file.
+pprInstr (NEWBLOCK _)
+ = panic "X86.Ppr.pprInstr: NEWBLOCK"
+
+pprInstr (LDATA _ _)
+ = panic "PprMach.pprInstr: LDATA"
+
+-- 64 bit FP loads are expanded into individual instructions in CodeGen.Expand
+pprInstr (LD FF64 _ reg)
+ | RegReal (RealRegSingle{}) <- reg
+ = panic "SPARC.Ppr: not emitting potentially misaligned LD FF64 instr"
+
+pprInstr (LD format addr reg)
+ = hcat [
+ text "\tld",
+ pprFormat format,
+ char '\t',
+ lbrack,
+ pprAddr addr,
+ pp_rbracket_comma,
+ pprReg reg
+ ]
+
+-- 64 bit FP stores are expanded into individual instructions in CodeGen.Expand
+pprInstr (ST FF64 reg _)
+ | RegReal (RealRegSingle{}) <- reg
+ = panic "SPARC.Ppr: not emitting potentially misaligned ST FF64 instr"
+
+-- no distinction is made between signed and unsigned bytes on stores for the
+-- Sparc opcodes (at least I cannot see any, and gas is nagging me --SOF),
+-- so we call a special-purpose pprFormat for ST..
+pprInstr (ST format reg addr)
+ = hcat [
+ text "\tst",
+ pprStFormat format,
+ char '\t',
+ pprReg reg,
+ pp_comma_lbracket,
+ pprAddr addr,
+ rbrack
+ ]
+
+
+pprInstr (ADD x cc reg1 ri reg2)
+ | not x && not cc && riZero ri
+ = hcat [ text "\tmov\t", pprReg reg1, comma, pprReg reg2 ]
+
+ | otherwise
+ = pprRegRIReg (if x then sLit "addx" else sLit "add") cc reg1 ri reg2
+
+
+pprInstr (SUB x cc reg1 ri reg2)
+ | not x && cc && reg2 == g0
+ = hcat [ text "\tcmp\t", pprReg reg1, comma, pprRI ri ]
+
+ | not x && not cc && riZero ri
+ = hcat [ text "\tmov\t", pprReg reg1, comma, pprReg reg2 ]
+
+ | otherwise
+ = pprRegRIReg (if x then sLit "subx" else sLit "sub") cc reg1 ri reg2
+
+pprInstr (AND b reg1 ri reg2) = pprRegRIReg (sLit "and") b reg1 ri reg2
+
+pprInstr (ANDN b reg1 ri reg2) = pprRegRIReg (sLit "andn") b reg1 ri reg2
+
+pprInstr (OR b reg1 ri reg2)
+ | not b && reg1 == g0
+ = let doit = hcat [ text "\tmov\t", pprRI ri, comma, pprReg reg2 ]
+ in case ri of
+ RIReg rrr | rrr == reg2 -> empty
+ _ -> doit
+
+ | otherwise
+ = pprRegRIReg (sLit "or") b reg1 ri reg2
+
+pprInstr (ORN b reg1 ri reg2) = pprRegRIReg (sLit "orn") b reg1 ri reg2
+
+pprInstr (XOR b reg1 ri reg2) = pprRegRIReg (sLit "xor") b reg1 ri reg2
+pprInstr (XNOR b reg1 ri reg2) = pprRegRIReg (sLit "xnor") b reg1 ri reg2
+
+pprInstr (SLL reg1 ri reg2) = pprRegRIReg (sLit "sll") False reg1 ri reg2
+pprInstr (SRL reg1 ri reg2) = pprRegRIReg (sLit "srl") False reg1 ri reg2
+pprInstr (SRA reg1 ri reg2) = pprRegRIReg (sLit "sra") False reg1 ri reg2
+
+pprInstr (RDY rd) = text "\trd\t%y," <> pprReg rd
+pprInstr (WRY reg1 reg2)
+ = text "\twr\t"
+ <> pprReg reg1
+ <> char ','
+ <> pprReg reg2
+ <> char ','
+ <> text "%y"
+
+pprInstr (SMUL b reg1 ri reg2) = pprRegRIReg (sLit "smul") b reg1 ri reg2
+pprInstr (UMUL b reg1 ri reg2) = pprRegRIReg (sLit "umul") b reg1 ri reg2
+pprInstr (SDIV b reg1 ri reg2) = pprRegRIReg (sLit "sdiv") b reg1 ri reg2
+pprInstr (UDIV b reg1 ri reg2) = pprRegRIReg (sLit "udiv") b reg1 ri reg2
+
+pprInstr (SETHI imm reg)
+ = hcat [
+ text "\tsethi\t",
+ pprImm imm,
+ comma,
+ pprReg reg
+ ]
+
+pprInstr NOP
+ = text "\tnop"
+
+pprInstr (FABS format reg1 reg2)
+ = pprFormatRegReg (sLit "fabs") format reg1 reg2
+
+pprInstr (FADD format reg1 reg2 reg3)
+ = pprFormatRegRegReg (sLit "fadd") format reg1 reg2 reg3
+
+pprInstr (FCMP e format reg1 reg2)
+ = pprFormatRegReg (if e then sLit "fcmpe" else sLit "fcmp")
+ format reg1 reg2
+
+pprInstr (FDIV format reg1 reg2 reg3)
+ = pprFormatRegRegReg (sLit "fdiv") format reg1 reg2 reg3
+
+pprInstr (FMOV format reg1 reg2)
+ = pprFormatRegReg (sLit "fmov") format reg1 reg2
+
+pprInstr (FMUL format reg1 reg2 reg3)
+ = pprFormatRegRegReg (sLit "fmul") format reg1 reg2 reg3
+
+pprInstr (FNEG format reg1 reg2)
+ = pprFormatRegReg (sLit "fneg") format reg1 reg2
+
+pprInstr (FSQRT format reg1 reg2)
+ = pprFormatRegReg (sLit "fsqrt") format reg1 reg2
+
+pprInstr (FSUB format reg1 reg2 reg3)
+ = pprFormatRegRegReg (sLit "fsub") format reg1 reg2 reg3
+
+pprInstr (FxTOy format1 format2 reg1 reg2)
+ = hcat [
+ text "\tf",
+ ptext
+ (case format1 of
+ II32 -> sLit "ito"
+ FF32 -> sLit "sto"
+ FF64 -> sLit "dto"
+ _ -> panic "SPARC.Ppr.pprInstr.FxToY: no match"),
+ ptext
+ (case format2 of
+ II32 -> sLit "i\t"
+ II64 -> sLit "x\t"
+ FF32 -> sLit "s\t"
+ FF64 -> sLit "d\t"
+ _ -> panic "SPARC.Ppr.pprInstr.FxToY: no match"),
+ pprReg reg1, comma, pprReg reg2
+ ]
+
+
+pprInstr (BI cond b blockid)
+ = hcat [
+ text "\tb", pprCond cond,
+ if b then pp_comma_a else empty,
+ char '\t',
+ ppr (blockLbl blockid)
+ ]
+
+pprInstr (BF cond b blockid)
+ = hcat [
+ text "\tfb", pprCond cond,
+ if b then pp_comma_a else empty,
+ char '\t',
+ ppr (blockLbl blockid)
+ ]
+
+pprInstr (JMP addr) = text "\tjmp\t" <> pprAddr addr
+pprInstr (JMP_TBL op _ _) = pprInstr (JMP op)
+
+pprInstr (CALL (Left imm) n _)
+ = hcat [ text "\tcall\t", pprImm imm, comma, int n ]
+
+pprInstr (CALL (Right reg) n _)
+ = hcat [ text "\tcall\t", pprReg reg, comma, int n ]
+
+
+-- | Pretty print a RI
+pprRI :: RI -> SDoc
+pprRI (RIReg r) = pprReg r
+pprRI (RIImm r) = pprImm r
+
+
+-- | Pretty print a two reg instruction.
+pprFormatRegReg :: PtrString -> Format -> Reg -> Reg -> SDoc
+pprFormatRegReg name format reg1 reg2
+ = hcat [
+ char '\t',
+ ptext name,
+ (case format of
+ FF32 -> text "s\t"
+ FF64 -> text "d\t"
+ _ -> panic "SPARC.Ppr.pprFormatRegReg: no match"),
+
+ pprReg reg1,
+ comma,
+ pprReg reg2
+ ]
+
+
+-- | Pretty print a three reg instruction.
+pprFormatRegRegReg :: PtrString -> Format -> Reg -> Reg -> Reg -> SDoc
+pprFormatRegRegReg name format reg1 reg2 reg3
+ = hcat [
+ char '\t',
+ ptext name,
+ (case format of
+ FF32 -> text "s\t"
+ FF64 -> text "d\t"
+ _ -> panic "SPARC.Ppr.pprFormatRegReg: no match"),
+ pprReg reg1,
+ comma,
+ pprReg reg2,
+ comma,
+ pprReg reg3
+ ]
+
+
+-- | Pretty print an instruction of two regs and a ri.
+pprRegRIReg :: PtrString -> Bool -> Reg -> RI -> Reg -> SDoc
+pprRegRIReg name b reg1 ri reg2
+ = hcat [
+ char '\t',
+ ptext name,
+ if b then text "cc\t" else char '\t',
+ pprReg reg1,
+ comma,
+ pprRI ri,
+ comma,
+ pprReg reg2
+ ]
+
+{-
+pprRIReg :: PtrString -> Bool -> RI -> Reg -> SDoc
+pprRIReg name b ri reg1
+ = hcat [
+ char '\t',
+ ptext name,
+ if b then text "cc\t" else char '\t',
+ pprRI ri,
+ comma,
+ pprReg reg1
+ ]
+-}
+
+{-
+pp_ld_lbracket :: SDoc
+pp_ld_lbracket = text "\tld\t["
+-}
+
+pp_rbracket_comma :: SDoc
+pp_rbracket_comma = text "],"
+
+
+pp_comma_lbracket :: SDoc
+pp_comma_lbracket = text ",["
+
+
+pp_comma_a :: SDoc
+pp_comma_a = text ",a"
diff --git a/compiler/GHC/CmmToAsm/SPARC/Regs.hs b/compiler/GHC/CmmToAsm/SPARC/Regs.hs
new file mode 100644
index 0000000000..ba22470912
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Regs.hs
@@ -0,0 +1,259 @@
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 1994-2004
+--
+-- -----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.SPARC.Regs (
+ -- registers
+ showReg,
+ virtualRegSqueeze,
+ realRegSqueeze,
+ classOfRealReg,
+ allRealRegs,
+
+ -- machine specific info
+ gReg, iReg, lReg, oReg, fReg,
+ fp, sp, g0, g1, g2, o0, o1, f0, f1, f6, f8, f22, f26, f27,
+
+ -- allocatable
+ allocatableRegs,
+
+ -- args
+ argRegs,
+ allArgRegs,
+ callClobberedRegs,
+
+ --
+ mkVirtualReg,
+ regDotColor
+)
+
+where
+
+
+import GhcPrelude
+
+import GHC.Platform.SPARC
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import Unique
+import Outputable
+
+{-
+ The SPARC has 64 registers of interest; 32 integer registers and 32
+ floating point registers. The mapping of STG registers to SPARC
+ machine registers is defined in StgRegs.h. We are, of course,
+ prepared for any eventuality.
+
+ The whole fp-register pairing thing on sparcs is a huge nuisance. See
+ includes/stg/MachRegs.h for a description of what's going on
+ here.
+-}
+
+
+-- | Get the standard name for the register with this number.
+showReg :: RegNo -> String
+showReg n
+ | n >= 0 && n < 8 = "%g" ++ show n
+ | n >= 8 && n < 16 = "%o" ++ show (n-8)
+ | n >= 16 && n < 24 = "%l" ++ show (n-16)
+ | n >= 24 && n < 32 = "%i" ++ show (n-24)
+ | n >= 32 && n < 64 = "%f" ++ show (n-32)
+ | otherwise = panic "SPARC.Regs.showReg: unknown sparc register"
+
+
+-- Get the register class of a certain real reg
+classOfRealReg :: RealReg -> RegClass
+classOfRealReg reg
+ = case reg of
+ RealRegSingle i
+ | i < 32 -> RcInteger
+ | otherwise -> RcFloat
+
+ RealRegPair{} -> RcDouble
+
+
+-- | regSqueeze_class reg
+-- Calculate the maximum number of register colors that could be
+-- denied to a node of this class due to having this reg
+-- as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+
+virtualRegSqueeze cls vr
+ = case cls of
+ RcInteger
+ -> case vr of
+ VirtualRegI{} -> 1
+ VirtualRegHi{} -> 1
+ _other -> 0
+
+ RcFloat
+ -> case vr of
+ VirtualRegF{} -> 1
+ VirtualRegD{} -> 2
+ _other -> 0
+
+ RcDouble
+ -> case vr of
+ VirtualRegF{} -> 1
+ VirtualRegD{} -> 1
+ _other -> 0
+
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+
+realRegSqueeze cls rr
+ = case cls of
+ RcInteger
+ -> case rr of
+ RealRegSingle regNo
+ | regNo < 32 -> 1
+ | otherwise -> 0
+
+ RealRegPair{} -> 0
+
+ RcFloat
+ -> case rr of
+ RealRegSingle regNo
+ | regNo < 32 -> 0
+ | otherwise -> 1
+
+ RealRegPair{} -> 2
+
+ RcDouble
+ -> case rr of
+ RealRegSingle regNo
+ | regNo < 32 -> 0
+ | otherwise -> 1
+
+ RealRegPair{} -> 1
+
+
+-- | All the allocatable registers in the machine,
+-- including register pairs.
+allRealRegs :: [RealReg]
+allRealRegs
+ = [ (RealRegSingle i) | i <- [0..63] ]
+ ++ [ (RealRegPair i (i+1)) | i <- [32, 34 .. 62 ] ]
+
+
+-- | Get the regno for this sort of reg
+gReg, lReg, iReg, oReg, fReg :: Int -> RegNo
+
+gReg x = x -- global regs
+oReg x = (8 + x) -- output regs
+lReg x = (16 + x) -- local regs
+iReg x = (24 + x) -- input regs
+fReg x = (32 + x) -- float regs
+
+
+-- | Some specific regs used by the code generator.
+g0, g1, g2, fp, sp, o0, o1, f0, f1, f6, f8, f22, f26, f27 :: Reg
+
+f6 = RegReal (RealRegSingle (fReg 6))
+f8 = RegReal (RealRegSingle (fReg 8))
+f22 = RegReal (RealRegSingle (fReg 22))
+f26 = RegReal (RealRegSingle (fReg 26))
+f27 = RegReal (RealRegSingle (fReg 27))
+
+-- g0 is always zero, and writes to it vanish.
+g0 = RegReal (RealRegSingle (gReg 0))
+g1 = RegReal (RealRegSingle (gReg 1))
+g2 = RegReal (RealRegSingle (gReg 2))
+
+-- FP, SP, int and float return (from C) regs.
+fp = RegReal (RealRegSingle (iReg 6))
+sp = RegReal (RealRegSingle (oReg 6))
+o0 = RegReal (RealRegSingle (oReg 0))
+o1 = RegReal (RealRegSingle (oReg 1))
+f0 = RegReal (RealRegSingle (fReg 0))
+f1 = RegReal (RealRegSingle (fReg 1))
+
+-- | Produce the second-half-of-a-double register given the first half.
+{-
+fPair :: Reg -> Maybe Reg
+fPair (RealReg n)
+ | n >= 32 && n `mod` 2 == 0 = Just (RealReg (n+1))
+
+fPair (VirtualRegD u)
+ = Just (VirtualRegHi u)
+
+fPair reg
+ = trace ("MachInstrs.fPair: can't get high half of supposed double reg " ++ showPpr reg)
+ Nothing
+-}
+
+
+-- | All the regs that the register allocator can allocate to,
+-- with the fixed use regs removed.
+--
+allocatableRegs :: [RealReg]
+allocatableRegs
+ = let isFree rr
+ = case rr of
+ RealRegSingle r -> freeReg r
+ RealRegPair r1 r2 -> freeReg r1 && freeReg r2
+ in filter isFree allRealRegs
+
+
+-- | The registers to place arguments for function calls,
+-- for some number of arguments.
+--
+argRegs :: RegNo -> [Reg]
+argRegs r
+ = case r of
+ 0 -> []
+ 1 -> map (RegReal . RealRegSingle . oReg) [0]
+ 2 -> map (RegReal . RealRegSingle . oReg) [0,1]
+ 3 -> map (RegReal . RealRegSingle . oReg) [0,1,2]
+ 4 -> map (RegReal . RealRegSingle . oReg) [0,1,2,3]
+ 5 -> map (RegReal . RealRegSingle . oReg) [0,1,2,3,4]
+ 6 -> map (RegReal . RealRegSingle . oReg) [0,1,2,3,4,5]
+ _ -> panic "MachRegs.argRegs(sparc): don't know about >6 arguments!"
+
+
+-- | All all the regs that could possibly be returned by argRegs
+--
+allArgRegs :: [Reg]
+allArgRegs
+ = map (RegReal . RealRegSingle) [oReg i | i <- [0..5]]
+
+
+-- These are the regs that we cannot assume stay alive over a C call.
+-- TODO: Why can we assume that o6 isn't clobbered? -- BL 2009/02
+--
+callClobberedRegs :: [Reg]
+callClobberedRegs
+ = map (RegReal . RealRegSingle)
+ ( oReg 7 :
+ [oReg i | i <- [0..5]] ++
+ [gReg i | i <- [1..7]] ++
+ [fReg i | i <- [0..31]] )
+
+
+
+-- | Make a virtual reg with this format.
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+ | not (isFloatFormat format)
+ = VirtualRegI u
+
+ | otherwise
+ = case format of
+ FF32 -> VirtualRegF u
+ FF64 -> VirtualRegD u
+ _ -> panic "mkVReg"
+
+
+regDotColor :: RealReg -> SDoc
+regDotColor reg
+ = case classOfRealReg reg of
+ RcInteger -> text "blue"
+ RcFloat -> text "red"
+ _other -> text "green"
diff --git a/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs b/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs
new file mode 100644
index 0000000000..2d1f77d737
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs
@@ -0,0 +1,74 @@
+module GHC.CmmToAsm.SPARC.ShortcutJump (
+ JumpDest(..), getJumpDestBlockId,
+ canShortcut,
+ shortcutJump,
+ shortcutStatics,
+ shortBlockId
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Imm
+
+import GHC.Cmm.CLabel
+import GHC.Cmm.BlockId
+import GHC.Cmm
+
+import Panic
+import Outputable
+
+data JumpDest
+ = DestBlockId BlockId
+ | DestImm Imm
+
+-- Debug Instance
+instance Outputable JumpDest where
+ ppr (DestBlockId bid) = text "blk:" <> ppr bid
+ ppr (DestImm _bid) = text "imm:?"
+
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+getJumpDestBlockId _ = Nothing
+
+
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut _ = Nothing
+
+
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump _ other = other
+
+
+
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
+shortcutStatics fn (RawCmmStatics lbl statics)
+ = RawCmmStatics lbl $ map (shortcutStatic fn) statics
+ -- we need to get the jump tables, so apply the mapping to the entries
+ -- of a CmmData too.
+
+shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
+shortcutLabel fn lab
+ | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn blkId
+ | otherwise = lab
+
+shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
+shortcutStatic fn (CmmStaticLit (CmmLabel lab))
+ = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
+shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
+ = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
+-- slightly dodgy, we're ignoring the second label, but this
+-- works with the way we use CmmLabelDiffOff for jump tables now.
+shortcutStatic _ other_static
+ = other_static
+
+
+shortBlockId :: (BlockId -> Maybe JumpDest) -> BlockId -> CLabel
+shortBlockId fn blockid =
+ case fn blockid of
+ Nothing -> blockLbl blockid
+ Just (DestBlockId blockid') -> shortBlockId fn blockid'
+ Just (DestImm (ImmCLbl lbl)) -> lbl
+ _other -> panic "shortBlockId"
diff --git a/compiler/GHC/CmmToAsm/SPARC/Stack.hs b/compiler/GHC/CmmToAsm/SPARC/Stack.hs
new file mode 100644
index 0000000000..d9a0ffd7cf
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Stack.hs
@@ -0,0 +1,59 @@
+module GHC.CmmToAsm.SPARC.Stack (
+ spRel,
+ fpRel,
+ spillSlotToOffset,
+ maxSpillSlots
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.SPARC.Imm
+
+import GHC.Driver.Session
+import Outputable
+
+-- | Get an AddrMode relative to the address in sp.
+-- This gives us a stack relative addressing mode for volatile
+-- temporaries and for excess call arguments.
+--
+spRel :: Int -- ^ stack offset in words, positive or negative
+ -> AddrMode
+
+spRel n = AddrRegImm sp (ImmInt (n * wordLength))
+
+
+-- | Get an address relative to the frame pointer.
+-- This doesn't work work for offsets greater than 13 bits; we just hope for the best
+--
+fpRel :: Int -> AddrMode
+fpRel n
+ = AddrRegImm fp (ImmInt (n * wordLength))
+
+
+-- | Convert a spill slot number to a *byte* offset, with no sign.
+--
+spillSlotToOffset :: DynFlags -> Int -> Int
+spillSlotToOffset dflags slot
+ | slot >= 0 && slot < maxSpillSlots dflags
+ = 64 + spillSlotSize * slot
+
+ | otherwise
+ = pprPanic "spillSlotToOffset:"
+ ( text "invalid spill location: " <> int slot
+ $$ text "maxSpillSlots: " <> int (maxSpillSlots dflags))
+
+
+-- | The maximum number of spill slots available on the C stack.
+-- If we use up all of the slots, then we're screwed.
+--
+-- Why do we reserve 64 bytes, instead of using the whole thing??
+-- -- BL 2009/02/15
+--
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+ = ((spillAreaLength dflags - 64) `div` spillSlotSize) - 1
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
new file mode 100644
index 0000000000..f3b20c19e1
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -0,0 +1,3747 @@
+{-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE BangPatterns #-}
+
+#if __GLASGOW_HASKELL__ <= 808
+-- GHC 8.10 deprecates this flag, but GHC 8.8 needs it
+-- The default iteration limit is a bit too low for the definitions
+-- in this module.
+{-# OPTIONS_GHC -fmax-pmcheck-iterations=10000000 #-}
+#endif
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- Generating machine code (instruction selection)
+--
+-- (c) The University of Glasgow 1996-2004
+--
+-----------------------------------------------------------------------------
+
+-- This is a big module, but, if you pay attention to
+-- (a) the sectioning, and (b) the type signatures, the
+-- structure should not be too overwhelming.
+
+module GHC.CmmToAsm.X86.CodeGen (
+ cmmTopCodeGen,
+ generateJumpTableForInstr,
+ extractUnwindPoints,
+ invertCondBranches,
+ InstrBlock
+)
+
+where
+
+#include "HsVersions.h"
+
+-- NCG stuff:
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Instr
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.X86.Ppr ( )
+import GHC.CmmToAsm.X86.RegInfo
+
+import GHC.Platform.Regs
+import GHC.CmmToAsm.CPrim
+import GHC.Cmm.DebugBlock
+ ( DebugBlock(..), UnwindPoint(..), UnwindTable
+ , UnwindExpr(UwReg), toUnwindExpr
+ )
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.CmmToAsm.Monad
+ ( NatM, getNewRegNat, getNewLabelNat, setDeltaNat
+ , getDeltaNat, getBlockIdNat, getPicBaseNat, getNewRegPairNat
+ , getPicBaseMaybeNat, getDebugBlock, getFileId
+ , addImmediateSuccessorNat, updateCfgNat
+ )
+import GHC.CmmToAsm.CFG
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.Platform
+
+-- Our intermediate code:
+import BasicTypes
+import GHC.Cmm.BlockId
+import Module ( primUnitId )
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.CLabel
+import CoreSyn ( Tickish(..) )
+import SrcLoc ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
+
+-- The rest:
+import ForeignCall ( CCallConv(..) )
+import OrdList
+import Outputable
+import FastString
+import GHC.Driver.Session
+import Util
+import UniqSupply ( getUniqueM )
+
+import Control.Monad
+import Data.Bits
+import Data.Foldable (fold)
+import Data.Int
+import Data.Maybe
+import Data.Word
+
+import qualified Data.Map as M
+
+is32BitPlatform :: NatM Bool
+is32BitPlatform = do
+ dflags <- getDynFlags
+ return $ target32Bit (targetPlatform dflags)
+
+sse2Enabled :: NatM Bool
+sse2Enabled = do
+ dflags <- getDynFlags
+ case platformArch (targetPlatform dflags) of
+ -- We Assume SSE1 and SSE2 operations are available on both
+ -- x86 and x86_64. Historically we didn't default to SSE2 and
+ -- SSE1 on x86, which results in defacto nondeterminism for how
+ -- rounding behaves in the associated x87 floating point instructions
+ -- because variations in the spill/fpu stack placement of arguments for
+ -- operations would change the precision and final result of what
+ -- would otherwise be the same expressions with respect to single or
+ -- double precision IEEE floating point computations.
+ ArchX86_64 -> return True
+ ArchX86 -> return True
+ _ -> panic "trying to generate x86/x86_64 on the wrong platform"
+
+
+sse4_2Enabled :: NatM Bool
+sse4_2Enabled = do
+ dflags <- getDynFlags
+ return (isSse4_2Enabled dflags)
+
+
+cmmTopCodeGen
+ :: RawCmmDecl
+ -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr]
+
+cmmTopCodeGen (CmmProc info lab live graph) = do
+ let blocks = toBlockListEntryFirst graph
+ (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+ picBaseMb <- getPicBaseMaybeNat
+ dflags <- getDynFlags
+ let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+ tops = proc : concat statics
+ os = platformOS $ targetPlatform dflags
+
+ case picBaseMb of
+ Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
+ Nothing -> return tops
+
+cmmTopCodeGen (CmmData sec dat) = do
+ return [CmmData sec (mkAlignment 1, dat)] -- no translation, we just use CmmStatic
+
+{- Note [Verifying basic blocks]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ We want to guarantee a few things about the results
+ of instruction selection.
+
+ Namely that each basic blocks consists of:
+ * A (potentially empty) sequence of straight line instructions
+ followed by
+ * A (potentially empty) sequence of jump like instructions.
+
+ We can verify this by going through the instructions and
+ making sure that any non-jumpish instruction can't appear
+ after a jumpish instruction.
+
+ There are gotchas however:
+ * CALLs are strictly speaking control flow but here we care
+ not about them. Hence we treat them as regular instructions.
+
+ It's safe for them to appear inside a basic block
+ as (ignoring side effects inside the call) they will result in
+ straight line code.
+
+ * NEWBLOCK marks the start of a new basic block so can
+ be followed by any instructions.
+-}
+
+-- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally.
+verifyBasicBlock :: [Instr] -> ()
+verifyBasicBlock instrs
+ | debugIsOn = go False instrs
+ | otherwise = ()
+ where
+ go _ [] = ()
+ go atEnd (i:instr)
+ = case i of
+ -- Start a new basic block
+ NEWBLOCK {} -> go False instr
+ -- Calls are not viable block terminators
+ CALL {} | atEnd -> faultyBlockWith i
+ | not atEnd -> go atEnd instr
+ -- All instructions ok, check if we reached the end and continue.
+ _ | not atEnd -> go (isJumpishInstr i) instr
+ -- Only jumps allowed at the end of basic blocks.
+ | otherwise -> if isJumpishInstr i
+ then go True instr
+ else faultyBlockWith i
+ faultyBlockWith i
+ = pprPanic "Non control flow instructions after end of basic block."
+ (ppr i <+> text "in:" $$ vcat (map ppr instrs))
+
+basicBlockCodeGen
+ :: CmmBlock
+ -> NatM ( [NatBasicBlock Instr]
+ , [NatCmmDecl (Alignment, RawCmmStatics) Instr])
+
+basicBlockCodeGen block = do
+ let (_, nodes, tail) = blockSplit block
+ id = entryLabel block
+ stmts = blockToList nodes
+ -- Generate location directive
+ dbg <- getDebugBlock (entryLabel block)
+ loc_instrs <- case dblSourceTick =<< dbg of
+ Just (SourceNote span name)
+ -> do fileId <- getFileId (srcSpanFile span)
+ let line = srcSpanStartLine span; col = srcSpanStartCol span
+ return $ unitOL $ LOCATION fileId line col name
+ _ -> return nilOL
+ (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+ (!tail_instrs,_) <- stmtToInstrs mid_bid tail
+ let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+ return $! verifyBasicBlock (fromOL instrs)
+ instrs' <- fold <$> traverse addSpUnwindings instrs
+ -- code generation may introduce new basic block boundaries, which
+ -- are indicated by the NEWBLOCK instruction. We must split up the
+ -- instruction stream into basic blocks again. Also, we extract
+ -- LDATAs here too.
+ let
+ (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs'
+
+ mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+ = ([], BasicBlock id instrs : blocks, statics)
+ mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+ = (instrs, blocks, CmmData sec dat:statics)
+ mkBlocks instr (instrs,blocks,statics)
+ = (instr:instrs, blocks, statics)
+ return (BasicBlock id top : other_blocks, statics)
+
+-- | Convert 'DELTA' instructions into 'UNWIND' instructions to capture changes
+-- in the @sp@ register. See Note [What is this unwinding business?] in Debug
+-- for details.
+addSpUnwindings :: Instr -> NatM (OrdList Instr)
+addSpUnwindings instr@(DELTA d) = do
+ dflags <- getDynFlags
+ if debugLevel dflags >= 1
+ then do lbl <- mkAsmTempLabel <$> getUniqueM
+ let unwind = M.singleton MachSp (Just $ UwReg MachSp $ negate d)
+ return $ toOL [ instr, UNWIND lbl unwind ]
+ else return (unitOL instr)
+addSpUnwindings instr = return $ unitOL instr
+
+{- Note [Keeping track of the current block]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When generating instructions for Cmm we sometimes require
+the current block for things like retry loops.
+
+We also sometimes change the current block, if a MachOP
+results in branching control flow.
+
+Issues arise if we have two statements in the same block,
+which both depend on the current block id *and* change the
+basic block after them. This happens for atomic primops
+in the X86 backend where we want to update the CFG data structure
+when introducing new basic blocks.
+
+For example in #17334 we got this Cmm code:
+
+ c3Bf: // global
+ (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18);
+ (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0);
+ _s3sT::I64 = _s3sV::I64;
+ goto c3B1;
+
+This resulted in two new basic blocks being inserted:
+
+ c3Bf:
+ movl $18,%vI_n3Bo
+ movq 88(%vI_s3sQ),%rax
+ jmp _n3Bp
+ n3Bp:
+ ...
+ cmpxchgq %vI_n3Bq,88(%vI_s3sQ)
+ jne _n3Bp
+ ...
+ jmp _n3Bs
+ n3Bs:
+ ...
+ cmpxchgq %vI_n3Bt,88(%vI_s3sQ)
+ jne _n3Bs
+ ...
+ jmp _c3B1
+ ...
+
+Based on the Cmm we called stmtToInstrs we translated both atomic operations under
+the assumption they would be placed into their Cmm basic block `c3Bf`.
+However for the retry loop we introduce new labels, so this is not the case
+for the second statement.
+This resulted in a desync between the explicit control flow graph
+we construct as a separate data type and the actual control flow graph in the code.
+
+Instead we now return the new basic block if a statement causes a change
+in the current block and use the block for all following statements.
+
+For this reason genCCall is also split into two parts.
+One for calls which *won't* change the basic blocks in
+which successive instructions will be placed.
+A different one for calls which *are* known to change the
+basic block.
+
+-}
+
+-- See Note [Keeping track of the current block] for why
+-- we pass the BlockId.
+stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
+ -> [CmmNode O O] -- ^ Cmm Statement
+ -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
+stmtsToInstrs bid stmts =
+ go bid stmts nilOL
+ where
+ go bid [] instrs = return (instrs,bid)
+ go bid (s:stmts) instrs = do
+ (instrs',bid') <- stmtToInstrs bid s
+ -- If the statement introduced a new block, we use that one
+ let !newBid = fromMaybe bid bid'
+ go newBid stmts (instrs `appOL` instrs')
+
+-- | `bid` refers to the current block and is used to update the CFG
+-- if new blocks are inserted in the control flow.
+-- See Note [Keeping track of the current block] for more details.
+stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
+ -> CmmNode e x
+ -> NatM (InstrBlock, Maybe BlockId)
+ -- ^ Instructions, and bid of new block if successive
+ -- statements are placed in a different basic block.
+stmtToInstrs bid stmt = do
+ dflags <- getDynFlags
+ is32Bit <- is32BitPlatform
+ case stmt of
+ CmmUnsafeForeignCall target result_regs args
+ -> genCCall dflags is32Bit target result_regs args bid
+
+ _ -> (,Nothing) <$> case stmt of
+ CmmComment s -> return (unitOL (COMMENT s))
+ CmmTick {} -> return nilOL
+
+ CmmUnwind regs -> do
+ let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
+ to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
+ case foldMap to_unwind_entry regs of
+ tbl | M.null tbl -> return nilOL
+ | otherwise -> do
+ lbl <- mkAsmTempLabel <$> getUniqueM
+ return $ unitOL $ UNWIND lbl tbl
+
+ CmmAssign reg src
+ | isFloatType ty -> assignReg_FltCode format reg src
+ | is32Bit && isWord64 ty -> assignReg_I64Code reg src
+ | otherwise -> assignReg_IntCode format reg src
+ where ty = cmmRegType dflags reg
+ format = cmmTypeFormat ty
+
+ CmmStore addr src
+ | isFloatType ty -> assignMem_FltCode format addr src
+ | is32Bit && isWord64 ty -> assignMem_I64Code addr src
+ | otherwise -> assignMem_IntCode format addr src
+ where ty = cmmExprType dflags src
+ format = cmmTypeFormat ty
+
+ CmmBranch id -> return $ genBranch id
+
+ --We try to arrange blocks such that the likely branch is the fallthrough
+ --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
+ CmmCondBranch arg true false _ -> genCondBranch bid true false arg
+ CmmSwitch arg ids -> do dflags <- getDynFlags
+ genSwitch dflags arg ids
+ CmmCall { cml_target = arg
+ , cml_args_regs = gregs } -> do
+ dflags <- getDynFlags
+ genJump arg (jumpRegs dflags gregs)
+ _ ->
+ panic "stmtToInstrs: statement should have been cps'd away"
+
+
+jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
+jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
+ where platform = targetPlatform dflags
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+-- They are really trees of insns to facilitate fast appending, where a
+-- left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+ = OrdList Instr
+
+
+-- | Condition codes passed up the tree.
+--
+data CondCode
+ = CondCode Bool Cond InstrBlock
+
+
+-- | a.k.a "Register64"
+-- Reg is the lower 32-bit temporary which contains the result.
+-- Use getHiVRegFromLo to find the other VRegUnique.
+--
+-- Rules of this simplified insn selection game are therefore that
+-- the returned Reg may be modified
+--
+data ChildCode64
+ = ChildCode64
+ InstrBlock
+ Reg
+
+
+-- | Register's passed up the tree. If the stix code forces the register
+-- to live in a pre-decided machine register, it comes out as @Fixed@;
+-- otherwise, it comes out as @Any@, and the parent can decide which
+-- register to put it in.
+--
+data Register
+ = Fixed Format Reg InstrBlock
+ | Any Format (Reg -> InstrBlock)
+
+
+swizzleRegisterRep :: Register -> Format -> Register
+swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
+swizzleRegisterRep (Any _ codefn) format = Any format codefn
+
+
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform -> CmmReg -> Reg
+
+getRegisterReg _ (CmmLocal (LocalReg u pk))
+ = -- by Assuming SSE2, Int,Word,Float,Double all can be register allocated
+ let fmt = cmmTypeFormat pk in
+ RegVirtual (mkVirtualReg u fmt)
+
+getRegisterReg platform (CmmGlobal mid)
+ = case globalRegMaybe platform mid of
+ Just reg -> RegReal $ reg
+ Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
+ -- By this stage, the only MagicIds remaining should be the
+ -- ones which map to a real machine register on this
+ -- platform. Hence ...
+
+
+-- | Memory addressing modes passed up the tree.
+data Amode
+ = Amode AddrMode InstrBlock
+
+{-
+Now, given a tree (the argument to a CmmLoad) that references memory,
+produce a suitable addressing mode.
+
+A Rule of the Game (tm) for Amodes: use of the addr bit must
+immediately follow use of the code part, since the code part puts
+values in registers which the addr then refers to. So you can't put
+anything in between, lest it overwrite some of those registers. If
+you need to do some other computation between the code part and use of
+the addr bit, first store the effective address from the amode in a
+temporary, then do the other computation, and then use the temporary:
+
+ code
+ LEA amode, tmp
+ ... other computation ...
+ ... (tmp) ...
+-}
+
+
+-- | Check whether an integer will fit in 32 bits.
+-- A CmmInt is intended to be truncated to the appropriate
+-- number of bits, so here we truncate it to Int64. This is
+-- important because e.g. -1 as a CmmInt might be either
+-- -1 or 18446744073709551615.
+--
+is32BitInteger :: Integer -> Bool
+is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
+ where i64 = fromIntegral i :: Int64
+
+
+-- | Convert a BlockId to some CmmStatic data
+jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
+jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+ where blockLabel = blockLbl blockid
+
+
+-- -----------------------------------------------------------------------------
+-- General things for putting together code sequences
+
+-- Expand CmmRegOff. ToDo: should we do it this way around, or convert
+-- CmmExprs into CmmRegOff?
+mangleIndexTree :: DynFlags -> CmmReg -> Int -> CmmExpr
+mangleIndexTree dflags reg off
+ = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+ where width = typeWidth (cmmRegType dflags reg)
+
+-- | The dual to getAnyReg: compute an expression into a register, but
+-- we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+ r <- getRegister expr
+ case r of
+ Any rep code -> do
+ tmp <- getNewRegNat rep
+ return (tmp, code tmp)
+ Fixed _ reg code ->
+ return (reg, code)
+
+
+assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_I64Code addrTree valueTree = do
+ Amode addr addr_code <- getAmode addrTree
+ ChildCode64 vcode rlo <- iselExpr64 valueTree
+ let
+ rhi = getHiVRegFromLo rlo
+
+ -- Little-endian store
+ mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
+ mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
+ return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
+
+
+assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock
+assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
+ ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
+ let
+ r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
+ r_dst_hi = getHiVRegFromLo r_dst_lo
+ r_src_hi = getHiVRegFromLo r_src_lo
+ mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
+ mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
+ return (
+ vcode `snocOL` mov_lo `snocOL` mov_hi
+ )
+
+assignReg_I64Code _ _
+ = panic "assignReg_I64Code(i386): invalid lvalue"
+
+
+iselExpr64 :: CmmExpr -> NatM ChildCode64
+iselExpr64 (CmmLit (CmmInt i _)) = do
+ (rlo,rhi) <- getNewRegPairNat II32
+ let
+ r = fromIntegral (fromIntegral i :: Word32)
+ q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
+ code = toOL [
+ MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
+ MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
+ ]
+ return (ChildCode64 code rlo)
+
+iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
+ Amode addr addr_code <- getAmode addrTree
+ (rlo,rhi) <- getNewRegPairNat II32
+ let
+ mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
+ mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
+ return (
+ ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
+ rlo
+ )
+
+iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
+ = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
+
+-- we handle addition, but rather badly
+iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
+ ChildCode64 code1 r1lo <- iselExpr64 e1
+ (rlo,rhi) <- getNewRegPairNat II32
+ let
+ r = fromIntegral (fromIntegral i :: Word32)
+ q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
+ r1hi = getHiVRegFromLo r1lo
+ code = code1 `appOL`
+ toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+ ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
+ MOV II32 (OpReg r1hi) (OpReg rhi),
+ ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
+ return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
+ ChildCode64 code1 r1lo <- iselExpr64 e1
+ ChildCode64 code2 r2lo <- iselExpr64 e2
+ (rlo,rhi) <- getNewRegPairNat II32
+ let
+ r1hi = getHiVRegFromLo r1lo
+ r2hi = getHiVRegFromLo r2lo
+ code = code1 `appOL`
+ code2 `appOL`
+ toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+ ADD II32 (OpReg r2lo) (OpReg rlo),
+ MOV II32 (OpReg r1hi) (OpReg rhi),
+ ADC II32 (OpReg r2hi) (OpReg rhi) ]
+ return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
+ ChildCode64 code1 r1lo <- iselExpr64 e1
+ ChildCode64 code2 r2lo <- iselExpr64 e2
+ (rlo,rhi) <- getNewRegPairNat II32
+ let
+ r1hi = getHiVRegFromLo r1lo
+ r2hi = getHiVRegFromLo r2lo
+ code = code1 `appOL`
+ code2 `appOL`
+ toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+ SUB II32 (OpReg r2lo) (OpReg rlo),
+ MOV II32 (OpReg r1hi) (OpReg rhi),
+ SBB II32 (OpReg r2hi) (OpReg rhi) ]
+ return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
+ fn <- getAnyReg expr
+ r_dst_lo <- getNewRegNat II32
+ let r_dst_hi = getHiVRegFromLo r_dst_lo
+ code = fn r_dst_lo
+ return (
+ ChildCode64 (code `snocOL`
+ MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
+ r_dst_lo
+ )
+
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
+ fn <- getAnyReg expr
+ r_dst_lo <- getNewRegNat II32
+ let r_dst_hi = getHiVRegFromLo r_dst_lo
+ code = fn r_dst_lo
+ return (
+ ChildCode64 (code `snocOL`
+ MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
+ CLTD II32 `snocOL`
+ MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
+ MOV II32 (OpReg edx) (OpReg r_dst_hi))
+ r_dst_lo
+ )
+
+iselExpr64 expr
+ = pprPanic "iselExpr64(i386)" (ppr expr)
+
+
+--------------------------------------------------------------------------------
+getRegister :: CmmExpr -> NatM Register
+getRegister e = do dflags <- getDynFlags
+ is32Bit <- is32BitPlatform
+ getRegister' dflags is32Bit e
+
+getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register
+
+getRegister' dflags is32Bit (CmmReg reg)
+ = case reg of
+ CmmGlobal PicBaseReg
+ | is32Bit ->
+ -- on x86_64, we have %rip for PicBaseReg, but it's not
+ -- a full-featured register, it can only be used for
+ -- rip-relative addressing.
+ do reg' <- getPicBaseNat (archWordFormat is32Bit)
+ return (Fixed (archWordFormat is32Bit) reg' nilOL)
+ _ ->
+ do
+ let
+ fmt = cmmTypeFormat (cmmRegType dflags reg)
+ format = fmt
+ --
+ let platform = targetPlatform dflags
+ return (Fixed format
+ (getRegisterReg platform reg)
+ nilOL)
+
+
+getRegister' dflags is32Bit (CmmRegOff r n)
+ = getRegister' dflags is32Bit $ mangleIndexTree dflags r n
+
+getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e])
+ = addAlignmentCheck align <$> getRegister' dflags is32Bit e
+
+-- for 32-bit architectures, support some 64 -> 32 bit conversions:
+-- TO_W_(x), TO_W_(x >> 32)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32)
+ [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | is32Bit = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32)
+ [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | is32Bit = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x])
+ | is32Bit = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 rlo code
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
+ | is32Bit = do
+ ChildCode64 code rlo <- iselExpr64 x
+ return $ Fixed II32 rlo code
+
+getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
+ float_const_sse2 where
+ float_const_sse2
+ | f == 0.0 = do
+ let
+ format = floatFormat w
+ code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
+ -- I don't know why there are xorpd, xorps, and pxor instructions.
+ -- They all appear to do the same thing --SDM
+ return (Any format code)
+
+ | otherwise = do
+ Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+ loadFloatAmode w addr code
+
+-- catch simple cases of zero- or sign-extended load
+getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
+ code <- intLoadCode (MOVZxL II8) addr
+ return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
+ code <- intLoadCode (MOVSxL II8) addr
+ return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
+ code <- intLoadCode (MOVZxL II16) addr
+ return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
+ code <- intLoadCode (MOVSxL II16) addr
+ return (Any II32 code)
+
+-- catch simple cases of zero- or sign-extended load
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _])
+ | not is32Bit = do
+ code <- intLoadCode (MOVZxL II8) addr
+ return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _])
+ | not is32Bit = do
+ code <- intLoadCode (MOVSxL II8) addr
+ return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _])
+ | not is32Bit = do
+ code <- intLoadCode (MOVZxL II16) addr
+ return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _])
+ | not is32Bit = do
+ code <- intLoadCode (MOVSxL II16) addr
+ return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _])
+ | not is32Bit = do
+ code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
+ return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _])
+ | not is32Bit = do
+ code <- intLoadCode (MOVSxL II32) addr
+ return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
+ CmmLit displacement])
+ | not is32Bit = do
+ return $ Any II64 (\dst -> unitOL $
+ LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
+
+getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
+ case mop of
+ MO_F_Neg w -> sse2NegCode w x
+
+
+ MO_S_Neg w -> triv_ucode NEGI (intFormat w)
+ MO_Not w -> triv_ucode NOT (intFormat w)
+
+ -- Nop conversions
+ MO_UU_Conv W32 W8 -> toI8Reg W32 x
+ MO_SS_Conv W32 W8 -> toI8Reg W32 x
+ MO_XX_Conv W32 W8 -> toI8Reg W32 x
+ MO_UU_Conv W16 W8 -> toI8Reg W16 x
+ MO_SS_Conv W16 W8 -> toI8Reg W16 x
+ MO_XX_Conv W16 W8 -> toI8Reg W16 x
+ MO_UU_Conv W32 W16 -> toI16Reg W32 x
+ MO_SS_Conv W32 W16 -> toI16Reg W32 x
+ MO_XX_Conv W32 W16 -> toI16Reg W32 x
+
+ MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+ MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+ MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+ MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+ MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+ MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+ MO_UU_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
+ MO_SS_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
+ MO_XX_Conv W64 W8 | not is32Bit -> toI8Reg W64 x
+
+ MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+ MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+ MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+
+ -- widenings
+ MO_UU_Conv W8 W32 -> integerExtend W8 W32 MOVZxL x
+ MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
+ MO_UU_Conv W8 W16 -> integerExtend W8 W16 MOVZxL x
+
+ MO_SS_Conv W8 W32 -> integerExtend W8 W32 MOVSxL x
+ MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
+ MO_SS_Conv W8 W16 -> integerExtend W8 W16 MOVSxL x
+
+ -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
+ -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
+ -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
+ MO_XX_Conv W8 W32
+ | is32Bit -> integerExtend W8 W32 MOVZxL x
+ | otherwise -> integerExtend W8 W32 MOV x
+ MO_XX_Conv W8 W16
+ | is32Bit -> integerExtend W8 W16 MOVZxL x
+ | otherwise -> integerExtend W8 W16 MOV x
+ MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
+
+ MO_UU_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVZxL x
+ MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
+ MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
+ MO_SS_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVSxL x
+ MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
+ MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
+ -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
+ -- However, we don't want the register allocator to throw it
+ -- away as an unnecessary reg-to-reg move, so we keep it in
+ -- the form of a movzl and print it as a movl later.
+ -- This doesn't apply to MO_XX_Conv since in this case we don't care about
+ -- the upper bits. So we can just use MOV.
+ MO_XX_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOV x
+ MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+ MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
+
+ MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
+
+
+ MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
+
+ MO_FS_Conv from to -> coerceFP2Int from to x
+ MO_SF_Conv from to -> coerceInt2FP from to x
+
+ MO_V_Insert {} -> needLlvm
+ MO_V_Extract {} -> needLlvm
+ MO_V_Add {} -> needLlvm
+ MO_V_Sub {} -> needLlvm
+ MO_V_Mul {} -> needLlvm
+ MO_VS_Quot {} -> needLlvm
+ MO_VS_Rem {} -> needLlvm
+ MO_VS_Neg {} -> needLlvm
+ MO_VU_Quot {} -> needLlvm
+ MO_VU_Rem {} -> needLlvm
+ MO_VF_Insert {} -> needLlvm
+ MO_VF_Extract {} -> needLlvm
+ MO_VF_Add {} -> needLlvm
+ MO_VF_Sub {} -> needLlvm
+ MO_VF_Mul {} -> needLlvm
+ MO_VF_Quot {} -> needLlvm
+ MO_VF_Neg {} -> needLlvm
+
+ _other -> pprPanic "getRegister" (pprMachOp mop)
+ where
+ triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register
+ triv_ucode instr format = trivialUCode format (instr format) x
+
+ -- signed or unsigned extension.
+ integerExtend :: Width -> Width
+ -> (Format -> Operand -> Operand -> Instr)
+ -> CmmExpr -> NatM Register
+ integerExtend from to instr expr = do
+ (reg,e_code) <- if from == W8 then getByteReg expr
+ else getSomeReg expr
+ let
+ code dst =
+ e_code `snocOL`
+ instr (intFormat from) (OpReg reg) (OpReg dst)
+ return (Any (intFormat to) code)
+
+ toI8Reg :: Width -> CmmExpr -> NatM Register
+ toI8Reg new_rep expr
+ = do codefn <- getAnyReg expr
+ return (Any (intFormat new_rep) codefn)
+ -- HACK: use getAnyReg to get a byte-addressable register.
+ -- If the source was a Fixed register, this will add the
+ -- mov instruction to put it into the desired destination.
+ -- We're assuming that the destination won't be a fixed
+ -- non-byte-addressable register; it won't be, because all
+ -- fixed registers are word-sized.
+
+ toI16Reg = toI8Reg -- for now
+
+ conversionNop :: Format -> CmmExpr -> NatM Register
+ conversionNop new_format expr
+ = do e_code <- getRegister' dflags is32Bit expr
+ return (swizzleRegisterRep e_code new_format)
+
+
+getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
+ case mop of
+ MO_F_Eq _ -> condFltReg is32Bit EQQ x y
+ MO_F_Ne _ -> condFltReg is32Bit NE x y
+ MO_F_Gt _ -> condFltReg is32Bit GTT x y
+ MO_F_Ge _ -> condFltReg is32Bit GE x y
+ -- Invert comparison condition and swap operands
+ -- See Note [SSE Parity Checks]
+ MO_F_Lt _ -> condFltReg is32Bit GTT y x
+ MO_F_Le _ -> condFltReg is32Bit GE y x
+
+ MO_Eq _ -> condIntReg EQQ x y
+ MO_Ne _ -> condIntReg NE x y
+
+ MO_S_Gt _ -> condIntReg GTT x y
+ MO_S_Ge _ -> condIntReg GE x y
+ MO_S_Lt _ -> condIntReg LTT x y
+ MO_S_Le _ -> condIntReg LE x y
+
+ MO_U_Gt _ -> condIntReg GU x y
+ MO_U_Ge _ -> condIntReg GEU x y
+ MO_U_Lt _ -> condIntReg LU x y
+ MO_U_Le _ -> condIntReg LEU x y
+
+ MO_F_Add w -> trivialFCode_sse2 w ADD x y
+
+ MO_F_Sub w -> trivialFCode_sse2 w SUB x y
+
+ MO_F_Quot w -> trivialFCode_sse2 w FDIV x y
+
+ MO_F_Mul w -> trivialFCode_sse2 w MUL x y
+
+
+ MO_Add rep -> add_code rep x y
+ MO_Sub rep -> sub_code rep x y
+
+ MO_S_Quot rep -> div_code rep True True x y
+ MO_S_Rem rep -> div_code rep True False x y
+ MO_U_Quot rep -> div_code rep False True x y
+ MO_U_Rem rep -> div_code rep False False x y
+
+ MO_S_MulMayOflo rep -> imulMayOflo rep x y
+
+ MO_Mul W8 -> imulW8 x y
+ MO_Mul rep -> triv_op rep IMUL
+ MO_And rep -> triv_op rep AND
+ MO_Or rep -> triv_op rep OR
+ MO_Xor rep -> triv_op rep XOR
+
+ {- Shift ops on x86s have constraints on their source, it
+ either has to be Imm, CL or 1
+ => trivialCode is not restrictive enough (sigh.)
+ -}
+ MO_Shl rep -> shift_code rep SHL x y {-False-}
+ MO_U_Shr rep -> shift_code rep SHR x y {-False-}
+ MO_S_Shr rep -> shift_code rep SAR x y {-False-}
+
+ MO_V_Insert {} -> needLlvm
+ MO_V_Extract {} -> needLlvm
+ MO_V_Add {} -> needLlvm
+ MO_V_Sub {} -> needLlvm
+ MO_V_Mul {} -> needLlvm
+ MO_VS_Quot {} -> needLlvm
+ MO_VS_Rem {} -> needLlvm
+ MO_VS_Neg {} -> needLlvm
+ MO_VF_Insert {} -> needLlvm
+ MO_VF_Extract {} -> needLlvm
+ MO_VF_Add {} -> needLlvm
+ MO_VF_Sub {} -> needLlvm
+ MO_VF_Mul {} -> needLlvm
+ MO_VF_Quot {} -> needLlvm
+ MO_VF_Neg {} -> needLlvm
+
+ _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
+ where
+ --------------------
+ triv_op width instr = trivialCode width op (Just op) x y
+ where op = instr (intFormat width)
+
+ -- Special case for IMUL for bytes, since the result of IMULB will be in
+ -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
+ -- values.
+ imulW8 :: CmmExpr -> CmmExpr -> NatM Register
+ imulW8 arg_a arg_b = do
+ (a_reg, a_code) <- getNonClobberedReg arg_a
+ b_code <- getAnyReg arg_b
+
+ let code = a_code `appOL` b_code eax `appOL`
+ toOL [ IMUL2 format (OpReg a_reg) ]
+ format = intFormat W8
+
+ return (Fixed format eax code)
+
+
+ imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
+ imulMayOflo rep a b = do
+ (a_reg, a_code) <- getNonClobberedReg a
+ b_code <- getAnyReg b
+ let
+ shift_amt = case rep of
+ W32 -> 31
+ W64 -> 63
+ _ -> panic "shift_amt"
+
+ format = intFormat rep
+ code = a_code `appOL` b_code eax `appOL`
+ toOL [
+ IMUL2 format (OpReg a_reg), -- result in %edx:%eax
+ SAR format (OpImm (ImmInt shift_amt)) (OpReg eax),
+ -- sign extend lower part
+ SUB format (OpReg edx) (OpReg eax)
+ -- compare against upper
+ -- eax==0 if high part == sign extended low part
+ ]
+ return (Fixed format eax code)
+
+ --------------------
+ shift_code :: Width
+ -> (Format -> Operand -> Operand -> Instr)
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+
+ {- Case1: shift length as immediate -}
+ shift_code width instr x (CmmLit lit) = do
+ x_code <- getAnyReg x
+ let
+ format = intFormat width
+ code dst
+ = x_code dst `snocOL`
+ instr format (OpImm (litToImm lit)) (OpReg dst)
+ return (Any format code)
+
+ {- Case2: shift length is complex (non-immediate)
+ * y must go in %ecx.
+ * we cannot do y first *and* put its result in %ecx, because
+ %ecx might be clobbered by x.
+ * if we do y second, then x cannot be
+ in a clobbered reg. Also, we cannot clobber x's reg
+ with the instruction itself.
+ * so we can either:
+ - do y first, put its result in a fresh tmp, then copy it to %ecx later
+ - do y second and put its result into %ecx. x gets placed in a fresh
+ tmp. This is likely to be better, because the reg alloc can
+ eliminate this reg->reg move here (it won't eliminate the other one,
+ because the move is into the fixed %ecx).
+ -}
+ shift_code width instr x y{-amount-} = do
+ x_code <- getAnyReg x
+ let format = intFormat width
+ tmp <- getNewRegNat format
+ y_code <- getAnyReg y
+ let
+ code = x_code tmp `appOL`
+ y_code ecx `snocOL`
+ instr format (OpReg ecx) (OpReg tmp)
+ return (Fixed format tmp code)
+
+ --------------------
+ add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
+ add_code rep x (CmmLit (CmmInt y _))
+ | is32BitInteger y = add_int rep x y
+ add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y
+ where format = intFormat rep
+ -- TODO: There are other interesting patterns we want to replace
+ -- with a LEA, e.g. `(x + offset) + (y << shift)`.
+
+ --------------------
+ sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
+ sub_code rep x (CmmLit (CmmInt y _))
+ | is32BitInteger (-y) = add_int rep x (-y)
+ sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y
+
+ -- our three-operand add instruction:
+ add_int width x y = do
+ (x_reg, x_code) <- getSomeReg x
+ let
+ format = intFormat width
+ imm = ImmInt (fromInteger y)
+ code dst
+ = x_code `snocOL`
+ LEA format
+ (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
+ (OpReg dst)
+ --
+ return (Any format code)
+
+ ----------------------
+
+ -- See Note [DIV/IDIV for bytes]
+ div_code W8 signed quotient x y = do
+ let widen | signed = MO_SS_Conv W8 W16
+ | otherwise = MO_UU_Conv W8 W16
+ div_code
+ W16
+ signed
+ quotient
+ (CmmMachOp widen [x])
+ (CmmMachOp widen [y])
+
+ div_code width signed quotient x y = do
+ (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
+ x_code <- getAnyReg x
+ let
+ format = intFormat width
+ widen | signed = CLTD format
+ | otherwise = XOR format (OpReg edx) (OpReg edx)
+
+ instr | signed = IDIV
+ | otherwise = DIV
+
+ code = y_code `appOL`
+ x_code eax `appOL`
+ toOL [widen, instr format y_op]
+
+ result | quotient = eax
+ | otherwise = edx
+
+ return (Fixed format result code)
+
+
+getRegister' _ _ (CmmLoad mem pk)
+ | isFloatType pk
+ = do
+ Amode addr mem_code <- getAmode mem
+ loadFloatAmode (typeWidth pk) addr mem_code
+
+getRegister' _ is32Bit (CmmLoad mem pk)
+ | is32Bit && not (isWord64 pk)
+ = do
+ code <- intLoadCode instr mem
+ return (Any format code)
+ where
+ width = typeWidth pk
+ format = intFormat width
+ instr = case width of
+ W8 -> MOVZxL II8
+ _other -> MOV format
+ -- We always zero-extend 8-bit loads, if we
+ -- can't think of anything better. This is because
+ -- we can't guarantee access to an 8-bit variant of every register
+ -- (esi and edi don't have 8-bit variants), so to make things
+ -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+
+-- Simpler memory load code on x86_64
+getRegister' _ is32Bit (CmmLoad mem pk)
+ | not is32Bit
+ = do
+ code <- intLoadCode (MOV format) mem
+ return (Any format code)
+ where format = intFormat $ typeWidth pk
+
+getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
+ = let
+ format = intFormat width
+
+ -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
+ format1 = if is32Bit then format
+ else case format of
+ II64 -> II32
+ _ -> format
+ code dst
+ = unitOL (XOR format1 (OpReg dst) (OpReg dst))
+ in
+ return (Any format code)
+
+ -- optimisation for loading small literals on x86_64: take advantage
+ -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
+ -- instruction forms are shorter.
+getRegister' dflags is32Bit (CmmLit lit)
+ | not is32Bit, isWord64 (cmmLitType dflags lit), not (isBigLit lit)
+ = let
+ imm = litToImm lit
+ code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
+ in
+ return (Any II64 code)
+ where
+ isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
+ isBigLit _ = False
+ -- note1: not the same as (not.is32BitLit), because that checks for
+ -- signed literals that fit in 32 bits, but we want unsigned
+ -- literals here.
+ -- note2: all labels are small, because we're assuming the
+ -- small memory model (see gcc docs, -mcmodel=small).
+
+getRegister' dflags _ (CmmLit lit)
+ = do let format = cmmTypeFormat (cmmLitType dflags lit)
+ imm = litToImm lit
+ code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
+ return (Any format code)
+
+getRegister' _ _ other
+ | isVecExpr other = needLlvm
+ | otherwise = pprPanic "getRegister(x86)" (ppr other)
+
+
+intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
+ -> NatM (Reg -> InstrBlock)
+intLoadCode instr mem = do
+ Amode src mem_code <- getAmode mem
+ return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
+
+-- Compute an expression into *any* register, adding the appropriate
+-- move instruction if necessary.
+getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
+getAnyReg expr = do
+ r <- getRegister expr
+ anyReg r
+
+anyReg :: Register -> NatM (Reg -> InstrBlock)
+anyReg (Any _ code) = return code
+anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
+
+-- A bit like getSomeReg, but we want a reg that can be byte-addressed.
+-- Fixed registers might not be byte-addressable, so we make sure we've
+-- got a temporary, inserting an extra reg copy if necessary.
+getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getByteReg expr = do
+ is32Bit <- is32BitPlatform
+ if is32Bit
+ then do r <- getRegister expr
+ case r of
+ Any rep code -> do
+ tmp <- getNewRegNat rep
+ return (tmp, code tmp)
+ Fixed rep reg code
+ | isVirtualReg reg -> return (reg,code)
+ | otherwise -> do
+ tmp <- getNewRegNat rep
+ return (tmp, code `snocOL` reg2reg rep reg tmp)
+ -- ToDo: could optimise slightly by checking for
+ -- byte-addressable real registers, but that will
+ -- happen very rarely if at all.
+ else getSomeReg expr -- all regs are byte-addressable on x86_64
+
+-- Another variant: this time we want the result in a register that cannot
+-- be modified by code to evaluate an arbitrary expression.
+getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getNonClobberedReg expr = do
+ dflags <- getDynFlags
+ r <- getRegister expr
+ case r of
+ Any rep code -> do
+ tmp <- getNewRegNat rep
+ return (tmp, code tmp)
+ Fixed rep reg code
+ -- only certain regs can be clobbered
+ | reg `elem` instrClobberedRegs (targetPlatform dflags)
+ -> do
+ tmp <- getNewRegNat rep
+ return (tmp, code `snocOL` reg2reg rep reg tmp)
+ | otherwise ->
+ return (reg, code)
+
+reg2reg :: Format -> Reg -> Reg -> Instr
+reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
+
+
+--------------------------------------------------------------------------------
+getAmode :: CmmExpr -> NatM Amode
+getAmode e = do is32Bit <- is32BitPlatform
+ getAmode' is32Bit e
+
+getAmode' :: Bool -> CmmExpr -> NatM Amode
+getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags
+ getAmode $ mangleIndexTree dflags r n
+
+getAmode' is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
+ CmmLit displacement])
+ | not is32Bit
+ = return $ Amode (ripRel (litToImm displacement)) nilOL
+
+
+-- This is all just ridiculous, since it carefully undoes
+-- what mangleIndexTree has just done.
+getAmode' is32Bit (CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)])
+ | is32BitLit is32Bit lit
+ -- ASSERT(rep == II32)???
+ = do (x_reg, x_code) <- getSomeReg x
+ let off = ImmInt (-(fromInteger i))
+ return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
+
+getAmode' is32Bit (CmmMachOp (MO_Add _rep) [x, CmmLit lit])
+ | is32BitLit is32Bit lit
+ -- ASSERT(rep == II32)???
+ = do (x_reg, x_code) <- getSomeReg x
+ let off = litToImm lit
+ return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
+
+-- Turn (lit1 << n + lit2) into (lit2 + lit1 << n) so it will be
+-- recognised by the next rule.
+getAmode' is32Bit (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
+ b@(CmmLit _)])
+ = getAmode' is32Bit (CmmMachOp (MO_Add rep) [b,a])
+
+-- Matches: (x + offset) + (y << shift)
+getAmode' _ (CmmMachOp (MO_Add _) [CmmRegOff x offset,
+ CmmMachOp (MO_Shl _)
+ [y, CmmLit (CmmInt shift _)]])
+ | shift == 0 || shift == 1 || shift == 2 || shift == 3
+ = x86_complex_amode (CmmReg x) y shift (fromIntegral offset)
+
+getAmode' _ (CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _)
+ [y, CmmLit (CmmInt shift _)]])
+ | shift == 0 || shift == 1 || shift == 2 || shift == 3
+ = x86_complex_amode x y shift 0
+
+getAmode' _ (CmmMachOp (MO_Add _)
+ [x, CmmMachOp (MO_Add _)
+ [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
+ CmmLit (CmmInt offset _)]])
+ | shift == 0 || shift == 1 || shift == 2 || shift == 3
+ && is32BitInteger offset
+ = x86_complex_amode x y shift offset
+
+getAmode' _ (CmmMachOp (MO_Add _) [x,y])
+ = x86_complex_amode x y 0 0
+
+getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit
+ = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
+
+getAmode' _ expr = do
+ (reg,code) <- getSomeReg expr
+ return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
+
+-- | Like 'getAmode', but on 32-bit use simple register addressing
+-- (i.e. no index register). This stops us from running out of
+-- registers on x86 when using instructions such as cmpxchg, which can
+-- use up to three virtual registers and one fixed register.
+getSimpleAmode :: DynFlags -> Bool -> CmmExpr -> NatM Amode
+getSimpleAmode dflags is32Bit addr
+ | is32Bit = do
+ addr_code <- getAnyReg addr
+ addr_r <- getNewRegNat (intFormat (wordWidth dflags))
+ let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0)
+ return $! Amode amode (addr_code addr_r)
+ | otherwise = getAmode addr
+
+x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
+x86_complex_amode base index shift offset
+ = do (x_reg, x_code) <- getNonClobberedReg base
+ -- x must be in a temp, because it has to stay live over y_code
+ -- we could compare x_reg and y_reg and do something better here...
+ (y_reg, y_code) <- getSomeReg index
+ let
+ code = x_code `appOL` y_code
+ base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
+ n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
+ return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
+ code)
+
+
+
+
+-- -----------------------------------------------------------------------------
+-- getOperand: sometimes any operand will do.
+
+-- getNonClobberedOperand: the value of the operand will remain valid across
+-- the computation of an arbitrary expression, unless the expression
+-- is computed directly into a register which the operand refers to
+-- (see trivialCode where this function is used for an example).
+
+getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand (CmmLit lit) = do
+ if isSuitableFloatingPointLit lit
+ then do
+ let CmmFloat _ w = lit
+ Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+ return (OpAddr addr, code)
+ else do
+
+ is32Bit <- is32BitPlatform
+ dflags <- getDynFlags
+ if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
+ then return (OpImm (litToImm lit), nilOL)
+ else getNonClobberedOperand_generic (CmmLit lit)
+
+getNonClobberedOperand (CmmLoad mem pk) = do
+ is32Bit <- is32BitPlatform
+ -- this logic could be simplified
+ -- TODO FIXME
+ if (if is32Bit then not (isWord64 pk) else True)
+ -- if 32bit and pk is at float/double/simd value
+ -- or if 64bit
+ -- this could use some eyeballs or i'll need to stare at it more later
+ then do
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ Amode src mem_code <- getAmode mem
+ (src',save_code) <-
+ if (amodeCouldBeClobbered platform src)
+ then do
+ tmp <- getNewRegNat (archWordFormat is32Bit)
+ return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
+ unitOL (LEA (archWordFormat is32Bit)
+ (OpAddr src)
+ (OpReg tmp)))
+ else
+ return (src, nilOL)
+ return (OpAddr src', mem_code `appOL` save_code)
+ else do
+ -- if its a word or gcptr on 32bit?
+ getNonClobberedOperand_generic (CmmLoad mem pk)
+
+getNonClobberedOperand e = getNonClobberedOperand_generic e
+
+getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand_generic e = do
+ (reg, code) <- getNonClobberedReg e
+ return (OpReg reg, code)
+
+amodeCouldBeClobbered :: Platform -> AddrMode -> Bool
+amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode)
+
+regClobbered :: Platform -> Reg -> Bool
+regClobbered platform (RegReal (RealRegSingle rr)) = freeReg platform rr
+regClobbered _ _ = False
+
+-- getOperand: the operand is not required to remain valid across the
+-- computation of an arbitrary expression.
+getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+
+getOperand (CmmLit lit) = do
+ use_sse2 <- sse2Enabled
+ if (use_sse2 && isSuitableFloatingPointLit lit)
+ then do
+ let CmmFloat _ w = lit
+ Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+ return (OpAddr addr, code)
+ else do
+
+ is32Bit <- is32BitPlatform
+ dflags <- getDynFlags
+ if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
+ then return (OpImm (litToImm lit), nilOL)
+ else getOperand_generic (CmmLit lit)
+
+getOperand (CmmLoad mem pk) = do
+ is32Bit <- is32BitPlatform
+ use_sse2 <- sse2Enabled
+ if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
+ then do
+ Amode src mem_code <- getAmode mem
+ return (OpAddr src, mem_code)
+ else
+ getOperand_generic (CmmLoad mem pk)
+
+getOperand e = getOperand_generic e
+
+getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getOperand_generic e = do
+ (reg, code) <- getSomeReg e
+ return (OpReg reg, code)
+
+isOperand :: Bool -> CmmExpr -> Bool
+isOperand _ (CmmLoad _ _) = True
+isOperand is32Bit (CmmLit lit) = is32BitLit is32Bit lit
+ || isSuitableFloatingPointLit lit
+isOperand _ _ = False
+
+-- | Given a 'Register', produce a new 'Register' with an instruction block
+-- which will check the value for alignment. Used for @-falignment-sanitisation@.
+addAlignmentCheck :: Int -> Register -> Register
+addAlignmentCheck align reg =
+ case reg of
+ Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg)
+ Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt reg)
+ where
+ check :: Format -> Reg -> InstrBlock
+ check fmt reg =
+ ASSERT(not $ isFloatFormat fmt)
+ toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
+ , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
+ ]
+
+memConstant :: Alignment -> CmmLit -> NatM Amode
+memConstant align lit = do
+ lbl <- getNewLabelNat
+ let rosection = Section ReadOnlyData lbl
+ dflags <- getDynFlags
+ (addr, addr_code) <- if target32Bit (targetPlatform dflags)
+ then do dynRef <- cmmMakeDynamicReference
+ dflags
+ DataReference
+ lbl
+ Amode addr addr_code <- getAmode dynRef
+ return (addr, addr_code)
+ else return (ripRel (ImmCLbl lbl), nilOL)
+ let code =
+ LDATA rosection (align, RawCmmStatics lbl [CmmStaticLit lit])
+ `consOL` addr_code
+ return (Amode addr code)
+
+
+loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
+loadFloatAmode w addr addr_code = do
+ let format = floatFormat w
+ code dst = addr_code `snocOL`
+ MOV format (OpAddr addr) (OpReg dst)
+
+ return (Any format code)
+
+
+-- if we want a floating-point literal as an operand, we can
+-- use it directly from memory. However, if the literal is
+-- zero, we're better off generating it into a register using
+-- xor.
+isSuitableFloatingPointLit :: CmmLit -> Bool
+isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
+isSuitableFloatingPointLit _ = False
+
+getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
+getRegOrMem e@(CmmLoad mem pk) = do
+ is32Bit <- is32BitPlatform
+ use_sse2 <- sse2Enabled
+ if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
+ then do
+ Amode src mem_code <- getAmode mem
+ return (OpAddr src, mem_code)
+ else do
+ (reg, code) <- getNonClobberedReg e
+ return (OpReg reg, code)
+getRegOrMem e = do
+ (reg, code) <- getNonClobberedReg e
+ return (OpReg reg, code)
+
+is32BitLit :: Bool -> CmmLit -> Bool
+is32BitLit is32Bit (CmmInt i W64)
+ | not is32Bit
+ = -- assume that labels are in the range 0-2^31-1: this assumes the
+ -- small memory model (see gcc docs, -mcmodel=small).
+ is32BitInteger i
+is32BitLit _ _ = True
+
+
+
+
+-- Set up a condition code for a conditional branch.
+
+getCondCode :: CmmExpr -> NatM CondCode
+
+-- yes, they really do seem to want exactly the same!
+
+getCondCode (CmmMachOp mop [x, y])
+ =
+ case mop of
+ MO_F_Eq W32 -> condFltCode EQQ x y
+ MO_F_Ne W32 -> condFltCode NE x y
+ MO_F_Gt W32 -> condFltCode GTT x y
+ MO_F_Ge W32 -> condFltCode GE x y
+ -- Invert comparison condition and swap operands
+ -- See Note [SSE Parity Checks]
+ MO_F_Lt W32 -> condFltCode GTT y x
+ MO_F_Le W32 -> condFltCode GE y x
+
+ MO_F_Eq W64 -> condFltCode EQQ x y
+ MO_F_Ne W64 -> condFltCode NE x y
+ MO_F_Gt W64 -> condFltCode GTT x y
+ MO_F_Ge W64 -> condFltCode GE x y
+ MO_F_Lt W64 -> condFltCode GTT y x
+ MO_F_Le W64 -> condFltCode GE y x
+
+ _ -> condIntCode (machOpToCond mop) x y
+
+getCondCode other = pprPanic "getCondCode(2)(x86,x86_64)" (ppr other)
+
+machOpToCond :: MachOp -> Cond
+machOpToCond mo = case mo of
+ MO_Eq _ -> EQQ
+ MO_Ne _ -> NE
+ MO_S_Gt _ -> GTT
+ MO_S_Ge _ -> GE
+ MO_S_Lt _ -> LTT
+ MO_S_Le _ -> LE
+ MO_U_Gt _ -> GU
+ MO_U_Ge _ -> GEU
+ MO_U_Lt _ -> LU
+ MO_U_Le _ -> LEU
+ _other -> pprPanic "machOpToCond" (pprMachOp mo)
+
+
+-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
+-- passed back up the tree.
+
+condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condIntCode cond x y = do is32Bit <- is32BitPlatform
+ condIntCode' is32Bit cond x y
+
+condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+
+-- memory vs immediate
+condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit)
+ | is32BitLit is32Bit lit = do
+ Amode x_addr x_code <- getAmode x
+ let
+ imm = litToImm lit
+ code = x_code `snocOL`
+ CMP (cmmTypeFormat pk) (OpImm imm) (OpAddr x_addr)
+ --
+ return (CondCode False cond code)
+
+-- anything vs zero, using a mask
+-- TODO: Add some sanity checking!!!!
+condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
+ | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit
+ = do
+ (x_reg, x_code) <- getSomeReg x
+ let
+ code = x_code `snocOL`
+ TEST (intFormat pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
+ --
+ return (CondCode False cond code)
+
+-- anything vs zero
+condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do
+ (x_reg, x_code) <- getSomeReg x
+ let
+ code = x_code `snocOL`
+ TEST (intFormat pk) (OpReg x_reg) (OpReg x_reg)
+ --
+ return (CondCode False cond code)
+
+-- anything vs operand
+condIntCode' is32Bit cond x y
+ | isOperand is32Bit y = do
+ dflags <- getDynFlags
+ (x_reg, x_code) <- getNonClobberedReg x
+ (y_op, y_code) <- getOperand y
+ let
+ code = x_code `appOL` y_code `snocOL`
+ CMP (cmmTypeFormat (cmmExprType dflags x)) y_op (OpReg x_reg)
+ return (CondCode False cond code)
+-- operand vs. anything: invert the comparison so that we can use a
+-- single comparison instruction.
+ | isOperand is32Bit x
+ , Just revcond <- maybeFlipCond cond = do
+ dflags <- getDynFlags
+ (y_reg, y_code) <- getNonClobberedReg y
+ (x_op, x_code) <- getOperand x
+ let
+ code = y_code `appOL` x_code `snocOL`
+ CMP (cmmTypeFormat (cmmExprType dflags x)) x_op (OpReg y_reg)
+ return (CondCode False revcond code)
+
+-- anything vs anything
+condIntCode' _ cond x y = do
+ dflags <- getDynFlags
+ (y_reg, y_code) <- getNonClobberedReg y
+ (x_op, x_code) <- getRegOrMem x
+ let
+ code = y_code `appOL`
+ x_code `snocOL`
+ CMP (cmmTypeFormat (cmmExprType dflags x)) (OpReg y_reg) x_op
+ return (CondCode False cond code)
+
+
+
+--------------------------------------------------------------------------------
+condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+
+condFltCode cond x y
+ = condFltCode_sse2
+ where
+
+
+ -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
+ -- an operand, but the right must be a reg. We can probably do better
+ -- than this general case...
+ condFltCode_sse2 = do
+ dflags <- getDynFlags
+ (x_reg, x_code) <- getNonClobberedReg x
+ (y_op, y_code) <- getOperand y
+ let
+ code = x_code `appOL`
+ y_code `snocOL`
+ CMP (floatFormat $ cmmExprWidth dflags x) y_op (OpReg x_reg)
+ -- NB(1): we need to use the unsigned comparison operators on the
+ -- result of this comparison.
+ return (CondCode True (condToUnsigned cond) code)
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business. Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers. If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side. This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
+
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock
+
+
+-- integer assignment to memory
+
+-- specific case of adding/subtracting an integer to a particular address.
+-- ToDo: catch other cases where we can use an operation directly on a memory
+-- address.
+assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
+ CmmLit (CmmInt i _)])
+ | addr == addr2, pk /= II64 || is32BitInteger i,
+ Just instr <- check op
+ = do Amode amode code_addr <- getAmode addr
+ let code = code_addr `snocOL`
+ instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
+ return code
+ where
+ check (MO_Add _) = Just ADD
+ check (MO_Sub _) = Just SUB
+ check _ = Nothing
+ -- ToDo: more?
+
+-- general case
+assignMem_IntCode pk addr src = do
+ is32Bit <- is32BitPlatform
+ Amode addr code_addr <- getAmode addr
+ (code_src, op_src) <- get_op_RI is32Bit src
+ let
+ code = code_src `appOL`
+ code_addr `snocOL`
+ MOV pk op_src (OpAddr addr)
+ -- NOTE: op_src is stable, so it will still be valid
+ -- after code_addr. This may involve the introduction
+ -- of an extra MOV to a temporary register, but we hope
+ -- the register allocator will get rid of it.
+ --
+ return code
+ where
+ get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand) -- code, operator
+ get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit
+ = return (nilOL, OpImm (litToImm lit))
+ get_op_RI _ op
+ = do (reg,code) <- getNonClobberedReg op
+ return (code, OpReg reg)
+
+
+-- Assign; dst is a reg, rhs is mem
+assignReg_IntCode pk reg (CmmLoad src _) = do
+ load_code <- intLoadCode (MOV pk) src
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ return (load_code (getRegisterReg platform reg))
+
+-- dst is a reg, but src could be anything
+assignReg_IntCode _ reg src = do
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ code <- getAnyReg src
+ return (code (getRegisterReg platform reg))
+
+
+-- Floating point assignment to memory
+assignMem_FltCode pk addr src = do
+ (src_reg, src_code) <- getNonClobberedReg src
+ Amode addr addr_code <- getAmode addr
+ let
+ code = src_code `appOL`
+ addr_code `snocOL`
+ MOV pk (OpReg src_reg) (OpAddr addr)
+
+ return code
+
+-- Floating point assignment to a register/temporary
+assignReg_FltCode _ reg src = do
+ src_code <- getAnyReg src
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ return (src_code (getRegisterReg platform reg))
+
+
+genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
+
+genJump (CmmLoad mem _) regs = do
+ Amode target code <- getAmode mem
+ return (code `snocOL` JMP (OpAddr target) regs)
+
+genJump (CmmLit lit) regs = do
+ return (unitOL (JMP (OpImm (litToImm lit)) regs))
+
+genJump expr regs = do
+ (reg,code) <- getSomeReg expr
+ return (code `snocOL` JMP (OpReg reg) regs)
+
+
+-- -----------------------------------------------------------------------------
+-- Unconditional branches
+
+genBranch :: BlockId -> InstrBlock
+genBranch = toOL . mkJumpInstr
+
+
+
+-- -----------------------------------------------------------------------------
+-- Conditional jumps/branches
+
+{-
+Conditional jumps are always to local labels, so we can use branch
+instructions. We peek at the arguments to decide what kind of
+comparison to do.
+
+I386: First, we have to ensure that the condition
+codes are set according to the supplied comparison operation.
+-}
+
+
+genCondBranch
+ :: BlockId -- the source of the jump
+ -> BlockId -- the true branch target
+ -> BlockId -- the false branch target
+ -> CmmExpr -- the condition on which to branch
+ -> NatM InstrBlock -- Instructions
+
+genCondBranch bid id false expr = do
+ is32Bit <- is32BitPlatform
+ genCondBranch' is32Bit bid id false expr
+
+-- | We return the instructions generated.
+genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr
+ -> NatM InstrBlock
+
+-- 64-bit integer comparisons on 32-bit
+genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2])
+ | is32Bit, Just W64 <- maybeIntComparison mop = do
+ ChildCode64 code1 r1_lo <- iselExpr64 e1
+ ChildCode64 code2 r2_lo <- iselExpr64 e2
+ let r1_hi = getHiVRegFromLo r1_lo
+ r2_hi = getHiVRegFromLo r2_lo
+ cond = machOpToCond mop
+ Just cond' = maybeFlipCond cond
+ --TODO: Update CFG for x86
+ let code = code1 `appOL` code2 `appOL` toOL [
+ CMP II32 (OpReg r2_hi) (OpReg r1_hi),
+ JXX cond true,
+ JXX cond' false,
+ CMP II32 (OpReg r2_lo) (OpReg r1_lo),
+ JXX cond true] `appOL` genBranch false
+ return code
+
+genCondBranch' _ bid id false bool = do
+ CondCode is_float cond cond_code <- getCondCode bool
+ use_sse2 <- sse2Enabled
+ if not is_float || not use_sse2
+ then
+ return (cond_code `snocOL` JXX cond id `appOL` genBranch false)
+ else do
+ -- See Note [SSE Parity Checks]
+ let jmpFalse = genBranch false
+ code
+ = case cond of
+ NE -> or_unordered
+ GU -> plain_test
+ GEU -> plain_test
+ -- Use ASSERT so we don't break releases if
+ -- LTT/LE creep in somehow.
+ LTT ->
+ ASSERT2(False, ppr "Should have been turned into >")
+ and_ordered
+ LE ->
+ ASSERT2(False, ppr "Should have been turned into >=")
+ and_ordered
+ _ -> and_ordered
+
+ plain_test = unitOL (
+ JXX cond id
+ ) `appOL` jmpFalse
+ or_unordered = toOL [
+ JXX cond id,
+ JXX PARITY id
+ ] `appOL` jmpFalse
+ and_ordered = toOL [
+ JXX PARITY false,
+ JXX cond id,
+ JXX ALWAYS false
+ ]
+ updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
+ return (cond_code `appOL` code)
+
+{- Note [Introducing cfg edges inside basic blocks]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ During instruction selection a statement `s`
+ in a block B with control of the sort: B -> C
+ will sometimes result in control
+ flow of the sort:
+
+ ┌ < ┐
+ v ^
+ B -> B1 ┴ -> C
+
+ as is the case for some atomic operations.
+
+ Now to keep the CFG in sync when introducing B1 we clearly
+ want to insert it between B and C. However there is
+ a catch when we have to deal with self loops.
+
+ We might start with code and a CFG of these forms:
+
+ loop:
+ stmt1 ┌ < ┐
+ .... v ^
+ stmtX loop ┘
+ stmtY
+ ....
+ goto loop:
+
+ Now we introduce B1:
+ ┌ ─ ─ ─ ─ ─┐
+ loop: │ ┌ < ┐ │
+ instrs v │ │ ^
+ .... loop ┴ B1 ┴ ┘
+ instrsFromX
+ stmtY
+ goto loop:
+
+ This is simple, all outgoing edges from loop now simply
+ start from B1 instead and the code generator knows which
+ new edges it introduced for the self loop of B1.
+
+ Disaster strikes if the statement Y follows the same pattern.
+ If we apply the same rule that all outgoing edges change then
+ we end up with:
+
+ loop ─> B1 ─> B2 ┬─┐
+ │ │ └─<┤ │
+ │ └───<───┘ │
+ └───────<────────┘
+
+ This is problematic. The edge B1->B1 is modified as expected.
+ However the modification is wrong!
+
+ The assembly in this case looked like this:
+
+ _loop:
+ <instrs>
+ _B1:
+ ...
+ cmpxchgq ...
+ jne _B1
+ <instrs>
+ <end _B1>
+ _B2:
+ ...
+ cmpxchgq ...
+ jne _B2
+ <instrs>
+ jmp loop
+
+ There is no edge _B2 -> _B1 here. It's still a self loop onto _B1.
+
+ The problem here is that really B1 should be two basic blocks.
+ Otherwise we have control flow in the *middle* of a basic block.
+ A contradiction!
+
+ So to account for this we add yet another basic block marker:
+
+ _B:
+ <instrs>
+ _B1:
+ ...
+ cmpxchgq ...
+ jne _B1
+ jmp _B1'
+ _B1':
+ <instrs>
+ <end _B1>
+ _B2:
+ ...
+
+ Now when inserting B2 we will only look at the outgoing edges of B1' and
+ everything will work out nicely.
+
+ You might also wonder why we don't insert jumps at the end of _B1'. There is
+ no way another block ends up jumping to the labels _B1 or _B2 since they are
+ essentially invisible to other blocks. View them as control flow labels local
+ to the basic block if you'd like.
+
+ Not doing this ultimately caused (part 2 of) #17334.
+-}
+
+
+-- -----------------------------------------------------------------------------
+-- Generating C calls
+
+-- Now the biggest nightmare---calls. Most of the nastiness is buried in
+-- @get_arg@, which moves the arguments to the correct registers/stack
+-- locations. Apart from that, the code is easy.
+--
+-- (If applicable) Do not fill the delay slots here; you will confuse the
+-- register allocator.
+--
+-- See Note [Keeping track of the current block] for information why we need
+-- to take/return a block id.
+
+genCCall
+ :: DynFlags
+ -> Bool -- 32 bit platform?
+ -> ForeignTarget -- function to call
+ -> [CmmFormal] -- where to put the result
+ -> [CmmActual] -- arguments (of mixed type)
+ -> BlockId -- The block we are in
+ -> NatM (InstrBlock, Maybe BlockId)
+
+-- First we deal with cases which might introduce new blocks in the stream.
+
+genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
+ [dst] [addr, n] bid = do
+ Amode amode addr_code <-
+ if amop `elem` [AMO_Add, AMO_Sub]
+ then getAmode addr
+ else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg
+ arg <- getNewRegNat format
+ arg_code <- getAnyReg n
+ let platform = targetPlatform dflags
+ dst_r = getRegisterReg platform (CmmLocal dst)
+ (code, lbl) <- op_code dst_r arg amode
+ return (addr_code `appOL` arg_code arg `appOL` code, Just lbl)
+ where
+ -- Code for the operation
+ op_code :: Reg -- Destination reg
+ -> Reg -- Register containing argument
+ -> AddrMode -- Address of location to mutate
+ -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId
+ op_code dst_r arg amode = case amop of
+ -- In the common case where dst_r is a virtual register the
+ -- final move should go away, because it's the last use of arg
+ -- and the first use of dst_r.
+ AMO_Add -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
+ , MOV format (OpReg arg) (OpReg dst_r)
+ ], bid)
+ AMO_Sub -> return $ (toOL [ NEGI format (OpReg arg)
+ , LOCK (XADD format (OpReg arg) (OpAddr amode))
+ , MOV format (OpReg arg) (OpReg dst_r)
+ ], bid)
+ -- In these cases we need a new block id, and have to return it so
+ -- that later instruction selection can reference it.
+ AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
+ AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
+ , NOT format dst
+ ])
+ AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
+ AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
+ where
+ -- Simulate operation that lacks a dedicated instruction using
+ -- cmpxchg.
+ cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
+ -> NatM (OrdList Instr, BlockId)
+ cmpxchg_code instrs = do
+ lbl1 <- getBlockIdNat
+ lbl2 <- getBlockIdNat
+ tmp <- getNewRegNat format
+
+ --Record inserted blocks
+ -- We turn A -> B into A -> A' -> A'' -> B
+ -- with a self loop on A'.
+ addImmediateSuccessorNat bid lbl1
+ addImmediateSuccessorNat lbl1 lbl2
+ updateCfgNat (addWeightEdge lbl1 lbl1 0)
+
+ return $ (toOL
+ [ MOV format (OpAddr amode) (OpReg eax)
+ , JXX ALWAYS lbl1
+ , NEWBLOCK lbl1
+ -- Keep old value so we can return it:
+ , MOV format (OpReg eax) (OpReg dst_r)
+ , MOV format (OpReg eax) (OpReg tmp)
+ ]
+ `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
+ [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
+ , JXX NE lbl1
+ -- See Note [Introducing cfg edges inside basic blocks]
+ -- why this basic block is required.
+ , JXX ALWAYS lbl2
+ , NEWBLOCK lbl2
+ ],
+ lbl2)
+ format = intFormat width
+
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
+ | is32Bit, width == W64 = do
+ ChildCode64 vcode rlo <- iselExpr64 src
+ let rhi = getHiVRegFromLo rlo
+ dst_r = getRegisterReg platform (CmmLocal dst)
+ lbl1 <- getBlockIdNat
+ lbl2 <- getBlockIdNat
+ let format = if width == W8 then II16 else intFormat width
+ tmp_r <- getNewRegNat format
+
+ -- New CFG Edges:
+ -- bid -> lbl2
+ -- bid -> lbl1 -> lbl2
+ -- We also changes edges originating at bid to start at lbl2 instead.
+ updateCfgNat (addWeightEdge bid lbl1 110 .
+ addWeightEdge lbl1 lbl2 110 .
+ addImmediateSuccessor bid lbl2)
+
+ -- The following instruction sequence corresponds to the pseudo-code
+ --
+ -- if (src) {
+ -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
+ -- } else {
+ -- dst = 64;
+ -- }
+ let !instrs = vcode `appOL` toOL
+ ([ MOV II32 (OpReg rhi) (OpReg tmp_r)
+ , OR II32 (OpReg rlo) (OpReg tmp_r)
+ , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+ , JXX EQQ lbl2
+ , JXX ALWAYS lbl1
+
+ , NEWBLOCK lbl1
+ , BSF II32 (OpReg rhi) dst_r
+ , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r)
+ , BSF II32 (OpReg rlo) tmp_r
+ , CMOV NE II32 (OpReg tmp_r) dst_r
+ , JXX ALWAYS lbl2
+
+ , NEWBLOCK lbl2
+ ])
+ return (instrs, Just lbl2)
+
+ | otherwise = do
+ code_src <- getAnyReg src
+ let dst_r = getRegisterReg platform (CmmLocal dst)
+
+ if isBmi2Enabled dflags
+ then do
+ src_r <- getNewRegNat (intFormat width)
+ let instrs = appOL (code_src src_r) $ case width of
+ W8 -> toOL
+ [ OR II32 (OpImm (ImmInteger 0xFFFFFF00)) (OpReg src_r)
+ , TZCNT II32 (OpReg src_r) dst_r
+ ]
+ W16 -> toOL
+ [ TZCNT II16 (OpReg src_r) dst_r
+ , MOVZxL II16 (OpReg dst_r) (OpReg dst_r)
+ ]
+ _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r
+ return (instrs, Nothing)
+ else do
+ -- The following insn sequence makes sure 'ctz 0' has a defined value.
+ -- starting with Haswell, one could use the TZCNT insn instead.
+ let format = if width == W8 then II16 else intFormat width
+ src_r <- getNewRegNat format
+ tmp_r <- getNewRegNat format
+ let !instrs = code_src src_r `appOL` toOL
+ ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+ [ BSF format (OpReg src_r) tmp_r
+ , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
+ , CMOV NE format (OpReg tmp_r) dst_r
+ ]) -- NB: We don't need to zero-extend the result for the
+ -- W8/W16 cases because the 'MOV' insn already
+ -- took care of implicitly clearing the upper bits
+ return (instrs, Nothing)
+ where
+ bw = widthInBits width
+ platform = targetPlatform dflags
+
+genCCall dflags bits mop dst args bid = do
+ instr <- genCCall' dflags bits mop dst args bid
+ return (instr, Nothing)
+
+-- genCCall' handles cases not introducing new code blocks.
+genCCall'
+ :: DynFlags
+ -> Bool -- 32 bit platform?
+ -> ForeignTarget -- function to call
+ -> [CmmFormal] -- where to put the result
+ -> [CmmActual] -- arguments (of mixed type)
+ -> BlockId -- The block we are in
+ -> NatM InstrBlock
+
+-- Unroll memcpy calls if the number of bytes to copy isn't too
+-- large. Otherwise, call C's memcpy.
+genCCall' dflags _ (PrimTarget (MO_Memcpy align)) _
+ [dst, src, CmmLit (CmmInt n _)] _
+ | fromInteger insns <= maxInlineMemcpyInsns dflags = do
+ code_dst <- getAnyReg dst
+ dst_r <- getNewRegNat format
+ code_src <- getAnyReg src
+ src_r <- getNewRegNat format
+ tmp_r <- getNewRegNat format
+ return $ code_dst dst_r `appOL` code_src src_r `appOL`
+ go dst_r src_r tmp_r (fromInteger n)
+ where
+ -- The number of instructions we will generate (approx). We need 2
+ -- instructions per move.
+ insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
+
+ maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
+ effectiveAlignment = min (alignmentOf align) maxAlignment
+ format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
+
+ -- The size of each move, in bytes.
+ sizeBytes :: Integer
+ sizeBytes = fromIntegral (formatInBytes format)
+
+ go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
+ go dst src tmp i
+ | i >= sizeBytes =
+ unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL`
+ unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL`
+ go dst src tmp (i - sizeBytes)
+ -- Deal with remaining bytes.
+ | i >= 4 = -- Will never happen on 32-bit
+ unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
+ unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+ go dst src tmp (i - 4)
+ | i >= 2 =
+ unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
+ unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+ go dst src tmp (i - 2)
+ | i >= 1 =
+ unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
+ unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+ go dst src tmp (i - 1)
+ | otherwise = nilOL
+ where
+ src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone
+ (ImmInteger (n - i))
+ dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
+ (ImmInteger (n - i))
+
+genCCall' dflags _ (PrimTarget (MO_Memset align)) _
+ [dst,
+ CmmLit (CmmInt c _),
+ CmmLit (CmmInt n _)]
+ _
+ | fromInteger insns <= maxInlineMemsetInsns dflags = do
+ code_dst <- getAnyReg dst
+ dst_r <- getNewRegNat format
+ if format == II64 && n >= 8 then do
+ code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64))
+ imm8byte_r <- getNewRegNat II64
+ return $ code_dst dst_r `appOL`
+ code_imm8byte imm8byte_r `appOL`
+ go8 dst_r imm8byte_r (fromInteger n)
+ else
+ return $ code_dst dst_r `appOL`
+ go4 dst_r (fromInteger n)
+ where
+ maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
+ effectiveAlignment = min (alignmentOf align) maxAlignment
+ format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
+ c2 = c `shiftL` 8 .|. c
+ c4 = c2 `shiftL` 16 .|. c2
+ c8 = c4 `shiftL` 32 .|. c4
+
+ -- The number of instructions we will generate (approx). We need 1
+ -- instructions per move.
+ insns = (n + sizeBytes - 1) `div` sizeBytes
+
+ -- The size of each move, in bytes.
+ sizeBytes :: Integer
+ sizeBytes = fromIntegral (formatInBytes format)
+
+ -- Depending on size returns the widest MOV instruction and its
+ -- width.
+ gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
+ gen4 addr size
+ | size >= 4 =
+ (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
+ | size >= 2 =
+ (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
+ | size >= 1 =
+ (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
+ | otherwise = (nilOL, 0)
+
+ -- Generates a 64-bit wide MOV instruction from REG to MEM.
+ gen8 :: AddrMode -> Reg -> InstrBlock
+ gen8 addr reg8byte =
+ unitOL (MOV format (OpReg reg8byte) (OpAddr addr))
+
+ -- Unrolls memset when the widest MOV is <= 4 bytes.
+ go4 :: Reg -> Integer -> InstrBlock
+ go4 dst left =
+ if left <= 0 then nilOL
+ else curMov `appOL` go4 dst (left - curWidth)
+ where
+ possibleWidth = minimum [left, sizeBytes]
+ dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
+ (curMov, curWidth) = gen4 dst_addr possibleWidth
+
+ -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg
+ -- argument). Falls back to go4 when all 8 byte moves are
+ -- exhausted.
+ go8 :: Reg -> Reg -> Integer -> InstrBlock
+ go8 dst reg8byte left =
+ if possibleWidth >= 8 then
+ let curMov = gen8 dst_addr reg8byte
+ in curMov `appOL` go8 dst reg8byte (left - 8)
+ else go4 dst left
+ where
+ possibleWidth = minimum [left, sizeBytes]
+ dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
+
+genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL
+genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
+ -- barriers compile to no code on x86/x86-64;
+ -- we keep it this long in order to prevent earlier optimisations.
+
+genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
+
+genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ =
+ case n of
+ 0 -> genPrefetch src $ PREFETCH NTA format
+ 1 -> genPrefetch src $ PREFETCH Lvl2 format
+ 2 -> genPrefetch src $ PREFETCH Lvl1 format
+ 3 -> genPrefetch src $ PREFETCH Lvl0 format
+ l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l)
+ -- the c / llvm prefetch convention is 0, 1, 2, and 3
+ -- the x86 corresponding names are : NTA, 2 , 1, and 0
+ where
+ format = archWordFormat is32bit
+ -- need to know what register width for pointers!
+ genPrefetch inRegSrc prefetchCTor =
+ do
+ code_src <- getAnyReg inRegSrc
+ src_r <- getNewRegNat format
+ return $ code_src src_r `appOL`
+ (unitOL (prefetchCTor (OpAddr
+ ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) ))
+ -- prefetch always takes an address
+
+genCCall' dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
+ let platform = targetPlatform dflags
+ let dst_r = getRegisterReg platform (CmmLocal dst)
+ case width of
+ W64 | is32Bit -> do
+ ChildCode64 vcode rlo <- iselExpr64 src
+ let dst_rhi = getHiVRegFromLo dst_r
+ rhi = getHiVRegFromLo rlo
+ return $ vcode `appOL`
+ toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi),
+ MOV II32 (OpReg rhi) (OpReg dst_r),
+ BSWAP II32 dst_rhi,
+ BSWAP II32 dst_r ]
+ W16 -> do code_src <- getAnyReg src
+ return $ code_src dst_r `appOL`
+ unitOL (BSWAP II32 dst_r) `appOL`
+ unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r))
+ _ -> do code_src <- getAnyReg src
+ return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r)
+ where
+ format = intFormat width
+
+genCCall' dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
+ args@[src] bid = do
+ sse4_2 <- sse4_2Enabled
+ let platform = targetPlatform dflags
+ if sse4_2
+ then do code_src <- getAnyReg src
+ src_r <- getNewRegNat format
+ let dst_r = getRegisterReg platform (CmmLocal dst)
+ return $ code_src src_r `appOL`
+ (if width == W8 then
+ -- The POPCNT instruction doesn't take a r/m8
+ unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL`
+ unitOL (POPCNT II16 (OpReg src_r) dst_r)
+ else
+ unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL`
+ (if width == W8 || width == W16 then
+ -- We used a 16-bit destination register above,
+ -- so zero-extend
+ unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+ else nilOL)
+ else do
+ targetExpr <- cmmMakeDynamicReference dflags
+ CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall' dflags is32Bit target dest_regs args bid
+ where
+ format = intFormat width
+ lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
+ args@[src, mask] bid = do
+ let platform = targetPlatform dflags
+ if isBmi2Enabled dflags
+ then do code_src <- getAnyReg src
+ code_mask <- getAnyReg mask
+ src_r <- getNewRegNat format
+ mask_r <- getNewRegNat format
+ let dst_r = getRegisterReg platform (CmmLocal dst)
+ return $ code_src src_r `appOL` code_mask mask_r `appOL`
+ (if width == W8 then
+ -- The PDEP instruction doesn't take a r/m8
+ unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
+ unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
+ unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r)
+ else
+ unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+ (if width == W8 || width == W16 then
+ -- We used a 16-bit destination register above,
+ -- so zero-extend
+ unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+ else nilOL)
+ else do
+ targetExpr <- cmmMakeDynamicReference dflags
+ CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall' dflags is32Bit target dest_regs args bid
+ where
+ format = intFormat width
+ lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
+ args@[src, mask] bid = do
+ let platform = targetPlatform dflags
+ if isBmi2Enabled dflags
+ then do code_src <- getAnyReg src
+ code_mask <- getAnyReg mask
+ src_r <- getNewRegNat format
+ mask_r <- getNewRegNat format
+ let dst_r = getRegisterReg platform (CmmLocal dst)
+ return $ code_src src_r `appOL` code_mask mask_r `appOL`
+ (if width == W8 then
+ -- The PEXT instruction doesn't take a r/m8
+ unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
+ unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
+ unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
+ else
+ unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+ (if width == W8 || width == W16 then
+ -- We used a 16-bit destination register above,
+ -- so zero-extend
+ unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+ else nilOL)
+ else do
+ targetExpr <- cmmMakeDynamicReference dflags
+ CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall' dflags is32Bit target dest_regs args bid
+ where
+ format = intFormat width
+ lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
+ | is32Bit && width == W64 = do
+ -- Fallback to `hs_clz64` on i386
+ targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall' dflags is32Bit target dest_regs args bid
+
+ | otherwise = do
+ code_src <- getAnyReg src
+ let dst_r = getRegisterReg platform (CmmLocal dst)
+ if isBmi2Enabled dflags
+ then do
+ src_r <- getNewRegNat (intFormat width)
+ return $ appOL (code_src src_r) $ case width of
+ W8 -> toOL
+ [ MOVZxL II8 (OpReg src_r) (OpReg src_r) -- zero-extend to 32 bit
+ , LZCNT II32 (OpReg src_r) dst_r -- lzcnt with extra 24 zeros
+ , SUB II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros
+ ]
+ W16 -> toOL
+ [ LZCNT II16 (OpReg src_r) dst_r
+ , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit
+ ]
+ _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r)
+ else do
+ let format = if width == W8 then II16 else intFormat width
+ src_r <- getNewRegNat format
+ tmp_r <- getNewRegNat format
+ return $ code_src src_r `appOL` toOL
+ ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+ [ BSR format (OpReg src_r) tmp_r
+ , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+ , CMOV NE format (OpReg tmp_r) dst_r
+ , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+ ]) -- NB: We don't need to zero-extend the result for the
+ -- W8/W16 cases because the 'MOV' insn already
+ -- took care of implicitly clearing the upper bits
+ where
+ bw = widthInBits width
+ platform = targetPlatform dflags
+ lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
+ targetExpr <- cmmMakeDynamicReference dflags
+ CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall' dflags is32Bit target dest_regs args bid
+ where
+ lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width))
+
+genCCall' dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
+ load_code <- intLoadCode (MOV (intFormat width)) addr
+ let platform = targetPlatform dflags
+
+ return (load_code (getRegisterReg platform (CmmLocal dst)))
+
+genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
+ code <- assignMem_IntCode (intFormat width) addr val
+ return $ code `snocOL` MFENCE
+
+genCCall' dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
+ -- On x86 we don't have enough registers to use cmpxchg with a
+ -- complicated addressing mode, so on that architecture we
+ -- pre-compute the address first.
+ Amode amode addr_code <- getSimpleAmode dflags is32Bit addr
+ newval <- getNewRegNat format
+ newval_code <- getAnyReg new
+ oldval <- getNewRegNat format
+ oldval_code <- getAnyReg old
+ let platform = targetPlatform dflags
+ dst_r = getRegisterReg platform (CmmLocal dst)
+ code = toOL
+ [ MOV format (OpReg oldval) (OpReg eax)
+ , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
+ , MOV format (OpReg eax) (OpReg dst_r)
+ ]
+ return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
+ `appOL` code
+ where
+ format = intFormat width
+
+genCCall' _ is32Bit target dest_regs args bid = do
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+ case (target, dest_regs) of
+ -- void return type prim op
+ (PrimTarget op, []) ->
+ outOfLineCmmOp bid op Nothing args
+ -- we only cope with a single result for foreign calls
+ (PrimTarget op, [r]) -> case op of
+ MO_F32_Fabs -> case args of
+ [x] -> sse2FabsCode W32 x
+ _ -> panic "genCCall: Wrong number of arguments for fabs"
+ MO_F64_Fabs -> case args of
+ [x] -> sse2FabsCode W64 x
+ _ -> panic "genCCall: Wrong number of arguments for fabs"
+
+ MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
+ MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
+ _other_op -> outOfLineCmmOp bid op (Just r) args
+
+ where
+ actuallyInlineSSE2Op = actuallyInlineFloatOp'
+
+ actuallyInlineFloatOp' instr format [x]
+ = do res <- trivialUFCode format (instr format) x
+ any <- anyReg res
+ return (any (getRegisterReg platform (CmmLocal r)))
+
+ actuallyInlineFloatOp' _ _ args
+ = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
+ ++ show (length args) ++ ")"
+
+ sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock
+ sse2FabsCode w x = do
+ let fmt = floatFormat w
+ x_code <- getAnyReg x
+ let
+ const | FF32 <- fmt = CmmInt 0x7fffffff W32
+ | otherwise = CmmInt 0x7fffffffffffffff W64
+ Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
+ tmp <- getNewRegNat fmt
+ let
+ code dst = x_code dst `appOL` amode_code `appOL` toOL [
+ MOV fmt (OpAddr amode) (OpReg tmp),
+ AND fmt (OpReg tmp) (OpReg dst)
+ ]
+
+ return $ code (getRegisterReg platform (CmmLocal r))
+
+ (PrimTarget (MO_S_QuotRem width), _) -> divOp1 platform True width dest_regs args
+ (PrimTarget (MO_U_QuotRem width), _) -> divOp1 platform False width dest_regs args
+ (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
+ (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
+ case args of
+ [arg_x, arg_y] ->
+ do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
+ let format = intFormat width
+ lCode <- anyReg =<< trivialCode width (ADD_CC format)
+ (Just (ADD_CC format)) arg_x arg_y
+ let reg_l = getRegisterReg platform (CmmLocal res_l)
+ reg_h = getRegisterReg platform (CmmLocal res_h)
+ code = hCode reg_h `appOL`
+ lCode reg_l `snocOL`
+ ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
+ return code
+ _ -> panic "genCCall: Wrong number of arguments/results for add2"
+ (PrimTarget (MO_AddWordC width), [res_r, res_c]) ->
+ addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args
+ (PrimTarget (MO_SubWordC width), [res_r, res_c]) ->
+ addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args
+ (PrimTarget (MO_AddIntC width), [res_r, res_c]) ->
+ addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args
+ (PrimTarget (MO_SubIntC width), [res_r, res_c]) ->
+ addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args
+ (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
+ case args of
+ [arg_x, arg_y] ->
+ do (y_reg, y_code) <- getRegOrMem arg_y
+ x_code <- getAnyReg arg_x
+ let format = intFormat width
+ reg_h = getRegisterReg platform (CmmLocal res_h)
+ reg_l = getRegisterReg platform (CmmLocal res_l)
+ code = y_code `appOL`
+ x_code rax `appOL`
+ toOL [MUL2 format y_reg,
+ MOV format (OpReg rdx) (OpReg reg_h),
+ MOV format (OpReg rax) (OpReg reg_l)]
+ return code
+ _ -> panic "genCCall: Wrong number of arguments/results for mul2"
+ (PrimTarget (MO_S_Mul2 width), [res_c, res_h, res_l]) ->
+ case args of
+ [arg_x, arg_y] ->
+ do (y_reg, y_code) <- getRegOrMem arg_y
+ x_code <- getAnyReg arg_x
+ reg_tmp <- getNewRegNat II8
+ let format = intFormat width
+ reg_h = getRegisterReg platform (CmmLocal res_h)
+ reg_l = getRegisterReg platform (CmmLocal res_l)
+ reg_c = getRegisterReg platform (CmmLocal res_c)
+ code = y_code `appOL`
+ x_code rax `appOL`
+ toOL [ IMUL2 format y_reg
+ , MOV format (OpReg rdx) (OpReg reg_h)
+ , MOV format (OpReg rax) (OpReg reg_l)
+ , SETCC CARRY (OpReg reg_tmp)
+ , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
+ ]
+ return code
+ _ -> panic "genCCall: Wrong number of arguments/results for imul2"
+
+ _ -> if is32Bit
+ then genCCall32' dflags target dest_regs args
+ else genCCall64' dflags target dest_regs args
+
+ where divOp1 platform signed width results [arg_x, arg_y]
+ = divOp platform signed width results Nothing arg_x arg_y
+ divOp1 _ _ _ _ _
+ = panic "genCCall: Wrong number of arguments for divOp1"
+ divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
+ = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
+ divOp2 _ _ _ _ _
+ = panic "genCCall: Wrong number of arguments for divOp2"
+
+ -- See Note [DIV/IDIV for bytes]
+ divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
+ let widen | signed = MO_SS_Conv W8 W16
+ | otherwise = MO_UU_Conv W8 W16
+ arg_x_low_16 = CmmMachOp widen [arg_x_low]
+ arg_y_16 = CmmMachOp widen [arg_y]
+ m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
+ in divOp
+ platform signed W16 [res_q, res_r]
+ m_arg_x_high_16 arg_x_low_16 arg_y_16
+
+ divOp platform signed width [res_q, res_r]
+ m_arg_x_high arg_x_low arg_y
+ = do let format = intFormat width
+ reg_q = getRegisterReg platform (CmmLocal res_q)
+ reg_r = getRegisterReg platform (CmmLocal res_r)
+ widen | signed = CLTD format
+ | otherwise = XOR format (OpReg rdx) (OpReg rdx)
+ instr | signed = IDIV
+ | otherwise = DIV
+ (y_reg, y_code) <- getRegOrMem arg_y
+ x_low_code <- getAnyReg arg_x_low
+ x_high_code <- case m_arg_x_high of
+ Just arg_x_high ->
+ getAnyReg arg_x_high
+ Nothing ->
+ return $ const $ unitOL widen
+ return $ y_code `appOL`
+ x_low_code rax `appOL`
+ x_high_code rdx `appOL`
+ toOL [instr format y_reg,
+ MOV format (OpReg rax) (OpReg reg_q),
+ MOV format (OpReg rdx) (OpReg reg_r)]
+ divOp _ _ _ _ _ _ _
+ = panic "genCCall: Wrong number of results for divOp"
+
+ addSubIntC platform instr mrevinstr cond width
+ res_r res_c [arg_x, arg_y]
+ = do let format = intFormat width
+ rCode <- anyReg =<< trivialCode width (instr format)
+ (mrevinstr format) arg_x arg_y
+ reg_tmp <- getNewRegNat II8
+ let reg_c = getRegisterReg platform (CmmLocal res_c)
+ reg_r = getRegisterReg platform (CmmLocal res_r)
+ code = rCode reg_r `snocOL`
+ SETCC cond (OpReg reg_tmp) `snocOL`
+ MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
+
+ return code
+ addSubIntC _ _ _ _ _ _ _ _
+ = panic "genCCall: Wrong number of arguments/results for addSubIntC"
+
+-- Note [DIV/IDIV for bytes]
+--
+-- IDIV reminder:
+-- Size Dividend Divisor Quotient Remainder
+-- byte %ax r/m8 %al %ah
+-- word %dx:%ax r/m16 %ax %dx
+-- dword %edx:%eax r/m32 %eax %edx
+-- qword %rdx:%rax r/m64 %rax %rdx
+--
+-- We do a special case for the byte division because the current
+-- codegen doesn't deal well with accessing %ah register (also,
+-- accessing %ah in 64-bit mode is complicated because it cannot be an
+-- operand of many instructions). So we just widen operands to 16 bits
+-- and get the results from %al, %dl. This is not optimal, but a few
+-- register moves are probably not a huge deal when doing division.
+
+genCCall32' :: DynFlags
+ -> ForeignTarget -- function to call
+ -> [CmmFormal] -- where to put the result
+ -> [CmmActual] -- arguments (of mixed type)
+ -> NatM InstrBlock
+genCCall32' dflags target dest_regs args = do
+ let
+ prom_args = map (maybePromoteCArg dflags W32) args
+
+ -- Align stack to 16n for calls, assuming a starting stack
+ -- alignment of 16n - word_size on procedure entry. Which we
+ -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
+ sizes = map (arg_size_bytes . cmmExprType dflags) (reverse args)
+ raw_arg_size = sum sizes + wORD_SIZE dflags
+ arg_pad_size = (roundTo 16 $ raw_arg_size) - raw_arg_size
+ tot_arg_size = raw_arg_size + arg_pad_size - wORD_SIZE dflags
+ delta0 <- getDeltaNat
+ setDeltaNat (delta0 - arg_pad_size)
+
+ push_codes <- mapM push_arg (reverse prom_args)
+ delta <- getDeltaNat
+ MASSERT(delta == delta0 - tot_arg_size)
+
+ -- deal with static vs dynamic call targets
+ (callinsns,cconv) <-
+ case target of
+ ForeignTarget (CmmLit (CmmLabel lbl)) conv
+ -> -- ToDo: stdcall arg sizes
+ return (unitOL (CALL (Left fn_imm) []), conv)
+ where fn_imm = ImmCLbl lbl
+ ForeignTarget expr conv
+ -> do { (dyn_r, dyn_c) <- getSomeReg expr
+ ; ASSERT( isWord32 (cmmExprType dflags expr) )
+ return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
+ PrimTarget _
+ -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
+ ++ "probably because too many return values."
+
+ let push_code
+ | arg_pad_size /= 0
+ = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
+ DELTA (delta0 - arg_pad_size)]
+ `appOL` concatOL push_codes
+ | otherwise
+ = concatOL push_codes
+
+ -- Deallocate parameters after call for ccall;
+ -- but not for stdcall (callee does it)
+ --
+ -- We have to pop any stack padding we added
+ -- even if we are doing stdcall, though (#5052)
+ pop_size
+ | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size
+ | otherwise = tot_arg_size
+
+ call = callinsns `appOL`
+ toOL (
+ (if pop_size==0 then [] else
+ [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
+ ++
+ [DELTA delta0]
+ )
+ setDeltaNat delta0
+
+ dflags <- getDynFlags
+ let platform = targetPlatform dflags
+
+ let
+ -- assign the results, if necessary
+ assign_code [] = nilOL
+ assign_code [dest]
+ | isFloatType ty =
+ -- we assume SSE2
+ let tmp_amode = AddrBaseIndex (EABaseReg esp)
+ EAIndexNone
+ (ImmInt 0)
+ fmt = floatFormat w
+ in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
+ DELTA (delta0 - b),
+ X87Store fmt tmp_amode,
+ -- X87Store only supported for the CDECL ABI
+ -- NB: This code will need to be
+ -- revisted once GHC does more work around
+ -- SIGFPE f
+ MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
+ ADD II32 (OpImm (ImmInt b)) (OpReg esp),
+ DELTA delta0]
+ | isWord64 ty = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
+ MOV II32 (OpReg edx) (OpReg r_dest_hi)]
+ | otherwise = unitOL (MOV (intFormat w)
+ (OpReg eax)
+ (OpReg r_dest))
+ where
+ ty = localRegType dest
+ w = typeWidth ty
+ b = widthInBytes w
+ r_dest_hi = getHiVRegFromLo r_dest
+ r_dest = getRegisterReg platform (CmmLocal dest)
+ assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
+
+ return (push_code `appOL`
+ call `appOL`
+ assign_code dest_regs)
+
+ where
+ -- If the size is smaller than the word, we widen things (see maybePromoteCArg)
+ arg_size_bytes :: CmmType -> Int
+ arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth dflags))
+
+ roundTo a x | x `mod` a == 0 = x
+ | otherwise = x + a - (x `mod` a)
+
+ push_arg :: CmmActual {-current argument-}
+ -> NatM InstrBlock -- code
+
+ push_arg arg -- we don't need the hints on x86
+ | isWord64 arg_ty = do
+ ChildCode64 code r_lo <- iselExpr64 arg
+ delta <- getDeltaNat
+ setDeltaNat (delta - 8)
+ let r_hi = getHiVRegFromLo r_lo
+ return ( code `appOL`
+ toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
+ PUSH II32 (OpReg r_lo), DELTA (delta - 8),
+ DELTA (delta-8)]
+ )
+
+ | isFloatType arg_ty = do
+ (reg, code) <- getSomeReg arg
+ delta <- getDeltaNat
+ setDeltaNat (delta-size)
+ return (code `appOL`
+ toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
+ DELTA (delta-size),
+ let addr = AddrBaseIndex (EABaseReg esp)
+ EAIndexNone
+ (ImmInt 0)
+ format = floatFormat (typeWidth arg_ty)
+ in
+
+ -- assume SSE2
+ MOV format (OpReg reg) (OpAddr addr)
+
+ ]
+ )
+
+ | otherwise = do
+ -- Arguments can be smaller than 32-bit, but we still use @PUSH
+ -- II32@ - the usual calling conventions expect integers to be
+ -- 4-byte aligned.
+ ASSERT((typeWidth arg_ty) <= W32) return ()
+ (operand, code) <- getOperand arg
+ delta <- getDeltaNat
+ setDeltaNat (delta-size)
+ return (code `snocOL`
+ PUSH II32 operand `snocOL`
+ DELTA (delta-size))
+
+ where
+ arg_ty = cmmExprType dflags arg
+ size = arg_size_bytes arg_ty -- Byte size
+
+genCCall64' :: DynFlags
+ -> ForeignTarget -- function to call
+ -> [CmmFormal] -- where to put the result
+ -> [CmmActual] -- arguments (of mixed type)
+ -> NatM InstrBlock
+genCCall64' dflags target dest_regs args = do
+ -- load up the register arguments
+ let prom_args = map (maybePromoteCArg dflags W32) args
+
+ (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
+ <-
+ if platformOS platform == OSMinGW32
+ then load_args_win prom_args [] [] (allArgRegs platform) nilOL
+ else do
+ (stack_args, aregs, fregs, load_args_code, assign_args_code)
+ <- load_args prom_args (allIntArgRegs platform)
+ (allFPArgRegs platform)
+ nilOL nilOL
+ let used_regs rs as = reverse (drop (length rs) (reverse as))
+ fregs_used = used_regs fregs (allFPArgRegs platform)
+ aregs_used = used_regs aregs (allIntArgRegs platform)
+ return (stack_args, aregs_used, fregs_used, load_args_code
+ , assign_args_code)
+
+ let
+ arg_regs_used = int_regs_used ++ fp_regs_used
+ arg_regs = [eax] ++ arg_regs_used
+ -- for annotating the call instruction with
+ sse_regs = length fp_regs_used
+ arg_stack_slots = if platformOS platform == OSMinGW32
+ then length stack_args + length (allArgRegs platform)
+ else length stack_args
+ tot_arg_size = arg_size * arg_stack_slots
+
+
+ -- Align stack to 16n for calls, assuming a starting stack
+ -- alignment of 16n - word_size on procedure entry. Which we
+ -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86]
+ (real_size, adjust_rsp) <-
+ if (tot_arg_size + wORD_SIZE dflags) `rem` 16 == 0
+ then return (tot_arg_size, nilOL)
+ else do -- we need to adjust...
+ delta <- getDeltaNat
+ setDeltaNat (delta - wORD_SIZE dflags)
+ return (tot_arg_size + wORD_SIZE dflags, toOL [
+ SUB II64 (OpImm (ImmInt (wORD_SIZE dflags))) (OpReg rsp),
+ DELTA (delta - wORD_SIZE dflags) ])
+
+ -- push the stack args, right to left
+ push_code <- push_args (reverse stack_args) nilOL
+ -- On Win64, we also have to leave stack space for the arguments
+ -- that we are passing in registers
+ lss_code <- if platformOS platform == OSMinGW32
+ then leaveStackSpace (length (allArgRegs platform))
+ else return nilOL
+ delta <- getDeltaNat
+
+ -- deal with static vs dynamic call targets
+ (callinsns,_cconv) <-
+ case target of
+ ForeignTarget (CmmLit (CmmLabel lbl)) conv
+ -> -- ToDo: stdcall arg sizes
+ return (unitOL (CALL (Left fn_imm) arg_regs), conv)
+ where fn_imm = ImmCLbl lbl
+ ForeignTarget expr conv
+ -> do (dyn_r, dyn_c) <- getSomeReg expr
+ return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
+ PrimTarget _
+ -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
+ ++ "probably because too many return values."
+
+ let
+ -- The x86_64 ABI requires us to set %al to the number of SSE2
+ -- registers that contain arguments, if the called routine
+ -- is a varargs function. We don't know whether it's a
+ -- varargs function or not, so we have to assume it is.
+ --
+ -- It's not safe to omit this assignment, even if the number
+ -- of SSE2 regs in use is zero. If %al is larger than 8
+ -- on entry to a varargs function, seg faults ensue.
+ assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
+
+ let call = callinsns `appOL`
+ toOL (
+ -- Deallocate parameters after call for ccall;
+ -- stdcall has callee do it, but is not supported on
+ -- x86_64 target (see #3336)
+ (if real_size==0 then [] else
+ [ADD (intFormat (wordWidth dflags)) (OpImm (ImmInt real_size)) (OpReg esp)])
+ ++
+ [DELTA (delta + real_size)]
+ )
+ setDeltaNat (delta + real_size)
+
+ let
+ -- assign the results, if necessary
+ assign_code [] = nilOL
+ assign_code [dest] =
+ case typeWidth rep of
+ W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
+ (OpReg xmm0)
+ (OpReg r_dest))
+ W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
+ (OpReg xmm0)
+ (OpReg r_dest))
+ _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
+ where
+ rep = localRegType dest
+ r_dest = getRegisterReg platform (CmmLocal dest)
+ assign_code _many = panic "genCCall.assign_code many"
+
+ return (adjust_rsp `appOL`
+ push_code `appOL`
+ load_args_code `appOL`
+ assign_args_code `appOL`
+ lss_code `appOL`
+ assign_eax sse_regs `appOL`
+ call `appOL`
+ assign_code dest_regs)
+
+ where platform = targetPlatform dflags
+ arg_size = 8 -- always, at the mo
+
+
+ load_args :: [CmmExpr]
+ -> [Reg] -- int regs avail for args
+ -> [Reg] -- FP regs avail for args
+ -> InstrBlock -- code computing args
+ -> InstrBlock -- code assigning args to ABI regs
+ -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
+ -- no more regs to use
+ load_args args [] [] code acode =
+ return (args, [], [], code, acode)
+
+ -- no more args to push
+ load_args [] aregs fregs code acode =
+ return ([], aregs, fregs, code, acode)
+
+ load_args (arg : rest) aregs fregs code acode
+ | isFloatType arg_rep = case fregs of
+ [] -> push_this_arg
+ (r:rs) -> do
+ (code',acode') <- reg_this_arg r
+ load_args rest aregs rs code' acode'
+ | otherwise = case aregs of
+ [] -> push_this_arg
+ (r:rs) -> do
+ (code',acode') <- reg_this_arg r
+ load_args rest rs fregs code' acode'
+ where
+
+ -- put arg into the list of stack pushed args
+ push_this_arg = do
+ (args',ars,frs,code',acode')
+ <- load_args rest aregs fregs code acode
+ return (arg:args', ars, frs, code', acode')
+
+ -- pass the arg into the given register
+ reg_this_arg r
+ -- "operand" args can be directly assigned into r
+ | isOperand False arg = do
+ arg_code <- getAnyReg arg
+ return (code, (acode `appOL` arg_code r))
+ -- The last non-operand arg can be directly assigned after its
+ -- computation without going into a temporary register
+ | all (isOperand False) rest = do
+ arg_code <- getAnyReg arg
+ return (code `appOL` arg_code r,acode)
+
+ -- other args need to be computed beforehand to avoid clobbering
+ -- previously assigned registers used to pass parameters (see
+ -- #11792, #12614). They are assigned into temporary registers
+ -- and get assigned to proper call ABI registers after they all
+ -- have been computed.
+ | otherwise = do
+ arg_code <- getAnyReg arg
+ tmp <- getNewRegNat arg_fmt
+ let
+ code' = code `appOL` arg_code tmp
+ acode' = acode `snocOL` reg2reg arg_fmt tmp r
+ return (code',acode')
+
+ arg_rep = cmmExprType dflags arg
+ arg_fmt = cmmTypeFormat arg_rep
+
+ load_args_win :: [CmmExpr]
+ -> [Reg] -- used int regs
+ -> [Reg] -- used FP regs
+ -> [(Reg, Reg)] -- (int, FP) regs avail for args
+ -> InstrBlock
+ -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
+ load_args_win args usedInt usedFP [] code
+ = return (args, usedInt, usedFP, code, nilOL)
+ -- no more regs to use
+ load_args_win [] usedInt usedFP _ code
+ = return ([], usedInt, usedFP, code, nilOL)
+ -- no more args to push
+ load_args_win (arg : rest) usedInt usedFP
+ ((ireg, freg) : regs) code
+ | isFloatType arg_rep = do
+ arg_code <- getAnyReg arg
+ load_args_win rest (ireg : usedInt) (freg : usedFP) regs
+ (code `appOL`
+ arg_code freg `snocOL`
+ -- If we are calling a varargs function
+ -- then we need to define ireg as well
+ -- as freg
+ MOV II64 (OpReg freg) (OpReg ireg))
+ | otherwise = do
+ arg_code <- getAnyReg arg
+ load_args_win rest (ireg : usedInt) usedFP regs
+ (code `appOL` arg_code ireg)
+ where
+ arg_rep = cmmExprType dflags arg
+
+ push_args [] code = return code
+ push_args (arg:rest) code
+ | isFloatType arg_rep = do
+ (arg_reg, arg_code) <- getSomeReg arg
+ delta <- getDeltaNat
+ setDeltaNat (delta-arg_size)
+ let code' = code `appOL` arg_code `appOL` toOL [
+ SUB (intFormat (wordWidth dflags)) (OpImm (ImmInt arg_size)) (OpReg rsp),
+ DELTA (delta-arg_size),
+ MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel dflags 0))]
+ push_args rest code'
+
+ | otherwise = do
+ -- Arguments can be smaller than 64-bit, but we still use @PUSH
+ -- II64@ - the usual calling conventions expect integers to be
+ -- 8-byte aligned.
+ ASSERT(width <= W64) return ()
+ (arg_op, arg_code) <- getOperand arg
+ delta <- getDeltaNat
+ setDeltaNat (delta-arg_size)
+ let code' = code `appOL` arg_code `appOL` toOL [
+ PUSH II64 arg_op,
+ DELTA (delta-arg_size)]
+ push_args rest code'
+ where
+ arg_rep = cmmExprType dflags arg
+ width = typeWidth arg_rep
+
+ leaveStackSpace n = do
+ delta <- getDeltaNat
+ setDeltaNat (delta - n * arg_size)
+ return $ toOL [
+ SUB II64 (OpImm (ImmInt (n * wORD_SIZE dflags))) (OpReg rsp),
+ DELTA (delta - n * arg_size)]
+
+maybePromoteCArg :: DynFlags -> Width -> CmmExpr -> CmmExpr
+maybePromoteCArg dflags wto arg
+ | wfrom < wto = CmmMachOp (MO_UU_Conv wfrom wto) [arg]
+ | otherwise = arg
+ where
+ wfrom = cmmExprWidth dflags arg
+
+outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual]
+ -> NatM InstrBlock
+outOfLineCmmOp bid mop res args
+ = do
+ dflags <- getDynFlags
+ targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
+ let target = ForeignTarget targetExpr
+ (ForeignConvention CCallConv [] [] CmmMayReturn)
+
+ -- We know foreign calls results in no new basic blocks, so we can ignore
+ -- the returned block id.
+ (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
+ return instrs
+ where
+ -- Assume we can call these functions directly, and that they're not in a dynamic library.
+ -- TODO: Why is this ok? Under linux this code will be in libm.so
+ -- Is it because they're really implemented as a primitive instruction by the assembler?? -- BL 2009/12/31
+ lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
+
+ fn = case mop of
+ MO_F32_Sqrt -> fsLit "sqrtf"
+ MO_F32_Fabs -> fsLit "fabsf"
+ MO_F32_Sin -> fsLit "sinf"
+ MO_F32_Cos -> fsLit "cosf"
+ MO_F32_Tan -> fsLit "tanf"
+ MO_F32_Exp -> fsLit "expf"
+ MO_F32_ExpM1 -> fsLit "expm1f"
+ MO_F32_Log -> fsLit "logf"
+ MO_F32_Log1P -> fsLit "log1pf"
+
+ MO_F32_Asin -> fsLit "asinf"
+ MO_F32_Acos -> fsLit "acosf"
+ MO_F32_Atan -> fsLit "atanf"
+
+ MO_F32_Sinh -> fsLit "sinhf"
+ MO_F32_Cosh -> fsLit "coshf"
+ MO_F32_Tanh -> fsLit "tanhf"
+ MO_F32_Pwr -> fsLit "powf"
+
+ MO_F32_Asinh -> fsLit "asinhf"
+ MO_F32_Acosh -> fsLit "acoshf"
+ MO_F32_Atanh -> fsLit "atanhf"
+
+ MO_F64_Sqrt -> fsLit "sqrt"
+ MO_F64_Fabs -> fsLit "fabs"
+ MO_F64_Sin -> fsLit "sin"
+ MO_F64_Cos -> fsLit "cos"
+ MO_F64_Tan -> fsLit "tan"
+ MO_F64_Exp -> fsLit "exp"
+ MO_F64_ExpM1 -> fsLit "expm1"
+ MO_F64_Log -> fsLit "log"
+ MO_F64_Log1P -> fsLit "log1p"
+
+ MO_F64_Asin -> fsLit "asin"
+ MO_F64_Acos -> fsLit "acos"
+ MO_F64_Atan -> fsLit "atan"
+
+ MO_F64_Sinh -> fsLit "sinh"
+ MO_F64_Cosh -> fsLit "cosh"
+ MO_F64_Tanh -> fsLit "tanh"
+ MO_F64_Pwr -> fsLit "pow"
+
+ MO_F64_Asinh -> fsLit "asinh"
+ MO_F64_Acosh -> fsLit "acosh"
+ MO_F64_Atanh -> fsLit "atanh"
+
+ MO_Memcpy _ -> fsLit "memcpy"
+ MO_Memset _ -> fsLit "memset"
+ MO_Memmove _ -> fsLit "memmove"
+ MO_Memcmp _ -> fsLit "memcmp"
+
+ MO_PopCnt _ -> fsLit "popcnt"
+ MO_BSwap _ -> fsLit "bswap"
+ {- Here the C implementation is used as there is no x86
+ instruction to reverse a word's bit order.
+ -}
+ MO_BRev w -> fsLit $ bRevLabel w
+ MO_Clz w -> fsLit $ clzLabel w
+ MO_Ctz _ -> unsupported
+
+ MO_Pdep w -> fsLit $ pdepLabel w
+ MO_Pext w -> fsLit $ pextLabel w
+
+ MO_AtomicRMW _ _ -> fsLit "atomicrmw"
+ MO_AtomicRead _ -> fsLit "atomicread"
+ MO_AtomicWrite _ -> fsLit "atomicwrite"
+ MO_Cmpxchg _ -> fsLit "cmpxchg"
+
+ MO_UF_Conv _ -> unsupported
+
+ MO_S_Mul2 {} -> unsupported
+ MO_S_QuotRem {} -> unsupported
+ MO_U_QuotRem {} -> unsupported
+ MO_U_QuotRem2 {} -> unsupported
+ MO_Add2 {} -> unsupported
+ MO_AddIntC {} -> unsupported
+ MO_SubIntC {} -> unsupported
+ MO_AddWordC {} -> unsupported
+ MO_SubWordC {} -> unsupported
+ MO_U_Mul2 {} -> unsupported
+ MO_ReadBarrier -> unsupported
+ MO_WriteBarrier -> unsupported
+ MO_Touch -> unsupported
+ (MO_Prefetch_Data _ ) -> unsupported
+ unsupported = panic ("outOfLineCmmOp: " ++ show mop
+ ++ " not supported here")
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+
+genSwitch dflags expr targets
+ | positionIndependent dflags
+ = do
+ (reg,e_code) <- getNonClobberedReg (cmmOffset dflags expr offset)
+ -- getNonClobberedReg because it needs to survive across t_code
+ lbl <- getNewLabelNat
+ dflags <- getDynFlags
+ let is32bit = target32Bit (targetPlatform dflags)
+ os = platformOS (targetPlatform dflags)
+ -- Might want to use .rodata.<function we're in> instead, but as
+ -- long as it's something unique it'll work out since the
+ -- references to the jump table are in the appropriate section.
+ rosection = case os of
+ -- on Mac OS X/x86_64, put the jump table in the text section to
+ -- work around a limitation of the linker.
+ -- ld64 is unable to handle the relocations for
+ -- .quad L1 - L0
+ -- if L0 is not preceded by a non-anonymous label in its section.
+ OSDarwin | not is32bit -> Section Text lbl
+ _ -> Section ReadOnlyData lbl
+ dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+ (tableReg,t_code) <- getSomeReg $ dynRef
+ let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
+ (EAIndex reg (wORD_SIZE dflags)) (ImmInt 0))
+
+ offsetReg <- getNewRegNat (intFormat (wordWidth dflags))
+ return $ if is32bit || os == OSDarwin
+ then e_code `appOL` t_code `appOL` toOL [
+ ADD (intFormat (wordWidth dflags)) op (OpReg tableReg),
+ JMP_TBL (OpReg tableReg) ids rosection lbl
+ ]
+ else -- HACK: On x86_64 binutils<2.17 is only able to generate
+ -- PC32 relocations, hence we only get 32-bit offsets in
+ -- the jump table. As these offsets are always negative
+ -- we need to properly sign extend them to 64-bit. This
+ -- hack should be removed in conjunction with the hack in
+ -- PprMach.hs/pprDataItem once binutils 2.17 is standard.
+ e_code `appOL` t_code `appOL` toOL [
+ MOVSxL II32 op (OpReg offsetReg),
+ ADD (intFormat (wordWidth dflags))
+ (OpReg offsetReg)
+ (OpReg tableReg),
+ JMP_TBL (OpReg tableReg) ids rosection lbl
+ ]
+ | otherwise
+ = do
+ (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+ lbl <- getNewLabelNat
+ let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (wORD_SIZE dflags)) (ImmCLbl lbl))
+ code = e_code `appOL` toOL [
+ JMP_TBL op ids (Section ReadOnlyData lbl) lbl
+ ]
+ return code
+ where
+ (offset, blockIds) = switchTargetsToTable targets
+ ids = map (fmap DestBlockId) blockIds
+
+generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, RawCmmStatics) Instr)
+generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl)
+ = let getBlockId (DestBlockId id) = id
+ getBlockId _ = panic "Non-Label target in Jump Table"
+ blockIds = map (fmap getBlockId) ids
+ in Just (createJumpTable dflags blockIds section lbl)
+generateJumpTableForInstr _ _ = Nothing
+
+createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel
+ -> GenCmmDecl (Alignment, RawCmmStatics) h g
+createJumpTable dflags ids section lbl
+ = let jumpTable
+ | positionIndependent dflags =
+ let ww = wordWidth dflags
+ jumpTableEntryRel Nothing
+ = CmmStaticLit (CmmInt 0 ww)
+ jumpTableEntryRel (Just blockid)
+ = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww)
+ where blockLabel = blockLbl blockid
+ in map jumpTableEntryRel ids
+ | otherwise = map (jumpTableEntry dflags) ids
+ in CmmData section (mkAlignment 1, RawCmmStatics lbl jumpTable)
+
+extractUnwindPoints :: [Instr] -> [UnwindPoint]
+extractUnwindPoints instrs =
+ [ UnwindPoint lbl unwinds | UNWIND lbl unwinds <- instrs]
+
+-- -----------------------------------------------------------------------------
+-- 'condIntReg' and 'condFltReg': condition codes into registers
+
+-- Turn those condition codes into integers now (when they appear on
+-- the right hand side of an assignment).
+--
+-- (If applicable) Do not fill the delay slots here; you will confuse the
+-- register allocator.
+
+condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+
+condIntReg cond x y = do
+ CondCode _ cond cond_code <- condIntCode cond x y
+ tmp <- getNewRegNat II8
+ let
+ code dst = cond_code `appOL` toOL [
+ SETCC cond (OpReg tmp),
+ MOVZxL II8 (OpReg tmp) (OpReg dst)
+ ]
+ return (Any II32 code)
+
+
+-----------------------------------------------------------
+--- Note [SSE Parity Checks] ---
+-----------------------------------------------------------
+
+-- We have to worry about unordered operands (eg. comparisons
+-- against NaN). If the operands are unordered, the comparison
+-- sets the parity flag, carry flag and zero flag.
+-- All comparisons are supposed to return false for unordered
+-- operands except for !=, which returns true.
+--
+-- Optimisation: we don't have to test the parity flag if we
+-- know the test has already excluded the unordered case: eg >
+-- and >= test for a zero carry flag, which can only occur for
+-- ordered operands.
+--
+-- By reversing comparisons we can avoid testing the parity
+-- for < and <= as well. If any of the arguments is an NaN we
+-- return false either way. If both arguments are valid then
+-- x <= y <-> y >= x holds. So it's safe to swap these.
+--
+-- We invert the condition inside getRegister'and getCondCode
+-- which should cover all invertable cases.
+-- All other functions translating FP comparisons to assembly
+-- use these to two generate the comparison code.
+--
+-- As an example consider a simple check:
+--
+-- func :: Float -> Float -> Int
+-- func x y = if x < y then 1 else 0
+--
+-- Which in Cmm gives the floating point comparison.
+--
+-- if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf;
+--
+-- We used to compile this to an assembly code block like this:
+-- _c2gh:
+-- ucomiss %xmm2,%xmm1
+-- jp _c2gf
+-- jb _c2gg
+-- jmp _c2gf
+--
+-- Where we have to introduce an explicit
+-- check for unordered results (using jmp parity):
+--
+-- We can avoid this by exchanging the arguments and inverting the direction
+-- of the comparison. This results in the sequence of:
+--
+-- ucomiss %xmm1,%xmm2
+-- ja _c2g2
+-- jmp _c2g1
+--
+-- Removing the jump reduces the pressure on the branch predidiction system
+-- and plays better with the uOP cache.
+
+condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
+condFltReg is32Bit cond x y = condFltReg_sse2
+ where
+
+
+ condFltReg_sse2 = do
+ CondCode _ cond cond_code <- condFltCode cond x y
+ tmp1 <- getNewRegNat (archWordFormat is32Bit)
+ tmp2 <- getNewRegNat (archWordFormat is32Bit)
+ let -- See Note [SSE Parity Checks]
+ code dst =
+ cond_code `appOL`
+ (case cond of
+ NE -> or_unordered dst
+ GU -> plain_test dst
+ GEU -> plain_test dst
+ -- Use ASSERT so we don't break releases if these creep in.
+ LTT -> ASSERT2(False, ppr "Should have been turned into >")
+ and_ordered dst
+ LE -> ASSERT2(False, ppr "Should have been turned into >=")
+ and_ordered dst
+ _ -> and_ordered dst)
+
+ plain_test dst = toOL [
+ SETCC cond (OpReg tmp1),
+ MOVZxL II8 (OpReg tmp1) (OpReg dst)
+ ]
+ or_unordered dst = toOL [
+ SETCC cond (OpReg tmp1),
+ SETCC PARITY (OpReg tmp2),
+ OR II8 (OpReg tmp1) (OpReg tmp2),
+ MOVZxL II8 (OpReg tmp2) (OpReg dst)
+ ]
+ and_ordered dst = toOL [
+ SETCC cond (OpReg tmp1),
+ SETCC NOTPARITY (OpReg tmp2),
+ AND II8 (OpReg tmp1) (OpReg tmp2),
+ MOVZxL II8 (OpReg tmp2) (OpReg dst)
+ ]
+ return (Any II32 code)
+
+
+-- -----------------------------------------------------------------------------
+-- 'trivial*Code': deal with trivial instructions
+
+-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
+-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
+-- Only look for constants on the right hand side, because that's
+-- where the generic optimizer will have put them.
+
+-- Similarly, for unary instructions, we don't have to worry about
+-- matching an StInt as the argument, because genericOpt will already
+-- have handled the constant-folding.
+
+
+{-
+The Rules of the Game are:
+
+* You cannot assume anything about the destination register dst;
+ it may be anything, including a fixed reg.
+
+* You may compute an operand into a fixed reg, but you may not
+ subsequently change the contents of that fixed reg. If you
+ want to do so, first copy the value either to a temporary
+ or into dst. You are free to modify dst even if it happens
+ to be a fixed reg -- that's not your problem.
+
+* You cannot assume that a fixed reg will stay live over an
+ arbitrary computation. The same applies to the dst reg.
+
+* Temporary regs obtained from getNewRegNat are distinct from
+ each other and from all other regs, and stay live over
+ arbitrary computations.
+
+--------------------
+
+SDM's version of The Rules:
+
+* If getRegister returns Any, that means it can generate correct
+ code which places the result in any register, period. Even if that
+ register happens to be read during the computation.
+
+ Corollary #1: this means that if you are generating code for an
+ operation with two arbitrary operands, you cannot assign the result
+ of the first operand into the destination register before computing
+ the second operand. The second operand might require the old value
+ of the destination register.
+
+ Corollary #2: A function might be able to generate more efficient
+ code if it knows the destination register is a new temporary (and
+ therefore not read by any of the sub-computations).
+
+* If getRegister returns Any, then the code it generates may modify only:
+ (a) fresh temporaries
+ (b) the destination register
+ (c) known registers (eg. %ecx is used by shifts)
+ In particular, it may *not* modify global registers, unless the global
+ register happens to be the destination register.
+-}
+
+trivialCode :: Width -> (Operand -> Operand -> Instr)
+ -> Maybe (Operand -> Operand -> Instr)
+ -> CmmExpr -> CmmExpr -> NatM Register
+trivialCode width instr m a b
+ = do is32Bit <- is32BitPlatform
+ trivialCode' is32Bit width instr m a b
+
+trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr)
+ -> Maybe (Operand -> Operand -> Instr)
+ -> CmmExpr -> CmmExpr -> NatM Register
+trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b
+ | is32BitLit is32Bit lit_a = do
+ b_code <- getAnyReg b
+ let
+ code dst
+ = b_code dst `snocOL`
+ revinstr (OpImm (litToImm lit_a)) (OpReg dst)
+ return (Any (intFormat width) code)
+
+trivialCode' _ width instr _ a b
+ = genTrivialCode (intFormat width) instr a b
+
+-- This is re-used for floating pt instructions too.
+genTrivialCode :: Format -> (Operand -> Operand -> Instr)
+ -> CmmExpr -> CmmExpr -> NatM Register
+genTrivialCode rep instr a b = do
+ (b_op, b_code) <- getNonClobberedOperand b
+ a_code <- getAnyReg a
+ tmp <- getNewRegNat rep
+ let
+ -- We want the value of b to stay alive across the computation of a.
+ -- But, we want to calculate a straight into the destination register,
+ -- because the instruction only has two operands (dst := dst `op` src).
+ -- The troublesome case is when the result of b is in the same register
+ -- as the destination reg. In this case, we have to save b in a
+ -- new temporary across the computation of a.
+ code dst
+ | dst `regClashesWithOp` b_op =
+ b_code `appOL`
+ unitOL (MOV rep b_op (OpReg tmp)) `appOL`
+ a_code dst `snocOL`
+ instr (OpReg tmp) (OpReg dst)
+ | otherwise =
+ b_code `appOL`
+ a_code dst `snocOL`
+ instr b_op (OpReg dst)
+ return (Any rep code)
+
+regClashesWithOp :: Reg -> Operand -> Bool
+reg `regClashesWithOp` OpReg reg2 = reg == reg2
+reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
+_ `regClashesWithOp` _ = False
+
+-----------
+
+trivialUCode :: Format -> (Operand -> Instr)
+ -> CmmExpr -> NatM Register
+trivialUCode rep instr x = do
+ x_code <- getAnyReg x
+ let
+ code dst =
+ x_code dst `snocOL`
+ instr (OpReg dst)
+ return (Any rep code)
+
+-----------
+
+
+trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
+ -> CmmExpr -> CmmExpr -> NatM Register
+trivialFCode_sse2 pk instr x y
+ = genTrivialCode format (instr format) x y
+ where format = floatFormat pk
+
+
+trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
+trivialUFCode format instr x = do
+ (x_reg, x_code) <- getSomeReg x
+ let
+ code dst =
+ x_code `snocOL`
+ instr x_reg dst
+ return (Any format code)
+
+
+--------------------------------------------------------------------------------
+coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP from to x = coerce_sse2
+ where
+
+ coerce_sse2 = do
+ (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
+ let
+ opc = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
+ n -> panic $ "coerceInt2FP.sse: unhandled width ("
+ ++ show n ++ ")"
+ code dst = x_code `snocOL` opc (intFormat from) x_op dst
+ return (Any (floatFormat to) code)
+ -- works even if the destination rep is <II32
+
+--------------------------------------------------------------------------------
+coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int from to x = coerceFP2Int_sse2
+ where
+ coerceFP2Int_sse2 = do
+ (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand
+ let
+ opc = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
+ n -> panic $ "coerceFP2Init.sse: unhandled width ("
+ ++ show n ++ ")"
+ code dst = x_code `snocOL` opc (intFormat to) x_op dst
+ return (Any (intFormat to) code)
+ -- works even if the destination rep is <II32
+
+
+--------------------------------------------------------------------------------
+coerceFP2FP :: Width -> CmmExpr -> NatM Register
+coerceFP2FP to x = do
+ (x_reg, x_code) <- getSomeReg x
+ let
+ opc = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
+ n -> panic $ "coerceFP2FP: unhandled width ("
+ ++ show n ++ ")"
+ code dst = x_code `snocOL` opc x_reg dst
+ return (Any ( floatFormat to) code)
+
+--------------------------------------------------------------------------------
+
+sse2NegCode :: Width -> CmmExpr -> NatM Register
+sse2NegCode w x = do
+ let fmt = floatFormat w
+ x_code <- getAnyReg x
+ -- This is how gcc does it, so it can't be that bad:
+ let
+ const = case fmt of
+ FF32 -> CmmInt 0x80000000 W32
+ FF64 -> CmmInt 0x8000000000000000 W64
+ x@II8 -> wrongFmt x
+ x@II16 -> wrongFmt x
+ x@II32 -> wrongFmt x
+ x@II64 -> wrongFmt x
+
+ where
+ wrongFmt x = panic $ "sse2NegCode: " ++ show x
+ Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
+ tmp <- getNewRegNat fmt
+ let
+ code dst = x_code dst `appOL` amode_code `appOL` toOL [
+ MOV fmt (OpAddr amode) (OpReg tmp),
+ XOR fmt (OpReg tmp) (OpReg dst)
+ ]
+ --
+ return (Any fmt code)
+
+isVecExpr :: CmmExpr -> Bool
+isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Add {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True
+isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True
+isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True
+isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True
+isVecExpr (CmmMachOp _ [e]) = isVecExpr e
+isVecExpr _ = False
+
+needLlvm :: NatM a
+needLlvm =
+ sorry $ unlines ["The native code generator does not support vector"
+ ,"instructions. Please use -fllvm."]
+
+-- | This works on the invariant that all jumps in the given blocks are required.
+-- Starting from there we try to make a few more jumps redundant by reordering
+-- them.
+-- We depend on the information in the CFG to do so so without a given CFG
+-- we do nothing.
+invertCondBranches :: Maybe CFG -- ^ CFG if present
+ -> LabelMap a -- ^ Blocks with info tables
+ -> [NatBasicBlock Instr] -- ^ List of basic blocks
+ -> [NatBasicBlock Instr]
+invertCondBranches Nothing _ bs = bs
+invertCondBranches (Just cfg) keep bs =
+ invert bs
+ where
+ invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
+ invert ((BasicBlock lbl1 ins@(_:_:_xs)):b2@(BasicBlock lbl2 _):bs)
+ | --pprTrace "Block" (ppr lbl1) True,
+ (jmp1,jmp2) <- last2 ins
+ , JXX cond1 target1 <- jmp1
+ , target1 == lbl2
+ --, pprTrace "CutChance" (ppr b1) True
+ , JXX ALWAYS target2 <- jmp2
+ -- We have enough information to check if we can perform the inversion
+ -- TODO: We could also check for the last asm instruction which sets
+ -- status flags instead. Which I suspect is worse in terms of compiler
+ -- performance, but might be applicable to more cases
+ , Just edgeInfo1 <- getEdgeInfo lbl1 target1 cfg
+ , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
+ -- Both jumps come from the same cmm statement
+ , transitionSource edgeInfo1 == transitionSource edgeInfo2
+ , CmmSource {trans_cmmNode = cmmCondBranch} <- transitionSource edgeInfo1
+
+ --Int comparisons are invertable
+ , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
+ , Just _ <- maybeIntComparison op
+ , Just invCond <- maybeInvertCond cond1
+
+ --Swap the last two jumps, invert the conditional jumps condition.
+ = let jumps =
+ case () of
+ -- We are free the eliminate the jmp. So we do so.
+ _ | not (mapMember target1 keep)
+ -> [JXX invCond target2]
+ -- If the conditional target is unlikely we put the other
+ -- target at the front.
+ | edgeWeight edgeInfo2 > edgeWeight edgeInfo1
+ -> [JXX invCond target2, JXX ALWAYS target1]
+ -- Keep things as-is otherwise
+ | otherwise
+ -> [jmp1, jmp2]
+ in --pprTrace "Cutable" (ppr [jmp1,jmp2] <+> text "=>" <+> ppr jumps) $
+ (BasicBlock lbl1
+ (dropTail 2 ins ++ jumps))
+ : invert (b2:bs)
+ invert (b:bs) = b : invert bs
+ invert [] = []
diff --git a/compiler/GHC/CmmToAsm/X86/Cond.hs b/compiler/GHC/CmmToAsm/X86/Cond.hs
new file mode 100644
index 0000000000..bb8f61438b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Cond.hs
@@ -0,0 +1,109 @@
+module GHC.CmmToAsm.X86.Cond (
+ Cond(..),
+ condUnsigned,
+ condToSigned,
+ condToUnsigned,
+ maybeFlipCond,
+ maybeInvertCond
+)
+
+where
+
+import GhcPrelude
+
+data Cond
+ = ALWAYS -- What's really used? ToDo
+ | EQQ
+ | GE
+ | GEU
+ | GTT
+ | GU
+ | LE
+ | LEU
+ | LTT
+ | LU
+ | NE
+ | NEG
+ | POS
+ | CARRY
+ | OFLO
+ | PARITY
+ | NOTPARITY
+ deriving Eq
+
+condUnsigned :: Cond -> Bool
+condUnsigned GU = True
+condUnsigned LU = True
+condUnsigned GEU = True
+condUnsigned LEU = True
+condUnsigned _ = False
+
+
+condToSigned :: Cond -> Cond
+condToSigned GU = GTT
+condToSigned LU = LTT
+condToSigned GEU = GE
+condToSigned LEU = LE
+condToSigned x = x
+
+
+condToUnsigned :: Cond -> Cond
+condToUnsigned GTT = GU
+condToUnsigned LTT = LU
+condToUnsigned GE = GEU
+condToUnsigned LE = LEU
+condToUnsigned x = x
+
+-- | @maybeFlipCond c@ returns @Just c'@ if it is possible to flip the
+-- arguments to the conditional @c@, and the new condition should be @c'@.
+maybeFlipCond :: Cond -> Maybe Cond
+maybeFlipCond cond = case cond of
+ EQQ -> Just EQQ
+ NE -> Just NE
+ LU -> Just GU
+ GU -> Just LU
+ LEU -> Just GEU
+ GEU -> Just LEU
+ LTT -> Just GTT
+ GTT -> Just LTT
+ LE -> Just GE
+ GE -> Just LE
+ _other -> Nothing
+
+-- | If we apply @maybeInvertCond@ to the condition of a jump we turn
+-- jumps taken into jumps not taken and vice versa.
+--
+-- Careful! If the used comparison and the conditional jump
+-- don't match the above behaviour will NOT hold.
+-- When used for FP comparisons this does not consider unordered
+-- numbers.
+-- Also inverting twice might return a synonym for the original condition.
+maybeInvertCond :: Cond -> Maybe Cond
+maybeInvertCond cond = case cond of
+ ALWAYS -> Nothing
+ EQQ -> Just NE
+ NE -> Just EQQ
+
+ NEG -> Just POS
+ POS -> Just NEG
+
+ GEU -> Just LU
+ LU -> Just GEU
+
+ GE -> Just LTT
+ LTT -> Just GE
+
+ GTT -> Just LE
+ LE -> Just GTT
+
+ GU -> Just LEU
+ LEU -> Just GU
+
+ --GEU "==" NOTCARRY, they are synonyms
+ --at the assembly level
+ CARRY -> Just GEU
+
+ OFLO -> Nothing
+
+ PARITY -> Just NOTPARITY
+ NOTPARITY -> Just PARITY
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
new file mode 100644
index 0000000000..4171806695
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -0,0 +1,1056 @@
+{-# LANGUAGE CPP, TypeFamilies #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-dependent assembly language
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.X86.Instr
+ ( Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..)
+ , getJumpDestBlockId, canShortcut, shortcutStatics
+ , shortcutJump, allocMoreStack
+ , maxSpillSlots, archWordFormat
+ )
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Reg.Target
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Platform.Regs
+import GHC.Cmm
+import FastString
+import Outputable
+import GHC.Platform
+
+import BasicTypes (Alignment)
+import GHC.Cmm.CLabel
+import GHC.Driver.Session
+import UniqSet
+import Unique
+import UniqSupply
+import GHC.Cmm.DebugBlock (UnwindTable)
+
+import Control.Monad
+import Data.Maybe (fromMaybe)
+
+-- Format of an x86/x86_64 memory address, in bytes.
+--
+archWordFormat :: Bool -> Format
+archWordFormat is32Bit
+ | is32Bit = II32
+ | otherwise = II64
+
+-- | Instruction instance for x86 instruction set.
+instance Instruction Instr where
+ regUsageOfInstr = x86_regUsageOfInstr
+ patchRegsOfInstr = x86_patchRegsOfInstr
+ isJumpishInstr = x86_isJumpishInstr
+ jumpDestsOfInstr = x86_jumpDestsOfInstr
+ patchJumpInstr = x86_patchJumpInstr
+ mkSpillInstr = x86_mkSpillInstr
+ mkLoadInstr = x86_mkLoadInstr
+ takeDeltaInstr = x86_takeDeltaInstr
+ isMetaInstr = x86_isMetaInstr
+ mkRegRegMoveInstr = x86_mkRegRegMoveInstr
+ takeRegRegMoveInstr = x86_takeRegRegMoveInstr
+ mkJumpInstr = x86_mkJumpInstr
+ mkStackAllocInstr = x86_mkStackAllocInstr
+ mkStackDeallocInstr = x86_mkStackDeallocInstr
+
+
+-- -----------------------------------------------------------------------------
+-- Intel x86 instructions
+
+{-
+Intel, in their infinite wisdom, selected a stack model for floating
+point registers on x86. That might have made sense back in 1979 --
+nowadays we can see it for the nonsense it really is. A stack model
+fits poorly with the existing nativeGen infrastructure, which assumes
+flat integer and FP register sets. Prior to this commit, nativeGen
+could not generate correct x86 FP code -- to do so would have meant
+somehow working the register-stack paradigm into the register
+allocator and spiller, which sounds very difficult.
+
+We have decided to cheat, and go for a simple fix which requires no
+infrastructure modifications, at the expense of generating ropey but
+correct FP code. All notions of the x86 FP stack and its insns have
+been removed. Instead, we pretend (to the instruction selector and
+register allocator) that x86 has six floating point registers, %fake0
+.. %fake5, which can be used in the usual flat manner. We further
+claim that x86 has floating point instructions very similar to SPARC
+and Alpha, that is, a simple 3-operand register-register arrangement.
+Code generation and register allocation proceed on this basis.
+
+When we come to print out the final assembly, our convenient fiction
+is converted to dismal reality. Each fake instruction is
+independently converted to a series of real x86 instructions.
+%fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg
+arithmetic operations, the two operands are pushed onto the top of the
+FP stack, the operation done, and the result copied back into the
+relevant register. There are only six %fake registers because 2 are
+needed for the translation, and x86 has 8 in total.
+
+The translation is inefficient but is simple and it works. A cleverer
+translation would handle a sequence of insns, simulating the FP stack
+contents, would not impose a fixed mapping from %fake to %st regs, and
+hopefully could avoid most of the redundant reg-reg moves of the
+current translation.
+
+We might as well make use of whatever unique FP facilities Intel have
+chosen to bless us with (let's not be churlish, after all).
+Hence GLDZ and GLD1. Bwahahahahahahaha!
+-}
+
+{-
+Note [x86 Floating point precision]
+
+Intel's internal floating point registers are by default 80 bit
+extended precision. This means that all operations done on values in
+registers are done at 80 bits, and unless the intermediate values are
+truncated to the appropriate size (32 or 64 bits) by storing in
+memory, calculations in registers will give different results from
+calculations which pass intermediate values in memory (eg. via
+function calls).
+
+One solution is to set the FPU into 64 bit precision mode. Some OSs
+do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is
+that this will only affect 64-bit precision arithmetic; 32-bit
+calculations will still be done at 64-bit precision in registers. So
+it doesn't solve the whole problem.
+
+There's also the issue of what the C library is expecting in terms of
+precision. It seems to be the case that glibc on Linux expects the
+FPU to be set to 80 bit precision, so setting it to 64 bit could have
+unexpected effects. Changing the default could have undesirable
+effects on other 3rd-party library code too, so the right thing would
+be to save/restore the FPU control word across Haskell code if we were
+to do this.
+
+gcc's -ffloat-store gives consistent results by always storing the
+results of floating-point calculations in memory, which works for both
+32 and 64-bit precision. However, it only affects the values of
+user-declared floating point variables in C, not intermediate results.
+GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
+flag).
+
+Another problem is how to spill floating point registers in the
+register allocator. Should we spill the whole 80 bits, or just 64?
+On an OS which is set to 64 bit precision, spilling 64 is fine. On
+Linux, spilling 64 bits will round the results of some operations.
+This is what gcc does. Spilling at 80 bits requires taking up a full
+128 bit slot (so we get alignment). We spill at 80-bits and ignore
+the alignment problems.
+
+In the future [edit: now available in GHC 7.0.1, with the -msse2
+flag], we'll use the SSE registers for floating point. This requires
+a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision
+float ops), which means P4 or Xeon and above. Using SSE will solve
+all these problems, because the SSE registers use fixed 32 bit or 64
+bit precision.
+
+--SDM 1/2003
+-}
+
+data Instr
+ -- comment pseudo-op
+ = COMMENT FastString
+
+ -- location pseudo-op (file, line, col, name)
+ | LOCATION Int Int Int String
+
+ -- some static data spat out during code
+ -- generation. Will be extracted before
+ -- pretty-printing.
+ | LDATA Section (Alignment, RawCmmStatics)
+
+ -- start a new basic block. Useful during
+ -- codegen, removed later. Preceding
+ -- instruction should be a jump, as per the
+ -- invariants for a BasicBlock (see Cmm).
+ | NEWBLOCK BlockId
+
+ -- unwinding information
+ -- See Note [Unwinding information in the NCG].
+ | UNWIND CLabel UnwindTable
+
+ -- specify current stack offset for benefit of subsequent passes.
+ -- This carries a BlockId so it can be used in unwinding information.
+ | DELTA Int
+
+ -- Moves.
+ | MOV Format Operand Operand
+ | CMOV Cond Format Operand Reg
+ | MOVZxL Format Operand Operand -- format is the size of operand 1
+ | MOVSxL Format Operand Operand -- format is the size of operand 1
+ -- x86_64 note: plain mov into a 32-bit register always zero-extends
+ -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
+ -- don't affect the high bits of the register.
+
+ -- Load effective address (also a very useful three-operand add instruction :-)
+ | LEA Format Operand Operand
+
+ -- Int Arithmetic.
+ | ADD Format Operand Operand
+ | ADC Format Operand Operand
+ | SUB Format Operand Operand
+ | SBB Format Operand Operand
+
+ | MUL Format Operand Operand
+ | MUL2 Format Operand -- %edx:%eax = operand * %rax
+ | IMUL Format Operand Operand -- signed int mul
+ | IMUL2 Format Operand -- %edx:%eax = operand * %eax
+
+ | DIV Format Operand -- eax := eax:edx/op, edx := eax:edx%op
+ | IDIV Format Operand -- ditto, but signed
+
+ -- Int Arithmetic, where the effects on the condition register
+ -- are important. Used in specialized sequences such as MO_Add2.
+ -- Do not rewrite these instructions to "equivalent" ones that
+ -- have different effect on the condition register! (See #9013.)
+ | ADD_CC Format Operand Operand
+ | SUB_CC Format Operand Operand
+
+ -- Simple bit-twiddling.
+ | AND Format Operand Operand
+ | OR Format Operand Operand
+ | XOR Format Operand Operand
+ | NOT Format Operand
+ | NEGI Format Operand -- NEG instruction (name clash with Cond)
+ | BSWAP Format Reg
+
+ -- Shifts (amount may be immediate or %cl only)
+ | SHL Format Operand{-amount-} Operand
+ | SAR Format Operand{-amount-} Operand
+ | SHR Format Operand{-amount-} Operand
+
+ | BT Format Imm Operand
+ | NOP
+
+
+ -- We need to support the FSTP (x87 store and pop) instruction
+ -- so that we can correctly read off the return value of an
+ -- x86 CDECL C function call when its floating point.
+ -- so we dont include a register argument, and just use st(0)
+ -- this instruction is used ONLY for return values of C ffi calls
+ -- in x86_32 abi
+ | X87Store Format AddrMode -- st(0), dst
+
+
+ -- SSE2 floating point: we use a restricted set of the available SSE2
+ -- instructions for floating-point.
+ -- use MOV for moving (either movss or movsd (movlpd better?))
+ | CVTSS2SD Reg Reg -- F32 to F64
+ | CVTSD2SS Reg Reg -- F64 to F32
+ | CVTTSS2SIQ Format Operand Reg -- F32 to I32/I64 (with truncation)
+ | CVTTSD2SIQ Format Operand Reg -- F64 to I32/I64 (with truncation)
+ | CVTSI2SS Format Operand Reg -- I32/I64 to F32
+ | CVTSI2SD Format Operand Reg -- I32/I64 to F64
+
+ -- use ADD, SUB, and SQRT for arithmetic. In both cases, operands
+ -- are Operand Reg.
+
+ -- SSE2 floating-point division:
+ | FDIV Format Operand Operand -- divisor, dividend(dst)
+
+ -- use CMP for comparisons. ucomiss and ucomisd instructions
+ -- compare single/double prec floating point respectively.
+
+ | SQRT Format Operand Reg -- src, dst
+
+
+ -- Comparison
+ | TEST Format Operand Operand
+ | CMP Format Operand Operand
+ | SETCC Cond Operand
+
+ -- Stack Operations.
+ | PUSH Format Operand
+ | POP Format Operand
+ -- both unused (SDM):
+ -- | PUSHA
+ -- | POPA
+
+ -- Jumping around.
+ | JMP Operand [Reg] -- including live Regs at the call
+ | JXX Cond BlockId -- includes unconditional branches
+ | JXX_GBL Cond Imm -- non-local version of JXX
+ -- Table jump
+ | JMP_TBL Operand -- Address to jump to
+ [Maybe JumpDest] -- Targets of the jump table
+ Section -- Data section jump table should be put in
+ CLabel -- Label of jump table
+ -- | X86 call instruction
+ | CALL (Either Imm Reg) -- ^ Jump target
+ [Reg] -- ^ Arguments (required for register allocation)
+
+ -- Other things.
+ | CLTD Format -- sign extend %eax into %edx:%eax
+
+ | FETCHGOT Reg -- pseudo-insn for ELF position-independent code
+ -- pretty-prints as
+ -- call 1f
+ -- 1: popl %reg
+ -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
+ | FETCHPC Reg -- pseudo-insn for Darwin position-independent code
+ -- pretty-prints as
+ -- call 1f
+ -- 1: popl %reg
+
+ -- bit counting instructions
+ | POPCNT Format Operand Reg -- [SSE4.2] count number of bits set to 1
+ | LZCNT Format Operand Reg -- [BMI2] count number of leading zeros
+ | TZCNT Format Operand Reg -- [BMI2] count number of trailing zeros
+ | BSF Format Operand Reg -- bit scan forward
+ | BSR Format Operand Reg -- bit scan reverse
+
+ -- bit manipulation instructions
+ | PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask
+ | PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
+
+ -- prefetch
+ | PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
+ -- variant can be NTA, Lvl0, Lvl1, or Lvl2
+
+ | LOCK Instr -- lock prefix
+ | XADD Format Operand Operand -- src (r), dst (r/m)
+ | CMPXCHG Format Operand Operand -- src (r), dst (r/m), eax implicit
+ | MFENCE
+
+data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
+
+
+data Operand
+ = OpReg Reg -- register
+ | OpImm Imm -- immediate value
+ | OpAddr AddrMode -- memory reference
+
+
+
+-- | Returns which registers are read and written as a (read, written)
+-- pair.
+x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
+x86_regUsageOfInstr platform instr
+ = case instr of
+ MOV _ src dst -> usageRW src dst
+ CMOV _ _ src dst -> mkRU (use_R src [dst]) [dst]
+ MOVZxL _ src dst -> usageRW src dst
+ MOVSxL _ src dst -> usageRW src dst
+ LEA _ src dst -> usageRW src dst
+ ADD _ src dst -> usageRM src dst
+ ADC _ src dst -> usageRM src dst
+ SUB _ src dst -> usageRM src dst
+ SBB _ src dst -> usageRM src dst
+ IMUL _ src dst -> usageRM src dst
+
+ -- Result of IMULB will be in just in %ax
+ IMUL2 II8 src -> mkRU (eax:use_R src []) [eax]
+ -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and
+ -- %ax/%eax/%rax.
+ IMUL2 _ src -> mkRU (eax:use_R src []) [eax,edx]
+
+ MUL _ src dst -> usageRM src dst
+ MUL2 _ src -> mkRU (eax:use_R src []) [eax,edx]
+ DIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
+ IDIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
+ ADD_CC _ src dst -> usageRM src dst
+ SUB_CC _ src dst -> usageRM src dst
+ AND _ src dst -> usageRM src dst
+ OR _ src dst -> usageRM src dst
+
+ XOR _ (OpReg src) (OpReg dst)
+ | src == dst -> mkRU [] [dst]
+
+ XOR _ src dst -> usageRM src dst
+ NOT _ op -> usageM op
+ BSWAP _ reg -> mkRU [reg] [reg]
+ NEGI _ op -> usageM op
+ SHL _ imm dst -> usageRM imm dst
+ SAR _ imm dst -> usageRM imm dst
+ SHR _ imm dst -> usageRM imm dst
+ BT _ _ src -> mkRUR (use_R src [])
+
+ PUSH _ op -> mkRUR (use_R op [])
+ POP _ op -> mkRU [] (def_W op)
+ TEST _ src dst -> mkRUR (use_R src $! use_R dst [])
+ CMP _ src dst -> mkRUR (use_R src $! use_R dst [])
+ SETCC _ op -> mkRU [] (def_W op)
+ JXX _ _ -> mkRU [] []
+ JXX_GBL _ _ -> mkRU [] []
+ JMP op regs -> mkRUR (use_R op regs)
+ JMP_TBL op _ _ _ -> mkRUR (use_R op [])
+ CALL (Left _) params -> mkRU params (callClobberedRegs platform)
+ CALL (Right reg) params -> mkRU (reg:params) (callClobberedRegs platform)
+ CLTD _ -> mkRU [eax] [edx]
+ NOP -> mkRU [] []
+
+ X87Store _ dst -> mkRUR ( use_EA dst [])
+
+ CVTSS2SD src dst -> mkRU [src] [dst]
+ CVTSD2SS src dst -> mkRU [src] [dst]
+ CVTTSS2SIQ _ src dst -> mkRU (use_R src []) [dst]
+ CVTTSD2SIQ _ src dst -> mkRU (use_R src []) [dst]
+ CVTSI2SS _ src dst -> mkRU (use_R src []) [dst]
+ CVTSI2SD _ src dst -> mkRU (use_R src []) [dst]
+ FDIV _ src dst -> usageRM src dst
+ SQRT _ src dst -> mkRU (use_R src []) [dst]
+
+ FETCHGOT reg -> mkRU [] [reg]
+ FETCHPC reg -> mkRU [] [reg]
+
+ COMMENT _ -> noUsage
+ LOCATION{} -> noUsage
+ UNWIND{} -> noUsage
+ DELTA _ -> noUsage
+
+ POPCNT _ src dst -> mkRU (use_R src []) [dst]
+ LZCNT _ src dst -> mkRU (use_R src []) [dst]
+ TZCNT _ src dst -> mkRU (use_R src []) [dst]
+ BSF _ src dst -> mkRU (use_R src []) [dst]
+ BSR _ src dst -> mkRU (use_R src []) [dst]
+
+ PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+ PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+
+ -- note: might be a better way to do this
+ PREFETCH _ _ src -> mkRU (use_R src []) []
+ LOCK i -> x86_regUsageOfInstr platform i
+ XADD _ src dst -> usageMM src dst
+ CMPXCHG _ src dst -> usageRMM src dst (OpReg eax)
+ MFENCE -> noUsage
+
+ _other -> panic "regUsage: unrecognised instr"
+ where
+ -- # Definitions
+ --
+ -- Written: If the operand is a register, it's written. If it's an
+ -- address, registers mentioned in the address are read.
+ --
+ -- Modified: If the operand is a register, it's both read and
+ -- written. If it's an address, registers mentioned in the address
+ -- are read.
+
+ -- 2 operand form; first operand Read; second Written
+ usageRW :: Operand -> Operand -> RegUsage
+ usageRW op (OpReg reg) = mkRU (use_R op []) [reg]
+ usageRW op (OpAddr ea) = mkRUR (use_R op $! use_EA ea [])
+ usageRW _ _ = panic "X86.RegInfo.usageRW: no match"
+
+ -- 2 operand form; first operand Read; second Modified
+ usageRM :: Operand -> Operand -> RegUsage
+ usageRM op (OpReg reg) = mkRU (use_R op [reg]) [reg]
+ usageRM op (OpAddr ea) = mkRUR (use_R op $! use_EA ea [])
+ usageRM _ _ = panic "X86.RegInfo.usageRM: no match"
+
+ -- 2 operand form; first operand Modified; second Modified
+ usageMM :: Operand -> Operand -> RegUsage
+ usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
+ usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
+ usageMM _ _ = panic "X86.RegInfo.usageMM: no match"
+
+ -- 3 operand form; first operand Read; second Modified; third Modified
+ usageRMM :: Operand -> Operand -> Operand -> RegUsage
+ usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
+ usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
+ usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match"
+
+ -- 1 operand form; operand Modified
+ usageM :: Operand -> RegUsage
+ usageM (OpReg reg) = mkRU [reg] [reg]
+ usageM (OpAddr ea) = mkRUR (use_EA ea [])
+ usageM _ = panic "X86.RegInfo.usageM: no match"
+
+ -- Registers defd when an operand is written.
+ def_W (OpReg reg) = [reg]
+ def_W (OpAddr _ ) = []
+ def_W _ = panic "X86.RegInfo.def_W: no match"
+
+ -- Registers used when an operand is read.
+ use_R (OpReg reg) tl = reg : tl
+ use_R (OpImm _) tl = tl
+ use_R (OpAddr ea) tl = use_EA ea tl
+
+ -- Registers used to compute an effective address.
+ use_EA (ImmAddr _ _) tl = tl
+ use_EA (AddrBaseIndex base index _) tl =
+ use_base base $! use_index index tl
+ where use_base (EABaseReg r) tl = r : tl
+ use_base _ tl = tl
+ use_index EAIndexNone tl = tl
+ use_index (EAIndex i _) tl = i : tl
+
+ mkRUR src = src' `seq` RU src' []
+ where src' = filter (interesting platform) src
+
+ mkRU src dst = src' `seq` dst' `seq` RU src' dst'
+ where src' = filter (interesting platform) src
+ dst' = filter (interesting platform) dst
+
+-- | Is this register interesting for the register allocator?
+interesting :: Platform -> Reg -> Bool
+interesting _ (RegVirtual _) = True
+interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
+interesting _ (RegReal (RealRegPair{})) = panic "X86.interesting: no reg pairs on this arch"
+
+
+
+-- | Applies the supplied function to all registers in instructions.
+-- Typically used to change virtual registers to real registers.
+x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+x86_patchRegsOfInstr instr env
+ = case instr of
+ MOV fmt src dst -> patch2 (MOV fmt) src dst
+ CMOV cc fmt src dst -> CMOV cc fmt (patchOp src) (env dst)
+ MOVZxL fmt src dst -> patch2 (MOVZxL fmt) src dst
+ MOVSxL fmt src dst -> patch2 (MOVSxL fmt) src dst
+ LEA fmt src dst -> patch2 (LEA fmt) src dst
+ ADD fmt src dst -> patch2 (ADD fmt) src dst
+ ADC fmt src dst -> patch2 (ADC fmt) src dst
+ SUB fmt src dst -> patch2 (SUB fmt) src dst
+ SBB fmt src dst -> patch2 (SBB fmt) src dst
+ IMUL fmt src dst -> patch2 (IMUL fmt) src dst
+ IMUL2 fmt src -> patch1 (IMUL2 fmt) src
+ MUL fmt src dst -> patch2 (MUL fmt) src dst
+ MUL2 fmt src -> patch1 (MUL2 fmt) src
+ IDIV fmt op -> patch1 (IDIV fmt) op
+ DIV fmt op -> patch1 (DIV fmt) op
+ ADD_CC fmt src dst -> patch2 (ADD_CC fmt) src dst
+ SUB_CC fmt src dst -> patch2 (SUB_CC fmt) src dst
+ AND fmt src dst -> patch2 (AND fmt) src dst
+ OR fmt src dst -> patch2 (OR fmt) src dst
+ XOR fmt src dst -> patch2 (XOR fmt) src dst
+ NOT fmt op -> patch1 (NOT fmt) op
+ BSWAP fmt reg -> BSWAP fmt (env reg)
+ NEGI fmt op -> patch1 (NEGI fmt) op
+ SHL fmt imm dst -> patch1 (SHL fmt imm) dst
+ SAR fmt imm dst -> patch1 (SAR fmt imm) dst
+ SHR fmt imm dst -> patch1 (SHR fmt imm) dst
+ BT fmt imm src -> patch1 (BT fmt imm) src
+ TEST fmt src dst -> patch2 (TEST fmt) src dst
+ CMP fmt src dst -> patch2 (CMP fmt) src dst
+ PUSH fmt op -> patch1 (PUSH fmt) op
+ POP fmt op -> patch1 (POP fmt) op
+ SETCC cond op -> patch1 (SETCC cond) op
+ JMP op regs -> JMP (patchOp op) regs
+ JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
+
+ -- literally only support storing the top x87 stack value st(0)
+ X87Store fmt dst -> X87Store fmt (lookupAddr dst)
+
+ CVTSS2SD src dst -> CVTSS2SD (env src) (env dst)
+ CVTSD2SS src dst -> CVTSD2SS (env src) (env dst)
+ CVTTSS2SIQ fmt src dst -> CVTTSS2SIQ fmt (patchOp src) (env dst)
+ CVTTSD2SIQ fmt src dst -> CVTTSD2SIQ fmt (patchOp src) (env dst)
+ CVTSI2SS fmt src dst -> CVTSI2SS fmt (patchOp src) (env dst)
+ CVTSI2SD fmt src dst -> CVTSI2SD fmt (patchOp src) (env dst)
+ FDIV fmt src dst -> FDIV fmt (patchOp src) (patchOp dst)
+ SQRT fmt src dst -> SQRT fmt (patchOp src) (env dst)
+
+ CALL (Left _) _ -> instr
+ CALL (Right reg) p -> CALL (Right (env reg)) p
+
+ FETCHGOT reg -> FETCHGOT (env reg)
+ FETCHPC reg -> FETCHPC (env reg)
+
+ NOP -> instr
+ COMMENT _ -> instr
+ LOCATION {} -> instr
+ UNWIND {} -> instr
+ DELTA _ -> instr
+
+ JXX _ _ -> instr
+ JXX_GBL _ _ -> instr
+ CLTD _ -> instr
+
+ POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
+ LZCNT fmt src dst -> LZCNT fmt (patchOp src) (env dst)
+ TZCNT fmt src dst -> TZCNT fmt (patchOp src) (env dst)
+ PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst)
+ PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst)
+ BSF fmt src dst -> BSF fmt (patchOp src) (env dst)
+ BSR fmt src dst -> BSR fmt (patchOp src) (env dst)
+
+ PREFETCH lvl format src -> PREFETCH lvl format (patchOp src)
+
+ LOCK i -> LOCK (x86_patchRegsOfInstr i env)
+ XADD fmt src dst -> patch2 (XADD fmt) src dst
+ CMPXCHG fmt src dst -> patch2 (CMPXCHG fmt) src dst
+ MFENCE -> instr
+
+ _other -> panic "patchRegs: unrecognised instr"
+
+ where
+ patch1 :: (Operand -> a) -> Operand -> a
+ patch1 insn op = insn $! patchOp op
+ patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a
+ patch2 insn src dst = (insn $! patchOp src) $! patchOp dst
+
+ patchOp (OpReg reg) = OpReg $! env reg
+ patchOp (OpImm imm) = OpImm imm
+ patchOp (OpAddr ea) = OpAddr $! lookupAddr ea
+
+ lookupAddr (ImmAddr imm off) = ImmAddr imm off
+ lookupAddr (AddrBaseIndex base index disp)
+ = ((AddrBaseIndex $! lookupBase base) $! lookupIndex index) disp
+ where
+ lookupBase EABaseNone = EABaseNone
+ lookupBase EABaseRip = EABaseRip
+ lookupBase (EABaseReg r) = EABaseReg $! env r
+
+ lookupIndex EAIndexNone = EAIndexNone
+ lookupIndex (EAIndex r i) = (EAIndex $! env r) i
+
+
+--------------------------------------------------------------------------------
+x86_isJumpishInstr
+ :: Instr -> Bool
+
+x86_isJumpishInstr instr
+ = case instr of
+ JMP{} -> True
+ JXX{} -> True
+ JXX_GBL{} -> True
+ JMP_TBL{} -> True
+ CALL{} -> True
+ _ -> False
+
+
+x86_jumpDestsOfInstr
+ :: Instr
+ -> [BlockId]
+
+x86_jumpDestsOfInstr insn
+ = case insn of
+ JXX _ id -> [id]
+ JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids]
+ _ -> []
+
+
+x86_patchJumpInstr
+ :: Instr -> (BlockId -> BlockId) -> Instr
+
+x86_patchJumpInstr insn patchF
+ = case insn of
+ JXX cc id -> JXX cc (patchF id)
+ JMP_TBL op ids section lbl
+ -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl
+ _ -> insn
+ where
+ patchJumpDest f (DestBlockId id) = DestBlockId (f id)
+ patchJumpDest _ dest = dest
+
+
+
+
+
+-- -----------------------------------------------------------------------------
+-- | Make a spill instruction.
+x86_mkSpillInstr
+ :: DynFlags
+ -> Reg -- register to spill
+ -> Int -- current stack delta
+ -> Int -- spill slot to use
+ -> Instr
+
+x86_mkSpillInstr dflags reg delta slot
+ = let off = spillSlotToOffset platform slot - delta
+ in
+ case targetClassOfReg platform reg of
+ RcInteger -> MOV (archWordFormat is32Bit)
+ (OpReg reg) (OpAddr (spRel dflags off))
+ RcDouble -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off))
+ _ -> panic "X86.mkSpillInstr: no match"
+ where platform = targetPlatform dflags
+ is32Bit = target32Bit platform
+
+-- | Make a spill reload instruction.
+x86_mkLoadInstr
+ :: DynFlags
+ -> Reg -- register to load
+ -> Int -- current stack delta
+ -> Int -- spill slot to use
+ -> Instr
+
+x86_mkLoadInstr dflags reg delta slot
+ = let off = spillSlotToOffset platform slot - delta
+ in
+ case targetClassOfReg platform reg of
+ RcInteger -> MOV (archWordFormat is32Bit)
+ (OpAddr (spRel dflags off)) (OpReg reg)
+ RcDouble -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg)
+ _ -> panic "X86.x86_mkLoadInstr"
+ where platform = targetPlatform dflags
+ is32Bit = target32Bit platform
+
+spillSlotSize :: Platform -> Int
+spillSlotSize dflags = if is32Bit then 12 else 8
+ where is32Bit = target32Bit dflags
+
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+ = ((rESERVED_C_STACK_BYTES dflags - 64) `div` spillSlotSize (targetPlatform dflags)) - 1
+-- = 0 -- useful for testing allocMoreStack
+
+-- number of bytes that the stack pointer should be aligned to
+stackAlign :: Int
+stackAlign = 16
+
+-- convert a spill slot number to a *byte* offset, with no sign:
+-- decide on a per arch basis whether you are spilling above or below
+-- the C stack pointer.
+spillSlotToOffset :: Platform -> Int -> Int
+spillSlotToOffset platform slot
+ = 64 + spillSlotSize platform * slot
+
+--------------------------------------------------------------------------------
+
+-- | See if this instruction is telling us the current C stack delta
+x86_takeDeltaInstr
+ :: Instr
+ -> Maybe Int
+
+x86_takeDeltaInstr instr
+ = case instr of
+ DELTA i -> Just i
+ _ -> Nothing
+
+
+x86_isMetaInstr
+ :: Instr
+ -> Bool
+
+x86_isMetaInstr instr
+ = case instr of
+ COMMENT{} -> True
+ LOCATION{} -> True
+ LDATA{} -> True
+ NEWBLOCK{} -> True
+ UNWIND{} -> True
+ DELTA{} -> True
+ _ -> False
+
+
+
+--- TODO: why is there
+-- | Make a reg-reg move instruction.
+-- On SPARC v8 there are no instructions to move directly between
+-- floating point and integer regs. If we need to do that then we
+-- have to go via memory.
+--
+x86_mkRegRegMoveInstr
+ :: Platform
+ -> Reg
+ -> Reg
+ -> Instr
+
+x86_mkRegRegMoveInstr platform src dst
+ = case targetClassOfReg platform src of
+ RcInteger -> case platformArch platform of
+ ArchX86 -> MOV II32 (OpReg src) (OpReg dst)
+ ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst)
+ _ -> panic "x86_mkRegRegMoveInstr: Bad arch"
+ RcDouble -> MOV FF64 (OpReg src) (OpReg dst)
+ -- this code is the lie we tell ourselves because both float and double
+ -- use the same register class.on x86_64 and x86 32bit with SSE2,
+ -- more plainly, both use the XMM registers
+ _ -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
+
+-- | Check whether an instruction represents a reg-reg move.
+-- The register allocator attempts to eliminate reg->reg moves whenever it can,
+-- by assigning the src and dest temporaries to the same real register.
+--
+x86_takeRegRegMoveInstr
+ :: Instr
+ -> Maybe (Reg,Reg)
+
+x86_takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2))
+ = Just (r1,r2)
+
+x86_takeRegRegMoveInstr _ = Nothing
+
+
+-- | Make an unconditional branch instruction.
+x86_mkJumpInstr
+ :: BlockId
+ -> [Instr]
+
+x86_mkJumpInstr id
+ = [JXX ALWAYS id]
+
+-- Note [Windows stack layout]
+-- | On most OSes the kernel will place a guard page after the current stack
+-- page. If you allocate larger than a page worth you may jump over this
+-- guard page. Not only is this a security issue, but on certain OSes such
+-- as Windows a new page won't be allocated if you don't hit the guard. This
+-- will cause a segfault or access fault.
+--
+-- This function defines if the current allocation amount requires a probe.
+-- On Windows (for now) we emit a call to _chkstk for this. For other OSes
+-- this is not yet implemented.
+-- See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk
+-- The Windows stack looks like this:
+--
+-- +-------------------+
+-- | SP |
+-- +-------------------+
+-- | |
+-- | GUARD PAGE |
+-- | |
+-- +-------------------+
+-- | |
+-- | |
+-- | UNMAPPED |
+-- | |
+-- | |
+-- +-------------------+
+--
+-- In essence each allocation larger than a page size needs to be chunked and
+-- a probe emitted after each page allocation. You have to hit the guard
+-- page so the kernel can map in the next page, otherwise you'll segfault.
+--
+needs_probe_call :: Platform -> Int -> Bool
+needs_probe_call platform amount
+ = case platformOS platform of
+ OSMinGW32 -> case platformArch platform of
+ ArchX86 -> amount > (4 * 1024)
+ ArchX86_64 -> amount > (8 * 1024)
+ _ -> False
+ _ -> False
+
+x86_mkStackAllocInstr
+ :: Platform
+ -> Int
+ -> [Instr]
+x86_mkStackAllocInstr platform amount
+ = case platformOS platform of
+ OSMinGW32 ->
+ -- These will clobber AX but this should be ok because
+ --
+ -- 1. It is the first thing we do when entering the closure and AX is
+ -- a caller saved registers on Windows both on x86_64 and x86.
+ --
+ -- 2. The closures are only entered via a call or longjmp in which case
+ -- there are no expectations for volatile registers.
+ --
+ -- 3. When the target is a local branch point it is re-targeted
+ -- after the dealloc, preserving #2. See note [extra spill slots].
+ --
+ -- We emit a call because the stack probes are quite involved and
+ -- would bloat code size a lot. GHC doesn't really have an -Os.
+ -- __chkstk is guaranteed to leave all nonvolatile registers and AX
+ -- untouched. It's part of the standard prologue code for any Windows
+ -- function dropping the stack more than a page.
+ -- See Note [Windows stack layout]
+ case platformArch platform of
+ ArchX86 | needs_probe_call platform amount ->
+ [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax)
+ , CALL (Left $ strImmLit "___chkstk_ms") [eax]
+ , SUB II32 (OpReg eax) (OpReg esp)
+ ]
+ | otherwise ->
+ [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
+ , TEST II32 (OpReg esp) (OpReg esp)
+ ]
+ ArchX86_64 | needs_probe_call platform amount ->
+ [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax)
+ , CALL (Left $ strImmLit "___chkstk_ms") [rax]
+ , SUB II64 (OpReg rax) (OpReg rsp)
+ ]
+ | otherwise ->
+ [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
+ , TEST II64 (OpReg rsp) (OpReg rsp)
+ ]
+ _ -> panic "x86_mkStackAllocInstr"
+ _ ->
+ case platformArch platform of
+ ArchX86 -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ]
+ ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ]
+ _ -> panic "x86_mkStackAllocInstr"
+
+x86_mkStackDeallocInstr
+ :: Platform
+ -> Int
+ -> [Instr]
+x86_mkStackDeallocInstr platform amount
+ = case platformArch platform of
+ ArchX86 -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)]
+ ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)]
+ _ -> panic "x86_mkStackDeallocInstr"
+
+
+--
+-- Note [extra spill slots]
+--
+-- If the register allocator used more spill slots than we have
+-- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more
+-- C stack space on entry and exit from this proc. Therefore we
+-- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp"
+-- before every non-local jump.
+--
+-- This became necessary when the new codegen started bundling entire
+-- functions together into one proc, because the register allocator
+-- assigns a different stack slot to each virtual reg within a proc.
+-- To avoid using so many slots we could also:
+--
+-- - split up the proc into connected components before code generator
+--
+-- - rename the virtual regs, so that we re-use vreg names and hence
+-- stack slots for non-overlapping vregs.
+--
+-- Note that when a block is both a non-local entry point (with an
+-- info table) and a local branch target, we have to split it into
+-- two, like so:
+--
+-- <info table>
+-- L:
+-- <code>
+--
+-- becomes
+--
+-- <info table>
+-- L:
+-- subl $rsp, N
+-- jmp Lnew
+-- Lnew:
+-- <code>
+--
+-- and all branches pointing to L are retargetted to point to Lnew.
+-- Otherwise, we would repeat the $rsp adjustment for each branch to
+-- L.
+--
+-- Returns a list of (L,Lnew) pairs.
+--
+allocMoreStack
+ :: Platform
+ -> Int
+ -> NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr
+ -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
+ let entries = entryBlocks proc
+
+ uniqs <- replicateM (length entries) getUniqueM
+
+ let
+ delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+ where x = slots * spillSlotSize platform -- sp delta
+
+ alloc = mkStackAllocInstr platform delta
+ dealloc = mkStackDeallocInstr platform delta
+
+ retargetList = (zip entries (map mkBlockId uniqs))
+
+ new_blockmap :: LabelMap BlockId
+ new_blockmap = mapFromList retargetList
+
+ insert_stack_insns (BasicBlock id insns)
+ | Just new_blockid <- mapLookup id new_blockmap
+ = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid]
+ , BasicBlock new_blockid block' ]
+ | otherwise
+ = [ BasicBlock id block' ]
+ where
+ block' = foldr insert_dealloc [] insns
+
+ insert_dealloc insn r = case insn of
+ JMP _ _ -> dealloc ++ (insn : r)
+ JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL"
+ _other -> x86_patchJumpInstr insn retarget : r
+ where retarget b = fromMaybe b (mapLookup b new_blockmap)
+
+ new_code = concatMap insert_stack_insns code
+ -- in
+ return (CmmProc info lbl live (ListGraph new_code), retargetList)
+
+data JumpDest = DestBlockId BlockId | DestImm Imm
+
+-- Debug Instance
+instance Outputable JumpDest where
+ ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
+ ppr (DestImm _imm) = text "jd<imm>:noShow"
+
+
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+getJumpDestBlockId _ = Nothing
+
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut (JXX ALWAYS id) = Just (DestBlockId id)
+canShortcut (JMP (OpImm imm) _) = Just (DestImm imm)
+canShortcut _ = Nothing
+
+
+-- This helper shortcuts a sequence of branches.
+-- The blockset helps avoid following cycles.
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn
+ where
+ shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr
+ shortcutJump' fn seen insn@(JXX cc id) =
+ if setMember id seen then insn
+ else case fn id of
+ Nothing -> insn
+ Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id')
+ Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm)
+ where seen' = setInsert id seen
+ shortcutJump' fn _ (JMP_TBL addr blocks section tblId) =
+ let updateBlock (Just (DestBlockId bid)) =
+ case fn bid of
+ Nothing -> Just (DestBlockId bid )
+ Just dest -> Just dest
+ updateBlock dest = dest
+ blocks' = map updateBlock blocks
+ in JMP_TBL addr blocks' section tblId
+ shortcutJump' _ _ other = other
+
+-- Here because it knows about JumpDest
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, RawCmmStatics) -> (Alignment, RawCmmStatics)
+shortcutStatics fn (align, RawCmmStatics lbl statics)
+ = (align, RawCmmStatics lbl $ map (shortcutStatic fn) statics)
+ -- we need to get the jump tables, so apply the mapping to the entries
+ -- of a CmmData too.
+
+shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
+shortcutLabel fn lab
+ | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId
+ | otherwise = lab
+
+shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
+shortcutStatic fn (CmmStaticLit (CmmLabel lab))
+ = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
+shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
+ = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
+ -- slightly dodgy, we're ignoring the second label, but this
+ -- works with the way we use CmmLabelDiffOff for jump tables now.
+shortcutStatic _ other_static
+ = other_static
+
+shortBlockId
+ :: (BlockId -> Maybe JumpDest)
+ -> UniqSet Unique
+ -> BlockId
+ -> CLabel
+
+shortBlockId fn seen blockid =
+ case (elementOfUniqSet uq seen, fn blockid) of
+ (True, _) -> blockLbl blockid
+ (_, Nothing) -> blockLbl blockid
+ (_, Just (DestBlockId blockid')) -> shortBlockId fn (addOneToUniqSet seen uq) blockid'
+ (_, Just (DestImm (ImmCLbl lbl))) -> lbl
+ (_, _other) -> panic "shortBlockId"
+ where uq = getUnique blockid
diff --git a/compiler/GHC/CmmToAsm/X86/Ppr.hs b/compiler/GHC/CmmToAsm/X86/Ppr.hs
new file mode 100644
index 0000000000..a5b9041974
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Ppr.hs
@@ -0,0 +1,1014 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.CmmToAsm.X86.Ppr (
+ pprNatCmmDecl,
+ pprData,
+ pprInstr,
+ pprFormat,
+ pprImm,
+ pprDataItem,
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.X86.Instr
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Ppr
+
+
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import BasicTypes (Alignment, mkAlignment, alignmentBytes)
+import GHC.Driver.Session
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import Unique ( pprUniqueAlways )
+import GHC.Platform
+import FastString
+import Outputable
+
+import Data.Word
+import Data.Bits
+
+-- -----------------------------------------------------------------------------
+-- Printing this stuff out
+--
+--
+-- Note [Subsections Via Symbols]
+--
+-- If we are using the .subsections_via_symbols directive
+-- (available on recent versions of Darwin),
+-- we have to make sure that there is some kind of reference
+-- from the entry code to a label on the _top_ of of the info table,
+-- so that the linker will not think it is unreferenced and dead-strip
+-- it. That's why the label is called a DeadStripPreventer (_dsp).
+--
+-- The LLVM code gen already creates `iTableSuf` symbols, where
+-- the X86 would generate the DeadStripPreventer (_dsp) symbol.
+-- Therefore all that is left for llvm code gen, is to ensure
+-- that all the `iTableSuf` symbols are marked as used.
+-- As of this writing the documentation regarding the
+-- .subsections_via_symbols and -dead_strip can be found at
+-- <https://developer.apple.com/library/mac/documentation/DeveloperTools/Reference/Assembler/040-Assembler_Directives/asm_directives.html#//apple_ref/doc/uid/TP30000823-TPXREF101>
+
+pprProcAlignment :: SDoc
+pprProcAlignment = sdocWithDynFlags $ \dflags ->
+ (maybe empty (pprAlign . mkAlignment) (cmmProcAlignment dflags))
+
+pprNatCmmDecl :: NatCmmDecl (Alignment, RawCmmStatics) Instr -> SDoc
+pprNatCmmDecl (CmmData section dats) =
+ pprSectionAlign section $$ pprDatas dats
+
+pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+ sdocWithDynFlags $ \dflags ->
+ pprProcAlignment $$
+ case topInfoTable proc of
+ Nothing ->
+ -- special case for code without info table:
+ pprSectionAlign (Section Text lbl) $$
+ pprProcAlignment $$
+ pprLabel lbl $$ -- blocks guaranteed not null, so label needed
+ vcat (map (pprBasicBlock top_info) blocks) $$
+ (if debugLevel dflags > 0
+ then ppr (mkAsmTempEndLabel lbl) <> char ':' else empty) $$
+ pprSizeDecl lbl
+
+ Just (RawCmmStatics info_lbl _) ->
+ sdocWithPlatform $ \platform ->
+ pprSectionAlign (Section Text info_lbl) $$
+ pprProcAlignment $$
+ (if platformHasSubsectionsViaSymbols platform
+ then ppr (mkDeadStripPreventer info_lbl) <> char ':'
+ else empty) $$
+ vcat (map (pprBasicBlock top_info) blocks) $$
+ -- above: Even the first block gets a label, because with branch-chain
+ -- elimination, it might be the target of a goto.
+ (if platformHasSubsectionsViaSymbols platform
+ then -- See Note [Subsections Via Symbols]
+ text "\t.long "
+ <+> ppr info_lbl
+ <+> char '-'
+ <+> ppr (mkDeadStripPreventer info_lbl)
+ else empty) $$
+ pprSizeDecl info_lbl
+
+-- | Output the ELF .size directive.
+pprSizeDecl :: CLabel -> SDoc
+pprSizeDecl lbl
+ = sdocWithPlatform $ \platform ->
+ if osElfTarget (platformOS platform)
+ then text "\t.size" <+> ppr lbl <> ptext (sLit ", .-") <> ppr lbl
+ else empty
+
+pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc
+pprBasicBlock info_env (BasicBlock blockid instrs)
+ = maybe_infotable $
+ pprLabel asmLbl $$
+ vcat (map pprInstr instrs) $$
+ (sdocOption sdocDebugLevel $ \level ->
+ if level > 0
+ then ppr (mkAsmTempEndLabel asmLbl) <> char ':'
+ else empty
+ )
+ where
+ asmLbl = blockLbl blockid
+ maybe_infotable c = case mapLookup blockid info_env of
+ Nothing -> c
+ Just (RawCmmStatics infoLbl info) ->
+ pprAlignForSection Text $$
+ infoTableLoc $$
+ vcat (map pprData info) $$
+ pprLabel infoLbl $$
+ c $$
+ (sdocOption sdocDebugLevel $ \level ->
+ if level > 0
+ then ppr (mkAsmTempEndLabel infoLbl) <> char ':'
+ else empty
+ )
+ -- Make sure the info table has the right .loc for the block
+ -- coming right after it. See [Note: Info Offset]
+ infoTableLoc = case instrs of
+ (l@LOCATION{} : _) -> pprInstr l
+ _other -> empty
+
+
+pprDatas :: (Alignment, RawCmmStatics) -> SDoc
+-- See note [emit-time elimination of static indirections] in CLabel.
+pprDatas (_, RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+ | lbl == mkIndStaticInfoLabel
+ , let labelInd (CmmLabelOff l _) = Just l
+ labelInd (CmmLabel l) = Just l
+ labelInd _ = Nothing
+ , Just ind' <- labelInd ind
+ , alias `mayRedirectTo` ind'
+ = pprGloblDecl alias
+ $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind')
+
+pprDatas (align, (RawCmmStatics lbl dats))
+ = vcat (pprAlign align : pprLabel lbl : map pprData dats)
+
+pprData :: CmmStatic -> SDoc
+pprData (CmmString str) = pprBytes str
+
+pprData (CmmUninitialised bytes)
+ = sdocWithPlatform $ \platform ->
+ if platformOS platform == OSDarwin then text ".space " <> int bytes
+ else text ".skip " <> int bytes
+
+pprData (CmmStaticLit lit) = pprDataItem lit
+
+pprGloblDecl :: CLabel -> SDoc
+pprGloblDecl lbl
+ | not (externallyVisibleCLabel lbl) = empty
+ | otherwise = text ".globl " <> ppr lbl
+
+pprLabelType' :: DynFlags -> CLabel -> SDoc
+pprLabelType' dflags lbl =
+ if isCFunctionLabel lbl || functionOkInfoTable then
+ text "@function"
+ else
+ text "@object"
+ where
+ {-
+ NOTE: This is a bit hacky.
+
+ With the `tablesNextToCode` info tables look like this:
+ ```
+ <info table data>
+ label_info:
+ <info table code>
+ ```
+ So actually info table label points exactly to the code and we can mark
+ the label as @function. (This is required to make perf and potentially other
+ tools to work on Haskell binaries).
+ This usually works well but it can cause issues with a linker.
+ A linker uses different algorithms for the relocation depending on
+ the symbol type.For some reason, a linker will generate JUMP_SLOT relocation
+ when constructor info table is referenced from a data section.
+ This only happens with static constructor call so
+ we mark _con_info symbols as `@object` to avoid the issue with relocations.
+
+ @SimonMarlow hack explanation:
+ "The reasoning goes like this:
+
+ * The danger when we mark a symbol as `@function` is that the linker will
+ redirect it to point to the PLT and use a `JUMP_SLOT` relocation when
+ the symbol refers to something outside the current shared object.
+ A PLT / JUMP_SLOT reference only works for symbols that we jump to, not
+ for symbols representing data,, nor for info table symbol references which
+ we expect to point directly to the info table.
+ * GHC generates code that might refer to any info table symbol from the text
+ segment, but that's OK, because those will be explicit GOT references
+ generated by the code generator.
+ * When we refer to info tables from the data segment, it's either
+ * a FUN_STATIC/THUNK_STATIC local to this module
+ * a `con_info` that could be from anywhere
+
+ So, the only info table symbols that we might refer to from the data segment
+ of another shared object are `con_info` symbols, so those are the ones we
+ need to exclude from getting the @function treatment.
+ "
+
+ A good place to check for more
+ https://gitlab.haskell.org/ghc/ghc/wikis/commentary/position-independent-code
+
+ Another possible hack is to create an extra local function symbol for
+ every code-like thing to give the needed information for to the tools
+ but mess up with the relocation. https://phabricator.haskell.org/D4730
+ -}
+ functionOkInfoTable = tablesNextToCode dflags &&
+ isInfoTableLabel lbl && not (isConInfoTableLabel lbl)
+
+
+pprTypeDecl :: CLabel -> SDoc
+pprTypeDecl lbl
+ = sdocWithPlatform $ \platform ->
+ if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
+ then
+ sdocWithDynFlags $ \df ->
+ text ".type " <> ppr lbl <> ptext (sLit ", ") <> pprLabelType' df lbl
+ else empty
+
+pprLabel :: CLabel -> SDoc
+pprLabel lbl = pprGloblDecl lbl
+ $$ pprTypeDecl lbl
+ $$ (ppr lbl <> char ':')
+
+pprAlign :: Alignment -> SDoc
+pprAlign alignment
+ = sdocWithPlatform $ \platform ->
+ text ".align " <> int (alignmentOn platform)
+ where
+ bytes = alignmentBytes alignment
+ alignmentOn platform = if platformOS platform == OSDarwin
+ then log2 bytes
+ else bytes
+
+ log2 :: Int -> Int -- cache the common ones
+ log2 1 = 0
+ log2 2 = 1
+ log2 4 = 2
+ log2 8 = 3
+ log2 n = 1 + log2 (n `quot` 2)
+
+-- -----------------------------------------------------------------------------
+-- pprInstr: print an 'Instr'
+
+instance Outputable Instr where
+ ppr instr = pprInstr instr
+
+
+pprReg :: Format -> Reg -> SDoc
+pprReg f r
+ = case r of
+ RegReal (RealRegSingle i) ->
+ sdocWithPlatform $ \platform ->
+ if target32Bit platform then ppr32_reg_no f i
+ else ppr64_reg_no f i
+ RegReal (RealRegPair _ _) -> panic "X86.Ppr: no reg pairs on this arch"
+ RegVirtual (VirtualRegI u) -> text "%vI_" <> pprUniqueAlways u
+ RegVirtual (VirtualRegHi u) -> text "%vHi_" <> pprUniqueAlways u
+ RegVirtual (VirtualRegF u) -> text "%vF_" <> pprUniqueAlways u
+ RegVirtual (VirtualRegD u) -> text "%vD_" <> pprUniqueAlways u
+
+ where
+ ppr32_reg_no :: Format -> Int -> SDoc
+ ppr32_reg_no II8 = ppr32_reg_byte
+ ppr32_reg_no II16 = ppr32_reg_word
+ ppr32_reg_no _ = ppr32_reg_long
+
+ ppr32_reg_byte i = ptext
+ (case i of {
+ 0 -> sLit "%al"; 1 -> sLit "%bl";
+ 2 -> sLit "%cl"; 3 -> sLit "%dl";
+ _ -> sLit $ "very naughty I386 byte register: " ++ show i
+ })
+
+ ppr32_reg_word i = ptext
+ (case i of {
+ 0 -> sLit "%ax"; 1 -> sLit "%bx";
+ 2 -> sLit "%cx"; 3 -> sLit "%dx";
+ 4 -> sLit "%si"; 5 -> sLit "%di";
+ 6 -> sLit "%bp"; 7 -> sLit "%sp";
+ _ -> sLit "very naughty I386 word register"
+ })
+
+ ppr32_reg_long i = ptext
+ (case i of {
+ 0 -> sLit "%eax"; 1 -> sLit "%ebx";
+ 2 -> sLit "%ecx"; 3 -> sLit "%edx";
+ 4 -> sLit "%esi"; 5 -> sLit "%edi";
+ 6 -> sLit "%ebp"; 7 -> sLit "%esp";
+ _ -> ppr_reg_float i
+ })
+
+ ppr64_reg_no :: Format -> Int -> SDoc
+ ppr64_reg_no II8 = ppr64_reg_byte
+ ppr64_reg_no II16 = ppr64_reg_word
+ ppr64_reg_no II32 = ppr64_reg_long
+ ppr64_reg_no _ = ppr64_reg_quad
+
+ ppr64_reg_byte i = ptext
+ (case i of {
+ 0 -> sLit "%al"; 1 -> sLit "%bl";
+ 2 -> sLit "%cl"; 3 -> sLit "%dl";
+ 4 -> sLit "%sil"; 5 -> sLit "%dil"; -- new 8-bit regs!
+ 6 -> sLit "%bpl"; 7 -> sLit "%spl";
+ 8 -> sLit "%r8b"; 9 -> sLit "%r9b";
+ 10 -> sLit "%r10b"; 11 -> sLit "%r11b";
+ 12 -> sLit "%r12b"; 13 -> sLit "%r13b";
+ 14 -> sLit "%r14b"; 15 -> sLit "%r15b";
+ _ -> sLit $ "very naughty x86_64 byte register: " ++ show i
+ })
+
+ ppr64_reg_word i = ptext
+ (case i of {
+ 0 -> sLit "%ax"; 1 -> sLit "%bx";
+ 2 -> sLit "%cx"; 3 -> sLit "%dx";
+ 4 -> sLit "%si"; 5 -> sLit "%di";
+ 6 -> sLit "%bp"; 7 -> sLit "%sp";
+ 8 -> sLit "%r8w"; 9 -> sLit "%r9w";
+ 10 -> sLit "%r10w"; 11 -> sLit "%r11w";
+ 12 -> sLit "%r12w"; 13 -> sLit "%r13w";
+ 14 -> sLit "%r14w"; 15 -> sLit "%r15w";
+ _ -> sLit "very naughty x86_64 word register"
+ })
+
+ ppr64_reg_long i = ptext
+ (case i of {
+ 0 -> sLit "%eax"; 1 -> sLit "%ebx";
+ 2 -> sLit "%ecx"; 3 -> sLit "%edx";
+ 4 -> sLit "%esi"; 5 -> sLit "%edi";
+ 6 -> sLit "%ebp"; 7 -> sLit "%esp";
+ 8 -> sLit "%r8d"; 9 -> sLit "%r9d";
+ 10 -> sLit "%r10d"; 11 -> sLit "%r11d";
+ 12 -> sLit "%r12d"; 13 -> sLit "%r13d";
+ 14 -> sLit "%r14d"; 15 -> sLit "%r15d";
+ _ -> sLit "very naughty x86_64 register"
+ })
+
+ ppr64_reg_quad i = ptext
+ (case i of {
+ 0 -> sLit "%rax"; 1 -> sLit "%rbx";
+ 2 -> sLit "%rcx"; 3 -> sLit "%rdx";
+ 4 -> sLit "%rsi"; 5 -> sLit "%rdi";
+ 6 -> sLit "%rbp"; 7 -> sLit "%rsp";
+ 8 -> sLit "%r8"; 9 -> sLit "%r9";
+ 10 -> sLit "%r10"; 11 -> sLit "%r11";
+ 12 -> sLit "%r12"; 13 -> sLit "%r13";
+ 14 -> sLit "%r14"; 15 -> sLit "%r15";
+ _ -> ppr_reg_float i
+ })
+
+ppr_reg_float :: Int -> PtrString
+ppr_reg_float i = case i of
+ 16 -> sLit "%xmm0" ; 17 -> sLit "%xmm1"
+ 18 -> sLit "%xmm2" ; 19 -> sLit "%xmm3"
+ 20 -> sLit "%xmm4" ; 21 -> sLit "%xmm5"
+ 22 -> sLit "%xmm6" ; 23 -> sLit "%xmm7"
+ 24 -> sLit "%xmm8" ; 25 -> sLit "%xmm9"
+ 26 -> sLit "%xmm10"; 27 -> sLit "%xmm11"
+ 28 -> sLit "%xmm12"; 29 -> sLit "%xmm13"
+ 30 -> sLit "%xmm14"; 31 -> sLit "%xmm15"
+ _ -> sLit "very naughty x86 register"
+
+pprFormat :: Format -> SDoc
+pprFormat x
+ = ptext (case x of
+ II8 -> sLit "b"
+ II16 -> sLit "w"
+ II32 -> sLit "l"
+ II64 -> sLit "q"
+ FF32 -> sLit "ss" -- "scalar single-precision float" (SSE2)
+ FF64 -> sLit "sd" -- "scalar double-precision float" (SSE2)
+ )
+
+pprFormat_x87 :: Format -> SDoc
+pprFormat_x87 x
+ = ptext $ case x of
+ FF32 -> sLit "s"
+ FF64 -> sLit "l"
+ _ -> panic "X86.Ppr.pprFormat_x87"
+
+
+pprCond :: Cond -> SDoc
+pprCond c
+ = ptext (case c of {
+ GEU -> sLit "ae"; LU -> sLit "b";
+ EQQ -> sLit "e"; GTT -> sLit "g";
+ GE -> sLit "ge"; GU -> sLit "a";
+ LTT -> sLit "l"; LE -> sLit "le";
+ LEU -> sLit "be"; NE -> sLit "ne";
+ NEG -> sLit "s"; POS -> sLit "ns";
+ CARRY -> sLit "c"; OFLO -> sLit "o";
+ PARITY -> sLit "p"; NOTPARITY -> sLit "np";
+ ALWAYS -> sLit "mp"})
+
+
+pprImm :: Imm -> SDoc
+pprImm (ImmInt i) = int i
+pprImm (ImmInteger i) = integer i
+pprImm (ImmCLbl l) = ppr l
+pprImm (ImmIndex l i) = ppr l <> char '+' <> int i
+pprImm (ImmLit s) = s
+
+pprImm (ImmFloat _) = text "naughty float immediate"
+pprImm (ImmDouble _) = text "naughty double immediate"
+
+pprImm (ImmConstantSum a b) = pprImm a <> char '+' <> pprImm b
+pprImm (ImmConstantDiff a b) = pprImm a <> char '-'
+ <> lparen <> pprImm b <> rparen
+
+
+
+pprAddr :: AddrMode -> SDoc
+pprAddr (ImmAddr imm off)
+ = let pp_imm = pprImm imm
+ in
+ if (off == 0) then
+ pp_imm
+ else if (off < 0) then
+ pp_imm <> int off
+ else
+ pp_imm <> char '+' <> int off
+
+pprAddr (AddrBaseIndex base index displacement)
+ = sdocWithPlatform $ \platform ->
+ let
+ pp_disp = ppr_disp displacement
+ pp_off p = pp_disp <> char '(' <> p <> char ')'
+ pp_reg r = pprReg (archWordFormat (target32Bit platform)) r
+ in
+ case (base, index) of
+ (EABaseNone, EAIndexNone) -> pp_disp
+ (EABaseReg b, EAIndexNone) -> pp_off (pp_reg b)
+ (EABaseRip, EAIndexNone) -> pp_off (text "%rip")
+ (EABaseNone, EAIndex r i) -> pp_off (comma <> pp_reg r <> comma <> int i)
+ (EABaseReg b, EAIndex r i) -> pp_off (pp_reg b <> comma <> pp_reg r
+ <> comma <> int i)
+ _ -> panic "X86.Ppr.pprAddr: no match"
+
+ where
+ ppr_disp (ImmInt 0) = empty
+ ppr_disp imm = pprImm imm
+
+-- | Print section header and appropriate alignment for that section.
+pprSectionAlign :: Section -> SDoc
+pprSectionAlign (Section (OtherSection _) _) =
+ panic "X86.Ppr.pprSectionAlign: unknown section"
+pprSectionAlign sec@(Section seg _) =
+ sdocWithPlatform $ \platform ->
+ pprSectionHeader platform sec $$
+ pprAlignForSection seg
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: SectionType -> SDoc
+pprAlignForSection seg =
+ sdocWithPlatform $ \platform ->
+ text ".align " <>
+ case platformOS platform of
+ -- Darwin: alignments are given as shifts.
+ OSDarwin
+ | target32Bit platform ->
+ case seg of
+ ReadOnlyData16 -> int 4
+ CString -> int 1
+ _ -> int 2
+ | otherwise ->
+ case seg of
+ ReadOnlyData16 -> int 4
+ CString -> int 1
+ _ -> int 3
+ -- Other: alignments are given as bytes.
+ _
+ | target32Bit platform ->
+ case seg of
+ Text -> text "4,0x90"
+ ReadOnlyData16 -> int 16
+ CString -> int 1
+ _ -> int 4
+ | otherwise ->
+ case seg of
+ ReadOnlyData16 -> int 16
+ CString -> int 1
+ _ -> int 8
+
+pprDataItem :: CmmLit -> SDoc
+pprDataItem lit = sdocWithDynFlags $ \dflags -> pprDataItem' dflags lit
+
+pprDataItem' :: DynFlags -> CmmLit -> SDoc
+pprDataItem' dflags lit
+ = vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit)
+ where
+ platform = targetPlatform dflags
+ imm = litToImm lit
+
+ -- These seem to be common:
+ ppr_item II8 _ = [text "\t.byte\t" <> pprImm imm]
+ ppr_item II16 _ = [text "\t.word\t" <> pprImm imm]
+ ppr_item II32 _ = [text "\t.long\t" <> pprImm imm]
+
+ ppr_item FF32 (CmmFloat r _)
+ = let bs = floatToBytes (fromRational r)
+ in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+ ppr_item FF64 (CmmFloat r _)
+ = let bs = doubleToBytes (fromRational r)
+ in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+ ppr_item II64 _
+ = case platformOS platform of
+ OSDarwin
+ | target32Bit platform ->
+ case lit of
+ CmmInt x _ ->
+ [text "\t.long\t"
+ <> int (fromIntegral (fromIntegral x :: Word32)),
+ text "\t.long\t"
+ <> int (fromIntegral
+ (fromIntegral (x `shiftR` 32) :: Word32))]
+ _ -> panic "X86.Ppr.ppr_item: no match for II64"
+ | otherwise ->
+ [text "\t.quad\t" <> pprImm imm]
+ _
+ | target32Bit platform ->
+ [text "\t.quad\t" <> pprImm imm]
+ | otherwise ->
+ -- x86_64: binutils can't handle the R_X86_64_PC64
+ -- relocation type, which means we can't do
+ -- pc-relative 64-bit addresses. Fortunately we're
+ -- assuming the small memory model, in which all such
+ -- offsets will fit into 32 bits, so we have to stick
+ -- to 32-bit offset fields and modify the RTS
+ -- appropriately
+ --
+ -- See Note [x86-64-relative] in includes/rts/storage/InfoTables.h
+ --
+ case lit of
+ -- A relative relocation:
+ CmmLabelDiffOff _ _ _ _ ->
+ [text "\t.long\t" <> pprImm imm,
+ text "\t.long\t0"]
+ _ ->
+ [text "\t.quad\t" <> pprImm imm]
+
+ ppr_item _ _
+ = panic "X86.Ppr.ppr_item: no match"
+
+
+asmComment :: SDoc -> SDoc
+asmComment c = whenPprDebug $ text "# " <> c
+
+pprInstr :: Instr -> SDoc
+
+pprInstr (COMMENT s)
+ = asmComment (ftext s)
+
+pprInstr (LOCATION file line col _name)
+ = text "\t.loc " <> ppr file <+> ppr line <+> ppr col
+
+pprInstr (DELTA d)
+ = asmComment $ text ("\tdelta = " ++ show d)
+
+pprInstr (NEWBLOCK _)
+ = panic "PprMach.pprInstr: NEWBLOCK"
+
+pprInstr (UNWIND lbl d)
+ = asmComment (text "\tunwind = " <> ppr d)
+ $$ ppr lbl <> colon
+
+pprInstr (LDATA _ _)
+ = panic "PprMach.pprInstr: LDATA"
+
+{-
+pprInstr (SPILL reg slot)
+ = hcat [
+ text "\tSPILL",
+ char ' ',
+ pprUserReg reg,
+ comma,
+ text "SLOT" <> parens (int slot)]
+
+pprInstr (RELOAD slot reg)
+ = hcat [
+ text "\tRELOAD",
+ char ' ',
+ text "SLOT" <> parens (int slot),
+ comma,
+ pprUserReg reg]
+-}
+
+-- Replace 'mov $0x0,%reg' by 'xor %reg,%reg', which is smaller and cheaper.
+-- The code generator catches most of these already, but not all.
+pprInstr (MOV format (OpImm (ImmInt 0)) dst@(OpReg _))
+ = pprInstr (XOR format' dst dst)
+ where format' = case format of
+ II64 -> II32 -- 32-bit version is equivalent, and smaller
+ _ -> format
+pprInstr (MOV format src dst)
+ = pprFormatOpOp (sLit "mov") format src dst
+
+pprInstr (CMOV cc format src dst)
+ = pprCondOpReg (sLit "cmov") format cc src dst
+
+pprInstr (MOVZxL II32 src dst) = pprFormatOpOp (sLit "mov") II32 src dst
+ -- 32-to-64 bit zero extension on x86_64 is accomplished by a simple
+ -- movl. But we represent it as a MOVZxL instruction, because
+ -- the reg alloc would tend to throw away a plain reg-to-reg
+ -- move, and we still want it to do that.
+
+pprInstr (MOVZxL formats src dst)
+ = pprFormatOpOpCoerce (sLit "movz") formats II32 src dst
+ -- zero-extension only needs to extend to 32 bits: on x86_64,
+ -- the remaining zero-extension to 64 bits is automatic, and the 32-bit
+ -- instruction is shorter.
+
+pprInstr (MOVSxL formats src dst)
+ = sdocWithPlatform $ \platform ->
+ pprFormatOpOpCoerce (sLit "movs") formats (archWordFormat (target32Bit platform)) src dst
+
+-- here we do some patching, since the physical registers are only set late
+-- in the code generation.
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) (EAIndex reg2 1) (ImmInt 0))) dst@(OpReg reg3))
+ | reg1 == reg3
+ = pprFormatOpOp (sLit "add") format (OpReg reg2) dst
+
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) (EAIndex reg2 1) (ImmInt 0))) dst@(OpReg reg3))
+ | reg2 == reg3
+ = pprFormatOpOp (sLit "add") format (OpReg reg1) dst
+
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) EAIndexNone displ)) dst@(OpReg reg3))
+ | reg1 == reg3
+ = pprInstr (ADD format (OpImm displ) dst)
+
+pprInstr (LEA format src dst) = pprFormatOpOp (sLit "lea") format src dst
+
+pprInstr (ADD format (OpImm (ImmInt (-1))) dst)
+ = pprFormatOp (sLit "dec") format dst
+pprInstr (ADD format (OpImm (ImmInt 1)) dst)
+ = pprFormatOp (sLit "inc") format dst
+pprInstr (ADD format src dst) = pprFormatOpOp (sLit "add") format src dst
+pprInstr (ADC format src dst) = pprFormatOpOp (sLit "adc") format src dst
+pprInstr (SUB format src dst) = pprFormatOpOp (sLit "sub") format src dst
+pprInstr (SBB format src dst) = pprFormatOpOp (sLit "sbb") format src dst
+pprInstr (IMUL format op1 op2) = pprFormatOpOp (sLit "imul") format op1 op2
+
+pprInstr (ADD_CC format src dst)
+ = pprFormatOpOp (sLit "add") format src dst
+pprInstr (SUB_CC format src dst)
+ = pprFormatOpOp (sLit "sub") format src dst
+
+{- A hack. The Intel documentation says that "The two and three
+ operand forms [of IMUL] may also be used with unsigned operands
+ because the lower half of the product is the same regardless if
+ (sic) the operands are signed or unsigned. The CF and OF flags,
+ however, cannot be used to determine if the upper half of the
+ result is non-zero." So there.
+-}
+
+-- Use a 32-bit instruction when possible as it saves a byte.
+-- Notably, extracting the tag bits of a pointer has this form.
+-- TODO: we could save a byte in a subsequent CMP instruction too,
+-- but need something like a peephole pass for this
+pprInstr (AND II64 src@(OpImm (ImmInteger mask)) dst)
+ | 0 <= mask && mask < 0xffffffff
+ = pprInstr (AND II32 src dst)
+pprInstr (AND FF32 src dst) = pprOpOp (sLit "andps") FF32 src dst
+pprInstr (AND FF64 src dst) = pprOpOp (sLit "andpd") FF64 src dst
+pprInstr (AND format src dst) = pprFormatOpOp (sLit "and") format src dst
+pprInstr (OR format src dst) = pprFormatOpOp (sLit "or") format src dst
+
+pprInstr (XOR FF32 src dst) = pprOpOp (sLit "xorps") FF32 src dst
+pprInstr (XOR FF64 src dst) = pprOpOp (sLit "xorpd") FF64 src dst
+pprInstr (XOR format src dst) = pprFormatOpOp (sLit "xor") format src dst
+
+pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst)
+pprInstr (LZCNT format src dst) = pprOpOp (sLit "lzcnt") format src (OpReg dst)
+pprInstr (TZCNT format src dst) = pprOpOp (sLit "tzcnt") format src (OpReg dst)
+pprInstr (BSF format src dst) = pprOpOp (sLit "bsf") format src (OpReg dst)
+pprInstr (BSR format src dst) = pprOpOp (sLit "bsr") format src (OpReg dst)
+
+pprInstr (PDEP format src mask dst) = pprFormatOpOpReg (sLit "pdep") format src mask dst
+pprInstr (PEXT format src mask dst) = pprFormatOpOpReg (sLit "pext") format src mask dst
+
+pprInstr (PREFETCH NTA format src ) = pprFormatOp_ (sLit "prefetchnta") format src
+pprInstr (PREFETCH Lvl0 format src) = pprFormatOp_ (sLit "prefetcht0") format src
+pprInstr (PREFETCH Lvl1 format src) = pprFormatOp_ (sLit "prefetcht1") format src
+pprInstr (PREFETCH Lvl2 format src) = pprFormatOp_ (sLit "prefetcht2") format src
+
+pprInstr (NOT format op) = pprFormatOp (sLit "not") format op
+pprInstr (BSWAP format op) = pprFormatOp (sLit "bswap") format (OpReg op)
+pprInstr (NEGI format op) = pprFormatOp (sLit "neg") format op
+
+pprInstr (SHL format src dst) = pprShift (sLit "shl") format src dst
+pprInstr (SAR format src dst) = pprShift (sLit "sar") format src dst
+pprInstr (SHR format src dst) = pprShift (sLit "shr") format src dst
+
+pprInstr (BT format imm src) = pprFormatImmOp (sLit "bt") format imm src
+
+pprInstr (CMP format src dst)
+ | isFloatFormat format = pprFormatOpOp (sLit "ucomi") format src dst -- SSE2
+ | otherwise = pprFormatOpOp (sLit "cmp") format src dst
+
+pprInstr (TEST format src dst) = sdocWithPlatform $ \platform ->
+ let format' = case (src,dst) of
+ -- Match instructions like 'test $0x3,%esi' or 'test $0x7,%rbx'.
+ -- We can replace them by equivalent, but smaller instructions
+ -- by reducing the size of the immediate operand as far as possible.
+ -- (We could handle masks larger than a single byte too,
+ -- but it would complicate the code considerably
+ -- and tag checks are by far the most common case.)
+ -- The mask must have the high bit clear for this smaller encoding
+ -- to be completely equivalent to the original; in particular so
+ -- that the signed comparison condition bits are the same as they
+ -- would be if doing a full word comparison. See #13425.
+ (OpImm (ImmInteger mask), OpReg dstReg)
+ | 0 <= mask && mask < 128 -> minSizeOfReg platform dstReg
+ _ -> format
+ in pprFormatOpOp (sLit "test") format' src dst
+ where
+ minSizeOfReg platform (RegReal (RealRegSingle i))
+ | target32Bit platform && i <= 3 = II8 -- al, bl, cl, dl
+ | target32Bit platform && i <= 7 = II16 -- si, di, bp, sp
+ | not (target32Bit platform) && i <= 15 = II8 -- al .. r15b
+ minSizeOfReg _ _ = format -- other
+
+pprInstr (PUSH format op) = pprFormatOp (sLit "push") format op
+pprInstr (POP format op) = pprFormatOp (sLit "pop") format op
+
+-- both unused (SDM):
+-- pprInstr PUSHA = text "\tpushal"
+-- pprInstr POPA = text "\tpopal"
+
+pprInstr NOP = text "\tnop"
+pprInstr (CLTD II8) = text "\tcbtw"
+pprInstr (CLTD II16) = text "\tcwtd"
+pprInstr (CLTD II32) = text "\tcltd"
+pprInstr (CLTD II64) = text "\tcqto"
+pprInstr (CLTD x) = panic $ "pprInstr: " ++ show x
+
+pprInstr (SETCC cond op) = pprCondInstr (sLit "set") cond (pprOperand II8 op)
+
+pprInstr (JXX cond blockid)
+ = pprCondInstr (sLit "j") cond (ppr lab)
+ where lab = blockLbl blockid
+
+pprInstr (JXX_GBL cond imm) = pprCondInstr (sLit "j") cond (pprImm imm)
+
+pprInstr (JMP (OpImm imm) _) = text "\tjmp " <> pprImm imm
+pprInstr (JMP op _) = sdocWithPlatform $ \platform ->
+ text "\tjmp *"
+ <> pprOperand (archWordFormat (target32Bit platform)) op
+pprInstr (JMP_TBL op _ _ _) = pprInstr (JMP op [])
+pprInstr (CALL (Left imm) _) = text "\tcall " <> pprImm imm
+pprInstr (CALL (Right reg) _) = sdocWithPlatform $ \platform ->
+ text "\tcall *"
+ <> pprReg (archWordFormat (target32Bit platform)) reg
+
+pprInstr (IDIV fmt op) = pprFormatOp (sLit "idiv") fmt op
+pprInstr (DIV fmt op) = pprFormatOp (sLit "div") fmt op
+pprInstr (IMUL2 fmt op) = pprFormatOp (sLit "imul") fmt op
+
+-- x86_64 only
+pprInstr (MUL format op1 op2) = pprFormatOpOp (sLit "mul") format op1 op2
+pprInstr (MUL2 format op) = pprFormatOp (sLit "mul") format op
+
+pprInstr (FDIV format op1 op2) = pprFormatOpOp (sLit "div") format op1 op2
+pprInstr (SQRT format op1 op2) = pprFormatOpReg (sLit "sqrt") format op1 op2
+
+pprInstr (CVTSS2SD from to) = pprRegReg (sLit "cvtss2sd") from to
+pprInstr (CVTSD2SS from to) = pprRegReg (sLit "cvtsd2ss") from to
+pprInstr (CVTTSS2SIQ fmt from to) = pprFormatFormatOpReg (sLit "cvttss2si") FF32 fmt from to
+pprInstr (CVTTSD2SIQ fmt from to) = pprFormatFormatOpReg (sLit "cvttsd2si") FF64 fmt from to
+pprInstr (CVTSI2SS fmt from to) = pprFormatOpReg (sLit "cvtsi2ss") fmt from to
+pprInstr (CVTSI2SD fmt from to) = pprFormatOpReg (sLit "cvtsi2sd") fmt from to
+
+ -- FETCHGOT for PIC on ELF platforms
+pprInstr (FETCHGOT reg)
+ = vcat [ text "\tcall 1f",
+ hcat [ text "1:\tpopl\t", pprReg II32 reg ],
+ hcat [ text "\taddl\t$_GLOBAL_OFFSET_TABLE_+(.-1b), ",
+ pprReg II32 reg ]
+ ]
+
+ -- FETCHPC for PIC on Darwin/x86
+ -- get the instruction pointer into a register
+ -- (Terminology note: the IP is called Program Counter on PPC,
+ -- and it's a good thing to use the same name on both platforms)
+pprInstr (FETCHPC reg)
+ = vcat [ text "\tcall 1f",
+ hcat [ text "1:\tpopl\t", pprReg II32 reg ]
+ ]
+
+
+-- the
+-- GST fmt src addr ==> FLD dst ; FSTPsz addr
+pprInstr g@(X87Store fmt addr)
+ = pprX87 g (hcat [gtab,
+ text "fstp", pprFormat_x87 fmt, gsp, pprAddr addr])
+
+
+-- Atomics
+
+pprInstr (LOCK i) = text "\tlock" $$ pprInstr i
+
+pprInstr MFENCE = text "\tmfence"
+
+pprInstr (XADD format src dst) = pprFormatOpOp (sLit "xadd") format src dst
+
+pprInstr (CMPXCHG format src dst)
+ = pprFormatOpOp (sLit "cmpxchg") format src dst
+
+
+
+--------------------------
+-- some left over
+
+
+
+gtab :: SDoc
+gtab = char '\t'
+
+gsp :: SDoc
+gsp = char ' '
+
+
+
+pprX87 :: Instr -> SDoc -> SDoc
+pprX87 fake actual
+ = (char '#' <> pprX87Instr fake) $$ actual
+
+pprX87Instr :: Instr -> SDoc
+pprX87Instr (X87Store fmt dst) = pprFormatAddr (sLit "gst") fmt dst
+pprX87Instr _ = panic "X86.Ppr.pprX87Instr: no match"
+
+pprDollImm :: Imm -> SDoc
+pprDollImm i = text "$" <> pprImm i
+
+
+pprOperand :: Format -> Operand -> SDoc
+pprOperand f (OpReg r) = pprReg f r
+pprOperand _ (OpImm i) = pprDollImm i
+pprOperand _ (OpAddr ea) = pprAddr ea
+
+
+pprMnemonic_ :: PtrString -> SDoc
+pprMnemonic_ name =
+ char '\t' <> ptext name <> space
+
+
+pprMnemonic :: PtrString -> Format -> SDoc
+pprMnemonic name format =
+ char '\t' <> ptext name <> pprFormat format <> space
+
+
+pprFormatImmOp :: PtrString -> Format -> Imm -> Operand -> SDoc
+pprFormatImmOp name format imm op1
+ = hcat [
+ pprMnemonic name format,
+ char '$',
+ pprImm imm,
+ comma,
+ pprOperand format op1
+ ]
+
+
+pprFormatOp_ :: PtrString -> Format -> Operand -> SDoc
+pprFormatOp_ name format op1
+ = hcat [
+ pprMnemonic_ name ,
+ pprOperand format op1
+ ]
+
+pprFormatOp :: PtrString -> Format -> Operand -> SDoc
+pprFormatOp name format op1
+ = hcat [
+ pprMnemonic name format,
+ pprOperand format op1
+ ]
+
+
+pprFormatOpOp :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprFormatOpOp name format op1 op2
+ = hcat [
+ pprMnemonic name format,
+ pprOperand format op1,
+ comma,
+ pprOperand format op2
+ ]
+
+
+pprOpOp :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprOpOp name format op1 op2
+ = hcat [
+ pprMnemonic_ name,
+ pprOperand format op1,
+ comma,
+ pprOperand format op2
+ ]
+
+
+
+pprRegReg :: PtrString -> Reg -> Reg -> SDoc
+pprRegReg name reg1 reg2
+ = sdocWithPlatform $ \platform ->
+ hcat [
+ pprMnemonic_ name,
+ pprReg (archWordFormat (target32Bit platform)) reg1,
+ comma,
+ pprReg (archWordFormat (target32Bit platform)) reg2
+ ]
+
+
+pprFormatOpReg :: PtrString -> Format -> Operand -> Reg -> SDoc
+pprFormatOpReg name format op1 reg2
+ = sdocWithPlatform $ \platform ->
+ hcat [
+ pprMnemonic name format,
+ pprOperand format op1,
+ comma,
+ pprReg (archWordFormat (target32Bit platform)) reg2
+ ]
+
+pprCondOpReg :: PtrString -> Format -> Cond -> Operand -> Reg -> SDoc
+pprCondOpReg name format cond op1 reg2
+ = hcat [
+ char '\t',
+ ptext name,
+ pprCond cond,
+ space,
+ pprOperand format op1,
+ comma,
+ pprReg format reg2
+ ]
+
+pprFormatFormatOpReg :: PtrString -> Format -> Format -> Operand -> Reg -> SDoc
+pprFormatFormatOpReg name format1 format2 op1 reg2
+ = hcat [
+ pprMnemonic name format2,
+ pprOperand format1 op1,
+ comma,
+ pprReg format2 reg2
+ ]
+
+pprFormatOpOpReg :: PtrString -> Format -> Operand -> Operand -> Reg -> SDoc
+pprFormatOpOpReg name format op1 op2 reg3
+ = hcat [
+ pprMnemonic name format,
+ pprOperand format op1,
+ comma,
+ pprOperand format op2,
+ comma,
+ pprReg format reg3
+ ]
+
+
+
+pprFormatAddr :: PtrString -> Format -> AddrMode -> SDoc
+pprFormatAddr name format op
+ = hcat [
+ pprMnemonic name format,
+ comma,
+ pprAddr op
+ ]
+
+pprShift :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprShift name format src dest
+ = hcat [
+ pprMnemonic name format,
+ pprOperand II8 src, -- src is 8-bit sized
+ comma,
+ pprOperand format dest
+ ]
+
+
+pprFormatOpOpCoerce :: PtrString -> Format -> Format -> Operand -> Operand -> SDoc
+pprFormatOpOpCoerce name format1 format2 op1 op2
+ = hcat [ char '\t', ptext name, pprFormat format1, pprFormat format2, space,
+ pprOperand format1 op1,
+ comma,
+ pprOperand format2 op2
+ ]
+
+
+pprCondInstr :: PtrString -> Cond -> SDoc -> SDoc
+pprCondInstr name cond arg
+ = hcat [ char '\t', ptext name, pprCond cond, space, arg]
diff --git a/compiler/GHC/CmmToAsm/X86/RegInfo.hs b/compiler/GHC/CmmToAsm/X86/RegInfo.hs
new file mode 100644
index 0000000000..597efe1c3e
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/RegInfo.hs
@@ -0,0 +1,73 @@
+{-# LANGUAGE CPP #-}
+module GHC.CmmToAsm.X86.RegInfo (
+ mkVirtualReg,
+ regDotColor
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import Outputable
+import GHC.Platform
+import Unique
+
+import UniqFM
+import GHC.CmmToAsm.X86.Regs
+
+
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+ = case format of
+ FF32 -> VirtualRegD u
+ -- for scalar F32, we use the same xmm as F64!
+ -- this is a hack that needs some improvement.
+ -- For now we map both to being allocated as "Double" Registers
+ -- on X86/X86_64
+ FF64 -> VirtualRegD u
+ _other -> VirtualRegI u
+
+regDotColor :: Platform -> RealReg -> SDoc
+regDotColor platform reg
+ = case (lookupUFM (regColors platform) reg) of
+ Just str -> text str
+ _ -> panic "Register not assigned a color"
+
+regColors :: Platform -> UniqFM [Char]
+regColors platform = listToUFM (normalRegColors platform)
+
+normalRegColors :: Platform -> [(Reg,String)]
+normalRegColors platform =
+ zip (map regSingle [0..lastint platform]) colors
+ ++ zip (map regSingle [firstxmm..lastxmm platform]) greys
+ where
+ -- 16 colors - enough for amd64 gp regs
+ colors = ["#800000","#ff0000","#808000","#ffff00","#008000"
+ ,"#00ff00","#008080","#00ffff","#000080","#0000ff"
+ ,"#800080","#ff00ff","#87005f","#875f00","#87af00"
+ ,"#ff00af"]
+
+ -- 16 shades of grey, enough for the currently supported
+ -- SSE extensions.
+ greys = ["#0e0e0e","#1c1c1c","#2a2a2a","#383838","#464646"
+ ,"#545454","#626262","#707070","#7e7e7e","#8c8c8c"
+ ,"#9a9a9a","#a8a8a8","#b6b6b6","#c4c4c4","#d2d2d2"
+ ,"#e0e0e0"]
+
+
+
+-- 32 shades of grey - use for avx 512 if we ever need it
+-- greys = ["#070707","#0e0e0e","#151515","#1c1c1c"
+-- ,"#232323","#2a2a2a","#313131","#383838","#3f3f3f"
+-- ,"#464646","#4d4d4d","#545454","#5b5b5b","#626262"
+-- ,"#696969","#707070","#777777","#7e7e7e","#858585"
+-- ,"#8c8c8c","#939393","#9a9a9a","#a1a1a1","#a8a8a8"
+-- ,"#afafaf","#b6b6b6","#bdbdbd","#c4c4c4","#cbcbcb"
+-- ,"#d2d2d2","#d9d9d9","#e0e0e0"]
+
+
diff --git a/compiler/GHC/CmmToAsm/X86/Regs.hs b/compiler/GHC/CmmToAsm/X86/Regs.hs
new file mode 100644
index 0000000000..87e31a1428
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Regs.hs
@@ -0,0 +1,442 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.X86.Regs (
+ -- squeese functions for the graph allocator
+ virtualRegSqueeze,
+ realRegSqueeze,
+
+ -- immediates
+ Imm(..),
+ strImmLit,
+ litToImm,
+
+ -- addressing modes
+ AddrMode(..),
+ addrOffset,
+
+ -- registers
+ spRel,
+ argRegs,
+ allArgRegs,
+ allIntArgRegs,
+ callClobberedRegs,
+ instrClobberedRegs,
+ allMachRegNos,
+ classOfRealReg,
+ showReg,
+
+ -- machine specific
+ EABase(..), EAIndex(..), addrModeRegs,
+
+ eax, ebx, ecx, edx, esi, edi, ebp, esp,
+
+
+ rax, rbx, rcx, rdx, rsi, rdi, rbp, rsp,
+ r8, r9, r10, r11, r12, r13, r14, r15,
+ lastint,
+ xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+ xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+ xmm,
+ firstxmm, lastxmm,
+
+ ripRel,
+ allFPArgRegs,
+
+ allocatableRegs
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Regs
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+
+import GHC.Cmm
+import GHC.Cmm.CLabel ( CLabel )
+import GHC.Driver.Session
+import Outputable
+import GHC.Platform
+
+import qualified Data.Array as A
+
+-- | regSqueeze_class reg
+-- Calculate the maximum number of register colors that could be
+-- denied to a node of this class due to having this reg
+-- as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+
+virtualRegSqueeze cls vr
+ = case cls of
+ RcInteger
+ -> case vr of
+ VirtualRegI{} -> 1
+ VirtualRegHi{} -> 1
+ _other -> 0
+
+ RcDouble
+ -> case vr of
+ VirtualRegD{} -> 1
+ VirtualRegF{} -> 0
+ _other -> 0
+
+
+ _other -> 0
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+realRegSqueeze cls rr
+ = case cls of
+ RcInteger
+ -> case rr of
+ RealRegSingle regNo
+ | regNo < firstxmm -> 1
+ | otherwise -> 0
+
+ RealRegPair{} -> 0
+
+ RcDouble
+ -> case rr of
+ RealRegSingle regNo
+ | regNo >= firstxmm -> 1
+ | otherwise -> 0
+
+ RealRegPair{} -> 0
+
+
+ _other -> 0
+
+-- -----------------------------------------------------------------------------
+-- Immediates
+
+data Imm
+ = ImmInt Int
+ | ImmInteger Integer -- Sigh.
+ | ImmCLbl CLabel -- AbstractC Label (with baggage)
+ | ImmLit SDoc -- Simple string
+ | ImmIndex CLabel Int
+ | ImmFloat Rational
+ | ImmDouble Rational
+ | ImmConstantSum Imm Imm
+ | ImmConstantDiff Imm Imm
+
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+litToImm :: CmmLit -> Imm
+litToImm (CmmInt i w) = ImmInteger (narrowS w i)
+ -- narrow to the width: a CmmInt might be out of
+ -- range, but we assume that ImmInteger only contains
+ -- in-range values. A signed value should be fine here.
+litToImm (CmmFloat f W32) = ImmFloat f
+litToImm (CmmFloat f W64) = ImmDouble f
+litToImm (CmmLabel l) = ImmCLbl l
+litToImm (CmmLabelOff l off) = ImmIndex l off
+litToImm (CmmLabelDiffOff l1 l2 off _)
+ = ImmConstantSum
+ (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+ (ImmInt off)
+litToImm _ = panic "X86.Regs.litToImm: no match"
+
+-- addressing modes ------------------------------------------------------------
+
+data AddrMode
+ = AddrBaseIndex EABase EAIndex Displacement
+ | ImmAddr Imm Int
+
+data EABase = EABaseNone | EABaseReg Reg | EABaseRip
+data EAIndex = EAIndexNone | EAIndex Reg Int
+type Displacement = Imm
+
+
+addrOffset :: AddrMode -> Int -> Maybe AddrMode
+addrOffset addr off
+ = case addr of
+ ImmAddr i off0 -> Just (ImmAddr i (off0 + off))
+
+ AddrBaseIndex r i (ImmInt n) -> Just (AddrBaseIndex r i (ImmInt (n + off)))
+ AddrBaseIndex r i (ImmInteger n)
+ -> Just (AddrBaseIndex r i (ImmInt (fromInteger (n + toInteger off))))
+
+ AddrBaseIndex r i (ImmCLbl lbl)
+ -> Just (AddrBaseIndex r i (ImmIndex lbl off))
+
+ AddrBaseIndex r i (ImmIndex lbl ix)
+ -> Just (AddrBaseIndex r i (ImmIndex lbl (ix+off)))
+
+ _ -> Nothing -- in theory, shouldn't happen
+
+
+addrModeRegs :: AddrMode -> [Reg]
+addrModeRegs (AddrBaseIndex b i _) = b_regs ++ i_regs
+ where
+ b_regs = case b of { EABaseReg r -> [r]; _ -> [] }
+ i_regs = case i of { EAIndex r _ -> [r]; _ -> [] }
+addrModeRegs _ = []
+
+
+-- registers -------------------------------------------------------------------
+
+-- @spRel@ gives us a stack relative addressing mode for volatile
+-- temporaries and for excess call arguments. @fpRel@, where
+-- applicable, is the same but for the frame pointer.
+
+
+spRel :: DynFlags
+ -> Int -- ^ desired stack offset in bytes, positive or negative
+ -> AddrMode
+spRel dflags n
+ | target32Bit (targetPlatform dflags)
+ = AddrBaseIndex (EABaseReg esp) EAIndexNone (ImmInt n)
+ | otherwise
+ = AddrBaseIndex (EABaseReg rsp) EAIndexNone (ImmInt n)
+
+-- The register numbers must fit into 32 bits on x86, so that we can
+-- use a Word32 to represent the set of free registers in the register
+-- allocator.
+
+
+
+firstxmm :: RegNo
+firstxmm = 16
+
+-- on 32bit platformOSs, only the first 8 XMM/YMM/ZMM registers are available
+lastxmm :: Platform -> RegNo
+lastxmm platform
+ | target32Bit platform = firstxmm + 7 -- xmm0 - xmmm7
+ | otherwise = firstxmm + 15 -- xmm0 -xmm15
+
+lastint :: Platform -> RegNo
+lastint platform
+ | target32Bit platform = 7 -- not %r8..%r15
+ | otherwise = 15
+
+intregnos :: Platform -> [RegNo]
+intregnos platform = [0 .. lastint platform]
+
+
+
+xmmregnos :: Platform -> [RegNo]
+xmmregnos platform = [firstxmm .. lastxmm platform]
+
+floatregnos :: Platform -> [RegNo]
+floatregnos platform = xmmregnos platform
+
+-- argRegs is the set of regs which are read for an n-argument call to C.
+-- For archs which pass all args on the stack (x86), is empty.
+-- Sparc passes up to the first 6 args in regs.
+argRegs :: RegNo -> [Reg]
+argRegs _ = panic "MachRegs.argRegs(x86): should not be used!"
+
+-- | The complete set of machine registers.
+allMachRegNos :: Platform -> [RegNo]
+allMachRegNos platform = intregnos platform ++ floatregnos platform
+
+-- | Take the class of a register.
+{-# INLINE classOfRealReg #-}
+classOfRealReg :: Platform -> RealReg -> RegClass
+-- On x86, we might want to have an 8-bit RegClass, which would
+-- contain just regs 1-4 (the others don't have 8-bit versions).
+-- However, we can get away without this at the moment because the
+-- only allocatable integer regs are also 8-bit compatible (1, 3, 4).
+classOfRealReg platform reg
+ = case reg of
+ RealRegSingle i
+ | i <= lastint platform -> RcInteger
+ | i <= lastxmm platform -> RcDouble
+ | otherwise -> panic "X86.Reg.classOfRealReg registerSingle too high"
+ _ -> panic "X86.Regs.classOfRealReg: RegPairs on this arch"
+
+-- | Get the name of the register with this number.
+-- NOTE: fixme, we dont track which "way" the XMM registers are used
+showReg :: Platform -> RegNo -> String
+showReg platform n
+ | n >= firstxmm && n <= lastxmm platform = "%xmm" ++ show (n-firstxmm)
+ | n >= 8 && n < firstxmm = "%r" ++ show n
+ | otherwise = regNames platform A.! n
+
+regNames :: Platform -> A.Array Int String
+regNames platform
+ = if target32Bit platform
+ then A.listArray (0,8) ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"]
+ else A.listArray (0,8) ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp"]
+
+
+
+-- machine specific ------------------------------------------------------------
+
+
+{-
+Intel x86 architecture:
+- All registers except 7 (esp) are available for use.
+- Only ebx, esi, edi and esp are available across a C call (they are callee-saves).
+- Registers 0-7 have 16-bit counterparts (ax, bx etc.)
+- Registers 0-3 have 8 bit counterparts (ah, bh etc.)
+
+The fp registers are all Double registers; we don't have any RcFloat class
+regs. @regClass@ barfs if you give it a VirtualRegF, and mkVReg above should
+never generate them.
+
+TODO: cleanup modelling float vs double registers and how they are the same class.
+-}
+
+
+eax, ebx, ecx, edx, esp, ebp, esi, edi :: Reg
+
+eax = regSingle 0
+ebx = regSingle 1
+ecx = regSingle 2
+edx = regSingle 3
+esi = regSingle 4
+edi = regSingle 5
+ebp = regSingle 6
+esp = regSingle 7
+
+
+
+
+{-
+AMD x86_64 architecture:
+- All 16 integer registers are addressable as 8, 16, 32 and 64-bit values:
+
+ 8 16 32 64
+ ---------------------
+ al ax eax rax
+ bl bx ebx rbx
+ cl cx ecx rcx
+ dl dx edx rdx
+ sil si esi rsi
+ dil si edi rdi
+ bpl bp ebp rbp
+ spl sp esp rsp
+ r10b r10w r10d r10
+ r11b r11w r11d r11
+ r12b r12w r12d r12
+ r13b r13w r13d r13
+ r14b r14w r14d r14
+ r15b r15w r15d r15
+-}
+
+rax, rbx, rcx, rdx, rsp, rbp, rsi, rdi,
+ r8, r9, r10, r11, r12, r13, r14, r15,
+ xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+ xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 :: Reg
+
+rax = regSingle 0
+rbx = regSingle 1
+rcx = regSingle 2
+rdx = regSingle 3
+rsi = regSingle 4
+rdi = regSingle 5
+rbp = regSingle 6
+rsp = regSingle 7
+r8 = regSingle 8
+r9 = regSingle 9
+r10 = regSingle 10
+r11 = regSingle 11
+r12 = regSingle 12
+r13 = regSingle 13
+r14 = regSingle 14
+r15 = regSingle 15
+xmm0 = regSingle 16
+xmm1 = regSingle 17
+xmm2 = regSingle 18
+xmm3 = regSingle 19
+xmm4 = regSingle 20
+xmm5 = regSingle 21
+xmm6 = regSingle 22
+xmm7 = regSingle 23
+xmm8 = regSingle 24
+xmm9 = regSingle 25
+xmm10 = regSingle 26
+xmm11 = regSingle 27
+xmm12 = regSingle 28
+xmm13 = regSingle 29
+xmm14 = regSingle 30
+xmm15 = regSingle 31
+
+ripRel :: Displacement -> AddrMode
+ripRel imm = AddrBaseIndex EABaseRip EAIndexNone imm
+
+
+ -- so we can re-use some x86 code:
+{-
+eax = rax
+ebx = rbx
+ecx = rcx
+edx = rdx
+esi = rsi
+edi = rdi
+ebp = rbp
+esp = rsp
+-}
+
+xmm :: RegNo -> Reg
+xmm n = regSingle (firstxmm+n)
+
+
+
+
+-- | these are the regs which we cannot assume stay alive over a C call.
+callClobberedRegs :: Platform -> [Reg]
+-- caller-saves registers
+callClobberedRegs platform
+ | target32Bit platform = [eax,ecx,edx] ++ map regSingle (floatregnos platform)
+ | platformOS platform == OSMinGW32
+ = [rax,rcx,rdx,r8,r9,r10,r11]
+ -- Only xmm0-5 are caller-saves registers on 64bit windows.
+ -- ( https://docs.microsoft.com/en-us/cpp/build/register-usage )
+ -- For details check the Win64 ABI.
+ ++ map xmm [0 .. 5]
+ | otherwise
+ -- all xmm regs are caller-saves
+ -- caller-saves registers
+ = [rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11]
+ ++ map regSingle (floatregnos platform)
+
+allArgRegs :: Platform -> [(Reg, Reg)]
+allArgRegs platform
+ | platformOS platform == OSMinGW32 = zip [rcx,rdx,r8,r9]
+ (map regSingle [firstxmm ..])
+ | otherwise = panic "X86.Regs.allArgRegs: not defined for this arch"
+
+allIntArgRegs :: Platform -> [Reg]
+allIntArgRegs platform
+ | (platformOS platform == OSMinGW32) || target32Bit platform
+ = panic "X86.Regs.allIntArgRegs: not defined for this platform"
+ | otherwise = [rdi,rsi,rdx,rcx,r8,r9]
+
+
+-- | on 64bit platforms we pass the first 8 float/double arguments
+-- in the xmm registers.
+allFPArgRegs :: Platform -> [Reg]
+allFPArgRegs platform
+ | platformOS platform == OSMinGW32
+ = panic "X86.Regs.allFPArgRegs: not defined for this platform"
+ | otherwise = map regSingle [firstxmm .. firstxmm + 7 ]
+
+
+-- Machine registers which might be clobbered by instructions that
+-- generate results into fixed registers, or need arguments in a fixed
+-- register.
+instrClobberedRegs :: Platform -> [Reg]
+instrClobberedRegs platform
+ | target32Bit platform = [ eax, ecx, edx ]
+ | otherwise = [ rax, rcx, rdx ]
+
+--
+
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: Platform -> [RealReg]
+allocatableRegs platform
+ = let isFree i = freeReg platform i
+ in map RealRegSingle $ filter isFree (allMachRegNos platform)
+
diff --git a/compiler/GHC/CmmToC.hs b/compiler/GHC/CmmToC.hs
index f7245f5c30..6e6f58ba7d 100644
--- a/compiler/GHC/CmmToC.hs
+++ b/compiler/GHC/CmmToC.hs
@@ -39,7 +39,7 @@ import GHC.Cmm.Utils
import GHC.Cmm.Switch
-- Utils
-import CPrim
+import GHC.CmmToAsm.CPrim
import GHC.Driver.Session
import FastString
import Outputable
diff --git a/compiler/GHC/Driver/CodeOutput.hs b/compiler/GHC/Driver/CodeOutput.hs
index e52d3216d5..507311c039 100644
--- a/compiler/GHC/Driver/CodeOutput.hs
+++ b/compiler/GHC/Driver/CodeOutput.hs
@@ -12,7 +12,7 @@ module GHC.Driver.CodeOutput ( codeOutput, outputForeignStubs ) where
import GhcPrelude
-import AsmCodeGen ( nativeCodeGen )
+import GHC.CmmToAsm ( nativeCodeGen )
import GHC.CmmToLlvm ( llvmCodeGen )
import UniqSupply ( mkSplitUniqSupply )
diff --git a/compiler/GHC/Llvm/Types.hs b/compiler/GHC/Llvm/Types.hs
index f8ca0c826c..3e320634d0 100644
--- a/compiler/GHC/Llvm/Types.hs
+++ b/compiler/GHC/Llvm/Types.hs
@@ -20,7 +20,7 @@ import Outputable
import Unique
-- from NCG
-import PprBase
+import GHC.CmmToAsm.Ppr
import GHC.Float
diff --git a/compiler/GHC/Platform/Reg.hs b/compiler/GHC/Platform/Reg.hs
new file mode 100644
index 0000000000..b856d7c3af
--- /dev/null
+++ b/compiler/GHC/Platform/Reg.hs
@@ -0,0 +1,241 @@
+-- | An architecture independent description of a register.
+-- This needs to stay architecture independent because it is used
+-- by NCGMonad and the register allocators, which are shared
+-- by all architectures.
+--
+module GHC.Platform.Reg (
+ RegNo,
+ Reg(..),
+ regPair,
+ regSingle,
+ isRealReg, takeRealReg,
+ isVirtualReg, takeVirtualReg,
+
+ VirtualReg(..),
+ renameVirtualReg,
+ classOfVirtualReg,
+ getHiVirtualRegFromLo,
+ getHiVRegFromLo,
+
+ RealReg(..),
+ regNosOfRealReg,
+ realRegsAlias,
+
+ liftPatchFnToRegReg
+)
+
+where
+
+import GhcPrelude
+
+import Outputable
+import Unique
+import GHC.Platform.Reg.Class
+import Data.List (intersect)
+
+-- | An identifier for a primitive real machine register.
+type RegNo
+ = Int
+
+-- VirtualRegs are virtual registers. The register allocator will
+-- eventually have to map them into RealRegs, or into spill slots.
+--
+-- VirtualRegs are allocated on the fly, usually to represent a single
+-- value in the abstract assembly code (i.e. dynamic registers are
+-- usually single assignment).
+--
+-- The single assignment restriction isn't necessary to get correct code,
+-- although a better register allocation will result if single
+-- assignment is used -- because the allocator maps a VirtualReg into
+-- a single RealReg, even if the VirtualReg has multiple live ranges.
+--
+-- Virtual regs can be of either class, so that info is attached.
+--
+data VirtualReg
+ = VirtualRegI {-# UNPACK #-} !Unique
+ | VirtualRegHi {-# UNPACK #-} !Unique -- High part of 2-word register
+ | VirtualRegF {-# UNPACK #-} !Unique
+ | VirtualRegD {-# UNPACK #-} !Unique
+
+ deriving (Eq, Show)
+
+-- This is laborious, but necessary. We can't derive Ord because
+-- Unique doesn't have an Ord instance. Note nonDetCmpUnique in the
+-- implementation. See Note [No Ord for Unique]
+-- This is non-deterministic but we do not currently support deterministic
+-- code-generation. See Note [Unique Determinism and code generation]
+instance Ord VirtualReg where
+ compare (VirtualRegI a) (VirtualRegI b) = nonDetCmpUnique a b
+ compare (VirtualRegHi a) (VirtualRegHi b) = nonDetCmpUnique a b
+ compare (VirtualRegF a) (VirtualRegF b) = nonDetCmpUnique a b
+ compare (VirtualRegD a) (VirtualRegD b) = nonDetCmpUnique a b
+
+ compare VirtualRegI{} _ = LT
+ compare _ VirtualRegI{} = GT
+ compare VirtualRegHi{} _ = LT
+ compare _ VirtualRegHi{} = GT
+ compare VirtualRegF{} _ = LT
+ compare _ VirtualRegF{} = GT
+
+
+
+instance Uniquable VirtualReg where
+ getUnique reg
+ = case reg of
+ VirtualRegI u -> u
+ VirtualRegHi u -> u
+ VirtualRegF u -> u
+ VirtualRegD u -> u
+
+instance Outputable VirtualReg where
+ ppr reg
+ = case reg of
+ VirtualRegI u -> text "%vI_" <> pprUniqueAlways u
+ VirtualRegHi u -> text "%vHi_" <> pprUniqueAlways u
+ -- this code is kinda wrong on x86
+ -- because float and double occupy the same register set
+ -- namely SSE2 register xmm0 .. xmm15
+ VirtualRegF u -> text "%vFloat_" <> pprUniqueAlways u
+ VirtualRegD u -> text "%vDouble_" <> pprUniqueAlways u
+
+
+
+renameVirtualReg :: Unique -> VirtualReg -> VirtualReg
+renameVirtualReg u r
+ = case r of
+ VirtualRegI _ -> VirtualRegI u
+ VirtualRegHi _ -> VirtualRegHi u
+ VirtualRegF _ -> VirtualRegF u
+ VirtualRegD _ -> VirtualRegD u
+
+
+classOfVirtualReg :: VirtualReg -> RegClass
+classOfVirtualReg vr
+ = case vr of
+ VirtualRegI{} -> RcInteger
+ VirtualRegHi{} -> RcInteger
+ VirtualRegF{} -> RcFloat
+ VirtualRegD{} -> RcDouble
+
+
+
+-- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform
+-- when supplied with the vreg for the lower-half of the quantity.
+-- (NB. Not reversible).
+getHiVirtualRegFromLo :: VirtualReg -> VirtualReg
+getHiVirtualRegFromLo reg
+ = case reg of
+ -- makes a pseudo-unique with tag 'H'
+ VirtualRegI u -> VirtualRegHi (newTagUnique u 'H')
+ _ -> panic "Reg.getHiVirtualRegFromLo"
+
+getHiVRegFromLo :: Reg -> Reg
+getHiVRegFromLo reg
+ = case reg of
+ RegVirtual vr -> RegVirtual (getHiVirtualRegFromLo vr)
+ RegReal _ -> panic "Reg.getHiVRegFromLo"
+
+
+------------------------------------------------------------------------------------
+-- | RealRegs are machine regs which are available for allocation, in
+-- the usual way. We know what class they are, because that's part of
+-- the processor's architecture.
+--
+-- RealRegPairs are pairs of real registers that are allocated together
+-- to hold a larger value, such as with Double regs on SPARC.
+--
+data RealReg
+ = RealRegSingle {-# UNPACK #-} !RegNo
+ | RealRegPair {-# UNPACK #-} !RegNo {-# UNPACK #-} !RegNo
+ deriving (Eq, Show, Ord)
+
+instance Uniquable RealReg where
+ getUnique reg
+ = case reg of
+ RealRegSingle i -> mkRegSingleUnique i
+ RealRegPair r1 r2 -> mkRegPairUnique (r1 * 65536 + r2)
+
+instance Outputable RealReg where
+ ppr reg
+ = case reg of
+ RealRegSingle i -> text "%r" <> int i
+ RealRegPair r1 r2 -> text "%r(" <> int r1
+ <> vbar <> int r2 <> text ")"
+
+regNosOfRealReg :: RealReg -> [RegNo]
+regNosOfRealReg rr
+ = case rr of
+ RealRegSingle r1 -> [r1]
+ RealRegPair r1 r2 -> [r1, r2]
+
+
+realRegsAlias :: RealReg -> RealReg -> Bool
+realRegsAlias rr1 rr2
+ = not $ null $ intersect (regNosOfRealReg rr1) (regNosOfRealReg rr2)
+
+--------------------------------------------------------------------------------
+-- | A register, either virtual or real
+data Reg
+ = RegVirtual !VirtualReg
+ | RegReal !RealReg
+ deriving (Eq, Ord)
+
+regSingle :: RegNo -> Reg
+regSingle regNo = RegReal $ RealRegSingle regNo
+
+regPair :: RegNo -> RegNo -> Reg
+regPair regNo1 regNo2 = RegReal $ RealRegPair regNo1 regNo2
+
+
+-- We like to have Uniques for Reg so that we can make UniqFM and UniqSets
+-- in the register allocator.
+instance Uniquable Reg where
+ getUnique reg
+ = case reg of
+ RegVirtual vr -> getUnique vr
+ RegReal rr -> getUnique rr
+
+-- | Print a reg in a generic manner
+-- If you want the architecture specific names, then use the pprReg
+-- function from the appropriate Ppr module.
+instance Outputable Reg where
+ ppr reg
+ = case reg of
+ RegVirtual vr -> ppr vr
+ RegReal rr -> ppr rr
+
+
+isRealReg :: Reg -> Bool
+isRealReg reg
+ = case reg of
+ RegReal _ -> True
+ RegVirtual _ -> False
+
+takeRealReg :: Reg -> Maybe RealReg
+takeRealReg reg
+ = case reg of
+ RegReal rr -> Just rr
+ _ -> Nothing
+
+
+isVirtualReg :: Reg -> Bool
+isVirtualReg reg
+ = case reg of
+ RegReal _ -> False
+ RegVirtual _ -> True
+
+takeVirtualReg :: Reg -> Maybe VirtualReg
+takeVirtualReg reg
+ = case reg of
+ RegReal _ -> Nothing
+ RegVirtual vr -> Just vr
+
+
+-- | The patch function supplied by the allocator maps VirtualReg to RealReg
+-- regs, but sometimes we want to apply it to plain old Reg.
+--
+liftPatchFnToRegReg :: (VirtualReg -> RealReg) -> (Reg -> Reg)
+liftPatchFnToRegReg patchF reg
+ = case reg of
+ RegVirtual vr -> RegReal (patchF vr)
+ RegReal _ -> reg
diff --git a/compiler/GHC/Platform/Reg/Class.hs b/compiler/GHC/Platform/Reg/Class.hs
new file mode 100644
index 0000000000..225ad05be5
--- /dev/null
+++ b/compiler/GHC/Platform/Reg/Class.hs
@@ -0,0 +1,32 @@
+-- | An architecture independent description of a register's class.
+module GHC.Platform.Reg.Class
+ ( RegClass (..) )
+
+where
+
+import GhcPrelude
+
+import Outputable
+import Unique
+
+
+-- | The class of a register.
+-- Used in the register allocator.
+-- We treat all registers in a class as being interchangeable.
+--
+data RegClass
+ = RcInteger
+ | RcFloat
+ | RcDouble
+ deriving Eq
+
+
+instance Uniquable RegClass where
+ getUnique RcInteger = mkRegClassUnique 0
+ getUnique RcFloat = mkRegClassUnique 1
+ getUnique RcDouble = mkRegClassUnique 2
+
+instance Outputable RegClass where
+ ppr RcInteger = Outputable.text "I"
+ ppr RcFloat = Outputable.text "F"
+ ppr RcDouble = Outputable.text "D"
diff --git a/compiler/GHC/Platform/Regs.hs b/compiler/GHC/Platform/Regs.hs
index 51f7658db2..d214b0d89f 100644
--- a/compiler/GHC/Platform/Regs.hs
+++ b/compiler/GHC/Platform/Regs.hs
@@ -7,7 +7,7 @@ import GhcPrelude
import GHC.Cmm.Expr
import GHC.Platform
-import Reg
+import GHC.Platform.Reg
import qualified GHC.Platform.ARM as ARM
import qualified GHC.Platform.ARM64 as ARM64