diff options
author | Sylvain Henry <sylvain@haskus.fr> | 2020-02-22 15:05:20 +0100 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2020-02-24 20:55:25 -0500 |
commit | 1b1067d14b656bbbfa7c47f156ec2700c9751549 (patch) | |
tree | 32346e3c4c3f89117190b36364144d85dc260e05 /compiler/GHC | |
parent | 354e2787be08fb6d973de1a39e58080ff8e107f8 (diff) | |
download | haskell-1b1067d14b656bbbfa7c47f156ec2700c9751549.tar.gz |
Modules: CmmToAsm (#13009)
Diffstat (limited to 'compiler/GHC')
71 files changed, 28475 insertions, 4 deletions
diff --git a/compiler/GHC/CmmToAsm.hs b/compiler/GHC/CmmToAsm.hs new file mode 100644 index 0000000000..8dc9b61198 --- /dev/null +++ b/compiler/GHC/CmmToAsm.hs @@ -0,0 +1,1236 @@ +-- ----------------------------------------------------------------------------- +-- +-- (c) The University of Glasgow 1993-2004 +-- +-- This is the top-level module in the native code generator. +-- +-- ----------------------------------------------------------------------------- + +{-# LANGUAGE BangPatterns, CPP, GADTs, ScopedTypeVariables, PatternSynonyms, + DeriveFunctor #-} + +#if !defined(GHC_LOADED_INTO_GHCI) +{-# LANGUAGE UnboxedTuples #-} +#endif + +{-# OPTIONS_GHC -Wno-incomplete-record-updates #-} + +module GHC.CmmToAsm ( + -- * Module entry point + nativeCodeGen + + -- * Test-only exports: see trac #12744 + -- used by testGraphNoSpills, which needs to access + -- the register allocator intermediate data structures + -- cmmNativeGen emits + , cmmNativeGen + , NcgImpl(..) + , x86NcgImpl + ) where + +#include "HsVersions.h" + +import GhcPrelude + +import qualified GHC.CmmToAsm.X86.CodeGen as X86.CodeGen +import qualified GHC.CmmToAsm.X86.Regs as X86.Regs +import qualified GHC.CmmToAsm.X86.Instr as X86.Instr +import qualified GHC.CmmToAsm.X86.Ppr as X86.Ppr + +import qualified GHC.CmmToAsm.SPARC.CodeGen as SPARC.CodeGen +import qualified GHC.CmmToAsm.SPARC.Regs as SPARC.Regs +import qualified GHC.CmmToAsm.SPARC.Instr as SPARC.Instr +import qualified GHC.CmmToAsm.SPARC.Ppr as SPARC.Ppr +import qualified GHC.CmmToAsm.SPARC.ShortcutJump as SPARC.ShortcutJump +import qualified GHC.CmmToAsm.SPARC.CodeGen.Expand as SPARC.CodeGen.Expand + +import qualified GHC.CmmToAsm.PPC.CodeGen as PPC.CodeGen +import qualified GHC.CmmToAsm.PPC.Regs as PPC.Regs +import qualified GHC.CmmToAsm.PPC.RegInfo as PPC.RegInfo +import qualified GHC.CmmToAsm.PPC.Instr as PPC.Instr +import qualified GHC.CmmToAsm.PPC.Ppr as PPC.Ppr + +import GHC.CmmToAsm.Reg.Liveness +import qualified GHC.CmmToAsm.Reg.Linear as Linear + +import qualified GraphColor as Color +import qualified GHC.CmmToAsm.Reg.Graph as Color +import qualified GHC.CmmToAsm.Reg.Graph.Stats as Color +import qualified GHC.CmmToAsm.Reg.Graph.TrivColorable as Color + +import AsmUtils +import GHC.CmmToAsm.Reg.Target +import GHC.Platform +import GHC.CmmToAsm.BlockLayout as BlockLayout +import Config +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.PIC +import GHC.Platform.Reg +import GHC.CmmToAsm.Monad +import GHC.CmmToAsm.CFG +import GHC.CmmToAsm.Dwarf +import GHC.Cmm.DebugBlock + +import GHC.Cmm.BlockId +import GHC.StgToCmm.CgUtils ( fixStgRegisters ) +import GHC.Cmm +import GHC.Cmm.Utils +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label +import GHC.Cmm.Dataflow.Block +import GHC.Cmm.Opt ( cmmMachOpFold ) +import GHC.Cmm.Ppr +import GHC.Cmm.CLabel + +import UniqFM +import UniqSupply +import GHC.Driver.Session +import Util + +import BasicTypes ( Alignment ) +import qualified Pretty +import BufWrite +import Outputable +import FastString +import UniqSet +import ErrUtils +import Module +import Stream (Stream) +import qualified Stream + +-- DEBUGGING ONLY +--import OrdList + +import Data.List +import Data.Maybe +import Data.Ord ( comparing ) +import Control.Exception +import Control.Monad +import System.IO + +{- +The native-code generator has machine-independent and +machine-dependent modules. + +This module ("AsmCodeGen") is the top-level machine-independent +module. Before entering machine-dependent land, we do some +machine-independent optimisations (defined below) on the +'CmmStmts's. + +We convert to the machine-specific 'Instr' datatype with +'cmmCodeGen', assuming an infinite supply of registers. We then use +a machine-independent register allocator ('regAlloc') to rejoin +reality. Obviously, 'regAlloc' has machine-specific helper +functions (see about "RegAllocInfo" below). + +Finally, we order the basic blocks of the function so as to minimise +the number of jumps between blocks, by utilising fallthrough wherever +possible. + +The machine-dependent bits break down as follows: + + * ["MachRegs"] Everything about the target platform's machine + registers (and immediate operands, and addresses, which tend to + intermingle/interact with registers). + + * ["MachInstrs"] Includes the 'Instr' datatype (possibly should + have a module of its own), plus a miscellany of other things + (e.g., 'targetDoubleSize', 'smStablePtrTable', ...) + + * ["MachCodeGen"] is where 'Cmm' stuff turns into + machine instructions. + + * ["PprMach"] 'pprInstr' turns an 'Instr' into text (well, really + a 'SDoc'). + + * ["RegAllocInfo"] In the register allocator, we manipulate + 'MRegsState's, which are 'BitSet's, one bit per machine register. + When we want to say something about a specific machine register + (e.g., ``it gets clobbered by this instruction''), we set/unset + its bit. Obviously, we do this 'BitSet' thing for efficiency + reasons. + + The 'RegAllocInfo' module collects together the machine-specific + info needed to do register allocation. + + * ["RegisterAlloc"] The (machine-independent) register allocator. +-} + +-------------------- +nativeCodeGen :: forall a . DynFlags -> Module -> ModLocation -> Handle -> UniqSupply + -> Stream IO RawCmmGroup a + -> IO a +nativeCodeGen dflags this_mod modLoc h us cmms + = let platform = targetPlatform dflags + nCG' :: ( Outputable statics, Outputable instr + , Outputable jumpDest, Instruction instr) + => NcgImpl statics instr jumpDest -> IO a + nCG' ncgImpl = nativeCodeGen' dflags this_mod modLoc ncgImpl h us cmms + in case platformArch platform of + ArchX86 -> nCG' (x86NcgImpl dflags) + ArchX86_64 -> nCG' (x86_64NcgImpl dflags) + ArchPPC -> nCG' (ppcNcgImpl dflags) + ArchS390X -> panic "nativeCodeGen: No NCG for S390X" + ArchSPARC -> nCG' (sparcNcgImpl dflags) + ArchSPARC64 -> panic "nativeCodeGen: No NCG for SPARC64" + ArchARM {} -> panic "nativeCodeGen: No NCG for ARM" + ArchARM64 -> panic "nativeCodeGen: No NCG for ARM64" + ArchPPC_64 _ -> nCG' (ppcNcgImpl dflags) + ArchAlpha -> panic "nativeCodeGen: No NCG for Alpha" + ArchMipseb -> panic "nativeCodeGen: No NCG for mipseb" + ArchMipsel -> panic "nativeCodeGen: No NCG for mipsel" + ArchUnknown -> panic "nativeCodeGen: No NCG for unknown arch" + ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript" + +x86NcgImpl :: DynFlags -> NcgImpl (Alignment, RawCmmStatics) + X86.Instr.Instr X86.Instr.JumpDest +x86NcgImpl dflags + = (x86_64NcgImpl dflags) + +x86_64NcgImpl :: DynFlags -> NcgImpl (Alignment, RawCmmStatics) + X86.Instr.Instr X86.Instr.JumpDest +x86_64NcgImpl dflags + = NcgImpl { + cmmTopCodeGen = X86.CodeGen.cmmTopCodeGen + ,generateJumpTableForInstr = X86.CodeGen.generateJumpTableForInstr dflags + ,getJumpDestBlockId = X86.Instr.getJumpDestBlockId + ,canShortcut = X86.Instr.canShortcut + ,shortcutStatics = X86.Instr.shortcutStatics + ,shortcutJump = X86.Instr.shortcutJump + ,pprNatCmmDecl = X86.Ppr.pprNatCmmDecl + ,maxSpillSlots = X86.Instr.maxSpillSlots dflags + ,allocatableRegs = X86.Regs.allocatableRegs platform + ,ncgAllocMoreStack = X86.Instr.allocMoreStack platform + ,ncgExpandTop = id + ,ncgMakeFarBranches = const id + ,extractUnwindPoints = X86.CodeGen.extractUnwindPoints + ,invertCondBranches = X86.CodeGen.invertCondBranches + } + where platform = targetPlatform dflags + +ppcNcgImpl :: DynFlags -> NcgImpl RawCmmStatics PPC.Instr.Instr PPC.RegInfo.JumpDest +ppcNcgImpl dflags + = NcgImpl { + cmmTopCodeGen = PPC.CodeGen.cmmTopCodeGen + ,generateJumpTableForInstr = PPC.CodeGen.generateJumpTableForInstr dflags + ,getJumpDestBlockId = PPC.RegInfo.getJumpDestBlockId + ,canShortcut = PPC.RegInfo.canShortcut + ,shortcutStatics = PPC.RegInfo.shortcutStatics + ,shortcutJump = PPC.RegInfo.shortcutJump + ,pprNatCmmDecl = PPC.Ppr.pprNatCmmDecl + ,maxSpillSlots = PPC.Instr.maxSpillSlots dflags + ,allocatableRegs = PPC.Regs.allocatableRegs platform + ,ncgAllocMoreStack = PPC.Instr.allocMoreStack platform + ,ncgExpandTop = id + ,ncgMakeFarBranches = PPC.Instr.makeFarBranches + ,extractUnwindPoints = const [] + ,invertCondBranches = \_ _ -> id + } + where platform = targetPlatform dflags + +sparcNcgImpl :: DynFlags -> NcgImpl RawCmmStatics SPARC.Instr.Instr SPARC.ShortcutJump.JumpDest +sparcNcgImpl dflags + = NcgImpl { + cmmTopCodeGen = SPARC.CodeGen.cmmTopCodeGen + ,generateJumpTableForInstr = SPARC.CodeGen.generateJumpTableForInstr dflags + ,getJumpDestBlockId = SPARC.ShortcutJump.getJumpDestBlockId + ,canShortcut = SPARC.ShortcutJump.canShortcut + ,shortcutStatics = SPARC.ShortcutJump.shortcutStatics + ,shortcutJump = SPARC.ShortcutJump.shortcutJump + ,pprNatCmmDecl = SPARC.Ppr.pprNatCmmDecl + ,maxSpillSlots = SPARC.Instr.maxSpillSlots dflags + ,allocatableRegs = SPARC.Regs.allocatableRegs + ,ncgAllocMoreStack = noAllocMoreStack + ,ncgExpandTop = map SPARC.CodeGen.Expand.expandTop + ,ncgMakeFarBranches = const id + ,extractUnwindPoints = const [] + ,invertCondBranches = \_ _ -> id + } + +-- +-- Allocating more stack space for spilling is currently only +-- supported for the linear register allocator on x86/x86_64, the rest +-- default to the panic below. To support allocating extra stack on +-- more platforms provide a definition of ncgAllocMoreStack. +-- +noAllocMoreStack :: Int -> NatCmmDecl statics instr + -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)]) +noAllocMoreStack amount _ + = panic $ "Register allocator: out of stack slots (need " ++ show amount ++ ")\n" + ++ " If you are trying to compile SHA1.hs from the crypto library then this\n" + ++ " is a known limitation in the linear allocator.\n" + ++ "\n" + ++ " Try enabling the graph colouring allocator with -fregs-graph instead." + ++ " You can still file a bug report if you like.\n" + + +-- | Data accumulated during code generation. Mostly about statistics, +-- but also collects debug data for DWARF generation. +data NativeGenAcc statics instr + = NGS { ngs_imports :: ![[CLabel]] + , ngs_natives :: ![[NatCmmDecl statics instr]] + -- ^ Native code generated, for statistics. This might + -- hold a lot of data, so it is important to clear this + -- field as early as possible if it isn't actually + -- required. + , ngs_colorStats :: ![[Color.RegAllocStats statics instr]] + , ngs_linearStats :: ![[Linear.RegAllocStats]] + , ngs_labels :: ![Label] + , ngs_debug :: ![DebugBlock] + , ngs_dwarfFiles :: !DwarfFiles + , ngs_unwinds :: !(LabelMap [UnwindPoint]) + -- ^ see Note [Unwinding information in the NCG] + -- and Note [What is this unwinding business?] in Debug. + } + +{- +Note [Unwinding information in the NCG] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unwind information is a type of metadata which allows a debugging tool +to reconstruct the values of machine registers at the time a procedure was +entered. For the most part, the production of unwind information is handled by +the Cmm stage, where it is represented by CmmUnwind nodes. + +Unfortunately, the Cmm stage doesn't know everything necessary to produce +accurate unwinding information. For instance, the x86-64 calling convention +requires that the stack pointer be aligned to 16 bytes, which in turn means that +GHC must sometimes add padding to $sp prior to performing a foreign call. When +this happens unwind information must be updated accordingly. +For this reason, we make the NCG backends responsible for producing +unwinding tables (with the extractUnwindPoints function in NcgImpl). + +We accumulate the produced unwind tables over CmmGroups in the ngs_unwinds +field of NativeGenAcc. This is a label map which contains an entry for each +procedure, containing a list of unwinding points (e.g. a label and an associated +unwinding table). + +See also Note [What is this unwinding business?] in Debug. +-} + +nativeCodeGen' :: (Outputable statics, Outputable instr,Outputable jumpDest, + Instruction instr) + => DynFlags + -> Module -> ModLocation + -> NcgImpl statics instr jumpDest + -> Handle + -> UniqSupply + -> Stream IO RawCmmGroup a + -> IO a +nativeCodeGen' dflags this_mod modLoc ncgImpl h us cmms + = do + -- BufHandle is a performance hack. We could hide it inside + -- Pretty if it weren't for the fact that we do lots of little + -- printDocs here (in order to do codegen in constant space). + bufh <- newBufHandle h + let ngs0 = NGS [] [] [] [] [] [] emptyUFM mapEmpty + (ngs, us', a) <- cmmNativeGenStream dflags this_mod modLoc ncgImpl bufh us + cmms ngs0 + _ <- finishNativeGen dflags modLoc bufh us' ngs + return a + +finishNativeGen :: Instruction instr + => DynFlags + -> ModLocation + -> BufHandle + -> UniqSupply + -> NativeGenAcc statics instr + -> IO UniqSupply +finishNativeGen dflags modLoc bufh@(BufHandle _ _ h) us ngs + = withTimingSilent dflags (text "NCG") (`seq` ()) $ do + -- Write debug data and finish + let emitDw = debugLevel dflags > 0 + us' <- if not emitDw then return us else do + (dwarf, us') <- dwarfGen dflags modLoc us (ngs_debug ngs) + emitNativeCode dflags bufh dwarf + return us' + bFlush bufh + + -- dump global NCG stats for graph coloring allocator + let stats = concat (ngs_colorStats ngs) + unless (null stats) $ do + + -- build the global register conflict graph + let graphGlobal + = foldl' Color.union Color.initGraph + $ [ Color.raGraph stat + | stat@Color.RegAllocStatsStart{} <- stats] + + dump_stats (Color.pprStats stats graphGlobal) + + let platform = targetPlatform dflags + dumpIfSet_dyn dflags + Opt_D_dump_asm_conflicts "Register conflict graph" + FormatText + $ Color.dotGraph + (targetRegDotColor platform) + (Color.trivColorable platform + (targetVirtualRegSqueeze platform) + (targetRealRegSqueeze platform)) + $ graphGlobal + + + -- dump global NCG stats for linear allocator + let linearStats = concat (ngs_linearStats ngs) + unless (null linearStats) $ + dump_stats (Linear.pprStats (concat (ngs_natives ngs)) linearStats) + + -- write out the imports + printSDocLn Pretty.LeftMode dflags h (mkCodeStyle AsmStyle) + $ makeImportsDoc dflags (concat (ngs_imports ngs)) + return us' + where + dump_stats = dumpAction dflags (mkDumpStyle dflags alwaysQualify) + (dumpOptionsFromFlag Opt_D_dump_asm_stats) "NCG stats" + FormatText + +cmmNativeGenStream :: (Outputable statics, Outputable instr + ,Outputable jumpDest, Instruction instr) + => DynFlags + -> Module -> ModLocation + -> NcgImpl statics instr jumpDest + -> BufHandle + -> UniqSupply + -> Stream IO RawCmmGroup a + -> NativeGenAcc statics instr + -> IO (NativeGenAcc statics instr, UniqSupply, a) + +cmmNativeGenStream dflags this_mod modLoc ncgImpl h us cmm_stream ngs + = do r <- Stream.runStream cmm_stream + case r of + Left a -> + return (ngs { ngs_imports = reverse $ ngs_imports ngs + , ngs_natives = reverse $ ngs_natives ngs + , ngs_colorStats = reverse $ ngs_colorStats ngs + , ngs_linearStats = reverse $ ngs_linearStats ngs + }, + us, + a) + Right (cmms, cmm_stream') -> do + (us', ngs'') <- + withTimingSilent + dflags + ncglabel (\(a, b) -> a `seq` b `seq` ()) $ do + -- Generate debug information + let debugFlag = debugLevel dflags > 0 + !ndbgs | debugFlag = cmmDebugGen modLoc cmms + | otherwise = [] + dbgMap = debugToMap ndbgs + + -- Generate native code + (ngs',us') <- cmmNativeGens dflags this_mod modLoc ncgImpl h + dbgMap us cmms ngs 0 + + -- Link native code information into debug blocks + -- See Note [What is this unwinding business?] in Debug. + let !ldbgs = cmmDebugLink (ngs_labels ngs') (ngs_unwinds ngs') ndbgs + unless (null ldbgs) $ + dumpIfSet_dyn dflags Opt_D_dump_debug "Debug Infos" FormatText + (vcat $ map ppr ldbgs) + + -- Accumulate debug information for emission in finishNativeGen. + let ngs'' = ngs' { ngs_debug = ngs_debug ngs' ++ ldbgs, ngs_labels = [] } + return (us', ngs'') + + cmmNativeGenStream dflags this_mod modLoc ncgImpl h us' + cmm_stream' ngs'' + + where ncglabel = text "NCG" + +-- | Do native code generation on all these cmms. +-- +cmmNativeGens :: forall statics instr jumpDest. + (Outputable statics, Outputable instr + ,Outputable jumpDest, Instruction instr) + => DynFlags + -> Module -> ModLocation + -> NcgImpl statics instr jumpDest + -> BufHandle + -> LabelMap DebugBlock + -> UniqSupply + -> [RawCmmDecl] + -> NativeGenAcc statics instr + -> Int + -> IO (NativeGenAcc statics instr, UniqSupply) + +cmmNativeGens dflags this_mod modLoc ncgImpl h dbgMap = go + where + go :: UniqSupply -> [RawCmmDecl] + -> NativeGenAcc statics instr -> Int + -> IO (NativeGenAcc statics instr, UniqSupply) + + go us [] ngs !_ = + return (ngs, us) + + go us (cmm : cmms) ngs count = do + let fileIds = ngs_dwarfFiles ngs + (us', fileIds', native, imports, colorStats, linearStats, unwinds) + <- {-# SCC "cmmNativeGen" #-} + cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap + cmm count + + -- Generate .file directives for every new file that has been + -- used. Note that it is important that we generate these in + -- ascending order, as Clang's 3.6 assembler complains. + let newFileIds = sortBy (comparing snd) $ + nonDetEltsUFM $ fileIds' `minusUFM` fileIds + -- See Note [Unique Determinism and code generation] + pprDecl (f,n) = text "\t.file " <> ppr n <+> + pprFilePathString (unpackFS f) + + emitNativeCode dflags h $ vcat $ + map pprDecl newFileIds ++ + map (pprNatCmmDecl ncgImpl) native + + -- force evaluation all this stuff to avoid space leaks + {-# SCC "seqString" #-} evaluate $ seqList (showSDoc dflags $ vcat $ map ppr imports) () + + let !labels' = if debugLevel dflags > 0 + then cmmDebugLabels isMetaInstr native else [] + !natives' = if dopt Opt_D_dump_asm_stats dflags + then native : ngs_natives ngs else [] + + mCon = maybe id (:) + ngs' = ngs{ ngs_imports = imports : ngs_imports ngs + , ngs_natives = natives' + , ngs_colorStats = colorStats `mCon` ngs_colorStats ngs + , ngs_linearStats = linearStats `mCon` ngs_linearStats ngs + , ngs_labels = ngs_labels ngs ++ labels' + , ngs_dwarfFiles = fileIds' + , ngs_unwinds = ngs_unwinds ngs `mapUnion` unwinds + } + go us' cmms ngs' (count + 1) + + +emitNativeCode :: DynFlags -> BufHandle -> SDoc -> IO () +emitNativeCode dflags h sdoc = do + + {-# SCC "pprNativeCode" #-} bufLeftRenderSDoc dflags h + (mkCodeStyle AsmStyle) sdoc + + -- dump native code + dumpIfSet_dyn dflags + Opt_D_dump_asm "Asm code" FormatASM + sdoc + +-- | Complete native code generation phase for a single top-level chunk of Cmm. +-- Dumping the output of each stage along the way. +-- Global conflict graph and NGC stats +cmmNativeGen + :: forall statics instr jumpDest. (Instruction instr, + Outputable statics, Outputable instr, Outputable jumpDest) + => DynFlags + -> Module -> ModLocation + -> NcgImpl statics instr jumpDest + -> UniqSupply + -> DwarfFiles + -> LabelMap DebugBlock + -> RawCmmDecl -- ^ the cmm to generate code for + -> Int -- ^ sequence number of this top thing + -> IO ( UniqSupply + , DwarfFiles + , [NatCmmDecl statics instr] -- native code + , [CLabel] -- things imported by this cmm + , Maybe [Color.RegAllocStats statics instr] -- stats for the coloring register allocator + , Maybe [Linear.RegAllocStats] -- stats for the linear register allocators + , LabelMap [UnwindPoint] -- unwinding information for blocks + ) + +cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count + = do + let platform = targetPlatform dflags + + let proc_name = case cmm of + (CmmProc _ entry_label _ _) -> ppr entry_label + _ -> text "DataChunk" + + -- rewrite assignments to global regs + let fixed_cmm = + {-# SCC "fixStgRegisters" #-} + fixStgRegisters dflags cmm + + -- cmm to cmm optimisations + let (opt_cmm, imports) = + {-# SCC "cmmToCmm" #-} + cmmToCmm dflags this_mod fixed_cmm + + dumpIfSet_dyn dflags + Opt_D_dump_opt_cmm "Optimised Cmm" FormatCMM + (pprCmmGroup [opt_cmm]) + + let cmmCfg = {-# SCC "getCFG" #-} + getCfgProc (cfgWeightInfo dflags) opt_cmm + + -- generate native code from cmm + let ((native, lastMinuteImports, fileIds', nativeCfgWeights), usGen) = + {-# SCC "genMachCode" #-} + initUs us $ genMachCode dflags this_mod modLoc + (cmmTopCodeGen ncgImpl) + fileIds dbgMap opt_cmm cmmCfg + + dumpIfSet_dyn dflags + Opt_D_dump_asm_native "Native code" FormatASM + (vcat $ map (pprNatCmmDecl ncgImpl) native) + + maybeDumpCfg dflags (Just nativeCfgWeights) "CFG Weights - Native" proc_name + + -- tag instructions with register liveness information + -- also drops dead code. We don't keep the cfg in sync on + -- some backends, so don't use it there. + let livenessCfg = if (backendMaintainsCfg dflags) + then Just nativeCfgWeights + else Nothing + let (withLiveness, usLive) = + {-# SCC "regLiveness" #-} + initUs usGen + $ mapM (cmmTopLiveness livenessCfg platform) native + + dumpIfSet_dyn dflags + Opt_D_dump_asm_liveness "Liveness annotations added" + FormatCMM + (vcat $ map ppr withLiveness) + + -- allocate registers + (alloced, usAlloc, ppr_raStatsColor, ppr_raStatsLinear, raStats, stack_updt_blks) <- + if ( gopt Opt_RegsGraph dflags + || gopt Opt_RegsIterative dflags ) + then do + -- the regs usable for allocation + let (alloc_regs :: UniqFM (UniqSet RealReg)) + = foldr (\r -> plusUFM_C unionUniqSets + $ unitUFM (targetClassOfRealReg platform r) (unitUniqSet r)) + emptyUFM + $ allocatableRegs ncgImpl + + -- do the graph coloring register allocation + let ((alloced, maybe_more_stack, regAllocStats), usAlloc) + = {-# SCC "RegAlloc-color" #-} + initUs usLive + $ Color.regAlloc + dflags + alloc_regs + (mkUniqSet [0 .. maxSpillSlots ncgImpl]) + (maxSpillSlots ncgImpl) + withLiveness + livenessCfg + + let ((alloced', stack_updt_blks), usAlloc') + = initUs usAlloc $ + case maybe_more_stack of + Nothing -> return (alloced, []) + Just amount -> do + (alloced',stack_updt_blks) <- unzip <$> + (mapM ((ncgAllocMoreStack ncgImpl) amount) alloced) + return (alloced', concat stack_updt_blks ) + + + -- dump out what happened during register allocation + dumpIfSet_dyn dflags + Opt_D_dump_asm_regalloc "Registers allocated" + FormatCMM + (vcat $ map (pprNatCmmDecl ncgImpl) alloced) + + dumpIfSet_dyn dflags + Opt_D_dump_asm_regalloc_stages "Build/spill stages" + FormatText + (vcat $ map (\(stage, stats) + -> text "# --------------------------" + $$ text "# cmm " <> int count <> text " Stage " <> int stage + $$ ppr stats) + $ zip [0..] regAllocStats) + + let mPprStats = + if dopt Opt_D_dump_asm_stats dflags + then Just regAllocStats else Nothing + + -- force evaluation of the Maybe to avoid space leak + mPprStats `seq` return () + + return ( alloced', usAlloc' + , mPprStats + , Nothing + , [], stack_updt_blks) + + else do + -- do linear register allocation + let reg_alloc proc = do + (alloced, maybe_more_stack, ra_stats) <- + Linear.regAlloc dflags proc + case maybe_more_stack of + Nothing -> return ( alloced, ra_stats, [] ) + Just amount -> do + (alloced',stack_updt_blks) <- + ncgAllocMoreStack ncgImpl amount alloced + return (alloced', ra_stats, stack_updt_blks ) + + let ((alloced, regAllocStats, stack_updt_blks), usAlloc) + = {-# SCC "RegAlloc-linear" #-} + initUs usLive + $ liftM unzip3 + $ mapM reg_alloc withLiveness + + dumpIfSet_dyn dflags + Opt_D_dump_asm_regalloc "Registers allocated" + FormatCMM + (vcat $ map (pprNatCmmDecl ncgImpl) alloced) + + let mPprStats = + if dopt Opt_D_dump_asm_stats dflags + then Just (catMaybes regAllocStats) else Nothing + + -- force evaluation of the Maybe to avoid space leak + mPprStats `seq` return () + + return ( alloced, usAlloc + , Nothing + , mPprStats, (catMaybes regAllocStats) + , concat stack_updt_blks ) + + -- Fixupblocks the register allocator inserted (from, regMoves, to) + let cfgRegAllocUpdates :: [(BlockId,BlockId,BlockId)] + cfgRegAllocUpdates = (concatMap Linear.ra_fixupList raStats) + + let cfgWithFixupBlks = + (\cfg -> addNodesBetween cfg cfgRegAllocUpdates) <$> livenessCfg + + -- Insert stack update blocks + let postRegCFG = + pure (foldl' (\m (from,to) -> addImmediateSuccessor from to m )) + <*> cfgWithFixupBlks + <*> pure stack_updt_blks + + ---- generate jump tables + let tabled = + {-# SCC "generateJumpTables" #-} + generateJumpTables ncgImpl alloced + + when (not $ null nativeCfgWeights) $ dumpIfSet_dyn dflags + Opt_D_dump_cfg_weights "CFG Update information" + FormatText + ( text "stack:" <+> ppr stack_updt_blks $$ + text "linearAlloc:" <+> ppr cfgRegAllocUpdates ) + + ---- shortcut branches + let (shorted, postShortCFG) = + {-# SCC "shortcutBranches" #-} + shortcutBranches dflags ncgImpl tabled postRegCFG + + let optimizedCFG :: Maybe CFG + optimizedCFG = + optimizeCFG (cfgWeightInfo dflags) cmm <$!> postShortCFG + + maybeDumpCfg dflags optimizedCFG "CFG Weights - Final" proc_name + + --TODO: Partially check validity of the cfg. + let getBlks (CmmProc _info _lbl _live (ListGraph blocks)) = blocks + getBlks _ = [] + + when ( backendMaintainsCfg dflags && + (gopt Opt_DoAsmLinting dflags || debugIsOn )) $ do + let blocks = concatMap getBlks shorted + let labels = setFromList $ fmap blockId blocks :: LabelSet + let cfg = fromJust optimizedCFG + return $! seq (sanityCheckCfg cfg labels $ + text "cfg not in lockstep") () + + ---- sequence blocks + let sequenced :: [NatCmmDecl statics instr] + sequenced = + checkLayout shorted $ + {-# SCC "sequenceBlocks" #-} + map (BlockLayout.sequenceTop + dflags + ncgImpl optimizedCFG) + shorted + + let branchOpt :: [NatCmmDecl statics instr] + branchOpt = + {-# SCC "invertCondBranches" #-} + map invert sequenced + where + invertConds :: LabelMap RawCmmStatics -> [NatBasicBlock instr] + -> [NatBasicBlock instr] + invertConds = invertCondBranches ncgImpl optimizedCFG + invert top@CmmData {} = top + invert (CmmProc info lbl live (ListGraph blocks)) = + CmmProc info lbl live (ListGraph $ invertConds info blocks) + + ---- expansion of SPARC synthetic instrs + let expanded = + {-# SCC "sparc_expand" #-} + ncgExpandTop ncgImpl branchOpt + --ncgExpandTop ncgImpl sequenced + + dumpIfSet_dyn dflags + Opt_D_dump_asm_expanded "Synthetic instructions expanded" + FormatCMM + (vcat $ map (pprNatCmmDecl ncgImpl) expanded) + + -- generate unwinding information from cmm + let unwinds :: BlockMap [UnwindPoint] + unwinds = + {-# SCC "unwindingInfo" #-} + foldl' addUnwind mapEmpty expanded + where + addUnwind acc proc = + acc `mapUnion` computeUnwinding dflags ncgImpl proc + + return ( usAlloc + , fileIds' + , expanded + , lastMinuteImports ++ imports + , ppr_raStatsColor + , ppr_raStatsLinear + , unwinds ) + +maybeDumpCfg :: DynFlags -> Maybe CFG -> String -> SDoc -> IO () +maybeDumpCfg _dflags Nothing _ _ = return () +maybeDumpCfg dflags (Just cfg) msg proc_name + | null cfg = return () + | otherwise + = dumpIfSet_dyn + dflags Opt_D_dump_cfg_weights msg + FormatText + (proc_name <> char ':' $$ pprEdgeWeights cfg) + +-- | Make sure all blocks we want the layout algorithm to place have been placed. +checkLayout :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr] + -> [NatCmmDecl statics instr] +checkLayout procsUnsequenced procsSequenced = + ASSERT2(setNull diff, + ppr "Block sequencing dropped blocks:" <> ppr diff) + procsSequenced + where + blocks1 = foldl' (setUnion) setEmpty $ + map getBlockIds procsUnsequenced :: LabelSet + blocks2 = foldl' (setUnion) setEmpty $ + map getBlockIds procsSequenced + diff = setDifference blocks1 blocks2 + + getBlockIds (CmmData _ _) = setEmpty + getBlockIds (CmmProc _ _ _ (ListGraph blocks)) = + setFromList $ map blockId blocks + +-- | Compute unwinding tables for the blocks of a procedure +computeUnwinding :: Instruction instr + => DynFlags -> NcgImpl statics instr jumpDest + -> NatCmmDecl statics instr + -- ^ the native code generated for the procedure + -> LabelMap [UnwindPoint] + -- ^ unwinding tables for all points of all blocks of the + -- procedure +computeUnwinding dflags _ _ + | debugLevel dflags == 0 = mapEmpty +computeUnwinding _ _ (CmmData _ _) = mapEmpty +computeUnwinding _ ncgImpl (CmmProc _ _ _ (ListGraph blks)) = + -- In general we would need to push unwinding information down the + -- block-level call-graph to ensure that we fully account for all + -- relevant register writes within a procedure. + -- + -- However, the only unwinding information that we care about in GHC is for + -- Sp. The fact that GHC.Cmm.LayoutStack already ensures that we have unwind + -- information at the beginning of every block means that there is no need + -- to perform this sort of push-down. + mapFromList [ (blk_lbl, extractUnwindPoints ncgImpl instrs) + | BasicBlock blk_lbl instrs <- blks ] + +-- | Build a doc for all the imports. +-- +makeImportsDoc :: DynFlags -> [CLabel] -> SDoc +makeImportsDoc dflags imports + = dyld_stubs imports + $$ + -- On recent versions of Darwin, the linker supports + -- dead-stripping of code and data on a per-symbol basis. + -- There's a hack to make this work in PprMach.pprNatCmmDecl. + (if platformHasSubsectionsViaSymbols platform + then text ".subsections_via_symbols" + else Outputable.empty) + $$ + -- On recent GNU ELF systems one can mark an object file + -- as not requiring an executable stack. If all objects + -- linked into a program have this note then the program + -- will not use an executable stack, which is good for + -- security. GHC generated code does not need an executable + -- stack so add the note in: + (if platformHasGnuNonexecStack platform + then text ".section .note.GNU-stack,\"\"," <> sectionType "progbits" + else Outputable.empty) + $$ + -- And just because every other compiler does, let's stick in + -- an identifier directive: .ident "GHC x.y.z" + (if platformHasIdentDirective platform + then let compilerIdent = text "GHC" <+> text cProjectVersion + in text ".ident" <+> doubleQuotes compilerIdent + else Outputable.empty) + + where + platform = targetPlatform dflags + arch = platformArch platform + os = platformOS platform + + -- Generate "symbol stubs" for all external symbols that might + -- come from a dynamic library. + dyld_stubs :: [CLabel] -> SDoc +{- dyld_stubs imps = vcat $ map pprDyldSymbolStub $ + map head $ group $ sort imps-} + -- (Hack) sometimes two Labels pretty-print the same, but have + -- different uniques; so we compare their text versions... + dyld_stubs imps + | needImportedSymbols dflags arch os + = vcat $ + (pprGotDeclaration dflags arch os :) $ + map ( pprImportedSymbol dflags platform . fst . head) $ + groupBy (\(_,a) (_,b) -> a == b) $ + sortBy (\(_,a) (_,b) -> compare a b) $ + map doPpr $ + imps + | otherwise + = Outputable.empty + + doPpr lbl = (lbl, renderWithStyle + (initSDocContext dflags astyle) + (pprCLabel dflags lbl)) + astyle = mkCodeStyle AsmStyle + +-- ----------------------------------------------------------------------------- +-- Generate jump tables + +-- Analyzes all native code and generates data sections for all jump +-- table instructions. +generateJumpTables + :: NcgImpl statics instr jumpDest + -> [NatCmmDecl statics instr] -> [NatCmmDecl statics instr] +generateJumpTables ncgImpl xs = concatMap f xs + where f p@(CmmProc _ _ _ (ListGraph xs)) = p : concatMap g xs + f p = [p] + g (BasicBlock _ xs) = catMaybes (map (generateJumpTableForInstr ncgImpl) xs) + +-- ----------------------------------------------------------------------------- +-- Shortcut branches + +shortcutBranches + :: forall statics instr jumpDest. (Outputable jumpDest) => DynFlags + -> NcgImpl statics instr jumpDest + -> [NatCmmDecl statics instr] + -> Maybe CFG + -> ([NatCmmDecl statics instr],Maybe CFG) + +shortcutBranches dflags ncgImpl tops weights + | gopt Opt_AsmShortcutting dflags + = ( map (apply_mapping ncgImpl mapping) tops' + , shortcutWeightMap mappingBid <$!> weights ) + | otherwise + = (tops, weights) + where + (tops', mappings) = mapAndUnzip (build_mapping ncgImpl) tops + mapping = mapUnions mappings :: LabelMap jumpDest + mappingBid = fmap (getJumpDestBlockId ncgImpl) mapping + +build_mapping :: forall instr t d statics jumpDest. + NcgImpl statics instr jumpDest + -> GenCmmDecl d (LabelMap t) (ListGraph instr) + -> (GenCmmDecl d (LabelMap t) (ListGraph instr) + ,LabelMap jumpDest) +build_mapping _ top@(CmmData _ _) = (top, mapEmpty) +build_mapping _ (CmmProc info lbl live (ListGraph [])) + = (CmmProc info lbl live (ListGraph []), mapEmpty) +build_mapping ncgImpl (CmmProc info lbl live (ListGraph (head:blocks))) + = (CmmProc info lbl live (ListGraph (head:others)), mapping) + -- drop the shorted blocks, but don't ever drop the first one, + -- because it is pointed to by a global label. + where + -- find all the blocks that just consist of a jump that can be + -- shorted. + -- Don't completely eliminate loops here -- that can leave a dangling jump! + shortcut_blocks :: [(BlockId, jumpDest)] + (_, shortcut_blocks, others) = + foldl' split (setEmpty :: LabelSet, [], []) blocks + split (s, shortcut_blocks, others) b@(BasicBlock id [insn]) + | Just jd <- canShortcut ncgImpl insn + , Just dest <- getJumpDestBlockId ncgImpl jd + , not (has_info id) + , (setMember dest s) || dest == id -- loop checks + = (s, shortcut_blocks, b : others) + split (s, shortcut_blocks, others) (BasicBlock id [insn]) + | Just dest <- canShortcut ncgImpl insn + , not (has_info id) + = (setInsert id s, (id,dest) : shortcut_blocks, others) + split (s, shortcut_blocks, others) other = (s, shortcut_blocks, other : others) + + -- do not eliminate blocks that have an info table + has_info l = mapMember l info + + -- build a mapping from BlockId to JumpDest for shorting branches + mapping = mapFromList shortcut_blocks + +apply_mapping :: NcgImpl statics instr jumpDest + -> LabelMap jumpDest + -> GenCmmDecl statics h (ListGraph instr) + -> GenCmmDecl statics h (ListGraph instr) +apply_mapping ncgImpl ufm (CmmData sec statics) + = CmmData sec (shortcutStatics ncgImpl (\bid -> mapLookup bid ufm) statics) +apply_mapping ncgImpl ufm (CmmProc info lbl live (ListGraph blocks)) + = CmmProc info lbl live (ListGraph $ map short_bb blocks) + where + short_bb (BasicBlock id insns) = BasicBlock id $! map short_insn insns + short_insn i = shortcutJump ncgImpl (\bid -> mapLookup bid ufm) i + -- shortcutJump should apply the mapping repeatedly, + -- just in case we can short multiple branches. + +-- ----------------------------------------------------------------------------- +-- Instruction selection + +-- Native code instruction selection for a chunk of stix code. For +-- this part of the computation, we switch from the UniqSM monad to +-- the NatM monad. The latter carries not only a Unique, but also an +-- Int denoting the current C stack pointer offset in the generated +-- code; this is needed for creating correct spill offsets on +-- architectures which don't offer, or for which it would be +-- prohibitively expensive to employ, a frame pointer register. Viz, +-- x86. + +-- The offset is measured in bytes, and indicates the difference +-- between the current (simulated) C stack-ptr and the value it was at +-- the beginning of the block. For stacks which grow down, this value +-- should be either zero or negative. + +-- Along with the stack pointer offset, we also carry along a LabelMap of +-- DebugBlocks, which we read to generate .location directives. +-- +-- Switching between the two monads whilst carrying along the same +-- Unique supply breaks abstraction. Is that bad? + +genMachCode + :: DynFlags + -> Module -> ModLocation + -> (RawCmmDecl -> NatM [NatCmmDecl statics instr]) + -> DwarfFiles + -> LabelMap DebugBlock + -> RawCmmDecl + -> CFG + -> UniqSM + ( [NatCmmDecl statics instr] + , [CLabel] + , DwarfFiles + , CFG + ) + +genMachCode dflags this_mod modLoc cmmTopCodeGen fileIds dbgMap cmm_top cmm_cfg + = do { initial_us <- getUniqueSupplyM + ; let initial_st = mkNatM_State initial_us 0 dflags this_mod + modLoc fileIds dbgMap cmm_cfg + (new_tops, final_st) = initNat initial_st (cmmTopCodeGen cmm_top) + final_delta = natm_delta final_st + final_imports = natm_imports final_st + final_cfg = natm_cfg final_st + ; if final_delta == 0 + then return (new_tops, final_imports + , natm_fileid final_st, final_cfg) + else pprPanic "genMachCode: nonzero final delta" (int final_delta) + } + +-- ----------------------------------------------------------------------------- +-- Generic Cmm optimiser + +{- +Here we do: + + (a) Constant folding + (c) Position independent code and dynamic linking + (i) introduce the appropriate indirections + and position independent refs + (ii) compile a list of imported symbols + (d) Some arch-specific optimizations + +(a) will be moving to the new Hoopl pipeline, however, (c) and +(d) are only needed by the native backend and will continue to live +here. + +Ideas for other things we could do (put these in Hoopl please!): + + - shortcut jumps-to-jumps + - simple CSE: if an expr is assigned to a temp, then replace later occs of + that expr with the temp, until the expr is no longer valid (can push through + temp assignments, and certain assigns to mem...) +-} + +cmmToCmm :: DynFlags -> Module -> RawCmmDecl -> (RawCmmDecl, [CLabel]) +cmmToCmm _ _ top@(CmmData _ _) = (top, []) +cmmToCmm dflags this_mod (CmmProc info lbl live graph) + = runCmmOpt dflags this_mod $ + do blocks' <- mapM cmmBlockConFold (toBlockList graph) + return $ CmmProc info lbl live (ofBlockList (g_entry graph) blocks') + +-- Avoids using unboxed tuples when loading into GHCi +#if !defined(GHC_LOADED_INTO_GHCI) + +type OptMResult a = (# a, [CLabel] #) + +pattern OptMResult :: a -> b -> (# a, b #) +pattern OptMResult x y = (# x, y #) +{-# COMPLETE OptMResult #-} +#else + +data OptMResult a = OptMResult !a ![CLabel] deriving (Functor) +#endif + +newtype CmmOptM a = CmmOptM (DynFlags -> Module -> [CLabel] -> OptMResult a) + deriving (Functor) + +instance Applicative CmmOptM where + pure x = CmmOptM $ \_ _ imports -> OptMResult x imports + (<*>) = ap + +instance Monad CmmOptM where + (CmmOptM f) >>= g = + CmmOptM $ \dflags this_mod imports0 -> + case f dflags this_mod imports0 of + OptMResult x imports1 -> + case g x of + CmmOptM g' -> g' dflags this_mod imports1 + +instance CmmMakeDynamicReferenceM CmmOptM where + addImport = addImportCmmOpt + getThisModule = CmmOptM $ \_ this_mod imports -> OptMResult this_mod imports + +addImportCmmOpt :: CLabel -> CmmOptM () +addImportCmmOpt lbl = CmmOptM $ \_ _ imports -> OptMResult () (lbl:imports) + +instance HasDynFlags CmmOptM where + getDynFlags = CmmOptM $ \dflags _ imports -> OptMResult dflags imports + +runCmmOpt :: DynFlags -> Module -> CmmOptM a -> (a, [CLabel]) +runCmmOpt dflags this_mod (CmmOptM f) = + case f dflags this_mod [] of + OptMResult result imports -> (result, imports) + +cmmBlockConFold :: CmmBlock -> CmmOptM CmmBlock +cmmBlockConFold block = do + let (entry, middle, last) = blockSplit block + stmts = blockToList middle + stmts' <- mapM cmmStmtConFold stmts + last' <- cmmStmtConFold last + return $ blockJoin entry (blockFromList stmts') last' + +-- This does three optimizations, but they're very quick to check, so we don't +-- bother turning them off even when the Hoopl code is active. Since +-- this is on the old Cmm representation, we can't reuse the code either: +-- * reg = reg --> nop +-- * if 0 then jump --> nop +-- * if 1 then jump --> jump +-- We might be tempted to skip this step entirely of not Opt_PIC, but +-- there is some PowerPC code for the non-PIC case, which would also +-- have to be separated. +cmmStmtConFold :: CmmNode e x -> CmmOptM (CmmNode e x) +cmmStmtConFold stmt + = case stmt of + CmmAssign reg src + -> do src' <- cmmExprConFold DataReference src + return $ case src' of + CmmReg reg' | reg == reg' -> CmmComment (fsLit "nop") + new_src -> CmmAssign reg new_src + + CmmStore addr src + -> do addr' <- cmmExprConFold DataReference addr + src' <- cmmExprConFold DataReference src + return $ CmmStore addr' src' + + CmmCall { cml_target = addr } + -> do addr' <- cmmExprConFold JumpReference addr + return $ stmt { cml_target = addr' } + + CmmUnsafeForeignCall target regs args + -> do target' <- case target of + ForeignTarget e conv -> do + e' <- cmmExprConFold CallReference e + return $ ForeignTarget e' conv + PrimTarget _ -> + return target + args' <- mapM (cmmExprConFold DataReference) args + return $ CmmUnsafeForeignCall target' regs args' + + CmmCondBranch test true false likely + -> do test' <- cmmExprConFold DataReference test + return $ case test' of + CmmLit (CmmInt 0 _) -> CmmBranch false + CmmLit (CmmInt _ _) -> CmmBranch true + _other -> CmmCondBranch test' true false likely + + CmmSwitch expr ids + -> do expr' <- cmmExprConFold DataReference expr + return $ CmmSwitch expr' ids + + other + -> return other + +cmmExprConFold :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr +cmmExprConFold referenceKind expr = do + dflags <- getDynFlags + + -- With -O1 and greater, the cmmSink pass does constant-folding, so + -- we don't need to do it again here. + let expr' = if optLevel dflags >= 1 + then expr + else cmmExprCon dflags expr + + cmmExprNative referenceKind expr' + +cmmExprCon :: DynFlags -> CmmExpr -> CmmExpr +cmmExprCon dflags (CmmLoad addr rep) = CmmLoad (cmmExprCon dflags addr) rep +cmmExprCon dflags (CmmMachOp mop args) + = cmmMachOpFold dflags mop (map (cmmExprCon dflags) args) +cmmExprCon _ other = other + +-- handles both PIC and non-PIC cases... a very strange mixture +-- of things to do. +cmmExprNative :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr +cmmExprNative referenceKind expr = do + dflags <- getDynFlags + let platform = targetPlatform dflags + arch = platformArch platform + case expr of + CmmLoad addr rep + -> do addr' <- cmmExprNative DataReference addr + return $ CmmLoad addr' rep + + CmmMachOp mop args + -> do args' <- mapM (cmmExprNative DataReference) args + return $ CmmMachOp mop args' + + CmmLit (CmmBlock id) + -> cmmExprNative referenceKind (CmmLit (CmmLabel (infoTblLbl id))) + -- we must convert block Ids to CLabels here, because we + -- might have to do the PIC transformation. Hence we must + -- not modify BlockIds beyond this point. + + CmmLit (CmmLabel lbl) + -> do + cmmMakeDynamicReference dflags referenceKind lbl + CmmLit (CmmLabelOff lbl off) + -> do + dynRef <- cmmMakeDynamicReference dflags referenceKind lbl + -- need to optimize here, since it's late + return $ cmmMachOpFold dflags (MO_Add (wordWidth dflags)) [ + dynRef, + (CmmLit $ CmmInt (fromIntegral off) (wordWidth dflags)) + ] + + -- On powerpc (non-PIC), it's easier to jump directly to a label than + -- to use the register table, so we replace these registers + -- with the corresponding labels: + CmmReg (CmmGlobal EagerBlackholeInfo) + | arch == ArchPPC && not (positionIndependent dflags) + -> cmmExprNative referenceKind $ + CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_EAGER_BLACKHOLE_info"))) + CmmReg (CmmGlobal GCEnter1) + | arch == ArchPPC && not (positionIndependent dflags) + -> cmmExprNative referenceKind $ + CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_gc_enter_1"))) + CmmReg (CmmGlobal GCFun) + | arch == ArchPPC && not (positionIndependent dflags) + -> cmmExprNative referenceKind $ + CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_gc_fun"))) + + other + -> return other diff --git a/compiler/GHC/CmmToAsm/BlockLayout.hs b/compiler/GHC/CmmToAsm/BlockLayout.hs new file mode 100644 index 0000000000..01a1388b5f --- /dev/null +++ b/compiler/GHC/CmmToAsm/BlockLayout.hs @@ -0,0 +1,895 @@ +-- +-- Copyright (c) 2018 Andreas Klebinger +-- + +{-# LANGUAGE CPP #-} +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE FlexibleContexts #-} + +module GHC.CmmToAsm.BlockLayout + ( sequenceTop ) +where + +#include "HsVersions.h" +import GhcPrelude + +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Monad +import GHC.CmmToAsm.CFG + +import GHC.Cmm.BlockId +import GHC.Cmm +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label + +import GHC.Driver.Session (gopt, GeneralFlag(..), DynFlags, backendMaintainsCfg) +import UniqFM +import Util +import Unique + +import Digraph +import Outputable +import Maybes + +-- DEBUGGING ONLY +--import GHC.Cmm.DebugBlock +--import Debug.Trace +import ListSetOps (removeDups) + +import OrdList +import Data.List +import Data.Foldable (toList) + +import qualified Data.Set as Set +import Data.STRef +import Control.Monad.ST.Strict +import Control.Monad (foldM) + +{- + Note [CFG based code layout] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + The major steps in placing blocks are as follow: + * Compute a CFG based on the Cmm AST, see getCfgProc. + This CFG will have edge weights representing a guess + on how important they are. + * After we convert Cmm to Asm we run `optimizeCFG` which + adds a few more "educated guesses" to the equation. + * Then we run loop analysis on the CFG (`loopInfo`) which tells us + about loop headers, loop nesting levels and the sort. + * Based on the CFG and loop information refine the edge weights + in the CFG and normalize them relative to the most often visited + node. (See `mkGlobalWeights`) + * Feed this CFG into the block layout code (`sequenceTop`) in this + module. Which will then produce a code layout based on the input weights. + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ Note [Chain based CFG serialization] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + For additional information also look at + https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/code-layout + + We have a CFG with edge weights based on which we try to place blocks next to + each other. + + Edge weights not only represent likelihood of control transfer between blocks + but also how much a block would benefit from being placed sequentially after + it's predecessor. + For example blocks which are preceded by an info table are more likely to end + up in a different cache line than their predecessor and we can't eliminate the jump + so there is less benefit to placing them sequentially. + + For example consider this example: + + A: ... + jmp cond D (weak successor) + jmp B + B: ... + jmp C + C: ... + jmp X + D: ... + jmp B (weak successor) + + We determine a block layout by building up chunks (calling them chains) of + possible control flows for which blocks will be placed sequentially. + + Eg for our example we might end up with two chains like: + [A->B->C->X],[D]. Blocks inside chains will always be placed sequentially. + However there is no particular order in which chains are placed since + (hopefully) the blocks for which sequentiality is important have already + been placed in the same chain. + + ----------------------------------------------------------------------------- + 1) First try to create a list of good chains. + ----------------------------------------------------------------------------- + + Good chains are these which allow us to eliminate jump instructions. + Which further eliminate often executed jumps first. + + We do so by: + + *) Ignore edges which represent instructions which can not be replaced + by fall through control flow. Primarily calls and edges to blocks which + are prefixed by a info table we have to jump across. + + *) Then process remaining edges in order of frequency taken and: + + +) If source and target have not been placed build a new chain from them. + + +) If source and target have been placed, and are ends of differing chains + try to merge the two chains. + + +) If one side of the edge is a end/front of a chain, add the other block of + to edge to the same chain + + Eg if we look at edge (B -> C) and already have the chain (A -> B) + then we extend the chain to (A -> B -> C). + + +) If the edge was used to modify or build a new chain remove the edge from + our working list. + + *) If there any blocks not being placed into a chain after these steps we place + them into a chain consisting of only this block. + + Ranking edges by their taken frequency, if + two edges compete for fall through on the same target block, the one taken + more often will automatically win out. Resulting in fewer instructions being + executed. + + Creating singleton chains is required for situations where we have code of the + form: + + A: goto B: + <infoTable> + B: goto C: + <infoTable> + C: ... + + As the code in block B is only connected to the rest of the program via edges + which will be ignored in this step we make sure that B still ends up in a chain + this way. + + ----------------------------------------------------------------------------- + 2) We also try to fuse chains. + ----------------------------------------------------------------------------- + + As a result from the above step we still end up with multiple chains which + represent sequential control flow chunks. But they are not yet suitable for + code layout as we need to place *all* blocks into a single sequence. + + In this step we combine chains result from the above step via these steps: + + *) Look at the ranked list of *all* edges, including calls/jumps across info tables + and the like. + + *) Look at each edge and + + +) Given an edge (A -> B) try to find two chains for which + * Block A is at the end of one chain + * Block B is at the front of the other chain. + +) If we find such a chain we "fuse" them into a single chain, remove the + edge from working set and continue. + +) If we can't find such chains we skip the edge and continue. + + ----------------------------------------------------------------------------- + 3) Place indirect successors (neighbours) after each other + ----------------------------------------------------------------------------- + + We might have chains [A,B,C,X],[E] in a CFG of the sort: + + A ---> B ---> C --------> X(exit) + \- ->E- -/ + + While E does not follow X it's still beneficial to place them near each other. + This can be advantageous if eg C,X,E will end up in the same cache line. + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ Note [Triangle Control Flow] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Checking if an argument is already evaluated leads to a somewhat + special case which looks like this: + + A: + if (R1 & 7 != 0) goto Leval; else goto Lwork; + Leval: // global + call (I64[R1])(R1) returns to Lwork, args: 8, res: 8, upd: 8; + Lwork: // global + ... + + A + |\ + | Leval + |/ - (This edge can be missing because of optimizations) + Lwork + + Once we hit the metal the call instruction is just 2-3 bytes large + depending on the register used. So we lay out the assembly like this: + + movq %rbx,%rax + andl $7,%eax + cmpq $1,%rax + jne Lwork + Leval: + jmp *(%rbx) # encoded in 2-3 bytes. + <info table> + Lwork: + ... + + We could explicitly check for this control flow pattern. + + This is advantageous because: + * It's optimal if the argument isn't evaluated. + * If it's evaluated we only have the extra cost of jumping over + the 2-3 bytes for the call. + * Guarantees the smaller encoding for the conditional jump. + + However given that Lwork usually has an info table we + penalize this edge. So Leval should get placed first + either way and things work out for the best. + + Optimizing for the evaluated case instead would penalize + the other code path. It adds an jump as we can't fall through + to Lwork because of the info table. + Assuming that Lwork is large the chance that the "call" ends up + in the same cache line is also fairly small. + +-} + + +-- | Look at X number of blocks in two chains to determine +-- if they are "neighbours". +neighbourOverlapp :: Int +neighbourOverlapp = 2 + +-- | Maps blocks near the end of a chain to it's chain AND +-- the other blocks near the end. +-- [A,B,C,D,E] Gives entries like (B -> ([A,B], [A,B,C,D,E])) +-- where [A,B] are blocks in the end region of a chain. +-- This is cheaper then recomputing the ends multiple times. +type FrontierMap = LabelMap ([BlockId],BlockChain) + +-- | A non empty ordered sequence of basic blocks. +-- It is suitable for serialization in this order. +-- +-- We use OrdList instead of [] to allow fast append on both sides +-- when combining chains. +newtype BlockChain + = BlockChain { chainBlocks :: (OrdList BlockId) } + +-- All chains are constructed the same way so comparison +-- including structure is faster. +instance Eq BlockChain where + BlockChain b1 == BlockChain b2 = strictlyEqOL b1 b2 + +-- Useful for things like sets and debugging purposes, sorts by blocks +-- in the chain. +instance Ord (BlockChain) where + (BlockChain lbls1) `compare` (BlockChain lbls2) + = ASSERT(toList lbls1 /= toList lbls2 || lbls1 `strictlyEqOL` lbls2) + strictlyOrdOL lbls1 lbls2 + +instance Outputable (BlockChain) where + ppr (BlockChain blks) = + parens (text "Chain:" <+> ppr (fromOL $ blks) ) + +chainFoldl :: (b -> BlockId -> b) -> b -> BlockChain -> b +chainFoldl f z (BlockChain blocks) = foldl' f z blocks + +noDups :: [BlockChain] -> Bool +noDups chains = + let chainBlocks = concatMap chainToBlocks chains :: [BlockId] + (_blocks, dups) = removeDups compare chainBlocks + in if null dups then True + else pprTrace "Duplicates:" (ppr (map toList dups) $$ text "chains" <+> ppr chains ) False + +inFront :: BlockId -> BlockChain -> Bool +inFront bid (BlockChain seq) + = headOL seq == bid + +chainSingleton :: BlockId -> BlockChain +chainSingleton lbl + = BlockChain (unitOL lbl) + +chainFromList :: [BlockId] -> BlockChain +chainFromList = BlockChain . toOL + +chainSnoc :: BlockChain -> BlockId -> BlockChain +chainSnoc (BlockChain blks) lbl + = BlockChain (blks `snocOL` lbl) + +chainCons :: BlockId -> BlockChain -> BlockChain +chainCons lbl (BlockChain blks) + = BlockChain (lbl `consOL` blks) + +chainConcat :: BlockChain -> BlockChain -> BlockChain +chainConcat (BlockChain blks1) (BlockChain blks2) + = BlockChain (blks1 `appOL` blks2) + +chainToBlocks :: BlockChain -> [BlockId] +chainToBlocks (BlockChain blks) = fromOL blks + +-- | Given the Chain A -> B -> C -> D and we break at C +-- we get the two Chains (A -> B, C -> D) as result. +breakChainAt :: BlockId -> BlockChain + -> (BlockChain,BlockChain) +breakChainAt bid (BlockChain blks) + | not (bid == head rblks) + = panic "Block not in chain" + | otherwise + = (BlockChain (toOL lblks), + BlockChain (toOL rblks)) + where + (lblks, rblks) = break (\lbl -> lbl == bid) (fromOL blks) + +takeR :: Int -> BlockChain -> [BlockId] +takeR n (BlockChain blks) = + take n . fromOLReverse $ blks + +takeL :: Int -> BlockChain -> [BlockId] +takeL n (BlockChain blks) = + take n . fromOL $ blks + +-- Note [Combining neighborhood chains] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +-- See also Note [Chain based CFG serialization] +-- We have the chains (A-B-C-D) and (E-F) and an Edge C->E. +-- +-- While placing the latter after the former doesn't result in sequential +-- control flow it is still beneficial. As block C and E might end +-- up in the same cache line. +-- +-- So we place these chains next to each other even if we can't fuse them. +-- +-- A -> B -> C -> D +-- v +-- - -> E -> F ... +-- +-- A simple heuristic to chose which chains we want to combine: +-- * Process edges in descending priority. +-- * Check if there is a edge near the end of one chain which goes +-- to a block near the start of another edge. +-- +-- While we could take into account the space between the two blocks which +-- share an edge this blows up compile times quite a bit. It requires +-- us to find all edges between two chains, check the distance for all edges, +-- rank them based on the distance and only then we can select two chains +-- to combine. Which would add a lot of complexity for little gain. +-- +-- So instead we just rank by the strength of the edge and use the first pair we +-- find. + +-- | For a given list of chains and edges try to combine chains with strong +-- edges between them. +combineNeighbourhood :: [CfgEdge] -- ^ Edges to consider + -> [BlockChain] -- ^ Current chains of blocks + -> ([BlockChain], Set.Set (BlockId,BlockId)) + -- ^ Resulting list of block chains, and a set of edges which + -- were used to fuse chains and as such no longer need to be + -- considered. +combineNeighbourhood edges chains + = -- pprTraceIt "Neighbours" $ + -- pprTrace "combineNeighbours" (ppr edges) $ + applyEdges edges endFrontier startFrontier (Set.empty) + where + --Build maps from chain ends to chains + endFrontier, startFrontier :: FrontierMap + endFrontier = + mapFromList $ concatMap (\chain -> + let ends = getEnds chain :: [BlockId] + entry = (ends,chain) + in map (\x -> (x,entry)) ends ) chains + startFrontier = + mapFromList $ concatMap (\chain -> + let front = getFronts chain + entry = (front,chain) + in map (\x -> (x,entry)) front) chains + applyEdges :: [CfgEdge] -> FrontierMap -> FrontierMap -> Set.Set (BlockId, BlockId) + -> ([BlockChain], Set.Set (BlockId,BlockId)) + applyEdges [] chainEnds _chainFronts combined = + (ordNub $ map snd $ mapElems chainEnds, combined) + applyEdges ((CfgEdge from to _w):edges) chainEnds chainFronts combined + | Just (c1_e,c1) <- mapLookup from chainEnds + , Just (c2_f,c2) <- mapLookup to chainFronts + , c1 /= c2 -- Avoid trying to concat a chain with itself. + = let newChain = chainConcat c1 c2 + newChainFrontier = getFronts newChain + newChainEnds = getEnds newChain + newFronts :: FrontierMap + newFronts = + let withoutOld = + foldl' (\m b -> mapDelete b m :: FrontierMap) chainFronts (c2_f ++ getFronts c1) + entry = + (newChainFrontier,newChain) --let bound to ensure sharing + in foldl' (\m x -> mapInsert x entry m) + withoutOld newChainFrontier + + newEnds = + let withoutOld = foldl' (\m b -> mapDelete b m) chainEnds (c1_e ++ getEnds c2) + entry = (newChainEnds,newChain) --let bound to ensure sharing + in foldl' (\m x -> mapInsert x entry m) + withoutOld newChainEnds + in + -- pprTrace "ApplyEdges" + -- (text "before" $$ + -- text "fronts" <+> ppr chainFronts $$ + -- text "ends" <+> ppr chainEnds $$ + + -- text "various" $$ + -- text "newChain" <+> ppr newChain $$ + -- text "newChainFrontier" <+> ppr newChainFrontier $$ + -- text "newChainEnds" <+> ppr newChainEnds $$ + -- text "drop" <+> ppr ((c2_f ++ getFronts c1) ++ (c1_e ++ getEnds c2)) $$ + + -- text "after" $$ + -- text "fronts" <+> ppr newFronts $$ + -- text "ends" <+> ppr newEnds + -- ) + applyEdges edges newEnds newFronts (Set.insert (from,to) combined) + | otherwise + = applyEdges edges chainEnds chainFronts combined + where + + getFronts chain = takeL neighbourOverlapp chain + getEnds chain = takeR neighbourOverlapp chain + +-- In the last stop we combine all chains into a single one. +-- Trying to place chains with strong edges next to each other. +mergeChains :: [CfgEdge] -> [BlockChain] + -> (BlockChain) +mergeChains edges chains + = -- pprTrace "combine" (ppr edges) $ + runST $ do + let addChain m0 chain = do + ref <- newSTRef chain + return $ chainFoldl (\m' b -> mapInsert b ref m') m0 chain + chainMap' <- foldM (\m0 c -> addChain m0 c) mapEmpty chains + merge edges chainMap' + where + -- We keep a map from ALL blocks to their respective chain (sigh) + -- This is required since when looking at an edge we need to find + -- the associated chains quickly. + -- We use a map of STRefs, maintaining a invariant of one STRef per chain. + -- When merging chains we can update the + -- STRef of one chain once (instead of writing to the map for each block). + -- We then overwrite the STRefs for the other chain so there is again only + -- a single STRef for the combined chain. + -- The difference in terms of allocations saved is ~0.2% with -O so actually + -- significant compared to using a regular map. + + merge :: forall s. [CfgEdge] -> LabelMap (STRef s BlockChain) -> ST s BlockChain + merge [] chains = do + chains' <- ordNub <$> (mapM readSTRef $ mapElems chains) :: ST s [BlockChain] + return $ foldl' chainConcat (head chains') (tail chains') + merge ((CfgEdge from to _):edges) chains + -- | pprTrace "merge" (ppr (from,to) <> ppr chains) False + -- = undefined + | cFrom == cTo + = merge edges chains + | otherwise + = do + chains' <- mergeComb cFrom cTo + merge edges chains' + where + mergeComb :: STRef s BlockChain -> STRef s BlockChain -> ST s (LabelMap (STRef s BlockChain)) + mergeComb refFrom refTo = do + cRight <- readSTRef refTo + chain <- pure chainConcat <*> readSTRef refFrom <*> pure cRight + writeSTRef refFrom chain + return $ chainFoldl (\m b -> mapInsert b refFrom m) chains cRight + + cFrom = expectJust "mergeChains:chainMap:from" $ mapLookup from chains + cTo = expectJust "mergeChains:chainMap:to" $ mapLookup to chains + + +-- See Note [Chain based CFG serialization] for the general idea. +-- This creates and fuses chains at the same time for performance reasons. + +-- Try to build chains from a list of edges. +-- Edges must be sorted **descending** by their priority. +-- Returns the constructed chains, along with all edges which +-- are irrelevant past this point, this information doesn't need +-- to be complete - it's only used to speed up the process. +-- An Edge is irrelevant if the ends are part of the same chain. +-- We say these edges are already linked +buildChains :: [CfgEdge] -> [BlockId] + -> ( LabelMap BlockChain -- Resulting chains, indexd by end if chain. + , Set.Set (BlockId, BlockId)) --List of fused edges. +buildChains edges blocks + = runST $ buildNext setEmpty mapEmpty mapEmpty edges Set.empty + where + -- buildNext builds up chains from edges one at a time. + + -- We keep a map from the ends of chains to the chains. + -- This we we can easily check if an block should be appended to an + -- existing chain! + -- We store them using STRefs so we don't have to rebuild the spine of both + -- maps every time we update a chain. + buildNext :: forall s. LabelSet + -> LabelMap (STRef s BlockChain) -- Map from end of chain to chain. + -> LabelMap (STRef s BlockChain) -- Map from start of chain to chain. + -> [CfgEdge] -- Edges to check - ordered by decreasing weight + -> Set.Set (BlockId, BlockId) -- Used edges + -> ST s ( LabelMap BlockChain -- Chains by end + , Set.Set (BlockId, BlockId) --List of fused edges + ) + buildNext placed _chainStarts chainEnds [] linked = do + ends' <- sequence $ mapMap readSTRef chainEnds :: ST s (LabelMap BlockChain) + -- Any remaining blocks have to be made to singleton chains. + -- They might be combined with other chains later on outside this function. + let unplaced = filter (\x -> not (setMember x placed)) blocks + singletons = map (\x -> (x,chainSingleton x)) unplaced :: [(BlockId,BlockChain)] + return (foldl' (\m (k,v) -> mapInsert k v m) ends' singletons , linked) + buildNext placed chainStarts chainEnds (edge:todo) linked + | from == to + -- We skip self edges + = buildNext placed chainStarts chainEnds todo (Set.insert (from,to) linked) + | not (alreadyPlaced from) && + not (alreadyPlaced to) + = do + --pprTraceM "Edge-Chain:" (ppr edge) + chain' <- newSTRef $ chainFromList [from,to] + buildNext + (setInsert to (setInsert from placed)) + (mapInsert from chain' chainStarts) + (mapInsert to chain' chainEnds) + todo + (Set.insert (from,to) linked) + + | (alreadyPlaced from) && + (alreadyPlaced to) + , Just predChain <- mapLookup from chainEnds + , Just succChain <- mapLookup to chainStarts + , predChain /= succChain -- Otherwise we try to create a cycle. + = do + -- pprTraceM "Fusing edge" (ppr edge) + fuseChain predChain succChain + + | (alreadyPlaced from) && + (alreadyPlaced to) + = --pprTraceM "Skipping:" (ppr edge) >> + buildNext placed chainStarts chainEnds todo linked + + | otherwise + = do -- pprTraceM "Finding chain for:" (ppr edge $$ + -- text "placed" <+> ppr placed) + findChain + where + from = edgeFrom edge + to = edgeTo edge + alreadyPlaced blkId = (setMember blkId placed) + + -- Combine two chains into a single one. + fuseChain :: STRef s BlockChain -> STRef s BlockChain + -> ST s ( LabelMap BlockChain -- Chains by end + , Set.Set (BlockId, BlockId) --List of fused edges + ) + fuseChain fromRef toRef = do + fromChain <- readSTRef fromRef + toChain <- readSTRef toRef + let newChain = chainConcat fromChain toChain + ref <- newSTRef newChain + let start = head $ takeL 1 newChain + let end = head $ takeR 1 newChain + -- chains <- sequence $ mapMap readSTRef chainStarts + -- pprTraceM "pre-fuse chains:" $ ppr chains + buildNext + placed + (mapInsert start ref $ mapDelete to $ chainStarts) + (mapInsert end ref $ mapDelete from $ chainEnds) + todo + (Set.insert (from,to) linked) + + + --Add the block to a existing chain or creates a new chain + findChain :: ST s ( LabelMap BlockChain -- Chains by end + , Set.Set (BlockId, BlockId) --List of fused edges + ) + findChain + -- We can attach the block to the end of a chain + | alreadyPlaced from + , Just predChain <- mapLookup from chainEnds + = do + chain <- readSTRef predChain + let newChain = chainSnoc chain to + writeSTRef predChain newChain + let chainEnds' = mapInsert to predChain $ mapDelete from chainEnds + -- chains <- sequence $ mapMap readSTRef chainStarts + -- pprTraceM "from chains:" $ ppr chains + buildNext (setInsert to placed) chainStarts chainEnds' todo (Set.insert (from,to) linked) + -- We can attack it to the front of a chain + | alreadyPlaced to + , Just succChain <- mapLookup to chainStarts + = do + chain <- readSTRef succChain + let newChain = from `chainCons` chain + writeSTRef succChain newChain + let chainStarts' = mapInsert from succChain $ mapDelete to chainStarts + -- chains <- sequence $ mapMap readSTRef chainStarts' + -- pprTraceM "to chains:" $ ppr chains + buildNext (setInsert from placed) chainStarts' chainEnds todo (Set.insert (from,to) linked) + -- The placed end of the edge is part of a chain already and not an end. + | otherwise + = do + let block = if alreadyPlaced to then from else to + --pprTraceM "Singleton" $ ppr block + let newChain = chainSingleton block + ref <- newSTRef newChain + buildNext (setInsert block placed) (mapInsert block ref chainStarts) + (mapInsert block ref chainEnds) todo (linked) + where + alreadyPlaced blkId = (setMember blkId placed) + +-- | Place basic blocks based on the given CFG. +-- See Note [Chain based CFG serialization] +sequenceChain :: forall a i. (Instruction i, Outputable i) + => LabelMap a -- ^ Keys indicate an info table on the block. + -> CFG -- ^ Control flow graph and some meta data. + -> [GenBasicBlock i] -- ^ List of basic blocks to be placed. + -> [GenBasicBlock i] -- ^ Blocks placed in sequence. +sequenceChain _info _weights [] = [] +sequenceChain _info _weights [x] = [x] +sequenceChain info weights' blocks@((BasicBlock entry _):_) = + let weights :: CFG + weights = --pprTrace "cfg'" (pprEdgeWeights cfg') + cfg' + where + (_, globalEdgeWeights) = {-# SCC mkGlobalWeights #-} mkGlobalWeights entry weights' + cfg' = {-# SCC rewriteEdges #-} + mapFoldlWithKey + (\cfg from m -> + mapFoldlWithKey + (\cfg to w -> setEdgeWeight cfg (EdgeWeight w) from to ) + cfg m ) + weights' + globalEdgeWeights + + directEdges :: [CfgEdge] + directEdges = sortBy (flip compare) $ catMaybes . map relevantWeight $ (infoEdgeList weights) + where + relevantWeight :: CfgEdge -> Maybe CfgEdge + relevantWeight edge@(CfgEdge from to edgeInfo) + | (EdgeInfo CmmSource { trans_cmmNode = CmmCall {} } _) <- edgeInfo + -- Ignore edges across calls + = Nothing + | mapMember to info + , w <- edgeWeight edgeInfo + -- The payoff is small if we jump over an info table + = Just (CfgEdge from to edgeInfo { edgeWeight = w/8 }) + | otherwise + = Just edge + + blockMap :: LabelMap (GenBasicBlock i) + blockMap + = foldl' (\m blk@(BasicBlock lbl _ins) -> + mapInsert lbl blk m) + mapEmpty blocks + + (builtChains, builtEdges) + = {-# SCC "buildChains" #-} + --pprTraceIt "generatedChains" $ + --pprTrace "blocks" (ppr (mapKeys blockMap)) $ + buildChains directEdges (mapKeys blockMap) + + rankedEdges :: [CfgEdge] + -- Sort descending by weight, remove fused edges + rankedEdges = + filter (\edge -> not (Set.member (edgeFrom edge,edgeTo edge) builtEdges)) $ + directEdges + + (neighbourChains, combined) + = ASSERT(noDups $ mapElems builtChains) + {-# SCC "groupNeighbourChains" #-} + -- pprTraceIt "NeighbourChains" $ + combineNeighbourhood rankedEdges (mapElems builtChains) + + + allEdges :: [CfgEdge] + allEdges = {-# SCC allEdges #-} + sortOn (relevantWeight) $ filter (not . deadEdge) $ (infoEdgeList weights) + where + deadEdge :: CfgEdge -> Bool + deadEdge (CfgEdge from to _) = let e = (from,to) in Set.member e combined || Set.member e builtEdges + relevantWeight :: CfgEdge -> EdgeWeight + relevantWeight (CfgEdge _ _ edgeInfo) + | EdgeInfo (CmmSource { trans_cmmNode = CmmCall {}}) _ <- edgeInfo + -- Penalize edges across calls + = weight/(64.0) + | otherwise + = weight + where + -- negate to sort descending + weight = negate (edgeWeight edgeInfo) + + masterChain = + {-# SCC "mergeChains" #-} + -- pprTraceIt "MergedChains" $ + mergeChains allEdges neighbourChains + + --Make sure the first block stays first + prepedChains + | inFront entry masterChain + = [masterChain] + | (rest,entry) <- breakChainAt entry masterChain + = [entry,rest] +#if __GLASGOW_HASKELL__ <= 810 + | otherwise = pprPanic "Entry point eliminated" $ + ppr masterChain +#endif + + blockList + = ASSERT(noDups [masterChain]) + (concatMap fromOL $ map chainBlocks prepedChains) + + --chainPlaced = setFromList $ map blockId blockList :: LabelSet + chainPlaced = setFromList $ blockList :: LabelSet + unplaced = + let blocks = mapKeys blockMap + isPlaced b = setMember (b) chainPlaced + in filter (\block -> not (isPlaced block)) blocks + + placedBlocks = + -- We want debug builds to catch this as it's a good indicator for + -- issues with CFG invariants. But we don't want to blow up production + -- builds if something slips through. + ASSERT(null unplaced) + --pprTraceIt "placedBlocks" $ + -- ++ [] is stil kinda expensive + if null unplaced then blockList else blockList ++ unplaced + getBlock bid = expectJust "Block placement" $ mapLookup bid blockMap + in + --Assert we placed all blocks given as input + ASSERT(all (\bid -> mapMember bid blockMap) placedBlocks) + dropJumps info $ map getBlock placedBlocks + +{-# SCC dropJumps #-} +-- | Remove redundant jumps between blocks when we can rely on +-- fall through. +dropJumps :: forall a i. Instruction i => LabelMap a -> [GenBasicBlock i] + -> [GenBasicBlock i] +dropJumps _ [] = [] +dropJumps info ((BasicBlock lbl ins):todo) + | not . null $ ins --This can happen because of shortcutting + , [dest] <- jumpDestsOfInstr (last ins) + , ((BasicBlock nextLbl _) : _) <- todo + , not (mapMember dest info) + , nextLbl == dest + = BasicBlock lbl (init ins) : dropJumps info todo + | otherwise + = BasicBlock lbl ins : dropJumps info todo + + +-- ----------------------------------------------------------------------------- +-- Sequencing the basic blocks + +-- Cmm BasicBlocks are self-contained entities: they always end in a +-- jump, either non-local or to another basic block in the same proc. +-- In this phase, we attempt to place the basic blocks in a sequence +-- such that as many of the local jumps as possible turn into +-- fallthroughs. + +sequenceTop + :: (Instruction instr, Outputable instr) + => DynFlags -- Determine which layout algo to use + -> NcgImpl statics instr jumpDest + -> Maybe CFG -- ^ CFG if we have one. + -> NatCmmDecl statics instr -- ^ Function to serialize + -> NatCmmDecl statics instr + +sequenceTop _ _ _ top@(CmmData _ _) = top +sequenceTop dflags ncgImpl edgeWeights + (CmmProc info lbl live (ListGraph blocks)) + | (gopt Opt_CfgBlocklayout dflags) && backendMaintainsCfg dflags + --Use chain based algorithm + , Just cfg <- edgeWeights + = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $ + {-# SCC layoutBlocks #-} + sequenceChain info cfg blocks ) + | otherwise + --Use old algorithm + = let cfg = if dontUseCfg then Nothing else edgeWeights + in CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $ + {-# SCC layoutBlocks #-} + sequenceBlocks cfg info blocks) + where + dontUseCfg = gopt Opt_WeightlessBlocklayout dflags || + (not $ backendMaintainsCfg dflags) + +-- The old algorithm: +-- It is very simple (and stupid): We make a graph out of +-- the blocks where there is an edge from one block to another iff the +-- first block ends by jumping to the second. Then we topologically +-- sort this graph. Then traverse the list: for each block, we first +-- output the block, then if it has an out edge, we move the +-- destination of the out edge to the front of the list, and continue. + +-- FYI, the classic layout for basic blocks uses postorder DFS; this +-- algorithm is implemented in Hoopl. + +sequenceBlocks :: Instruction inst => Maybe CFG -> LabelMap a + -> [GenBasicBlock inst] -> [GenBasicBlock inst] +sequenceBlocks _edgeWeight _ [] = [] +sequenceBlocks edgeWeights infos (entry:blocks) = + let entryNode = mkNode edgeWeights entry + bodyNodes = reverse + (flattenSCCs (sccBlocks edgeWeights blocks)) + in dropJumps infos . seqBlocks infos $ ( entryNode : bodyNodes) + -- the first block is the entry point ==> it must remain at the start. + +sccBlocks + :: Instruction instr + => Maybe CFG -> [NatBasicBlock instr] + -> [SCC (Node BlockId (NatBasicBlock instr))] +sccBlocks edgeWeights blocks = + stronglyConnCompFromEdgedVerticesUniqR + (map (mkNode edgeWeights) blocks) + +mkNode :: (Instruction t) + => Maybe CFG -> GenBasicBlock t + -> Node BlockId (GenBasicBlock t) +mkNode edgeWeights block@(BasicBlock id instrs) = + DigraphNode block id outEdges + where + outEdges :: [BlockId] + outEdges + --Select the heaviest successor, ignore weights <= zero + = successor + where + successor + | Just successors <- fmap (`getSuccEdgesSorted` id) + edgeWeights -- :: Maybe [(Label, EdgeInfo)] + = case successors of + [] -> [] + ((target,info):_) + | length successors > 2 || edgeWeight info <= 0 -> [] + | otherwise -> [target] + | otherwise + = case jumpDestsOfInstr (last instrs) of + [one] -> [one] + _many -> [] + + +seqBlocks :: LabelMap i -> [Node BlockId (GenBasicBlock t1)] + -> [GenBasicBlock t1] +seqBlocks infos blocks = placeNext pullable0 todo0 + where + -- pullable: Blocks that are not yet placed + -- todo: Original order of blocks, to be followed if we have no good + -- reason not to; + -- may include blocks that have already been placed, but then + -- these are not in pullable + pullable0 = listToUFM [ (i,(b,n)) | DigraphNode b i n <- blocks ] + todo0 = map node_key blocks + + placeNext _ [] = [] + placeNext pullable (i:rest) + | Just (block, pullable') <- lookupDeleteUFM pullable i + = place pullable' rest block + | otherwise + -- We already placed this block, so ignore + = placeNext pullable rest + + place pullable todo (block,[]) + = block : placeNext pullable todo + place pullable todo (block@(BasicBlock id instrs),[next]) + | mapMember next infos + = block : placeNext pullable todo + | Just (nextBlock, pullable') <- lookupDeleteUFM pullable next + = BasicBlock id instrs : place pullable' todo nextBlock + | otherwise + = block : placeNext pullable todo + place _ _ (_,tooManyNextNodes) + = pprPanic "seqBlocks" (ppr tooManyNextNodes) + + +lookupDeleteUFM :: Uniquable key => UniqFM elt -> key + -> Maybe (elt, UniqFM elt) +lookupDeleteUFM m k = do -- Maybe monad + v <- lookupUFM m k + return (v, delFromUFM m k) diff --git a/compiler/GHC/CmmToAsm/CFG.hs b/compiler/GHC/CmmToAsm/CFG.hs new file mode 100644 index 0000000000..f52ff514b1 --- /dev/null +++ b/compiler/GHC/CmmToAsm/CFG.hs @@ -0,0 +1,1320 @@ +-- +-- Copyright (c) 2018 Andreas Klebinger +-- + +{-# LANGUAGE TypeFamilies, ScopedTypeVariables #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE Rank2Types #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE DataKinds #-} + +module GHC.CmmToAsm.CFG + ( CFG, CfgEdge(..), EdgeInfo(..), EdgeWeight(..) + , TransitionSource(..) + + --Modify the CFG + , addWeightEdge, addEdge + , delEdge, delNode + , addNodesBetween, shortcutWeightMap + , reverseEdges, filterEdges + , addImmediateSuccessor + , mkWeightInfo, adjustEdgeWeight, setEdgeWeight + + --Query the CFG + , infoEdgeList, edgeList + , getSuccessorEdges, getSuccessors + , getSuccEdgesSorted + , getEdgeInfo + , getCfgNodes, hasNode + + -- Loop Information + , loopMembers, loopLevels, loopInfo + + --Construction/Misc + , getCfg, getCfgProc, pprEdgeWeights, sanityCheckCfg + + --Find backedges and update their weight + , optimizeCFG + , mkGlobalWeights + + ) +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.Cmm.BlockId +import GHC.Cmm as Cmm + +import GHC.Cmm.Utils +import GHC.Cmm.Switch +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label +import GHC.Cmm.Dataflow.Block +import qualified GHC.Cmm.Dataflow.Graph as G + +import Util +import Digraph +import Maybes + +import Unique +import qualified GHC.CmmToAsm.CFG.Dominators as Dom +import Data.IntMap.Strict (IntMap) +import Data.IntSet (IntSet) + +import qualified Data.IntMap.Strict as IM +import qualified Data.Map as M +import qualified Data.IntSet as IS +import qualified Data.Set as S +import Data.Tree +import Data.Bifunctor + +import Outputable +-- DEBUGGING ONLY +--import GHC.Cmm.DebugBlock +--import OrdList +--import GHC.Cmm.DebugBlock.Trace +import GHC.Cmm.Ppr () -- For Outputable instances +import qualified GHC.Driver.Session as D + +import Data.List (sort, nub, partition) +import Data.STRef.Strict +import Control.Monad.ST + +import Data.Array.MArray +import Data.Array.ST +import Data.Array.IArray +import Data.Array.Unsafe (unsafeFreeze) +import Data.Array.Base (unsafeRead, unsafeWrite) + +import Control.Monad + +type Prob = Double + +type Edge = (BlockId, BlockId) +type Edges = [Edge] + +newtype EdgeWeight + = EdgeWeight { weightToDouble :: Double } + deriving (Eq,Ord,Enum,Num,Real,Fractional) + +instance Outputable EdgeWeight where + ppr (EdgeWeight w) = doublePrec 5 w + +type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo) + +-- | A control flow graph where edges have been annotated with a weight. +-- Implemented as IntMap (IntMap <edgeData>) +-- We must uphold the invariant that for each edge A -> B we must have: +-- A entry B in the outer map. +-- A entry B in the map we get when looking up A. +-- Maintaining this invariant is useful as any failed lookup now indicates +-- an actual error in code which might go unnoticed for a while +-- otherwise. +type CFG = EdgeInfoMap EdgeInfo + +data CfgEdge + = CfgEdge + { edgeFrom :: !BlockId + , edgeTo :: !BlockId + , edgeInfo :: !EdgeInfo + } + +-- | Careful! Since we assume there is at most one edge from A to B +-- the Eq instance does not consider weight. +instance Eq CfgEdge where + (==) (CfgEdge from1 to1 _) (CfgEdge from2 to2 _) + = from1 == from2 && to1 == to2 + +-- | Edges are sorted ascending pointwise by weight, source and destination +instance Ord CfgEdge where + compare (CfgEdge from1 to1 (EdgeInfo {edgeWeight = weight1})) + (CfgEdge from2 to2 (EdgeInfo {edgeWeight = weight2})) + | weight1 < weight2 || weight1 == weight2 && from1 < from2 || + weight1 == weight2 && from1 == from2 && to1 < to2 + = LT + | from1 == from2 && to1 == to2 && weight1 == weight2 + = EQ + | otherwise + = GT + +instance Outputable CfgEdge where + ppr (CfgEdge from1 to1 edgeInfo) + = parens (ppr from1 <+> text "-(" <> ppr edgeInfo <> text ")->" <+> ppr to1) + +-- | Can we trace back a edge to a specific Cmm Node +-- or has it been introduced during assembly codegen. We use this to maintain +-- some information which would otherwise be lost during the +-- Cmm <-> asm transition. +-- See also Note [Inverting Conditional Branches] +data TransitionSource + = CmmSource { trans_cmmNode :: (CmmNode O C) + , trans_info :: BranchInfo } + | AsmCodeGen + deriving (Eq) + +data BranchInfo = NoInfo -- ^ Unknown, but not heap or stack check. + | HeapStackCheck -- ^ Heap or stack check + deriving Eq + +instance Outputable BranchInfo where + ppr NoInfo = text "regular" + ppr HeapStackCheck = text "heap/stack" + +isHeapOrStackCheck :: TransitionSource -> Bool +isHeapOrStackCheck (CmmSource { trans_info = HeapStackCheck}) = True +isHeapOrStackCheck _ = False + +-- | Information about edges +data EdgeInfo + = EdgeInfo + { transitionSource :: !TransitionSource + , edgeWeight :: !EdgeWeight + } deriving (Eq) + +instance Outputable EdgeInfo where + ppr edgeInfo = text "weight:" <+> ppr (edgeWeight edgeInfo) + +-- | Convenience function, generate edge info based +-- on weight not originating from cmm. +mkWeightInfo :: EdgeWeight -> EdgeInfo +mkWeightInfo = EdgeInfo AsmCodeGen + +-- | Adjust the weight between the blocks using the given function. +-- If there is no such edge returns the original map. +adjustEdgeWeight :: CFG -> (EdgeWeight -> EdgeWeight) + -> BlockId -> BlockId -> CFG +adjustEdgeWeight cfg f from to + | Just info <- getEdgeInfo from to cfg + , !weight <- edgeWeight info + , !newWeight <- f weight + = addEdge from to (info { edgeWeight = newWeight}) cfg + | otherwise = cfg + +-- | Set the weight between the blocks to the given weight. +-- If there is no such edge returns the original map. +setEdgeWeight :: CFG -> EdgeWeight + -> BlockId -> BlockId -> CFG +setEdgeWeight cfg !weight from to + | Just info <- getEdgeInfo from to cfg + = addEdge from to (info { edgeWeight = weight}) cfg + | otherwise = cfg + + +getCfgNodes :: CFG -> [BlockId] +getCfgNodes m = + mapKeys m + +-- | Is this block part of this graph? +hasNode :: CFG -> BlockId -> Bool +hasNode m node = + -- Check the invariant that each node must exist in the first map or not at all. + ASSERT( found || not (any (mapMember node) m)) + found + where + found = mapMember node m + + + +-- | Check if the nodes in the cfg and the set of blocks are the same. +-- In a case of a missmatch we panic and show the difference. +sanityCheckCfg :: CFG -> LabelSet -> SDoc -> Bool +sanityCheckCfg m blockSet msg + | blockSet == cfgNodes + = True + | otherwise = + pprPanic "Block list and cfg nodes don't match" ( + text "difference:" <+> ppr diff $$ + text "blocks:" <+> ppr blockSet $$ + text "cfg:" <+> pprEdgeWeights m $$ + msg ) + False + where + cfgNodes = setFromList $ getCfgNodes m :: LabelSet + diff = (setUnion cfgNodes blockSet) `setDifference` (setIntersection cfgNodes blockSet) :: LabelSet + +-- | Filter the CFG with a custom function f. +-- Paramaeters are `f from to edgeInfo` +filterEdges :: (BlockId -> BlockId -> EdgeInfo -> Bool) -> CFG -> CFG +filterEdges f cfg = + mapMapWithKey filterSources cfg + where + filterSources from m = + mapFilterWithKey (\to w -> f from to w) m + + +{- Note [Updating the CFG during shortcutting] + +See Note [What is shortcutting] in the control flow optimization +code (GHC.Cmm.ContFlowOpt) for a slightly more in depth explanation on shortcutting. + +In the native backend we shortcut jumps at the assembly level. (AsmCodeGen.hs) +This means we remove blocks containing only one jump from the code +and instead redirecting all jumps targeting this block to the deleted +blocks jump target. + +However we want to have an accurate representation of control +flow in the CFG. So we add/remove edges accordingly to account +for the eliminated blocks and new edges. + +If we shortcut A -> B -> C to A -> C: +* We delete edges A -> B and B -> C +* Replacing them with the edge A -> C + +We also try to preserve jump weights while doing so. + +Note that: +* The edge B -> C can't have interesting weights since + the block B consists of a single unconditional jump without branching. +* We delete the edge A -> B and add the edge A -> C. +* The edge A -> B can be one of many edges originating from A so likely + has edge weights we want to preserve. + +For this reason we simply store the edge info from the original A -> B +edge and apply this information to the new edge A -> C. + +Sometimes we have a scenario where jump target C is not represented by an +BlockId but an immediate value. I'm only aware of this happening without +tables next to code currently. + +Then we go from A ---> B - -> IMM to A - -> IMM where the dashed arrows +are not stored in the CFG. + +In that case we simply delete the edge A -> B. + +In terms of implementation the native backend first builds a mapping +from blocks suitable for shortcutting to their jump targets. +Then it redirects all jump instructions to these blocks using the +built up mapping. +This function (shortcutWeightMap) takes the same mapping and +applies the mapping to the CFG in the way laid out above. + +-} +shortcutWeightMap :: LabelMap (Maybe BlockId) -> CFG -> CFG +shortcutWeightMap cuts cfg = + foldl' applyMapping cfg $ mapToList cuts + where +-- takes the tuple (B,C) from the notation in [Updating the CFG during shortcutting] + applyMapping :: CFG -> (BlockId,Maybe BlockId) -> CFG + --Shortcut immediate + applyMapping m (from, Nothing) = + mapDelete from . + fmap (mapDelete from) $ m + --Regular shortcut + applyMapping m (from, Just to) = + let updatedMap :: CFG + updatedMap + = fmap (shortcutEdge (from,to)) $ + (mapDelete from m :: CFG ) + --Sometimes we can shortcut multiple blocks like so: + -- A -> B -> C -> D -> E => A -> E + -- so we check for such chains. + in case mapLookup to cuts of + Nothing -> updatedMap + Just dest -> applyMapping updatedMap (to, dest) + --Redirect edge from B to C + shortcutEdge :: (BlockId, BlockId) -> LabelMap EdgeInfo -> LabelMap EdgeInfo + shortcutEdge (from, to) m = + case mapLookup from m of + Just info -> mapInsert to info $ mapDelete from m + Nothing -> m + +-- | Sometimes we insert a block which should unconditionally be executed +-- after a given block. This function updates the CFG for these cases. +-- So we get A -> B => A -> A' -> B +-- \ \ +-- -> C => -> C +-- +addImmediateSuccessor :: BlockId -> BlockId -> CFG -> CFG +addImmediateSuccessor node follower cfg + = updateEdges . addWeightEdge node follower uncondWeight $ cfg + where + uncondWeight = fromIntegral . D.uncondWeight . + D.cfgWeightInfo $ D.unsafeGlobalDynFlags + targets = getSuccessorEdges cfg node + successors = map fst targets :: [BlockId] + updateEdges = addNewSuccs . remOldSuccs + remOldSuccs m = foldl' (flip (delEdge node)) m successors + addNewSuccs m = + foldl' (\m' (t,info) -> addEdge follower t info m') m targets + +-- | Adds a new edge, overwrites existing edges if present +addEdge :: BlockId -> BlockId -> EdgeInfo -> CFG -> CFG +addEdge from to info cfg = + mapAlter addFromToEdge from $ + mapAlter addDestNode to cfg + where + -- Simply insert the edge into the edge list. + addFromToEdge Nothing = Just $ mapSingleton to info + addFromToEdge (Just wm) = Just $ mapInsert to info wm + -- We must add the destination node explicitly + addDestNode Nothing = Just $ mapEmpty + addDestNode n@(Just _) = n + + +-- | Adds a edge with the given weight to the cfg +-- If there already existed an edge it is overwritten. +-- `addWeightEdge from to weight cfg` +addWeightEdge :: BlockId -> BlockId -> EdgeWeight -> CFG -> CFG +addWeightEdge from to weight cfg = + addEdge from to (mkWeightInfo weight) cfg + +delEdge :: BlockId -> BlockId -> CFG -> CFG +delEdge from to m = + mapAlter remDest from m + where + remDest Nothing = Nothing + remDest (Just wm) = Just $ mapDelete to wm + +delNode :: BlockId -> CFG -> CFG +delNode node cfg = + fmap (mapDelete node) -- < Edges to the node + (mapDelete node cfg) -- < Edges from the node + +-- | Destinations from bid ordered by weight (descending) +getSuccEdgesSorted :: CFG -> BlockId -> [(BlockId,EdgeInfo)] +getSuccEdgesSorted m bid = + let destMap = mapFindWithDefault mapEmpty bid m + cfgEdges = mapToList destMap + sortedEdges = sortWith (negate . edgeWeight . snd) cfgEdges + in --pprTrace "getSuccEdgesSorted" (ppr bid <+> text "map:" <+> ppr m) + sortedEdges + +-- | Get successors of a given node with edge weights. +getSuccessorEdges :: HasDebugCallStack => CFG -> BlockId -> [(BlockId,EdgeInfo)] +getSuccessorEdges m bid = maybe lookupError mapToList (mapLookup bid m) + where + lookupError = pprPanic "getSuccessorEdges: Block does not exist" $ + ppr bid <+> pprEdgeWeights m + +getEdgeInfo :: BlockId -> BlockId -> CFG -> Maybe EdgeInfo +getEdgeInfo from to m + | Just wm <- mapLookup from m + , Just info <- mapLookup to wm + = Just $! info + | otherwise + = Nothing + +getEdgeWeight :: CFG -> BlockId -> BlockId -> EdgeWeight +getEdgeWeight cfg from to = + edgeWeight $ expectJust "Edgeweight for noexisting block" $ + getEdgeInfo from to cfg + +getTransitionSource :: BlockId -> BlockId -> CFG -> TransitionSource +getTransitionSource from to cfg = transitionSource $ expectJust "Source info for noexisting block" $ + getEdgeInfo from to cfg + +reverseEdges :: CFG -> CFG +reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg + where + -- We must preserve nodes without outgoing edges! + addNode :: CFG -> BlockId -> CFG + addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg + go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG + go cfg from toMap = mapFoldlWithKey (\cfg to info -> addEdge to from info cfg) cfg toMap :: CFG + + +-- | Returns a unordered list of all edges with info +infoEdgeList :: CFG -> [CfgEdge] +infoEdgeList m = + go (mapToList m) [] + where + -- We avoid foldMap to avoid thunk buildup + go :: [(BlockId,LabelMap EdgeInfo)] -> [CfgEdge] -> [CfgEdge] + go [] acc = acc + go ((from,toMap):xs) acc + = go' xs from (mapToList toMap) acc + go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [(BlockId,EdgeInfo)] -> [CfgEdge] -> [CfgEdge] + go' froms _ [] acc = go froms acc + go' froms from ((to,info):tos) acc + = go' froms from tos (CfgEdge from to info : acc) + +-- | Returns a unordered list of all edges without weights +edgeList :: CFG -> [Edge] +edgeList m = + go (mapToList m) [] + where + -- We avoid foldMap to avoid thunk buildup + go :: [(BlockId,LabelMap EdgeInfo)] -> [Edge] -> [Edge] + go [] acc = acc + go ((from,toMap):xs) acc + = go' xs from (mapKeys toMap) acc + go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [BlockId] -> [Edge] -> [Edge] + go' froms _ [] acc = go froms acc + go' froms from (to:tos) acc + = go' froms from tos ((from,to) : acc) + +-- | Get successors of a given node without edge weights. +getSuccessors :: HasDebugCallStack => CFG -> BlockId -> [BlockId] +getSuccessors m bid + | Just wm <- mapLookup bid m + = mapKeys wm + | otherwise = lookupError + where + lookupError = pprPanic "getSuccessors: Block does not exist" $ + ppr bid <+> pprEdgeWeights m + +pprEdgeWeights :: CFG -> SDoc +pprEdgeWeights m = + let edges = sort $ infoEdgeList m :: [CfgEdge] + printEdge (CfgEdge from to (EdgeInfo { edgeWeight = weight })) + = text "\t" <> ppr from <+> text "->" <+> ppr to <> + text "[label=\"" <> ppr weight <> text "\",weight=\"" <> + ppr weight <> text "\"];\n" + --for the case that there are no edges from/to this node. + --This should rarely happen but it can save a lot of time + --to immediately see it when it does. + printNode node + = text "\t" <> ppr node <> text ";\n" + getEdgeNodes (CfgEdge from to _) = [from,to] + edgeNodes = setFromList $ concatMap getEdgeNodes edges :: LabelSet + nodes = filter (\n -> (not . setMember n) edgeNodes) . mapKeys $ mapFilter null m + in + text "digraph {\n" <> + (foldl' (<>) empty (map printEdge edges)) <> + (foldl' (<>) empty (map printNode nodes)) <> + text "}\n" + +{-# INLINE updateEdgeWeight #-} --Allows eliminating the tuple when possible +-- | Invariant: The edge **must** exist already in the graph. +updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG +updateEdgeWeight f (from, to) cfg + | Just oldInfo <- getEdgeInfo from to cfg + = let !oldWeight = edgeWeight oldInfo + !newWeight = f oldWeight + in addEdge from to (oldInfo {edgeWeight = newWeight}) cfg + | otherwise + = panic "Trying to update invalid edge" + +-- from to oldWeight => newWeight +mapWeights :: (BlockId -> BlockId -> EdgeWeight -> EdgeWeight) -> CFG -> CFG +mapWeights f cfg = + foldl' (\cfg (CfgEdge from to info) -> + let oldWeight = edgeWeight info + newWeight = f from to oldWeight + in addEdge from to (info {edgeWeight = newWeight}) cfg) + cfg (infoEdgeList cfg) + + +-- | Insert a block in the control flow between two other blocks. +-- We pass a list of tuples (A,B,C) where +-- * A -> C: Old edge +-- * A -> B -> C : New Arc, where B is the new block. +-- It's possible that a block has two jumps to the same block +-- in the assembly code. However we still only store a single edge for +-- these cases. +-- We assign the old edge info to the edge A -> B and assign B -> C the +-- weight of an unconditional jump. +addNodesBetween :: CFG -> [(BlockId,BlockId,BlockId)] -> CFG +addNodesBetween m updates = + foldl' updateWeight m . + weightUpdates $ updates + where + weight = fromIntegral . D.uncondWeight . + D.cfgWeightInfo $ D.unsafeGlobalDynFlags + -- We might add two blocks for different jumps along a single + -- edge. So we end up with edges: A -> B -> C , A -> D -> C + -- in this case after applying the first update the weight for A -> C + -- is no longer available. So we calculate future weights before updates. + weightUpdates = map getWeight + getWeight :: (BlockId,BlockId,BlockId) -> (BlockId,BlockId,BlockId,EdgeInfo) + getWeight (from,between,old) + | Just edgeInfo <- getEdgeInfo from old m + = (from,between,old,edgeInfo) + | otherwise + = pprPanic "Can't find weight for edge that should have one" ( + text "triple" <+> ppr (from,between,old) $$ + text "updates" <+> ppr updates $$ + text "cfg:" <+> pprEdgeWeights m ) + updateWeight :: CFG -> (BlockId,BlockId,BlockId,EdgeInfo) -> CFG + updateWeight m (from,between,old,edgeInfo) + = addEdge from between edgeInfo . + addWeightEdge between old weight . + delEdge from old $ m + +{- + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ Note [CFG Edge Weights] ~~~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Edge weights assigned do not currently represent a specific + cost model and rather just a ranking of which blocks should + be placed next to each other given their connection type in + the CFG. + This is especially relevant if we whenever two blocks will + jump to the same target. + + A B + \ / + C + + Should A or B be placed in front of C? The block layout algorithm + decides this based on which edge (A,C)/(B,C) is heavier. So we + make a educated guess on which branch should be preferred. + + We rank edges in this order: + * Unconditional Control Transfer - They will always + transfer control to their target. Unless there is a info table + we can turn the jump into a fallthrough as well. + We use 20k as default, so it's easy to spot if values have been + modified but unlikely that we run into issues with overflow. + * If branches (likely) - We assume branches marked as likely + are taken more than 80% of the time. + By ranking them below unconditional jumps we make sure we + prefer the unconditional if there is a conditional and + unconditional edge towards a block. + * If branches (regular) - The false branch can potentially be turned + into a fallthrough so we prefer it slightly over the true branch. + * Unlikely branches - These can be assumed to be taken less than 20% + of the time. So we given them one of the lowest priorities. + * Switches - Switches at this level are implemented as jump tables + so have a larger number of successors. So without more information + we can only say that each individual successor is unlikely to be + jumped to and we rank them accordingly. + * Calls - We currently ignore calls completely: + * By the time we return from a call there is a good chance + that the address we return to has already been evicted from + cache eliminating a main advantage sequential placement brings. + * Calls always require a info table in front of their return + address. This reduces the chance that we return to the same + cache line further. + +-} +-- | Generate weights for a Cmm proc based on some simple heuristics. +getCfgProc :: D.CfgWeights -> RawCmmDecl -> CFG +getCfgProc _ (CmmData {}) = mapEmpty +getCfgProc weights (CmmProc _info _lab _live graph) = getCfg weights graph + +getCfg :: D.CfgWeights -> CmmGraph -> CFG +getCfg weights graph = + foldl' insertEdge edgelessCfg $ concatMap getBlockEdges blocks + where + D.CFGWeights + { D.uncondWeight = uncondWeight + , D.condBranchWeight = condBranchWeight + , D.switchWeight = switchWeight + , D.callWeight = callWeight + , D.likelyCondWeight = likelyCondWeight + , D.unlikelyCondWeight = unlikelyCondWeight + -- Last two are used in other places + --, D.infoTablePenalty = infoTablePenalty + --, D.backEdgeBonus = backEdgeBonus + } = weights + -- Explicitly add all nodes to the cfg to ensure they are part of the + -- CFG. + edgelessCfg = mapFromList $ zip (map G.entryLabel blocks) (repeat mapEmpty) + insertEdge :: CFG -> ((BlockId,BlockId),EdgeInfo) -> CFG + insertEdge m ((from,to),weight) = + mapAlter f from m + where + f :: Maybe (LabelMap EdgeInfo) -> Maybe (LabelMap EdgeInfo) + f Nothing = Just $ mapSingleton to weight + f (Just destMap) = Just $ mapInsert to weight destMap + getBlockEdges :: CmmBlock -> [((BlockId,BlockId),EdgeInfo)] + getBlockEdges block = + case branch of + CmmBranch dest -> [mkEdge dest uncondWeight] + CmmCondBranch cond t f l + | l == Nothing -> + [mkEdge f condBranchWeight, mkEdge t condBranchWeight] + | l == Just True -> + [mkEdge f unlikelyCondWeight, mkEdge t likelyCondWeight] + | l == Just False -> + [mkEdge f likelyCondWeight, mkEdge t unlikelyCondWeight] + where + mkEdgeInfo = -- pprTrace "Info" (ppr branchInfo <+> ppr cond) + EdgeInfo (CmmSource branch branchInfo) . fromIntegral + mkEdge target weight = ((bid,target), mkEdgeInfo weight) + branchInfo = + foldRegsUsed + (panic "foldRegsDynFlags") + (\info r -> if r == SpLim || r == HpLim || r == BaseReg + then HeapStackCheck else info) + NoInfo cond + + (CmmSwitch _e ids) -> + let switchTargets = switchTargetsToList ids + --Compiler performance hack - for very wide switches don't + --consider targets for layout. + adjustedWeight = + if (length switchTargets > 10) then -1 else switchWeight + in map (\x -> mkEdge x adjustedWeight) switchTargets + (CmmCall { cml_cont = Just cont}) -> [mkEdge cont callWeight] + (CmmForeignCall {Cmm.succ = cont}) -> [mkEdge cont callWeight] + (CmmCall { cml_cont = Nothing }) -> [] + other -> + panic "Foo" $ + ASSERT2(False, ppr "Unknown successor cause:" <> + (ppr branch <+> text "=>" <> ppr (G.successors other))) + map (\x -> ((bid,x),mkEdgeInfo 0)) $ G.successors other + where + bid = G.entryLabel block + mkEdgeInfo = EdgeInfo (CmmSource branch NoInfo) . fromIntegral + mkEdge target weight = ((bid,target), mkEdgeInfo weight) + branch = lastNode block :: CmmNode O C + + blocks = revPostorder graph :: [CmmBlock] + +--Find back edges by BFS +findBackEdges :: HasDebugCallStack => BlockId -> CFG -> Edges +findBackEdges root cfg = + --pprTraceIt "Backedges:" $ + map fst . + filter (\x -> snd x == Backward) $ typedEdges + where + edges = edgeList cfg :: [(BlockId,BlockId)] + getSuccs = getSuccessors cfg :: BlockId -> [BlockId] + typedEdges = + classifyEdges root getSuccs edges :: [((BlockId,BlockId),EdgeType)] + + +optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG +optimizeCFG _ (CmmData {}) cfg = cfg +optimizeCFG weights (CmmProc info _lab _live graph) cfg = + {-# SCC optimizeCFG #-} + -- pprTrace "Initial:" (pprEdgeWeights cfg) $ + -- pprTrace "Initial:" (ppr $ mkGlobalWeights (g_entry graph) cfg) $ + + -- pprTrace "LoopInfo:" (ppr $ loopInfo cfg (g_entry graph)) $ + favourFewerPreds . + penalizeInfoTables info . + increaseBackEdgeWeight (g_entry graph) $ cfg + where + + -- | Increase the weight of all backedges in the CFG + -- this helps to make loop jumpbacks the heaviest edges + increaseBackEdgeWeight :: BlockId -> CFG -> CFG + increaseBackEdgeWeight root cfg = + let backedges = findBackEdges root cfg + update weight + --Keep irrelevant edges irrelevant + | weight <= 0 = 0 + | otherwise + = weight + fromIntegral (D.backEdgeBonus weights) + in foldl' (\cfg edge -> updateEdgeWeight update edge cfg) + cfg backedges + + -- | Since we cant fall through info tables we penalize these. + penalizeInfoTables :: LabelMap a -> CFG -> CFG + penalizeInfoTables info cfg = + mapWeights fupdate cfg + where + fupdate :: BlockId -> BlockId -> EdgeWeight -> EdgeWeight + fupdate _ to weight + | mapMember to info + = weight - (fromIntegral $ D.infoTablePenalty weights) + | otherwise = weight + + -- | If a block has two successors, favour the one with fewer + -- predecessors and/or the one allowing fall through. + favourFewerPreds :: CFG -> CFG + favourFewerPreds cfg = + let + revCfg = + reverseEdges $ filterEdges + (\_from -> fallthroughTarget) cfg + + predCount n = length $ getSuccessorEdges revCfg n + nodes = getCfgNodes cfg + + modifiers :: Int -> Int -> (EdgeWeight, EdgeWeight) + modifiers preds1 preds2 + | preds1 < preds2 = ( 1,-1) + | preds1 == preds2 = ( 0, 0) + | otherwise = (-1, 1) + + update :: CFG -> BlockId -> CFG + update cfg node + | [(s1,e1),(s2,e2)] <- getSuccessorEdges cfg node + , !w1 <- edgeWeight e1 + , !w2 <- edgeWeight e2 + --Only change the weights if there isn't already a ordering. + , w1 == w2 + , (mod1,mod2) <- modifiers (predCount s1) (predCount s2) + = (\cfg' -> + (adjustEdgeWeight cfg' (+mod2) node s2)) + (adjustEdgeWeight cfg (+mod1) node s1) + | otherwise + = cfg + in foldl' update cfg nodes + where + fallthroughTarget :: BlockId -> EdgeInfo -> Bool + fallthroughTarget to (EdgeInfo source _weight) + | mapMember to info = False + | AsmCodeGen <- source = True + | CmmSource { trans_cmmNode = CmmBranch {} } <- source = True + | CmmSource { trans_cmmNode = CmmCondBranch {} } <- source = True + | otherwise = False + +-- | Determine loop membership of blocks based on SCC analysis +-- This is faster but only gives yes/no answers. +loopMembers :: HasDebugCallStack => CFG -> LabelMap Bool +loopMembers cfg = + foldl' (flip setLevel) mapEmpty sccs + where + mkNode :: BlockId -> Node BlockId BlockId + mkNode bid = DigraphNode bid bid (getSuccessors cfg bid) + nodes = map mkNode (getCfgNodes cfg) + + sccs = stronglyConnCompFromEdgedVerticesOrd nodes + + setLevel :: SCC BlockId -> LabelMap Bool -> LabelMap Bool + setLevel (AcyclicSCC bid) m = mapInsert bid False m + setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids + +loopLevels :: CFG -> BlockId -> LabelMap Int +loopLevels cfg root = liLevels loopInfos + where + loopInfos = loopInfo cfg root + +data LoopInfo = LoopInfo + { liBackEdges :: [(Edge)] -- ^ List of back edges + , liLevels :: LabelMap Int -- ^ BlockId -> LoopLevel mapping + , liLoops :: [(Edge, LabelSet)] -- ^ (backEdge, loopBody), body includes header + } + +instance Outputable LoopInfo where + ppr (LoopInfo _ _lvls loops) = + text "Loops:(backEdge, bodyNodes)" $$ + (vcat $ map ppr loops) + +{- Note [Determining the loop body] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Starting with the knowledge that: + * head dominates the loop + * `tail` -> `head` is a backedge + + We can determine all nodes by: + * Deleting the loop head from the graph. + * Collect all blocks which are reachable from the `tail`. + + We do so by performing bfs from the tail node towards the head. + -} + +-- | Determine loop membership of blocks based on Dominator analysis. +-- This is slower but gives loop levels instead of just loop membership. +-- However it only detects natural loops. Irreducible control flow is not +-- recognized even if it loops. But that is rare enough that we don't have +-- to care about that special case. +loopInfo :: HasDebugCallStack => CFG -> BlockId -> LoopInfo +loopInfo cfg root = LoopInfo { liBackEdges = backEdges + , liLevels = mapFromList loopCounts + , liLoops = loopBodies } + where + revCfg = reverseEdges cfg + + graph = -- pprTrace "CFG - loopInfo" (pprEdgeWeights cfg) $ + fmap (setFromList . mapKeys ) cfg :: LabelMap LabelSet + + + --TODO - This should be a no op: Export constructors? Use unsafeCoerce? ... + rooted = ( fromBlockId root + , toIntMap $ fmap toIntSet graph) :: (Int, IntMap IntSet) + tree = fmap toBlockId $ Dom.domTree rooted :: Tree BlockId + + -- Map from Nodes to their dominators + domMap :: LabelMap LabelSet + domMap = mkDomMap tree + + edges = edgeList cfg :: [(BlockId, BlockId)] + -- We can't recompute nodes from edges, there might be blocks not connected via edges. + nodes = getCfgNodes cfg :: [BlockId] + + -- identify back edges + isBackEdge (from,to) + | Just doms <- mapLookup from domMap + , setMember to doms + = True + | otherwise = False + + -- See Note [Determining the loop body] + -- Get the loop body associated with a back edge. + findBody edge@(tail, head) + = ( edge, setInsert head $ go (setSingleton tail) (setSingleton tail) ) + where + -- See Note [Determining the loop body] + cfg' = delNode head revCfg + + go :: LabelSet -> LabelSet -> LabelSet + go found current + | setNull current = found + | otherwise = go (setUnion newSuccessors found) + newSuccessors + where + -- Really predecessors, since we use the reversed cfg. + newSuccessors = setFilter (\n -> not $ setMember n found) successors :: LabelSet + successors = setFromList $ concatMap + (getSuccessors cfg') + -- we filter head as it's no longer part of the cfg. + (filter (/= head) $ setElems current) :: LabelSet + + backEdges = filter isBackEdge edges + loopBodies = map findBody backEdges :: [(Edge, LabelSet)] + + -- Block b is part of n loop bodies => loop nest level of n + loopCounts = + let bodies = map (first snd) loopBodies -- [(Header, Body)] + loopCount n = length $ nub . map fst . filter (setMember n . snd) $ bodies + in map (\n -> (n, loopCount n)) $ nodes :: [(BlockId, Int)] + + toIntSet :: LabelSet -> IntSet + toIntSet s = IS.fromList . map fromBlockId . setElems $ s + toIntMap :: LabelMap a -> IntMap a + toIntMap m = IM.fromList $ map (\(x,y) -> (fromBlockId x,y)) $ mapToList m + + mkDomMap :: Tree BlockId -> LabelMap LabelSet + mkDomMap root = mapFromList $ go setEmpty root + where + go :: LabelSet -> Tree BlockId -> [(Label,LabelSet)] + go parents (Node lbl []) + = [(lbl, parents)] + go parents (Node _ leaves) + = let nodes = map rootLabel leaves + entries = map (\x -> (x,parents)) nodes + in entries ++ concatMap + (\n -> go (setInsert (rootLabel n) parents) n) + leaves + + fromBlockId :: BlockId -> Int + fromBlockId = getKey . getUnique + + toBlockId :: Int -> BlockId + toBlockId = mkBlockId . mkUniqueGrimily + +-- We make the CFG a Hoopl Graph, so we can reuse revPostOrder. +newtype BlockNode (e :: Extensibility) (x :: Extensibility) = BN (BlockId,[BlockId]) + +instance G.NonLocal (BlockNode) where + entryLabel (BN (lbl,_)) = lbl + successors (BN (_,succs)) = succs + +revPostorderFrom :: HasDebugCallStack => CFG -> BlockId -> [BlockId] +revPostorderFrom cfg root = + map fromNode $ G.revPostorderFrom hooplGraph root + where + nodes = getCfgNodes cfg + hooplGraph = foldl' (\m n -> mapInsert n (toNode n) m) mapEmpty nodes + + fromNode :: BlockNode C C -> BlockId + fromNode (BN x) = fst x + + toNode :: BlockId -> BlockNode C C + toNode bid = + BN (bid,getSuccessors cfg $ bid) + + +-- | We take in a CFG which has on its edges weights which are +-- relative only to other edges originating from the same node. +-- +-- We return a CFG for which each edge represents a GLOBAL weight. +-- This means edge weights are comparable across the whole graph. +-- +-- For irreducible control flow results might be imprecise, otherwise they +-- are reliable. +-- +-- The algorithm is based on the Paper +-- "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus +-- The only big change is that we go over the nodes in the body of loops in +-- reverse post order. Which is required for diamond control flow to work probably. +-- +-- We also apply a few prediction heuristics (based on the same paper) + +{-# NOINLINE mkGlobalWeights #-} +{-# SCC mkGlobalWeights #-} +mkGlobalWeights :: HasDebugCallStack => BlockId -> CFG -> (LabelMap Double, LabelMap (LabelMap Double)) +mkGlobalWeights root localCfg + | null localCfg = panic "Error - Empty CFG" + | otherwise + = (blockFreqs', edgeFreqs') + where + -- Calculate fixpoints + (blockFreqs, edgeFreqs) = calcFreqs nodeProbs backEdges' bodies' revOrder' + blockFreqs' = mapFromList $ map (first fromVertex) (assocs blockFreqs) :: LabelMap Double + edgeFreqs' = fmap fromVertexMap $ fromVertexMap edgeFreqs + + fromVertexMap :: IM.IntMap x -> LabelMap x + fromVertexMap m = mapFromList . map (first fromVertex) $ IM.toList m + + revOrder = revPostorderFrom localCfg root :: [BlockId] + loopResults@(LoopInfo backedges _levels bodies) = loopInfo localCfg root + + revOrder' = map toVertex revOrder + backEdges' = map (bimap toVertex toVertex) backedges + bodies' = map calcBody bodies + + estimatedCfg = staticBranchPrediction root loopResults localCfg + -- Normalize the weights to probabilities and apply heuristics + nodeProbs = cfgEdgeProbabilities estimatedCfg toVertex + + -- By mapping vertices to numbers in reverse post order we can bring any subset into reverse post + -- order simply by sorting. + -- TODO: The sort is redundant if we can guarantee that setElems returns elements ascending + calcBody (backedge, blocks) = + (toVertex $ snd backedge, sort . map toVertex $ (setElems blocks)) + + vertexMapping = mapFromList $ zip revOrder [0..] :: LabelMap Int + blockMapping = listArray (0,mapSize vertexMapping - 1) revOrder :: Array Int BlockId + -- Map from blockId to indices starting at zero + toVertex :: BlockId -> Int + toVertex blockId = expectJust "mkGlobalWeights" $ mapLookup blockId vertexMapping + -- Map from indices starting at zero to blockIds + fromVertex :: Int -> BlockId + fromVertex vertex = blockMapping ! vertex + +{- Note [Static Branch Prediction] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The work here has been based on the paper +"Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus. + +The primary differences are that if we branch on the result of a heap +check we do not apply any of the heuristics. +The reason is simple: They look like loops in the control flow graph +but are usually never entered, and if at most once. + +Currently implemented is a heuristic to predict that we do not exit +loops (lehPredicts) and one to predict that backedges are more likely +than any other edge. + +The back edge case is special as it superceeds any other heuristic if it +applies. + +Do NOT rely solely on nofib results for benchmarking this. I recommend at least +comparing megaparsec and container benchmarks. Nofib does not seeem to have +many instances of "loopy" Cmm where these make a difference. + +TODO: +* The paper containers more benchmarks which should be implemented. +* If we turn the likelihood on if/else branches into a probability + instead of true/false we could implement this as a Cmm pass. + + The complete Cmm code still exists and can be accessed by the heuristics + + There is no chance of register allocation/codegen inserting branches/blocks + + making the TransitionSource info wrong. + + potential to use this information in CmmPasses. + - Requires refactoring of all the code relying on the binary nature of likelihood. + - Requires refactoring `loopInfo` to work on both, Cmm Graphs and the backend CFG. +-} + +-- | Combination of target node id and information about the branch +-- we are looking at. +type TargetNodeInfo = (BlockId, EdgeInfo) + + +-- | Update branch weights based on certain heuristics. +-- See Note [Static Branch Prediction] +-- TODO: This should be combined with optimizeCFG +{-# SCC staticBranchPrediction #-} +staticBranchPrediction :: BlockId -> LoopInfo -> CFG -> CFG +staticBranchPrediction _root (LoopInfo l_backEdges loopLevels l_loops) cfg = + -- pprTrace "staticEstimatesOn" (ppr (cfg)) $ + foldl' update cfg nodes + where + nodes = getCfgNodes cfg + backedges = S.fromList $ l_backEdges + -- Loops keyed by their back edge + loops = M.fromList $ l_loops :: M.Map Edge LabelSet + loopHeads = S.fromList $ map snd $ M.keys loops + + update :: CFG -> BlockId -> CFG + update cfg node + -- No successors, nothing to do. + | null successors = cfg + + -- Mix of backedges and others: + -- Always predict the backedges. + | not (null m) && length m < length successors + -- Heap/Stack checks "loop", but only once. + -- So we simply exclude any case involving them. + , not $ any (isHeapOrStackCheck . transitionSource . snd) successors + = let loopChance = repeat $! pred_LBH / (fromIntegral $ length m) + exitChance = repeat $! (1 - pred_LBH) / fromIntegral (length not_m) + updates = zip (map fst m) loopChance ++ zip (map fst not_m) exitChance + in -- pprTrace "mix" (ppr (node,successors)) $ + foldl' (\cfg (to,weight) -> setEdgeWeight cfg weight node to) cfg updates + + -- For (regular) non-binary branches we keep the weights from the STG -> Cmm translation. + | length successors /= 2 + = cfg + + -- Only backedges - no need to adjust + | length m > 0 + = cfg + + -- A regular binary branch, we can plug addition predictors in here. + | [(s1,s1_info),(s2,s2_info)] <- successors + , not $ any (isHeapOrStackCheck . transitionSource . snd) successors + = -- Normalize weights to total of 1 + let !w1 = max (edgeWeight s1_info) (0) + !w2 = max (edgeWeight s2_info) (0) + -- Of both weights are <= 0 we set both to 0.5 + normalizeWeight w = if w1 + w2 == 0 then 0.5 else w/(w1+w2) + !cfg' = setEdgeWeight cfg (normalizeWeight w1) node s1 + !cfg'' = setEdgeWeight cfg' (normalizeWeight w2) node s2 + + -- Figure out which heuristics apply to these successors + heuristics = map ($ ((s1,s1_info),(s2,s2_info))) + [lehPredicts, phPredicts, ohPredicts, ghPredicts, lhhPredicts, chPredicts + , shPredicts, rhPredicts] + -- Apply result of a heuristic. Argument is the likelihood + -- predicted for s1. + applyHeuristic :: CFG -> Maybe Prob -> CFG + applyHeuristic cfg Nothing = cfg + applyHeuristic cfg (Just (s1_pred :: Double)) + | s1_old == 0 || s2_old == 0 || + isHeapOrStackCheck (transitionSource s1_info) || + isHeapOrStackCheck (transitionSource s2_info) + = cfg + | otherwise = + let -- Predictions from heuristic + s1_prob = EdgeWeight s1_pred :: EdgeWeight + s2_prob = 1.0 - s1_prob + -- Update + d = (s1_old * s1_prob) + (s2_old * s2_prob) :: EdgeWeight + s1_prob' = s1_old * s1_prob / d + !s2_prob' = s2_old * s2_prob / d + !cfg_s1 = setEdgeWeight cfg s1_prob' node s1 + in -- pprTrace "Applying heuristic!" (ppr (node,s1,s2) $$ ppr (s1_prob', s2_prob')) $ + setEdgeWeight cfg_s1 s2_prob' node s2 + where + -- Old weights + s1_old = getEdgeWeight cfg node s1 + s2_old = getEdgeWeight cfg node s2 + + in + -- pprTraceIt "RegularCfgResult" $ + foldl' applyHeuristic cfg'' heuristics + + -- Branch on heap/stack check + | otherwise = cfg + + where + -- Chance that loops are taken. + pred_LBH = 0.875 + -- successors + successors = getSuccessorEdges cfg node + -- backedges + (m,not_m) = partition (\succ -> S.member (node, fst succ) backedges) successors + + -- Heuristics return nothing if they don't say anything about this branch + -- or Just (prob_s1) where prob_s1 is the likelihood for s1 to be the + -- taken branch. s1 is the branch in the true case. + + -- Loop exit heuristic. + -- We are unlikely to leave a loop unless it's to enter another one. + pred_LEH = 0.75 + -- If and only if no successor is a loopheader, + -- then we will likely not exit the current loop body. + lehPredicts :: (TargetNodeInfo,TargetNodeInfo) -> Maybe Prob + lehPredicts ((s1,_s1_info),(s2,_s2_info)) + | S.member s1 loopHeads || S.member s2 loopHeads + = Nothing + + | otherwise + = --pprTrace "lehPredict:" (ppr $ compare s1Level s2Level) $ + case compare s1Level s2Level of + EQ -> Nothing + LT -> Just (1-pred_LEH) --s1 exits to a shallower loop level (exits loop) + GT -> Just (pred_LEH) --s1 exits to a deeper loop level + where + s1Level = mapLookup s1 loopLevels + s2Level = mapLookup s2 loopLevels + + -- Comparing to a constant is unlikely to be equal. + ohPredicts (s1,_s2) + | CmmSource { trans_cmmNode = src1 } <- getTransitionSource node (fst s1) cfg + , CmmCondBranch cond ltrue _lfalse likely <- src1 + , likely == Nothing + , CmmMachOp mop args <- cond + , MO_Eq {} <- mop + , not (null [x | x@CmmLit{} <- args]) + = if fst s1 == ltrue then Just 0.3 else Just 0.7 + + | otherwise + = Nothing + + -- TODO: These are all the other heuristics from the paper. + -- Not all will apply, for now we just stub them out as Nothing. + phPredicts = const Nothing + ghPredicts = const Nothing + lhhPredicts = const Nothing + chPredicts = const Nothing + shPredicts = const Nothing + rhPredicts = const Nothing + +-- We normalize all edge weights as probabilities between 0 and 1. +-- Ignoring rounding errors all outgoing edges sum up to 1. +cfgEdgeProbabilities :: CFG -> (BlockId -> Int) -> IM.IntMap (IM.IntMap Prob) +cfgEdgeProbabilities cfg toVertex + = mapFoldlWithKey foldEdges IM.empty cfg + where + foldEdges = (\m from toMap -> IM.insert (toVertex from) (normalize toMap) m) + + normalize :: (LabelMap EdgeInfo) -> (IM.IntMap Prob) + normalize weightMap + | edgeCount <= 1 = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) 1.0 m) IM.empty weightMap + | otherwise = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) (normalWeight k) m) IM.empty weightMap + where + edgeCount = mapSize weightMap + -- Negative weights are generally allowed but are mapped to zero. + -- We then check if there is at least one non-zero edge and if not + -- assign uniform weights to all branches. + minWeight = 0 :: Prob + weightMap' = fmap (\w -> max (weightToDouble . edgeWeight $ w) minWeight) weightMap + totalWeight = sum weightMap' + + normalWeight :: BlockId -> Prob + normalWeight bid + | totalWeight == 0 + = 1.0 / fromIntegral edgeCount + | Just w <- mapLookup bid weightMap' + = w/totalWeight + | otherwise = panic "impossible" + +-- This is the fixpoint algorithm from +-- "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus +-- The adaption to Haskell is my own. +calcFreqs :: IM.IntMap (IM.IntMap Prob) -> [(Int,Int)] -> [(Int, [Int])] -> [Int] + -> (Array Int Double, IM.IntMap (IM.IntMap Prob)) +calcFreqs graph backEdges loops revPostOrder = runST $ do + visitedNodes <- newArray (0,nodeCount-1) False :: ST s (STUArray s Int Bool) + blockFreqs <- newArray (0,nodeCount-1) 0.0 :: ST s (STUArray s Int Double) + edgeProbs <- newSTRef graph + edgeBackProbs <- newSTRef graph + + -- let traceArray a = do + -- vs <- forM [0..nodeCount-1] $ \i -> readArray a i >>= (\v -> return (i,v)) + -- trace ("array: " ++ show vs) $ return () + + let -- See #1600, we need to inline or unboxing makes perf worse. + -- {-# INLINE getFreq #-} + {-# INLINE visited #-} + visited b = unsafeRead visitedNodes b + getFreq b = unsafeRead blockFreqs b + -- setFreq :: forall s. Int -> Double -> ST s () + setFreq b f = unsafeWrite blockFreqs b f + -- setVisited :: forall s. Node -> ST s () + setVisited b = unsafeWrite visitedNodes b True + -- Frequency/probability that edge is taken. + getProb' arr b1 b2 = readSTRef arr >>= + (\graph -> + return . + fromMaybe (error "getFreq 1") . + IM.lookup b2 . + fromMaybe (error "getFreq 2") $ + (IM.lookup b1 graph) + ) + setProb' arr b1 b2 prob = do + g <- readSTRef arr + let !m = fromMaybe (error "Foo") $ IM.lookup b1 g + !m' = IM.insert b2 prob m + writeSTRef arr $! (IM.insert b1 m' g) + + getEdgeFreq b1 b2 = getProb' edgeProbs b1 b2 + setEdgeFreq b1 b2 = setProb' edgeProbs b1 b2 + getProb b1 b2 = fromMaybe (error "getProb") $ do + m' <- IM.lookup b1 graph + IM.lookup b2 m' + + getBackProb b1 b2 = getProb' edgeBackProbs b1 b2 + setBackProb b1 b2 = setProb' edgeBackProbs b1 b2 + + + let -- calcOutFreqs :: Node -> ST s () + calcOutFreqs bhead block = do + !f <- getFreq block + forM (successors block) $ \bi -> do + let !prob = getProb block bi + let !succFreq = f * prob + setEdgeFreq block bi succFreq + -- traceM $ "SetOut: " ++ show (block, bi, f, prob, succFreq) + when (bi == bhead) $ setBackProb block bi succFreq + + + let propFreq block head = do + -- traceM ("prop:" ++ show (block,head)) + -- traceShowM block + + !v <- visited block + if v then + return () --Dont look at nodes twice + else if block == head then + setFreq block 1.0 -- Loop header frequency is always 1 + else do + let preds = IS.elems $ predecessors block + irreducible <- (fmap or) $ forM preds $ \bp -> do + !bp_visited <- visited bp + let bp_backedge = isBackEdge bp block + return (not bp_visited && not bp_backedge) + + if irreducible + then return () -- Rare we don't care + else do + setFreq block 0 + !cycleProb <- sum <$> (forM preds $ \pred -> do + if isBackEdge pred block + then + getBackProb pred block + else do + !f <- getFreq block + !prob <- getEdgeFreq pred block + setFreq block $! f + prob + return 0) + -- traceM $ "cycleProb:" ++ show cycleProb + let limit = 1 - 1/512 -- Paper uses 1 - epsilon, but this works. + -- determines how large likelyhoods in loops can grow. + !cycleProb <- return $ min cycleProb limit -- <- return $ if cycleProb > limit then limit else cycleProb + -- traceM $ "cycleProb:" ++ show cycleProb + + !f <- getFreq block + setFreq block (f / (1.0 - cycleProb)) + + setVisited block + calcOutFreqs head block + + -- Loops, by nesting, inner to outer + forM_ loops $ \(head, body) -> do + forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i True) -- Mark all nodes as visited. + forM_ body (\i -> unsafeWrite visitedNodes i False) -- Mark all blocks reachable from head as not visited + forM_ body $ \block -> propFreq block head + + -- After dealing with all loops, deal with non-looping parts of the CFG + forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i False) -- Everything in revPostOrder is reachable + forM_ revPostOrder $ \block -> propFreq block (head revPostOrder) + + -- trace ("Final freqs:") $ return () + -- let freqString = pprFreqs freqs + -- trace (unlines freqString) $ return () + -- trace (pprFre) $ return () + graph' <- readSTRef edgeProbs + freqs' <- unsafeFreeze blockFreqs + + return (freqs', graph') + where + -- How can these lookups fail? Consider the CFG [A -> B] + predecessors :: Int -> IS.IntSet + predecessors b = fromMaybe IS.empty $ IM.lookup b revGraph + successors :: Int -> [Int] + successors b = fromMaybe (lookupError "succ" b graph)$ IM.keys <$> IM.lookup b graph + lookupError s b g = pprPanic ("Lookup error " ++ s) $ + ( text "node" <+> ppr b $$ + text "graph" <+> + vcat (map (\(k,m) -> ppr (k,m :: IM.IntMap Double)) $ IM.toList g) + ) + + nodeCount = IM.foldl' (\count toMap -> IM.foldlWithKey' countTargets count toMap) (IM.size graph) graph + where + countTargets = (\count k _ -> countNode k + count ) + countNode n = if IM.member n graph then 0 else 1 + + isBackEdge from to = S.member (from,to) backEdgeSet + backEdgeSet = S.fromList backEdges + + revGraph :: IntMap IntSet + revGraph = IM.foldlWithKey' (\m from toMap -> addEdges m from toMap) IM.empty graph + where + addEdges m0 from toMap = IM.foldlWithKey' (\m k _ -> addEdge m from k) m0 toMap + addEdge m0 from to = IM.insertWith IS.union to (IS.singleton from) m0 diff --git a/compiler/GHC/CmmToAsm/CFG/Dominators.hs b/compiler/GHC/CmmToAsm/CFG/Dominators.hs new file mode 100644 index 0000000000..b9dcacd8cb --- /dev/null +++ b/compiler/GHC/CmmToAsm/CFG/Dominators.hs @@ -0,0 +1,597 @@ +{-# LANGUAGE RankNTypes, BangPatterns, FlexibleContexts, Strict #-}
+
+{- |
+ Module : Dominators
+ Copyright : (c) Matt Morrow 2009
+ License : BSD3
+ Maintainer : <morrow@moonpatio.com>
+ Stability : experimental
+ Portability : portable
+
+ Taken from the dom-lt package.
+
+ The Lengauer-Tarjan graph dominators algorithm.
+
+ \[1\] Lengauer, Tarjan,
+ /A Fast Algorithm for Finding Dominators in a Flowgraph/, 1979.
+
+ \[2\] Muchnick,
+ /Advanced Compiler Design and Implementation/, 1997.
+
+ \[3\] Brisk, Sarrafzadeh,
+ /Interference Graphs for Procedures in Static Single/
+ /Information Form are Interval Graphs/, 2007.
+
+ Originally taken from the dom-lt package.
+-}
+
+module GHC.CmmToAsm.CFG.Dominators (
+ Node,Path,Edge
+ ,Graph,Rooted
+ ,idom,ipdom
+ ,domTree,pdomTree
+ ,dom,pdom
+ ,pddfs,rpddfs
+ ,fromAdj,fromEdges
+ ,toAdj,toEdges
+ ,asTree,asGraph
+ ,parents,ancestors
+) where
+
+import GhcPrelude
+
+import Data.Bifunctor
+import Data.Tuple (swap)
+
+import Data.Tree
+import Data.IntMap(IntMap)
+import Data.IntSet(IntSet)
+import qualified Data.IntMap.Strict as IM
+import qualified Data.IntSet as IS
+
+import Control.Monad
+import Control.Monad.ST.Strict
+
+import Data.Array.ST
+import Data.Array.Base hiding ((!))
+ -- (unsafeNewArray_
+ -- ,unsafeWrite,unsafeRead
+ -- ,readArray,writeArray)
+
+import Util (debugIsOn)
+
+-----------------------------------------------------------------------------
+
+type Node = Int
+type Path = [Node]
+type Edge = (Node,Node)
+type Graph = IntMap IntSet
+type Rooted = (Node, Graph)
+
+-----------------------------------------------------------------------------
+
+-- | /Dominators/.
+-- Complexity as for @idom@
+dom :: Rooted -> [(Node, Path)]
+dom = ancestors . domTree
+
+-- | /Post-dominators/.
+-- Complexity as for @idom@.
+pdom :: Rooted -> [(Node, Path)]
+pdom = ancestors . pdomTree
+
+-- | /Dominator tree/.
+-- Complexity as for @idom@.
+domTree :: Rooted -> Tree Node
+domTree a@(r,_) =
+ let is = filter ((/=r).fst) (idom a)
+ tg = fromEdges (fmap swap is)
+ in asTree (r,tg)
+
+-- | /Post-dominator tree/.
+-- Complexity as for @idom@.
+pdomTree :: Rooted -> Tree Node
+pdomTree a@(r,_) =
+ let is = filter ((/=r).fst) (ipdom a)
+ tg = fromEdges (fmap swap is)
+ in asTree (r,tg)
+
+-- | /Immediate dominators/.
+-- /O(|E|*alpha(|E|,|V|))/, where /alpha(m,n)/ is
+-- \"a functional inverse of Ackermann's function\".
+--
+-- This Complexity bound assumes /O(1)/ indexing. Since we're
+-- using @IntMap@, it has an additional /lg |V|/ factor
+-- somewhere in there. I'm not sure where.
+idom :: Rooted -> [(Node,Node)]
+idom rg = runST (evalS idomM =<< initEnv (pruneReach rg))
+
+-- | /Immediate post-dominators/.
+-- Complexity as for @idom@.
+ipdom :: Rooted -> [(Node,Node)]
+ipdom rg = runST (evalS idomM =<< initEnv (pruneReach (second predG rg)))
+
+-----------------------------------------------------------------------------
+
+-- | /Post-dominated depth-first search/.
+pddfs :: Rooted -> [Node]
+pddfs = reverse . rpddfs
+
+-- | /Reverse post-dominated depth-first search/.
+rpddfs :: Rooted -> [Node]
+rpddfs = concat . levels . pdomTree
+
+-----------------------------------------------------------------------------
+
+type Dom s a = S s (Env s) a
+type NodeSet = IntSet
+type NodeMap a = IntMap a
+data Env s = Env
+ {succE :: !Graph
+ ,predE :: !Graph
+ ,bucketE :: !Graph
+ ,dfsE :: {-# UNPACK #-}!Int
+ ,zeroE :: {-# UNPACK #-}!Node
+ ,rootE :: {-# UNPACK #-}!Node
+ ,labelE :: {-# UNPACK #-}!(Arr s Node)
+ ,parentE :: {-# UNPACK #-}!(Arr s Node)
+ ,ancestorE :: {-# UNPACK #-}!(Arr s Node)
+ ,childE :: {-# UNPACK #-}!(Arr s Node)
+ ,ndfsE :: {-# UNPACK #-}!(Arr s Node)
+ ,dfnE :: {-# UNPACK #-}!(Arr s Int)
+ ,sdnoE :: {-# UNPACK #-}!(Arr s Int)
+ ,sizeE :: {-# UNPACK #-}!(Arr s Int)
+ ,domE :: {-# UNPACK #-}!(Arr s Node)
+ ,rnE :: {-# UNPACK #-}!(Arr s Node)}
+
+-----------------------------------------------------------------------------
+
+idomM :: Dom s [(Node,Node)]
+idomM = do
+ dfsDom =<< rootM
+ n <- gets dfsE
+ forM_ [n,n-1..1] (\i-> do
+ w <- ndfsM i
+ sw <- sdnoM w
+ ps <- predsM w
+ forM_ ps (\v-> do
+ u <- eval v
+ su <- sdnoM u
+ when (su < sw)
+ (store sdnoE w su))
+ z <- ndfsM =<< sdnoM w
+ modify(\e->e{bucketE=IM.adjust
+ (w`IS.insert`)
+ z (bucketE e)})
+ pw <- parentM w
+ link pw w
+ bps <- bucketM pw
+ forM_ bps (\v-> do
+ u <- eval v
+ su <- sdnoM u
+ sv <- sdnoM v
+ let dv = case su < sv of
+ True-> u
+ False-> pw
+ store domE v dv))
+ forM_ [1..n] (\i-> do
+ w <- ndfsM i
+ j <- sdnoM w
+ z <- ndfsM j
+ dw <- domM w
+ when (dw /= z)
+ (do ddw <- domM dw
+ store domE w ddw))
+ fromEnv
+
+-----------------------------------------------------------------------------
+
+eval :: Node -> Dom s Node
+eval v = do
+ n0 <- zeroM
+ a <- ancestorM v
+ case a==n0 of
+ True-> labelM v
+ False-> do
+ compress v
+ a <- ancestorM v
+ l <- labelM v
+ la <- labelM a
+ sl <- sdnoM l
+ sla <- sdnoM la
+ case sl <= sla of
+ True-> return l
+ False-> return la
+
+compress :: Node -> Dom s ()
+compress v = do
+ n0 <- zeroM
+ a <- ancestorM v
+ aa <- ancestorM a
+ when (aa /= n0) (do
+ compress a
+ a <- ancestorM v
+ aa <- ancestorM a
+ l <- labelM v
+ la <- labelM a
+ sl <- sdnoM l
+ sla <- sdnoM la
+ when (sla < sl)
+ (store labelE v la)
+ store ancestorE v aa)
+
+-----------------------------------------------------------------------------
+
+link :: Node -> Node -> Dom s ()
+link v w = do
+ n0 <- zeroM
+ lw <- labelM w
+ slw <- sdnoM lw
+ let balance s = do
+ c <- childM s
+ lc <- labelM c
+ slc <- sdnoM lc
+ case slw < slc of
+ False-> return s
+ True-> do
+ zs <- sizeM s
+ zc <- sizeM c
+ cc <- childM c
+ zcc <- sizeM cc
+ case 2*zc <= zs+zcc of
+ True-> do
+ store ancestorE c s
+ store childE s cc
+ balance s
+ False-> do
+ store sizeE c zs
+ store ancestorE s c
+ balance c
+ s <- balance w
+ lw <- labelM w
+ zw <- sizeM w
+ store labelE s lw
+ store sizeE v . (+zw) =<< sizeM v
+ let follow s = do
+ when (s /= n0) (do
+ store ancestorE s v
+ follow =<< childM s)
+ zv <- sizeM v
+ follow =<< case zv < 2*zw of
+ False-> return s
+ True-> do
+ cv <- childM v
+ store childE v s
+ return cv
+
+-----------------------------------------------------------------------------
+
+dfsDom :: Node -> Dom s ()
+dfsDom i = do
+ _ <- go i
+ n0 <- zeroM
+ r <- rootM
+ store parentE r n0
+ where go i = do
+ n <- nextM
+ store dfnE i n
+ store sdnoE i n
+ store ndfsE n i
+ store labelE i i
+ ss <- succsM i
+ forM_ ss (\j-> do
+ s <- sdnoM j
+ case s==0 of
+ False-> return()
+ True-> do
+ store parentE j i
+ go j)
+
+-----------------------------------------------------------------------------
+
+initEnv :: Rooted -> ST s (Env s)
+initEnv (r0,g0) = do
+ let (g,rnmap) = renum 1 g0
+ pred = predG g
+ r = rnmap IM.! r0
+ n = IM.size g
+ ns = [0..n]
+ m = n+1
+
+ let bucket = IM.fromList
+ (zip ns (repeat mempty))
+
+ rna <- newI m
+ writes rna (fmap swap
+ (IM.toList rnmap))
+
+ doms <- newI m
+ sdno <- newI m
+ size <- newI m
+ parent <- newI m
+ ancestor <- newI m
+ child <- newI m
+ label <- newI m
+ ndfs <- newI m
+ dfn <- newI m
+
+ forM_ [0..n] (doms.=0)
+ forM_ [0..n] (sdno.=0)
+ forM_ [1..n] (size.=1)
+ forM_ [0..n] (ancestor.=0)
+ forM_ [0..n] (child.=0)
+
+ (doms.=r) r
+ (size.=0) 0
+ (label.=0) 0
+
+ return (Env
+ {rnE = rna
+ ,dfsE = 0
+ ,zeroE = 0
+ ,rootE = r
+ ,labelE = label
+ ,parentE = parent
+ ,ancestorE = ancestor
+ ,childE = child
+ ,ndfsE = ndfs
+ ,dfnE = dfn
+ ,sdnoE = sdno
+ ,sizeE = size
+ ,succE = g
+ ,predE = pred
+ ,bucketE = bucket
+ ,domE = doms})
+
+fromEnv :: Dom s [(Node,Node)]
+fromEnv = do
+ dom <- gets domE
+ rn <- gets rnE
+ -- r <- gets rootE
+ (_,n) <- st (getBounds dom)
+ forM [1..n] (\i-> do
+ j <- st(rn!:i)
+ d <- st(dom!:i)
+ k <- st(rn!:d)
+ return (j,k))
+
+-----------------------------------------------------------------------------
+
+zeroM :: Dom s Node
+zeroM = gets zeroE
+domM :: Node -> Dom s Node
+domM = fetch domE
+rootM :: Dom s Node
+rootM = gets rootE
+succsM :: Node -> Dom s [Node]
+succsM i = gets (IS.toList . (! i) . succE)
+predsM :: Node -> Dom s [Node]
+predsM i = gets (IS.toList . (! i) . predE)
+bucketM :: Node -> Dom s [Node]
+bucketM i = gets (IS.toList . (! i) . bucketE)
+sizeM :: Node -> Dom s Int
+sizeM = fetch sizeE
+sdnoM :: Node -> Dom s Int
+sdnoM = fetch sdnoE
+-- dfnM :: Node -> Dom s Int
+-- dfnM = fetch dfnE
+ndfsM :: Int -> Dom s Node
+ndfsM = fetch ndfsE
+childM :: Node -> Dom s Node
+childM = fetch childE
+ancestorM :: Node -> Dom s Node
+ancestorM = fetch ancestorE
+parentM :: Node -> Dom s Node
+parentM = fetch parentE
+labelM :: Node -> Dom s Node
+labelM = fetch labelE
+nextM :: Dom s Int
+nextM = do
+ n <- gets dfsE
+ let n' = n+1
+ modify(\e->e{dfsE=n'})
+ return n'
+
+-----------------------------------------------------------------------------
+
+type A = STUArray
+type Arr s a = A s Int a
+
+infixl 9 !:
+infixr 2 .=
+
+(.=) :: (MArray (A s) a (ST s))
+ => Arr s a -> a -> Int -> ST s ()
+(v .= x) i
+ | debugIsOn = writeArray v i x
+ | otherwise = unsafeWrite v i x
+
+(!:) :: (MArray (A s) a (ST s))
+ => A s Int a -> Int -> ST s a
+a !: i
+ | debugIsOn = do
+ o <- readArray a i
+ return $! o
+ | otherwise = do
+ o <- unsafeRead a i
+ return $! o
+
+new :: (MArray (A s) a (ST s))
+ => Int -> ST s (Arr s a)
+new n = unsafeNewArray_ (0,n-1)
+
+newI :: Int -> ST s (Arr s Int)
+newI = new
+
+-- newD :: Int -> ST s (Arr s Double)
+-- newD = new
+
+-- dump :: (MArray (A s) a (ST s)) => Arr s a -> ST s [a]
+-- dump a = do
+-- (m,n) <- getBounds a
+-- forM [m..n] (\i -> a!:i)
+
+writes :: (MArray (A s) a (ST s))
+ => Arr s a -> [(Int,a)] -> ST s ()
+writes a xs = forM_ xs (\(i,x) -> (a.=x) i)
+
+-- arr :: (MArray (A s) a (ST s)) => [a] -> ST s (Arr s a)
+-- arr xs = do
+-- let n = length xs
+-- a <- new n
+-- go a n 0 xs
+-- return a
+-- where go _ _ _ [] = return ()
+-- go a n i (x:xs)
+-- | i <= n = (a.=x) i >> go a n (i+1) xs
+-- | otherwise = return ()
+
+-----------------------------------------------------------------------------
+
+(!) :: Monoid a => IntMap a -> Int -> a
+(!) g n = maybe mempty id (IM.lookup n g)
+
+fromAdj :: [(Node, [Node])] -> Graph
+fromAdj = IM.fromList . fmap (second IS.fromList)
+
+fromEdges :: [Edge] -> Graph
+fromEdges = collectI IS.union fst (IS.singleton . snd)
+
+toAdj :: Graph -> [(Node, [Node])]
+toAdj = fmap (second IS.toList) . IM.toList
+
+toEdges :: Graph -> [Edge]
+toEdges = concatMap (uncurry (fmap . (,))) . toAdj
+
+predG :: Graph -> Graph
+predG g = IM.unionWith IS.union (go g) g0
+ where g0 = fmap (const mempty) g
+ f :: IntMap IntSet -> Int -> IntSet -> IntMap IntSet
+ f m i a = foldl' (\m p -> IM.insertWith mappend p
+ (IS.singleton i) m)
+ m
+ (IS.toList a)
+ go :: IntMap IntSet -> IntMap IntSet
+ go = flip IM.foldlWithKey' mempty f
+
+pruneReach :: Rooted -> Rooted
+pruneReach (r,g) = (r,g2)
+ where is = reachable
+ (maybe mempty id
+ . flip IM.lookup g) $ r
+ g2 = IM.fromList
+ . fmap (second (IS.filter (`IS.member`is)))
+ . filter ((`IS.member`is) . fst)
+ . IM.toList $ g
+
+tip :: Tree a -> (a, [Tree a])
+tip (Node a ts) = (a, ts)
+
+parents :: Tree a -> [(a, a)]
+parents (Node i xs) = p i xs
+ ++ concatMap parents xs
+ where p i = fmap (flip (,) i . rootLabel)
+
+ancestors :: Tree a -> [(a, [a])]
+ancestors = go []
+ where go acc (Node i xs)
+ = let acc' = i:acc
+ in p acc' xs ++ concatMap (go acc') xs
+ p is = fmap (flip (,) is . rootLabel)
+
+asGraph :: Tree Node -> Rooted
+asGraph t@(Node a _) = let g = go t in (a, fromAdj g)
+ where go (Node a ts) = let as = (fst . unzip . fmap tip) ts
+ in (a, as) : concatMap go ts
+
+asTree :: Rooted -> Tree Node
+asTree (r,g) = let go a = Node a (fmap go ((IS.toList . f) a))
+ f = (g !)
+ in go r
+
+reachable :: (Node -> NodeSet) -> (Node -> NodeSet)
+reachable f a = go (IS.singleton a) a
+ where go seen a = let s = f a
+ as = IS.toList (s `IS.difference` seen)
+ in foldl' go (s `IS.union` seen) as
+
+collectI :: (c -> c -> c)
+ -> (a -> Int) -> (a -> c) -> [a] -> IntMap c
+collectI (<>) f g
+ = foldl' (\m a -> IM.insertWith (<>)
+ (f a)
+ (g a) m) mempty
+
+-- collect :: (Ord b) => (c -> c -> c)
+-- -> (a -> b) -> (a -> c) -> [a] -> Map b c
+-- collect (<>) f g
+-- = foldl' (\m a -> SM.insertWith (<>)
+-- (f a)
+-- (g a) m) mempty
+
+-- (renamed, old -> new)
+renum :: Int -> Graph -> (Graph, NodeMap Node)
+renum from = (\(_,m,g)->(g,m))
+ . IM.foldlWithKey'
+ f (from,mempty,mempty)
+ where
+ f :: (Int, NodeMap Node, IntMap IntSet) -> Node -> IntSet
+ -> (Int, NodeMap Node, IntMap IntSet)
+ f (!n,!env,!new) i ss =
+ let (j,n2,env2) = go n env i
+ (n3,env3,ss2) = IS.fold
+ (\k (!n,!env,!new)->
+ case go n env k of
+ (l,n2,env2)-> (n2,env2,l `IS.insert` new))
+ (n2,env2,mempty) ss
+ new2 = IM.insertWith IS.union j ss2 new
+ in (n3,env3,new2)
+ go :: Int
+ -> NodeMap Node
+ -> Node
+ -> (Node,Int,NodeMap Node)
+ go !n !env i =
+ case IM.lookup i env of
+ Just j -> (j,n,env)
+ Nothing -> (n,n+1,IM.insert i n env)
+
+-----------------------------------------------------------------------------
+
+newtype S z s a = S {unS :: forall o. (a -> s -> ST z o) -> s -> ST z o}
+instance Functor (S z s) where
+ fmap f (S g) = S (\k -> g (k . f))
+instance Monad (S z s) where
+ return = pure
+ S g >>= f = S (\k -> g (\a -> unS (f a) k))
+instance Applicative (S z s) where
+ pure a = S (\k -> k a)
+ (<*>) = ap
+-- get :: S z s s
+-- get = S (\k s -> k s s)
+gets :: (s -> a) -> S z s a
+gets f = S (\k s -> k (f s) s)
+-- set :: s -> S z s ()
+-- set s = S (\k _ -> k () s)
+modify :: (s -> s) -> S z s ()
+modify f = S (\k -> k () . f)
+-- runS :: S z s a -> s -> ST z (a, s)
+-- runS (S g) = g (\a s -> return (a,s))
+evalS :: S z s a -> s -> ST z a
+evalS (S g) = g ((return .) . const)
+-- execS :: S z s a -> s -> ST z s
+-- execS (S g) = g ((return .) . flip const)
+st :: ST z a -> S z s a
+st m = S (\k s-> do
+ a <- m
+ k a s)
+store :: (MArray (A z) a (ST z))
+ => (s -> Arr z a) -> Int -> a -> S z s ()
+store f i x = do
+ a <- gets f
+ st ((a.=x) i)
+fetch :: (MArray (A z) a (ST z))
+ => (s -> Arr z a) -> Int -> S z s a
+fetch f i = do
+ a <- gets f
+ st (a!:i)
+
diff --git a/compiler/GHC/CmmToAsm/CPrim.hs b/compiler/GHC/CmmToAsm/CPrim.hs new file mode 100644 index 0000000000..34c3a7ff6a --- /dev/null +++ b/compiler/GHC/CmmToAsm/CPrim.hs @@ -0,0 +1,133 @@ +-- | Generating C symbol names emitted by the compiler. +module GHC.CmmToAsm.CPrim + ( atomicReadLabel + , atomicWriteLabel + , atomicRMWLabel + , cmpxchgLabel + , popCntLabel + , pdepLabel + , pextLabel + , bSwapLabel + , bRevLabel + , clzLabel + , ctzLabel + , word2FloatLabel + ) where + +import GhcPrelude + +import GHC.Cmm.Type +import GHC.Cmm.MachOp +import Outputable + +popCntLabel :: Width -> String +popCntLabel w = "hs_popcnt" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "popCntLabel: Unsupported word width " (ppr w) + +pdepLabel :: Width -> String +pdepLabel w = "hs_pdep" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "pdepLabel: Unsupported word width " (ppr w) + +pextLabel :: Width -> String +pextLabel w = "hs_pext" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "pextLabel: Unsupported word width " (ppr w) + +bSwapLabel :: Width -> String +bSwapLabel w = "hs_bswap" ++ pprWidth w + where + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "bSwapLabel: Unsupported word width " (ppr w) + +bRevLabel :: Width -> String +bRevLabel w = "hs_bitrev" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "bRevLabel: Unsupported word width " (ppr w) + +clzLabel :: Width -> String +clzLabel w = "hs_clz" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "clzLabel: Unsupported word width " (ppr w) + +ctzLabel :: Width -> String +ctzLabel w = "hs_ctz" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "ctzLabel: Unsupported word width " (ppr w) + +word2FloatLabel :: Width -> String +word2FloatLabel w = "hs_word2float" ++ pprWidth w + where + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "word2FloatLabel: Unsupported word width " (ppr w) + +atomicRMWLabel :: Width -> AtomicMachOp -> String +atomicRMWLabel w amop = "hs_atomic_" ++ pprFunName amop ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "atomicRMWLabel: Unsupported word width " (ppr w) + + pprFunName AMO_Add = "add" + pprFunName AMO_Sub = "sub" + pprFunName AMO_And = "and" + pprFunName AMO_Nand = "nand" + pprFunName AMO_Or = "or" + pprFunName AMO_Xor = "xor" + +cmpxchgLabel :: Width -> String +cmpxchgLabel w = "hs_cmpxchg" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "cmpxchgLabel: Unsupported word width " (ppr w) + +atomicReadLabel :: Width -> String +atomicReadLabel w = "hs_atomicread" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "atomicReadLabel: Unsupported word width " (ppr w) + +atomicWriteLabel :: Width -> String +atomicWriteLabel w = "hs_atomicwrite" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "atomicWriteLabel: Unsupported word width " (ppr w) diff --git a/compiler/GHC/CmmToAsm/Dwarf.hs b/compiler/GHC/CmmToAsm/Dwarf.hs new file mode 100644 index 0000000000..8cacd19023 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Dwarf.hs @@ -0,0 +1,269 @@ +module GHC.CmmToAsm.Dwarf ( + dwarfGen + ) where + +import GhcPrelude + +import GHC.Cmm.CLabel +import GHC.Cmm.Expr ( GlobalReg(..) ) +import Config ( cProjectName, cProjectVersion ) +import CoreSyn ( Tickish(..) ) +import GHC.Cmm.DebugBlock +import GHC.Driver.Session +import Module +import Outputable +import GHC.Platform +import Unique +import UniqSupply + +import GHC.CmmToAsm.Dwarf.Constants +import GHC.CmmToAsm.Dwarf.Types + +import Control.Arrow ( first ) +import Control.Monad ( mfilter ) +import Data.Maybe +import Data.List ( sortBy ) +import Data.Ord ( comparing ) +import qualified Data.Map as Map +import System.FilePath +import System.Directory ( getCurrentDirectory ) + +import qualified GHC.Cmm.Dataflow.Label as H +import qualified GHC.Cmm.Dataflow.Collections as H + +-- | Generate DWARF/debug information +dwarfGen :: DynFlags -> ModLocation -> UniqSupply -> [DebugBlock] + -> IO (SDoc, UniqSupply) +dwarfGen _ _ us [] = return (empty, us) +dwarfGen df modLoc us blocks = do + + -- Convert debug data structures to DWARF info records + -- We strip out block information when running with -g0 or -g1. + let procs = debugSplitProcs blocks + stripBlocks dbg + | debugLevel df < 2 = dbg { dblBlocks = [] } + | otherwise = dbg + compPath <- getCurrentDirectory + let lowLabel = dblCLabel $ head procs + highLabel = mkAsmTempEndLabel $ dblCLabel $ last procs + dwarfUnit = DwarfCompileUnit + { dwChildren = map (procToDwarf df) (map stripBlocks procs) + , dwName = fromMaybe "" (ml_hs_file modLoc) + , dwCompDir = addTrailingPathSeparator compPath + , dwProducer = cProjectName ++ " " ++ cProjectVersion + , dwLowLabel = lowLabel + , dwHighLabel = highLabel + , dwLineLabel = dwarfLineLabel + } + + -- Check whether we have any source code information, so we do not + -- end up writing a pointer to an empty .debug_line section + -- (dsymutil on Mac Os gets confused by this). + let haveSrcIn blk = isJust (dblSourceTick blk) && isJust (dblPosition blk) + || any haveSrcIn (dblBlocks blk) + haveSrc = any haveSrcIn procs + + -- .debug_abbrev section: Declare the format we're using + let abbrevSct = pprAbbrevDecls haveSrc + + -- .debug_info section: Information records on procedures and blocks + let -- unique to identify start and end compilation unit .debug_inf + (unitU, us') = takeUniqFromSupply us + infoSct = vcat [ ptext dwarfInfoLabel <> colon + , dwarfInfoSection + , compileUnitHeader unitU + , pprDwarfInfo haveSrc dwarfUnit + , compileUnitFooter unitU + ] + + -- .debug_line section: Generated mainly by the assembler, but we + -- need to label it + let lineSct = dwarfLineSection $$ + ptext dwarfLineLabel <> colon + + -- .debug_frame section: Information about the layout of the GHC stack + let (framesU, us'') = takeUniqFromSupply us' + frameSct = dwarfFrameSection $$ + ptext dwarfFrameLabel <> colon $$ + pprDwarfFrame (debugFrame framesU procs) + + -- .aranges section: Information about the bounds of compilation units + let aranges' | gopt Opt_SplitSections df = map mkDwarfARange procs + | otherwise = [DwarfARange lowLabel highLabel] + let aranges = dwarfARangesSection $$ pprDwarfARanges aranges' unitU + + return (infoSct $$ abbrevSct $$ lineSct $$ frameSct $$ aranges, us'') + +-- | Build an address range entry for one proc. +-- With split sections, each proc needs its own entry, since they may get +-- scattered in the final binary. Without split sections, we could make a +-- single arange based on the first/last proc. +mkDwarfARange :: DebugBlock -> DwarfARange +mkDwarfARange proc = DwarfARange start end + where + start = dblCLabel proc + end = mkAsmTempEndLabel start + +-- | Header for a compilation unit, establishing global format +-- parameters +compileUnitHeader :: Unique -> SDoc +compileUnitHeader unitU = sdocWithPlatform $ \plat -> + let cuLabel = mkAsmTempLabel unitU -- sits right before initialLength field + length = ppr (mkAsmTempEndLabel cuLabel) <> char '-' <> ppr cuLabel + <> text "-4" -- length of initialLength field + in vcat [ ppr cuLabel <> colon + , text "\t.long " <> length -- compilation unit size + , pprHalf 3 -- DWARF version + , sectionOffset (ptext dwarfAbbrevLabel) (ptext dwarfAbbrevLabel) + -- abbrevs offset + , text "\t.byte " <> ppr (platformWordSizeInBytes plat) -- word size + ] + +-- | Compilation unit footer, mainly establishing size of debug sections +compileUnitFooter :: Unique -> SDoc +compileUnitFooter unitU = + let cuEndLabel = mkAsmTempEndLabel $ mkAsmTempLabel unitU + in ppr cuEndLabel <> colon + +-- | Splits the blocks by procedures. In the result all nested blocks +-- will come from the same procedure as the top-level block. See +-- Note [Splitting DebugBlocks] for details. +debugSplitProcs :: [DebugBlock] -> [DebugBlock] +debugSplitProcs b = concat $ H.mapElems $ mergeMaps $ map (split Nothing) b + where mergeMaps = foldr (H.mapUnionWithKey (const (++))) H.mapEmpty + split :: Maybe DebugBlock -> DebugBlock -> H.LabelMap [DebugBlock] + split parent blk = H.mapInsert prc [blk'] nested + where prc = dblProcedure blk + blk' = blk { dblBlocks = own_blks + , dblParent = parent + } + own_blks = fromMaybe [] $ H.mapLookup prc nested + nested = mergeMaps $ map (split parent') $ dblBlocks blk + -- Figure out who should be the parent of nested blocks. + -- If @blk@ is optimized out then it isn't a good choice + -- and we just use its parent. + parent' + | Nothing <- dblPosition blk = parent + | otherwise = Just blk + +{- +Note [Splitting DebugBlocks] + +DWARF requires that we break up the nested DebugBlocks produced from +the C-- AST. For instance, we begin with tick trees containing nested procs. +For example, + + proc A [tick1, tick2] + block B [tick3] + proc C [tick4] + +when producing DWARF we need to procs (which are represented in DWARF as +TAG_subprogram DIEs) to be top-level DIEs. debugSplitProcs is responsible for +this transform, pulling out the nested procs into top-level procs. + +However, in doing this we need to be careful to preserve the parentage of the +nested procs. This is the reason DebugBlocks carry the dblParent field, allowing +us to reorganize the above tree as, + + proc A [tick1, tick2] + block B [tick3] + proc C [tick4] parent=B + +Here we have annotated the new proc C with an attribute giving its original +parent, B. +-} + +-- | Generate DWARF info for a procedure debug block +procToDwarf :: DynFlags -> DebugBlock -> DwarfInfo +procToDwarf df prc + = DwarfSubprogram { dwChildren = map (blockToDwarf df) (dblBlocks prc) + , dwName = case dblSourceTick prc of + Just s@SourceNote{} -> sourceName s + _otherwise -> showSDocDump df $ ppr $ dblLabel prc + , dwLabel = dblCLabel prc + , dwParent = fmap mkAsmTempDieLabel + $ mfilter goodParent + $ fmap dblCLabel (dblParent prc) + } + where + goodParent a | a == dblCLabel prc = False + -- Omit parent if it would be self-referential + goodParent a | not (externallyVisibleCLabel a) + , debugLevel df < 2 = False + -- We strip block information when running -g0 or -g1, don't + -- refer to blocks in that case. Fixes #14894. + goodParent _ = True + +-- | Generate DWARF info for a block +blockToDwarf :: DynFlags -> DebugBlock -> DwarfInfo +blockToDwarf df blk + = DwarfBlock { dwChildren = concatMap (tickToDwarf df) (dblTicks blk) + ++ map (blockToDwarf df) (dblBlocks blk) + , dwLabel = dblCLabel blk + , dwMarker = marker + } + where + marker + | Just _ <- dblPosition blk = Just $ mkAsmTempLabel $ dblLabel blk + | otherwise = Nothing -- block was optimized out + +tickToDwarf :: DynFlags -> Tickish () -> [DwarfInfo] +tickToDwarf _ (SourceNote ss _) = [DwarfSrcNote ss] +tickToDwarf _ _ = [] + +-- | Generates the data for the debug frame section, which encodes the +-- desired stack unwind behaviour for the debugger +debugFrame :: Unique -> [DebugBlock] -> DwarfFrame +debugFrame u procs + = DwarfFrame { dwCieLabel = mkAsmTempLabel u + , dwCieInit = initUws + , dwCieProcs = map (procToFrame initUws) procs + } + where + initUws :: UnwindTable + initUws = Map.fromList [(Sp, Just (UwReg Sp 0))] + +-- | Generates unwind information for a procedure debug block +procToFrame :: UnwindTable -> DebugBlock -> DwarfFrameProc +procToFrame initUws blk + = DwarfFrameProc { dwFdeProc = dblCLabel blk + , dwFdeHasInfo = dblHasInfoTbl blk + , dwFdeBlocks = map (uncurry blockToFrame) + (setHasInfo blockUws) + } + where blockUws :: [(DebugBlock, [UnwindPoint])] + blockUws = map snd $ sortBy (comparing fst) $ flatten blk + + flatten :: DebugBlock + -> [(Int, (DebugBlock, [UnwindPoint]))] + flatten b@DebugBlock{ dblPosition=pos, dblUnwind=uws, dblBlocks=blocks } + | Just p <- pos = (p, (b, uws')):nested + | otherwise = nested -- block was optimized out + where uws' = addDefaultUnwindings initUws uws + nested = concatMap flatten blocks + + -- | If the current procedure has an info table, then we also say that + -- its first block has one to ensure that it gets the necessary -1 + -- offset applied to its start address. + -- See Note [Info Offset] in Dwarf.Types. + setHasInfo :: [(DebugBlock, [UnwindPoint])] + -> [(DebugBlock, [UnwindPoint])] + setHasInfo [] = [] + setHasInfo (c0:cs) = first setIt c0 : cs + where + setIt child = + child { dblHasInfoTbl = dblHasInfoTbl child + || dblHasInfoTbl blk } + +blockToFrame :: DebugBlock -> [UnwindPoint] -> DwarfFrameBlock +blockToFrame blk uws + = DwarfFrameBlock { dwFdeBlkHasInfo = dblHasInfoTbl blk + , dwFdeUnwind = uws + } + +addDefaultUnwindings :: UnwindTable -> [UnwindPoint] -> [UnwindPoint] +addDefaultUnwindings tbl pts = + [ UnwindPoint lbl (tbl' `mappend` tbl) + -- mappend is left-biased + | UnwindPoint lbl tbl' <- pts + ] diff --git a/compiler/GHC/CmmToAsm/Dwarf/Constants.hs b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs new file mode 100644 index 0000000000..4ab54b6629 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs @@ -0,0 +1,229 @@ +-- | Constants describing the DWARF format. Most of this simply +-- mirrors /usr/include/dwarf.h. + +module GHC.CmmToAsm.Dwarf.Constants where + +import GhcPrelude + +import AsmUtils +import FastString +import GHC.Platform +import Outputable + +import GHC.Platform.Reg +import GHC.CmmToAsm.X86.Regs + +import Data.Word + +-- | Language ID used for Haskell. +dW_LANG_Haskell :: Word +dW_LANG_Haskell = 0x18 + -- Thanks to Nathan Howell for getting us our very own language ID! + +-- * Dwarf tags +dW_TAG_compile_unit, dW_TAG_subroutine_type, + dW_TAG_file_type, dW_TAG_subprogram, dW_TAG_lexical_block, + dW_TAG_base_type, dW_TAG_structure_type, dW_TAG_pointer_type, + dW_TAG_array_type, dW_TAG_subrange_type, dW_TAG_typedef, + dW_TAG_variable, dW_TAG_arg_variable, dW_TAG_auto_variable, + dW_TAG_ghc_src_note :: Word +dW_TAG_array_type = 1 +dW_TAG_lexical_block = 11 +dW_TAG_pointer_type = 15 +dW_TAG_compile_unit = 17 +dW_TAG_structure_type = 19 +dW_TAG_typedef = 22 +dW_TAG_subroutine_type = 32 +dW_TAG_subrange_type = 33 +dW_TAG_base_type = 36 +dW_TAG_file_type = 41 +dW_TAG_subprogram = 46 +dW_TAG_variable = 52 +dW_TAG_auto_variable = 256 +dW_TAG_arg_variable = 257 + +dW_TAG_ghc_src_note = 0x5b00 + +-- * Dwarf attributes +dW_AT_name, dW_AT_stmt_list, dW_AT_low_pc, dW_AT_high_pc, dW_AT_language, + dW_AT_comp_dir, dW_AT_producer, dW_AT_external, dW_AT_frame_base, + dW_AT_use_UTF8, dW_AT_MIPS_linkage_name :: Word +dW_AT_name = 0x03 +dW_AT_stmt_list = 0x10 +dW_AT_low_pc = 0x11 +dW_AT_high_pc = 0x12 +dW_AT_language = 0x13 +dW_AT_comp_dir = 0x1b +dW_AT_producer = 0x25 +dW_AT_external = 0x3f +dW_AT_frame_base = 0x40 +dW_AT_use_UTF8 = 0x53 +dW_AT_MIPS_linkage_name = 0x2007 + +-- * Custom DWARF attributes +-- Chosen a more or less random section of the vendor-extensible region + +-- ** Describing C-- blocks +-- These appear in DW_TAG_lexical_scope DIEs corresponding to C-- blocks +dW_AT_ghc_tick_parent :: Word +dW_AT_ghc_tick_parent = 0x2b20 + +-- ** Describing source notes +-- These appear in DW_TAG_ghc_src_note DIEs +dW_AT_ghc_span_file, dW_AT_ghc_span_start_line, + dW_AT_ghc_span_start_col, dW_AT_ghc_span_end_line, + dW_AT_ghc_span_end_col :: Word +dW_AT_ghc_span_file = 0x2b00 +dW_AT_ghc_span_start_line = 0x2b01 +dW_AT_ghc_span_start_col = 0x2b02 +dW_AT_ghc_span_end_line = 0x2b03 +dW_AT_ghc_span_end_col = 0x2b04 + + +-- * Abbrev declarations +dW_CHILDREN_no, dW_CHILDREN_yes :: Word8 +dW_CHILDREN_no = 0 +dW_CHILDREN_yes = 1 + +dW_FORM_addr, dW_FORM_data2, dW_FORM_data4, dW_FORM_string, dW_FORM_flag, + dW_FORM_block1, dW_FORM_ref4, dW_FORM_ref_addr, dW_FORM_flag_present :: Word +dW_FORM_addr = 0x01 +dW_FORM_data2 = 0x05 +dW_FORM_data4 = 0x06 +dW_FORM_string = 0x08 +dW_FORM_flag = 0x0c +dW_FORM_block1 = 0x0a +dW_FORM_ref_addr = 0x10 +dW_FORM_ref4 = 0x13 +dW_FORM_flag_present = 0x19 + +-- * Dwarf native types +dW_ATE_address, dW_ATE_boolean, dW_ATE_float, dW_ATE_signed, + dW_ATE_signed_char, dW_ATE_unsigned, dW_ATE_unsigned_char :: Word +dW_ATE_address = 1 +dW_ATE_boolean = 2 +dW_ATE_float = 4 +dW_ATE_signed = 5 +dW_ATE_signed_char = 6 +dW_ATE_unsigned = 7 +dW_ATE_unsigned_char = 8 + +-- * Call frame information +dW_CFA_set_loc, dW_CFA_undefined, dW_CFA_same_value, + dW_CFA_def_cfa, dW_CFA_def_cfa_offset, dW_CFA_def_cfa_expression, + dW_CFA_expression, dW_CFA_offset_extended_sf, dW_CFA_def_cfa_offset_sf, + dW_CFA_def_cfa_sf, dW_CFA_val_offset, dW_CFA_val_expression, + dW_CFA_offset :: Word8 +dW_CFA_set_loc = 0x01 +dW_CFA_undefined = 0x07 +dW_CFA_same_value = 0x08 +dW_CFA_def_cfa = 0x0c +dW_CFA_def_cfa_offset = 0x0e +dW_CFA_def_cfa_expression = 0x0f +dW_CFA_expression = 0x10 +dW_CFA_offset_extended_sf = 0x11 +dW_CFA_def_cfa_sf = 0x12 +dW_CFA_def_cfa_offset_sf = 0x13 +dW_CFA_val_offset = 0x14 +dW_CFA_val_expression = 0x16 +dW_CFA_offset = 0x80 + +-- * Operations +dW_OP_addr, dW_OP_deref, dW_OP_consts, + dW_OP_minus, dW_OP_mul, dW_OP_plus, + dW_OP_lit0, dW_OP_breg0, dW_OP_call_frame_cfa :: Word8 +dW_OP_addr = 0x03 +dW_OP_deref = 0x06 +dW_OP_consts = 0x11 +dW_OP_minus = 0x1c +dW_OP_mul = 0x1e +dW_OP_plus = 0x22 +dW_OP_lit0 = 0x30 +dW_OP_breg0 = 0x70 +dW_OP_call_frame_cfa = 0x9c + +-- * Dwarf section declarations +dwarfInfoSection, dwarfAbbrevSection, dwarfLineSection, + dwarfFrameSection, dwarfGhcSection, dwarfARangesSection :: SDoc +dwarfInfoSection = dwarfSection "info" +dwarfAbbrevSection = dwarfSection "abbrev" +dwarfLineSection = dwarfSection "line" +dwarfFrameSection = dwarfSection "frame" +dwarfGhcSection = dwarfSection "ghc" +dwarfARangesSection = dwarfSection "aranges" + +dwarfSection :: String -> SDoc +dwarfSection name = sdocWithPlatform $ \plat -> + case platformOS plat of + os | osElfTarget os + -> text "\t.section .debug_" <> text name <> text ",\"\"," + <> sectionType "progbits" + | osMachOTarget os + -> text "\t.section __DWARF,__debug_" <> text name <> text ",regular,debug" + | otherwise + -> text "\t.section .debug_" <> text name <> text ",\"dr\"" + +-- * Dwarf section labels +dwarfInfoLabel, dwarfAbbrevLabel, dwarfLineLabel, dwarfFrameLabel :: PtrString +dwarfInfoLabel = sLit ".Lsection_info" +dwarfAbbrevLabel = sLit ".Lsection_abbrev" +dwarfLineLabel = sLit ".Lsection_line" +dwarfFrameLabel = sLit ".Lsection_frame" + +-- | Mapping of registers to DWARF register numbers +dwarfRegNo :: Platform -> Reg -> Word8 +dwarfRegNo p r = case platformArch p of + ArchX86 + | r == eax -> 0 + | r == ecx -> 1 -- yes, no typo + | r == edx -> 2 + | r == ebx -> 3 + | r == esp -> 4 + | r == ebp -> 5 + | r == esi -> 6 + | r == edi -> 7 + ArchX86_64 + | r == rax -> 0 + | r == rdx -> 1 -- this neither. The order GCC allocates registers in? + | r == rcx -> 2 + | r == rbx -> 3 + | r == rsi -> 4 + | r == rdi -> 5 + | r == rbp -> 6 + | r == rsp -> 7 + | r == r8 -> 8 + | r == r9 -> 9 + | r == r10 -> 10 + | r == r11 -> 11 + | r == r12 -> 12 + | r == r13 -> 13 + | r == r14 -> 14 + | r == r15 -> 15 + | r == xmm0 -> 17 + | r == xmm1 -> 18 + | r == xmm2 -> 19 + | r == xmm3 -> 20 + | r == xmm4 -> 21 + | r == xmm5 -> 22 + | r == xmm6 -> 23 + | r == xmm7 -> 24 + | r == xmm8 -> 25 + | r == xmm9 -> 26 + | r == xmm10 -> 27 + | r == xmm11 -> 28 + | r == xmm12 -> 29 + | r == xmm13 -> 30 + | r == xmm14 -> 31 + | r == xmm15 -> 32 + _other -> error "dwarfRegNo: Unsupported platform or unknown register!" + +-- | Virtual register number to use for return address. +dwarfReturnRegNo :: Platform -> Word8 +dwarfReturnRegNo p + -- We "overwrite" IP with our pseudo register - that makes sense, as + -- when using this mechanism gdb already knows the IP anyway. Clang + -- does this too, so it must be safe. + = case platformArch p of + ArchX86 -> 8 -- eip + ArchX86_64 -> 16 -- rip + _other -> error "dwarfReturnRegNo: Unsupported platform!" diff --git a/compiler/GHC/CmmToAsm/Dwarf/Types.hs b/compiler/GHC/CmmToAsm/Dwarf/Types.hs new file mode 100644 index 0000000000..5eda37a653 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Dwarf/Types.hs @@ -0,0 +1,612 @@ +module GHC.CmmToAsm.Dwarf.Types + ( -- * Dwarf information + DwarfInfo(..) + , pprDwarfInfo + , pprAbbrevDecls + -- * Dwarf address range table + , DwarfARange(..) + , pprDwarfARanges + -- * Dwarf frame + , DwarfFrame(..), DwarfFrameProc(..), DwarfFrameBlock(..) + , pprDwarfFrame + -- * Utilities + , pprByte + , pprHalf + , pprData4' + , pprDwWord + , pprWord + , pprLEBWord + , pprLEBInt + , wordAlign + , sectionOffset + ) + where + +import GhcPrelude + +import GHC.Cmm.DebugBlock +import GHC.Cmm.CLabel +import GHC.Cmm.Expr ( GlobalReg(..) ) +import Encoding +import FastString +import Outputable +import GHC.Platform +import Unique +import GHC.Platform.Reg +import SrcLoc +import Util + +import GHC.CmmToAsm.Dwarf.Constants + +import qualified Data.ByteString as BS +import qualified Control.Monad.Trans.State.Strict as S +import Control.Monad (zipWithM, join) +import Data.Bits +import qualified Data.Map as Map +import Data.Word +import Data.Char + +import GHC.Platform.Regs + +-- | Individual dwarf records. Each one will be encoded as an entry in +-- the @.debug_info@ section. +data DwarfInfo + = DwarfCompileUnit { dwChildren :: [DwarfInfo] + , dwName :: String + , dwProducer :: String + , dwCompDir :: String + , dwLowLabel :: CLabel + , dwHighLabel :: CLabel + , dwLineLabel :: PtrString } + | DwarfSubprogram { dwChildren :: [DwarfInfo] + , dwName :: String + , dwLabel :: CLabel + , dwParent :: Maybe CLabel + -- ^ label of DIE belonging to the parent tick + } + | DwarfBlock { dwChildren :: [DwarfInfo] + , dwLabel :: CLabel + , dwMarker :: Maybe CLabel + } + | DwarfSrcNote { dwSrcSpan :: RealSrcSpan + } + +-- | Abbreviation codes used for encoding above records in the +-- @.debug_info@ section. +data DwarfAbbrev + = DwAbbrNull -- ^ Pseudo, used for marking the end of lists + | DwAbbrCompileUnit + | DwAbbrSubprogram + | DwAbbrSubprogramWithParent + | DwAbbrBlockWithoutCode + | DwAbbrBlock + | DwAbbrGhcSrcNote + deriving (Eq, Enum) + +-- | Generate assembly for the given abbreviation code +pprAbbrev :: DwarfAbbrev -> SDoc +pprAbbrev = pprLEBWord . fromIntegral . fromEnum + +-- | Abbreviation declaration. This explains the binary encoding we +-- use for representing 'DwarfInfo'. Be aware that this must be updated +-- along with 'pprDwarfInfo'. +pprAbbrevDecls :: Bool -> SDoc +pprAbbrevDecls haveDebugLine = + let mkAbbrev abbr tag chld flds = + let fld (tag, form) = pprLEBWord tag $$ pprLEBWord form + in pprAbbrev abbr $$ pprLEBWord tag $$ pprByte chld $$ + vcat (map fld flds) $$ pprByte 0 $$ pprByte 0 + -- These are shared between DwAbbrSubprogram and + -- DwAbbrSubprogramWithParent + subprogramAttrs = + [ (dW_AT_name, dW_FORM_string) + , (dW_AT_MIPS_linkage_name, dW_FORM_string) + , (dW_AT_external, dW_FORM_flag) + , (dW_AT_low_pc, dW_FORM_addr) + , (dW_AT_high_pc, dW_FORM_addr) + , (dW_AT_frame_base, dW_FORM_block1) + ] + in dwarfAbbrevSection $$ + ptext dwarfAbbrevLabel <> colon $$ + mkAbbrev DwAbbrCompileUnit dW_TAG_compile_unit dW_CHILDREN_yes + ([(dW_AT_name, dW_FORM_string) + , (dW_AT_producer, dW_FORM_string) + , (dW_AT_language, dW_FORM_data4) + , (dW_AT_comp_dir, dW_FORM_string) + , (dW_AT_use_UTF8, dW_FORM_flag_present) -- not represented in body + , (dW_AT_low_pc, dW_FORM_addr) + , (dW_AT_high_pc, dW_FORM_addr) + ] ++ + (if haveDebugLine + then [ (dW_AT_stmt_list, dW_FORM_data4) ] + else [])) $$ + mkAbbrev DwAbbrSubprogram dW_TAG_subprogram dW_CHILDREN_yes + subprogramAttrs $$ + mkAbbrev DwAbbrSubprogramWithParent dW_TAG_subprogram dW_CHILDREN_yes + (subprogramAttrs ++ [(dW_AT_ghc_tick_parent, dW_FORM_ref_addr)]) $$ + mkAbbrev DwAbbrBlockWithoutCode dW_TAG_lexical_block dW_CHILDREN_yes + [ (dW_AT_name, dW_FORM_string) + ] $$ + mkAbbrev DwAbbrBlock dW_TAG_lexical_block dW_CHILDREN_yes + [ (dW_AT_name, dW_FORM_string) + , (dW_AT_low_pc, dW_FORM_addr) + , (dW_AT_high_pc, dW_FORM_addr) + ] $$ + mkAbbrev DwAbbrGhcSrcNote dW_TAG_ghc_src_note dW_CHILDREN_no + [ (dW_AT_ghc_span_file, dW_FORM_string) + , (dW_AT_ghc_span_start_line, dW_FORM_data4) + , (dW_AT_ghc_span_start_col, dW_FORM_data2) + , (dW_AT_ghc_span_end_line, dW_FORM_data4) + , (dW_AT_ghc_span_end_col, dW_FORM_data2) + ] $$ + pprByte 0 + +-- | Generate assembly for DWARF data +pprDwarfInfo :: Bool -> DwarfInfo -> SDoc +pprDwarfInfo haveSrc d + = case d of + DwarfCompileUnit {} -> hasChildren + DwarfSubprogram {} -> hasChildren + DwarfBlock {} -> hasChildren + DwarfSrcNote {} -> noChildren + where + hasChildren = + pprDwarfInfoOpen haveSrc d $$ + vcat (map (pprDwarfInfo haveSrc) (dwChildren d)) $$ + pprDwarfInfoClose + noChildren = pprDwarfInfoOpen haveSrc d + +-- | Prints assembler data corresponding to DWARF info records. Note +-- that the binary format of this is parameterized in @abbrevDecls@ and +-- has to be kept in synch. +pprDwarfInfoOpen :: Bool -> DwarfInfo -> SDoc +pprDwarfInfoOpen haveSrc (DwarfCompileUnit _ name producer compDir lowLabel + highLabel lineLbl) = + pprAbbrev DwAbbrCompileUnit + $$ pprString name + $$ pprString producer + $$ pprData4 dW_LANG_Haskell + $$ pprString compDir + $$ pprWord (ppr lowLabel) + $$ pprWord (ppr highLabel) + $$ if haveSrc + then sectionOffset (ptext lineLbl) (ptext dwarfLineLabel) + else empty +pprDwarfInfoOpen _ (DwarfSubprogram _ name label + parent) = sdocWithDynFlags $ \df -> + ppr (mkAsmTempDieLabel label) <> colon + $$ pprAbbrev abbrev + $$ pprString name + $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label)) + $$ pprFlag (externallyVisibleCLabel label) + $$ pprWord (ppr label) + $$ pprWord (ppr $ mkAsmTempEndLabel label) + $$ pprByte 1 + $$ pprByte dW_OP_call_frame_cfa + $$ parentValue + where + abbrev = case parent of Nothing -> DwAbbrSubprogram + Just _ -> DwAbbrSubprogramWithParent + parentValue = maybe empty pprParentDie parent + pprParentDie sym = sectionOffset (ppr sym) (ptext dwarfInfoLabel) +pprDwarfInfoOpen _ (DwarfBlock _ label Nothing) = sdocWithDynFlags $ \df -> + ppr (mkAsmTempDieLabel label) <> colon + $$ pprAbbrev DwAbbrBlockWithoutCode + $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label)) +pprDwarfInfoOpen _ (DwarfBlock _ label (Just marker)) = sdocWithDynFlags $ \df -> + ppr (mkAsmTempDieLabel label) <> colon + $$ pprAbbrev DwAbbrBlock + $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label)) + $$ pprWord (ppr marker) + $$ pprWord (ppr $ mkAsmTempEndLabel marker) +pprDwarfInfoOpen _ (DwarfSrcNote ss) = + pprAbbrev DwAbbrGhcSrcNote + $$ pprString' (ftext $ srcSpanFile ss) + $$ pprData4 (fromIntegral $ srcSpanStartLine ss) + $$ pprHalf (fromIntegral $ srcSpanStartCol ss) + $$ pprData4 (fromIntegral $ srcSpanEndLine ss) + $$ pprHalf (fromIntegral $ srcSpanEndCol ss) + +-- | Close a DWARF info record with children +pprDwarfInfoClose :: SDoc +pprDwarfInfoClose = pprAbbrev DwAbbrNull + +-- | A DWARF address range. This is used by the debugger to quickly locate +-- which compilation unit a given address belongs to. This type assumes +-- a non-segmented address-space. +data DwarfARange + = DwarfARange + { dwArngStartLabel :: CLabel + , dwArngEndLabel :: CLabel + } + +-- | Print assembler directives corresponding to a DWARF @.debug_aranges@ +-- address table entry. +pprDwarfARanges :: [DwarfARange] -> Unique -> SDoc +pprDwarfARanges arngs unitU = sdocWithPlatform $ \plat -> + let wordSize = platformWordSizeInBytes plat + paddingSize = 4 :: Int + -- header is 12 bytes long. + -- entry is 8 bytes (32-bit platform) or 16 bytes (64-bit platform). + -- pad such that first entry begins at multiple of entry size. + pad n = vcat $ replicate n $ pprByte 0 + -- Fix for #17428 + initialLength = 8 + paddingSize + (1 + length arngs) * 2 * wordSize + in pprDwWord (ppr initialLength) + $$ pprHalf 2 + $$ sectionOffset (ppr $ mkAsmTempLabel $ unitU) + (ptext dwarfInfoLabel) + $$ pprByte (fromIntegral wordSize) + $$ pprByte 0 + $$ pad paddingSize + -- body + $$ vcat (map pprDwarfARange arngs) + -- terminus + $$ pprWord (char '0') + $$ pprWord (char '0') + +pprDwarfARange :: DwarfARange -> SDoc +pprDwarfARange arng = pprWord (ppr $ dwArngStartLabel arng) $$ pprWord length + where + length = ppr (dwArngEndLabel arng) + <> char '-' <> ppr (dwArngStartLabel arng) + +-- | Information about unwind instructions for a procedure. This +-- corresponds to a "Common Information Entry" (CIE) in DWARF. +data DwarfFrame + = DwarfFrame + { dwCieLabel :: CLabel + , dwCieInit :: UnwindTable + , dwCieProcs :: [DwarfFrameProc] + } + +-- | Unwind instructions for an individual procedure. Corresponds to a +-- "Frame Description Entry" (FDE) in DWARF. +data DwarfFrameProc + = DwarfFrameProc + { dwFdeProc :: CLabel + , dwFdeHasInfo :: Bool + , dwFdeBlocks :: [DwarfFrameBlock] + -- ^ List of blocks. Order must match asm! + } + +-- | Unwind instructions for a block. Will become part of the +-- containing FDE. +data DwarfFrameBlock + = DwarfFrameBlock + { dwFdeBlkHasInfo :: Bool + , dwFdeUnwind :: [UnwindPoint] + -- ^ these unwind points must occur in the same order as they occur + -- in the block + } + +instance Outputable DwarfFrameBlock where + ppr (DwarfFrameBlock hasInfo unwinds) = braces $ ppr hasInfo <+> ppr unwinds + +-- | Header for the @.debug_frame@ section. Here we emit the "Common +-- Information Entry" record that establishes general call frame +-- parameters and the default stack layout. +pprDwarfFrame :: DwarfFrame -> SDoc +pprDwarfFrame DwarfFrame{dwCieLabel=cieLabel,dwCieInit=cieInit,dwCieProcs=procs} + = sdocWithPlatform $ \plat -> + let cieStartLabel= mkAsmTempDerivedLabel cieLabel (fsLit "_start") + cieEndLabel = mkAsmTempEndLabel cieLabel + length = ppr cieEndLabel <> char '-' <> ppr cieStartLabel + spReg = dwarfGlobalRegNo plat Sp + retReg = dwarfReturnRegNo plat + wordSize = platformWordSizeInBytes plat + pprInit :: (GlobalReg, Maybe UnwindExpr) -> SDoc + pprInit (g, uw) = pprSetUnwind plat g (Nothing, uw) + + -- Preserve C stack pointer: This necessary to override that default + -- unwinding behavior of setting $sp = CFA. + preserveSp = case platformArch plat of + ArchX86 -> pprByte dW_CFA_same_value $$ pprLEBWord 4 + ArchX86_64 -> pprByte dW_CFA_same_value $$ pprLEBWord 7 + _ -> empty + in vcat [ ppr cieLabel <> colon + , pprData4' length -- Length of CIE + , ppr cieStartLabel <> colon + , pprData4' (text "-1") + -- Common Information Entry marker (-1 = 0xf..f) + , pprByte 3 -- CIE version (we require DWARF 3) + , pprByte 0 -- Augmentation (none) + , pprByte 1 -- Code offset multiplicator + , pprByte (128-fromIntegral wordSize) + -- Data offset multiplicator + -- (stacks grow down => "-w" in signed LEB128) + , pprByte retReg -- virtual register holding return address + ] $$ + -- Initial unwind table + vcat (map pprInit $ Map.toList cieInit) $$ + vcat [ -- RET = *CFA + pprByte (dW_CFA_offset+retReg) + , pprByte 0 + + -- Preserve C stack pointer + , preserveSp + + -- Sp' = CFA + -- (we need to set this manually as our (STG) Sp register is + -- often not the architecture's default stack register) + , pprByte dW_CFA_val_offset + , pprLEBWord (fromIntegral spReg) + , pprLEBWord 0 + ] $$ + wordAlign $$ + ppr cieEndLabel <> colon $$ + -- Procedure unwind tables + vcat (map (pprFrameProc cieLabel cieInit) procs) + +-- | Writes a "Frame Description Entry" for a procedure. This consists +-- mainly of referencing the CIE and writing state machine +-- instructions to describe how the frame base (CFA) changes. +pprFrameProc :: CLabel -> UnwindTable -> DwarfFrameProc -> SDoc +pprFrameProc frameLbl initUw (DwarfFrameProc procLbl hasInfo blocks) + = let fdeLabel = mkAsmTempDerivedLabel procLbl (fsLit "_fde") + fdeEndLabel = mkAsmTempDerivedLabel procLbl (fsLit "_fde_end") + procEnd = mkAsmTempEndLabel procLbl + ifInfo str = if hasInfo then text str else empty + -- see [Note: Info Offset] + in vcat [ whenPprDebug $ text "# Unwinding for" <+> ppr procLbl <> colon + , pprData4' (ppr fdeEndLabel <> char '-' <> ppr fdeLabel) + , ppr fdeLabel <> colon + , pprData4' (ppr frameLbl <> char '-' <> + ptext dwarfFrameLabel) -- Reference to CIE + , pprWord (ppr procLbl <> ifInfo "-1") -- Code pointer + , pprWord (ppr procEnd <> char '-' <> + ppr procLbl <> ifInfo "+1") -- Block byte length + ] $$ + vcat (S.evalState (mapM pprFrameBlock blocks) initUw) $$ + wordAlign $$ + ppr fdeEndLabel <> colon + +-- | Generates unwind information for a block. We only generate +-- instructions where unwind information actually changes. This small +-- optimisations saves a lot of space, as subsequent blocks often have +-- the same unwind information. +pprFrameBlock :: DwarfFrameBlock -> S.State UnwindTable SDoc +pprFrameBlock (DwarfFrameBlock hasInfo uws0) = + vcat <$> zipWithM pprFrameDecl (True : repeat False) uws0 + where + pprFrameDecl :: Bool -> UnwindPoint -> S.State UnwindTable SDoc + pprFrameDecl firstDecl (UnwindPoint lbl uws) = S.state $ \oldUws -> + let -- Did a register's unwind expression change? + isChanged :: GlobalReg -> Maybe UnwindExpr + -> Maybe (Maybe UnwindExpr, Maybe UnwindExpr) + isChanged g new + -- the value didn't change + | Just new == old = Nothing + -- the value was and still is undefined + | Nothing <- old + , Nothing <- new = Nothing + -- the value changed + | otherwise = Just (join old, new) + where + old = Map.lookup g oldUws + + changed = Map.toList $ Map.mapMaybeWithKey isChanged uws + + in if oldUws == uws + then (empty, oldUws) + else let -- see [Note: Info Offset] + needsOffset = firstDecl && hasInfo + lblDoc = ppr lbl <> + if needsOffset then text "-1" else empty + doc = sdocWithPlatform $ \plat -> + pprByte dW_CFA_set_loc $$ pprWord lblDoc $$ + vcat (map (uncurry $ pprSetUnwind plat) changed) + in (doc, uws) + +-- Note [Info Offset] +-- +-- GDB was pretty much written with C-like programs in mind, and as a +-- result they assume that once you have a return address, it is a +-- good idea to look at (PC-1) to unwind further - as that's where the +-- "call" instruction is supposed to be. +-- +-- Now on one hand, code generated by GHC looks nothing like what GDB +-- expects, and in fact going up from a return pointer is guaranteed +-- to land us inside an info table! On the other hand, that actually +-- gives us some wiggle room, as we expect IP to never *actually* end +-- up inside the info table, so we can "cheat" by putting whatever GDB +-- expects to see there. This is probably pretty safe, as GDB cannot +-- assume (PC-1) to be a valid code pointer in the first place - and I +-- have seen no code trying to correct this. +-- +-- Note that this will not prevent GDB from failing to look-up the +-- correct function name for the frame, as that uses the symbol table, +-- which we can not manipulate as easily. +-- +-- There's a GDB patch to address this at [1]. At the moment of writing +-- it's not merged, so I recommend building GDB with the patch if you +-- care about unwinding. The hack above doesn't cover every case. +-- +-- [1] https://sourceware.org/ml/gdb-patches/2018-02/msg00055.html + +-- | Get DWARF register ID for a given GlobalReg +dwarfGlobalRegNo :: Platform -> GlobalReg -> Word8 +dwarfGlobalRegNo p UnwindReturnReg = dwarfReturnRegNo p +dwarfGlobalRegNo p reg = maybe 0 (dwarfRegNo p . RegReal) $ globalRegMaybe p reg + +-- | Generate code for setting the unwind information for a register, +-- optimized using its known old value in the table. Note that "Sp" is +-- special: We see it as synonym for the CFA. +pprSetUnwind :: Platform + -> GlobalReg + -- ^ the register to produce an unwinding table entry for + -> (Maybe UnwindExpr, Maybe UnwindExpr) + -- ^ the old and new values of the register + -> SDoc +pprSetUnwind plat g (_, Nothing) + = pprUndefUnwind plat g +pprSetUnwind _ Sp (Just (UwReg s _), Just (UwReg s' o')) | s == s' + = if o' >= 0 + then pprByte dW_CFA_def_cfa_offset $$ pprLEBWord (fromIntegral o') + else pprByte dW_CFA_def_cfa_offset_sf $$ pprLEBInt o' +pprSetUnwind plat Sp (_, Just (UwReg s' o')) + = if o' >= 0 + then pprByte dW_CFA_def_cfa $$ + pprLEBRegNo plat s' $$ + pprLEBWord (fromIntegral o') + else pprByte dW_CFA_def_cfa_sf $$ + pprLEBRegNo plat s' $$ + pprLEBInt o' +pprSetUnwind _ Sp (_, Just uw) + = pprByte dW_CFA_def_cfa_expression $$ pprUnwindExpr False uw +pprSetUnwind plat g (_, Just (UwDeref (UwReg Sp o))) + | o < 0 && ((-o) `mod` platformWordSizeInBytes plat) == 0 -- expected case + = pprByte (dW_CFA_offset + dwarfGlobalRegNo plat g) $$ + pprLEBWord (fromIntegral ((-o) `div` platformWordSizeInBytes plat)) + | otherwise + = pprByte dW_CFA_offset_extended_sf $$ + pprLEBRegNo plat g $$ + pprLEBInt o +pprSetUnwind plat g (_, Just (UwDeref uw)) + = pprByte dW_CFA_expression $$ + pprLEBRegNo plat g $$ + pprUnwindExpr True uw +pprSetUnwind plat g (_, Just (UwReg g' 0)) + | g == g' + = pprByte dW_CFA_same_value $$ + pprLEBRegNo plat g +pprSetUnwind plat g (_, Just uw) + = pprByte dW_CFA_val_expression $$ + pprLEBRegNo plat g $$ + pprUnwindExpr True uw + +-- | Print the register number of the given 'GlobalReg' as an unsigned LEB128 +-- encoded number. +pprLEBRegNo :: Platform -> GlobalReg -> SDoc +pprLEBRegNo plat = pprLEBWord . fromIntegral . dwarfGlobalRegNo plat + +-- | Generates a DWARF expression for the given unwind expression. If +-- @spIsCFA@ is true, we see @Sp@ as the frame base CFA where it gets +-- mentioned. +pprUnwindExpr :: Bool -> UnwindExpr -> SDoc +pprUnwindExpr spIsCFA expr + = sdocWithPlatform $ \plat -> + let pprE (UwConst i) + | i >= 0 && i < 32 = pprByte (dW_OP_lit0 + fromIntegral i) + | otherwise = pprByte dW_OP_consts $$ pprLEBInt i -- lazy... + pprE (UwReg Sp i) | spIsCFA + = if i == 0 + then pprByte dW_OP_call_frame_cfa + else pprE (UwPlus (UwReg Sp 0) (UwConst i)) + pprE (UwReg g i) = pprByte (dW_OP_breg0+dwarfGlobalRegNo plat g) $$ + pprLEBInt i + pprE (UwDeref u) = pprE u $$ pprByte dW_OP_deref + pprE (UwLabel l) = pprByte dW_OP_addr $$ pprWord (ppr l) + pprE (UwPlus u1 u2) = pprE u1 $$ pprE u2 $$ pprByte dW_OP_plus + pprE (UwMinus u1 u2) = pprE u1 $$ pprE u2 $$ pprByte dW_OP_minus + pprE (UwTimes u1 u2) = pprE u1 $$ pprE u2 $$ pprByte dW_OP_mul + in text "\t.uleb128 2f-1f" $$ -- DW_FORM_block length + -- computed as the difference of the following local labels 2: and 1: + text "1:" $$ + pprE expr $$ + text "2:" + +-- | Generate code for re-setting the unwind information for a +-- register to @undefined@ +pprUndefUnwind :: Platform -> GlobalReg -> SDoc +pprUndefUnwind plat g = pprByte dW_CFA_undefined $$ + pprLEBRegNo plat g + + +-- | Align assembly at (machine) word boundary +wordAlign :: SDoc +wordAlign = sdocWithPlatform $ \plat -> + text "\t.align " <> case platformOS plat of + OSDarwin -> case platformWordSize plat of + PW8 -> char '3' + PW4 -> char '2' + _other -> ppr (platformWordSizeInBytes plat) + +-- | Assembly for a single byte of constant DWARF data +pprByte :: Word8 -> SDoc +pprByte x = text "\t.byte " <> ppr (fromIntegral x :: Word) + +-- | Assembly for a two-byte constant integer +pprHalf :: Word16 -> SDoc +pprHalf x = text "\t.short" <+> ppr (fromIntegral x :: Word) + +-- | Assembly for a constant DWARF flag +pprFlag :: Bool -> SDoc +pprFlag f = pprByte (if f then 0xff else 0x00) + +-- | Assembly for 4 bytes of dynamic DWARF data +pprData4' :: SDoc -> SDoc +pprData4' x = text "\t.long " <> x + +-- | Assembly for 4 bytes of constant DWARF data +pprData4 :: Word -> SDoc +pprData4 = pprData4' . ppr + +-- | Assembly for a DWARF word of dynamic data. This means 32 bit, as +-- we are generating 32 bit DWARF. +pprDwWord :: SDoc -> SDoc +pprDwWord = pprData4' + +-- | Assembly for a machine word of dynamic data. Depends on the +-- architecture we are currently generating code for. +pprWord :: SDoc -> SDoc +pprWord s = (<> s) . sdocWithPlatform $ \plat -> + case platformWordSize plat of + PW4 -> text "\t.long " + PW8 -> text "\t.quad " + +-- | Prints a number in "little endian base 128" format. The idea is +-- to optimize for small numbers by stopping once all further bytes +-- would be 0. The highest bit in every byte signals whether there +-- are further bytes to read. +pprLEBWord :: Word -> SDoc +pprLEBWord x | x < 128 = pprByte (fromIntegral x) + | otherwise = pprByte (fromIntegral $ 128 .|. (x .&. 127)) $$ + pprLEBWord (x `shiftR` 7) + +-- | Same as @pprLEBWord@, but for a signed number +pprLEBInt :: Int -> SDoc +pprLEBInt x | x >= -64 && x < 64 + = pprByte (fromIntegral (x .&. 127)) + | otherwise = pprByte (fromIntegral $ 128 .|. (x .&. 127)) $$ + pprLEBInt (x `shiftR` 7) + +-- | Generates a dynamic null-terminated string. If required the +-- caller needs to make sure that the string is escaped properly. +pprString' :: SDoc -> SDoc +pprString' str = text "\t.asciz \"" <> str <> char '"' + +-- | Generate a string constant. We take care to escape the string. +pprString :: String -> SDoc +pprString str + = pprString' $ hcat $ map escapeChar $ + if str `lengthIs` utf8EncodedLength str + then str + else map (chr . fromIntegral) $ BS.unpack $ bytesFS $ mkFastString str + +-- | Escape a single non-unicode character +escapeChar :: Char -> SDoc +escapeChar '\\' = text "\\\\" +escapeChar '\"' = text "\\\"" +escapeChar '\n' = text "\\n" +escapeChar c + | isAscii c && isPrint c && c /= '?' -- prevents trigraph warnings + = char c + | otherwise + = char '\\' <> char (intToDigit (ch `div` 64)) <> + char (intToDigit ((ch `div` 8) `mod` 8)) <> + char (intToDigit (ch `mod` 8)) + where ch = ord c + +-- | Generate an offset into another section. This is tricky because +-- this is handled differently depending on platform: Mac Os expects +-- us to calculate the offset using assembler arithmetic. Linux expects +-- us to just reference the target directly, and will figure out on +-- their own that we actually need an offset. Finally, Windows has +-- a special directive to refer to relative offsets. Fun. +sectionOffset :: SDoc -> SDoc -> SDoc +sectionOffset target section = sdocWithPlatform $ \plat -> + case platformOS plat of + OSDarwin -> pprDwWord (target <> char '-' <> section) + OSMinGW32 -> text "\t.secrel32 " <> target + _other -> pprDwWord target diff --git a/compiler/GHC/CmmToAsm/Format.hs b/compiler/GHC/CmmToAsm/Format.hs new file mode 100644 index 0000000000..446c760939 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Format.hs @@ -0,0 +1,105 @@ +-- | Formats on this architecture +-- A Format is a combination of width and class +-- +-- TODO: Signed vs unsigned? +-- +-- TODO: This module is currently shared by all architectures because +-- NCGMonad need to know about it to make a VReg. It would be better +-- to have architecture specific formats, and do the overloading +-- properly. eg SPARC doesn't care about FF80. +-- +module GHC.CmmToAsm.Format ( + Format(..), + intFormat, + floatFormat, + isFloatFormat, + cmmTypeFormat, + formatToWidth, + formatInBytes +) + +where + +import GhcPrelude + +import GHC.Cmm +import Outputable + +-- It looks very like the old MachRep, but it's now of purely local +-- significance, here in the native code generator. You can change it +-- without global consequences. +-- +-- A major use is as an opcode qualifier; thus the opcode +-- mov.l a b +-- might be encoded +-- MOV II32 a b +-- where the Format field encodes the ".l" part. + +-- ToDo: it's not clear to me that we need separate signed-vs-unsigned formats +-- here. I've removed them from the x86 version, we'll see what happens --SDM + +-- ToDo: quite a few occurrences of Format could usefully be replaced by Width + +data Format + = II8 + | II16 + | II32 + | II64 + | FF32 + | FF64 + deriving (Show, Eq) + + +-- | Get the integer format of this width. +intFormat :: Width -> Format +intFormat width + = case width of + W8 -> II8 + W16 -> II16 + W32 -> II32 + W64 -> II64 + other -> sorry $ "The native code generator cannot " ++ + "produce code for Format.intFormat " ++ show other + ++ "\n\tConsider using the llvm backend with -fllvm" + + +-- | Get the float format of this width. +floatFormat :: Width -> Format +floatFormat width + = case width of + W32 -> FF32 + W64 -> FF64 + + other -> pprPanic "Format.floatFormat" (ppr other) + + +-- | Check if a format represents a floating point value. +isFloatFormat :: Format -> Bool +isFloatFormat format + = case format of + FF32 -> True + FF64 -> True + _ -> False + + +-- | Convert a Cmm type to a Format. +cmmTypeFormat :: CmmType -> Format +cmmTypeFormat ty + | isFloatType ty = floatFormat (typeWidth ty) + | otherwise = intFormat (typeWidth ty) + + +-- | Get the Width of a Format. +formatToWidth :: Format -> Width +formatToWidth format + = case format of + II8 -> W8 + II16 -> W16 + II32 -> W32 + II64 -> W64 + FF32 -> W32 + FF64 -> W64 + + +formatInBytes :: Format -> Int +formatInBytes = widthInBytes . formatToWidth diff --git a/compiler/GHC/CmmToAsm/Instr.hs b/compiler/GHC/CmmToAsm/Instr.hs new file mode 100644 index 0000000000..44fa9b7cc9 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Instr.hs @@ -0,0 +1,202 @@ + +module GHC.CmmToAsm.Instr ( + RegUsage(..), + noUsage, + GenBasicBlock(..), blockId, + ListGraph(..), + NatCmm, + NatCmmDecl, + NatBasicBlock, + topInfoTable, + entryBlocks, + Instruction(..) +) + +where + +import GhcPrelude + +import GHC.Platform.Reg + +import GHC.Cmm.BlockId +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label +import GHC.Driver.Session +import GHC.Cmm hiding (topInfoTable) +import GHC.Platform + +-- | Holds a list of source and destination registers used by a +-- particular instruction. +-- +-- Machine registers that are pre-allocated to stgRegs are filtered +-- out, because they are uninteresting from a register allocation +-- standpoint. (We wouldn't want them to end up on the free list!) +-- +-- As far as we are concerned, the fixed registers simply don't exist +-- (for allocation purposes, anyway). +-- +data RegUsage + = RU [Reg] [Reg] + +-- | No regs read or written to. +noUsage :: RegUsage +noUsage = RU [] [] + +-- Our flavours of the Cmm types +-- Type synonyms for Cmm populated with native code +type NatCmm instr + = GenCmmGroup + RawCmmStatics + (LabelMap RawCmmStatics) + (ListGraph instr) + +type NatCmmDecl statics instr + = GenCmmDecl + statics + (LabelMap RawCmmStatics) + (ListGraph instr) + + +type NatBasicBlock instr + = GenBasicBlock instr + + +-- | Returns the info table associated with the CmmDecl's entry point, +-- if any. +topInfoTable :: GenCmmDecl a (LabelMap i) (ListGraph b) -> Maybe i +topInfoTable (CmmProc infos _ _ (ListGraph (b:_))) + = mapLookup (blockId b) infos +topInfoTable _ + = Nothing + +-- | Return the list of BlockIds in a CmmDecl that are entry points +-- for this proc (i.e. they may be jumped to from outside this proc). +entryBlocks :: GenCmmDecl a (LabelMap i) (ListGraph b) -> [BlockId] +entryBlocks (CmmProc info _ _ (ListGraph code)) = entries + where + infos = mapKeys info + entries = case code of + [] -> infos + BasicBlock entry _ : _ -- first block is the entry point + | entry `elem` infos -> infos + | otherwise -> entry : infos +entryBlocks _ = [] + +-- | Common things that we can do with instructions, on all architectures. +-- These are used by the shared parts of the native code generator, +-- specifically the register allocators. +-- +class Instruction instr where + + -- | Get the registers that are being used by this instruction. + -- regUsage doesn't need to do any trickery for jumps and such. + -- Just state precisely the regs read and written by that insn. + -- The consequences of control flow transfers, as far as register + -- allocation goes, are taken care of by the register allocator. + -- + regUsageOfInstr + :: Platform + -> instr + -> RegUsage + + + -- | Apply a given mapping to all the register references in this + -- instruction. + patchRegsOfInstr + :: instr + -> (Reg -> Reg) + -> instr + + + -- | Checks whether this instruction is a jump/branch instruction. + -- One that can change the flow of control in a way that the + -- register allocator needs to worry about. + isJumpishInstr + :: instr -> Bool + + + -- | Give the possible destinations of this jump instruction. + -- Must be defined for all jumpish instructions. + jumpDestsOfInstr + :: instr -> [BlockId] + + + -- | Change the destination of this jump instruction. + -- Used in the linear allocator when adding fixup blocks for join + -- points. + patchJumpInstr + :: instr + -> (BlockId -> BlockId) + -> instr + + + -- | An instruction to spill a register into a spill slot. + mkSpillInstr + :: DynFlags + -> Reg -- ^ the reg to spill + -> Int -- ^ the current stack delta + -> Int -- ^ spill slot to use + -> instr + + + -- | An instruction to reload a register from a spill slot. + mkLoadInstr + :: DynFlags + -> Reg -- ^ the reg to reload. + -> Int -- ^ the current stack delta + -> Int -- ^ the spill slot to use + -> instr + + -- | See if this instruction is telling us the current C stack delta + takeDeltaInstr + :: instr + -> Maybe Int + + -- | Check whether this instruction is some meta thing inserted into + -- the instruction stream for other purposes. + -- + -- Not something that has to be treated as a real machine instruction + -- and have its registers allocated. + -- + -- eg, comments, delta, ldata, etc. + isMetaInstr + :: instr + -> Bool + + + + -- | Copy the value in a register to another one. + -- Must work for all register classes. + mkRegRegMoveInstr + :: Platform + -> Reg -- ^ source register + -> Reg -- ^ destination register + -> instr + + -- | Take the source and destination from this reg -> reg move instruction + -- or Nothing if it's not one + takeRegRegMoveInstr + :: instr + -> Maybe (Reg, Reg) + + -- | Make an unconditional jump instruction. + -- For architectures with branch delay slots, its ok to put + -- a NOP after the jump. Don't fill the delay slot with an + -- instruction that references regs or you'll confuse the + -- linear allocator. + mkJumpInstr + :: BlockId + -> [instr] + + + -- Subtract an amount from the C stack pointer + mkStackAllocInstr + :: Platform + -> Int + -> [instr] + + -- Add an amount to the C stack pointer + mkStackDeallocInstr + :: Platform + -> Int + -> [instr] diff --git a/compiler/GHC/CmmToAsm/Monad.hs b/compiler/GHC/CmmToAsm/Monad.hs new file mode 100644 index 0000000000..c9414a2eee --- /dev/null +++ b/compiler/GHC/CmmToAsm/Monad.hs @@ -0,0 +1,294 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE BangPatterns #-} + +-- ----------------------------------------------------------------------------- +-- +-- (c) The University of Glasgow 1993-2004 +-- +-- The native code generator's monad. +-- +-- ----------------------------------------------------------------------------- + +module GHC.CmmToAsm.Monad ( + NcgImpl(..), + NatM_State(..), mkNatM_State, + + NatM, -- instance Monad + initNat, + addImportNat, + addNodeBetweenNat, + addImmediateSuccessorNat, + updateCfgNat, + getUniqueNat, + mapAccumLNat, + setDeltaNat, + getDeltaNat, + getThisModuleNat, + getBlockIdNat, + getNewLabelNat, + getNewRegNat, + getNewRegPairNat, + getPicBaseMaybeNat, + getPicBaseNat, + getDynFlags, + getModLoc, + getFileId, + getDebugBlock, + + DwarfFiles +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.Platform.Reg +import GHC.CmmToAsm.Format +import GHC.CmmToAsm.Reg.Target + +import GHC.Cmm.BlockId +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label +import GHC.Cmm.CLabel ( CLabel ) +import GHC.Cmm.DebugBlock +import FastString ( FastString ) +import UniqFM +import UniqSupply +import Unique ( Unique ) +import GHC.Driver.Session +import Module + +import Control.Monad ( ap ) + +import GHC.CmmToAsm.Instr +import Outputable (SDoc, pprPanic, ppr) +import GHC.Cmm (RawCmmDecl, RawCmmStatics) +import GHC.CmmToAsm.CFG + +data NcgImpl statics instr jumpDest = NcgImpl { + cmmTopCodeGen :: RawCmmDecl -> NatM [NatCmmDecl statics instr], + generateJumpTableForInstr :: instr -> Maybe (NatCmmDecl statics instr), + getJumpDestBlockId :: jumpDest -> Maybe BlockId, + canShortcut :: instr -> Maybe jumpDest, + shortcutStatics :: (BlockId -> Maybe jumpDest) -> statics -> statics, + shortcutJump :: (BlockId -> Maybe jumpDest) -> instr -> instr, + pprNatCmmDecl :: NatCmmDecl statics instr -> SDoc, + maxSpillSlots :: Int, + allocatableRegs :: [RealReg], + ncgExpandTop :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr], + ncgAllocMoreStack :: Int -> NatCmmDecl statics instr + -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)]), + -- ^ The list of block ids records the redirected jumps to allow us to update + -- the CFG. + ncgMakeFarBranches :: LabelMap RawCmmStatics -> [NatBasicBlock instr] -> [NatBasicBlock instr], + extractUnwindPoints :: [instr] -> [UnwindPoint], + -- ^ given the instruction sequence of a block, produce a list of + -- the block's 'UnwindPoint's + -- See Note [What is this unwinding business?] in Debug + -- and Note [Unwinding information in the NCG] in this module. + invertCondBranches :: Maybe CFG -> LabelMap RawCmmStatics -> [NatBasicBlock instr] + -> [NatBasicBlock instr] + -- ^ Turn the sequence of `jcc l1; jmp l2` into `jncc l2; <block_l1>` + -- when possible. + } + +data NatM_State + = NatM_State { + natm_us :: UniqSupply, + natm_delta :: Int, + natm_imports :: [(CLabel)], + natm_pic :: Maybe Reg, + natm_dflags :: DynFlags, + natm_this_module :: Module, + natm_modloc :: ModLocation, + natm_fileid :: DwarfFiles, + natm_debug_map :: LabelMap DebugBlock, + natm_cfg :: CFG + -- ^ Having a CFG with additional information is essential for some + -- operations. However we can't reconstruct all information once we + -- generated instructions. So instead we update the CFG as we go. + } + +type DwarfFiles = UniqFM (FastString, Int) + +newtype NatM result = NatM (NatM_State -> (result, NatM_State)) + deriving (Functor) + +unNat :: NatM a -> NatM_State -> (a, NatM_State) +unNat (NatM a) = a + +mkNatM_State :: UniqSupply -> Int -> DynFlags -> Module -> ModLocation -> + DwarfFiles -> LabelMap DebugBlock -> CFG -> NatM_State +mkNatM_State us delta dflags this_mod + = \loc dwf dbg cfg -> + NatM_State + { natm_us = us + , natm_delta = delta + , natm_imports = [] + , natm_pic = Nothing + , natm_dflags = dflags + , natm_this_module = this_mod + , natm_modloc = loc + , natm_fileid = dwf + , natm_debug_map = dbg + , natm_cfg = cfg + } + +initNat :: NatM_State -> NatM a -> (a, NatM_State) +initNat init_st m + = case unNat m init_st of { (r,st) -> (r,st) } + +instance Applicative NatM where + pure = returnNat + (<*>) = ap + +instance Monad NatM where + (>>=) = thenNat + +instance MonadUnique NatM where + getUniqueSupplyM = NatM $ \st -> + case splitUniqSupply (natm_us st) of + (us1, us2) -> (us1, st {natm_us = us2}) + + getUniqueM = NatM $ \st -> + case takeUniqFromSupply (natm_us st) of + (uniq, us') -> (uniq, st {natm_us = us'}) + +thenNat :: NatM a -> (a -> NatM b) -> NatM b +thenNat expr cont + = NatM $ \st -> case unNat expr st of + (result, st') -> unNat (cont result) st' + +returnNat :: a -> NatM a +returnNat result + = NatM $ \st -> (result, st) + +mapAccumLNat :: (acc -> x -> NatM (acc, y)) + -> acc + -> [x] + -> NatM (acc, [y]) + +mapAccumLNat _ b [] + = return (b, []) +mapAccumLNat f b (x:xs) + = do (b__2, x__2) <- f b x + (b__3, xs__2) <- mapAccumLNat f b__2 xs + return (b__3, x__2:xs__2) + +getUniqueNat :: NatM Unique +getUniqueNat = NatM $ \ st -> + case takeUniqFromSupply $ natm_us st of + (uniq, us') -> (uniq, st {natm_us = us'}) + +instance HasDynFlags NatM where + getDynFlags = NatM $ \ st -> (natm_dflags st, st) + + +getDeltaNat :: NatM Int +getDeltaNat = NatM $ \ st -> (natm_delta st, st) + + +setDeltaNat :: Int -> NatM () +setDeltaNat delta = NatM $ \ st -> ((), st {natm_delta = delta}) + + +getThisModuleNat :: NatM Module +getThisModuleNat = NatM $ \ st -> (natm_this_module st, st) + + +addImportNat :: CLabel -> NatM () +addImportNat imp + = NatM $ \ st -> ((), st {natm_imports = imp : natm_imports st}) + +updateCfgNat :: (CFG -> CFG) -> NatM () +updateCfgNat f + = NatM $ \ st -> let !cfg' = f (natm_cfg st) + in ((), st { natm_cfg = cfg'}) + +-- | Record that we added a block between `from` and `old`. +addNodeBetweenNat :: BlockId -> BlockId -> BlockId -> NatM () +addNodeBetweenNat from between to + = do df <- getDynFlags + let jmpWeight = fromIntegral . uncondWeight . + cfgWeightInfo $ df + updateCfgNat (updateCfg jmpWeight from between to) + where + -- When transforming A -> B to A -> A' -> B + -- A -> A' keeps the old edge info while + -- A' -> B gets the info for an unconditional + -- jump. + updateCfg weight from between old m + | Just info <- getEdgeInfo from old m + = addEdge from between info . + addWeightEdge between old weight . + delEdge from old $ m + | otherwise + = pprPanic "Failed to update cfg: Untracked edge" (ppr (from,to)) + + +-- | Place `succ` after `block` and change any edges +-- block -> X to `succ` -> X +addImmediateSuccessorNat :: BlockId -> BlockId -> NatM () +addImmediateSuccessorNat block succ + = updateCfgNat (addImmediateSuccessor block succ) + +getBlockIdNat :: NatM BlockId +getBlockIdNat + = do u <- getUniqueNat + return (mkBlockId u) + + +getNewLabelNat :: NatM CLabel +getNewLabelNat + = blockLbl <$> getBlockIdNat + + +getNewRegNat :: Format -> NatM Reg +getNewRegNat rep + = do u <- getUniqueNat + dflags <- getDynFlags + return (RegVirtual $ targetMkVirtualReg (targetPlatform dflags) u rep) + + +getNewRegPairNat :: Format -> NatM (Reg,Reg) +getNewRegPairNat rep + = do u <- getUniqueNat + dflags <- getDynFlags + let vLo = targetMkVirtualReg (targetPlatform dflags) u rep + let lo = RegVirtual $ targetMkVirtualReg (targetPlatform dflags) u rep + let hi = RegVirtual $ getHiVirtualRegFromLo vLo + return (lo, hi) + + +getPicBaseMaybeNat :: NatM (Maybe Reg) +getPicBaseMaybeNat + = NatM (\state -> (natm_pic state, state)) + + +getPicBaseNat :: Format -> NatM Reg +getPicBaseNat rep + = do mbPicBase <- getPicBaseMaybeNat + case mbPicBase of + Just picBase -> return picBase + Nothing + -> do + reg <- getNewRegNat rep + NatM (\state -> (reg, state { natm_pic = Just reg })) + +getModLoc :: NatM ModLocation +getModLoc + = NatM $ \ st -> (natm_modloc st, st) + +getFileId :: FastString -> NatM Int +getFileId f = NatM $ \st -> + case lookupUFM (natm_fileid st) f of + Just (_,n) -> (n, st) + Nothing -> let n = 1 + sizeUFM (natm_fileid st) + fids = addToUFM (natm_fileid st) f (f,n) + in n `seq` fids `seq` (n, st { natm_fileid = fids }) + +getDebugBlock :: Label -> NatM (Maybe DebugBlock) +getDebugBlock l = NatM $ \st -> (mapLookup l (natm_debug_map st), st) diff --git a/compiler/GHC/CmmToAsm/PIC.hs b/compiler/GHC/CmmToAsm/PIC.hs new file mode 100644 index 0000000000..323d93d173 --- /dev/null +++ b/compiler/GHC/CmmToAsm/PIC.hs @@ -0,0 +1,837 @@ +{- + This module handles generation of position independent code and + dynamic-linking related issues for the native code generator. + + This depends both the architecture and OS, so we define it here + instead of in one of the architecture specific modules. + + Things outside this module which are related to this: + + + module CLabel + - PIC base label (pretty printed as local label 1) + - DynamicLinkerLabels - several kinds: + CodeStub, SymbolPtr, GotSymbolPtr, GotSymbolOffset + - labelDynamic predicate + + module Cmm + - The GlobalReg datatype has a PicBaseReg constructor + - The CmmLit datatype has a CmmLabelDiffOff constructor + + codeGen & RTS + - When tablesNextToCode, no absolute addresses are stored in info tables + any more. Instead, offsets from the info label are used. + - For Win32 only, SRTs might contain addresses of __imp_ symbol pointers + because Win32 doesn't support external references in data sections. + TODO: make sure this still works, it might be bitrotted + + NCG + - The cmmToCmm pass in AsmCodeGen calls cmmMakeDynamicReference for all + labels. + - nativeCodeGen calls pprImportedSymbol and pprGotDeclaration to output + all the necessary stuff for imported symbols. + - The NCG monad keeps track of a list of imported symbols. + - MachCodeGen invokes initializePicBase to generate code to initialize + the PIC base register when needed. + - MachCodeGen calls cmmMakeDynamicReference whenever it uses a CLabel + that wasn't in the original Cmm code (e.g. floating point literals). +-} + +module GHC.CmmToAsm.PIC ( + cmmMakeDynamicReference, + CmmMakeDynamicReferenceM(..), + ReferenceKind(..), + needImportedSymbols, + pprImportedSymbol, + pprGotDeclaration, + + initializePicBase_ppc, + initializePicBase_x86 +) + +where + +import GhcPrelude + +import qualified GHC.CmmToAsm.PPC.Instr as PPC +import qualified GHC.CmmToAsm.PPC.Regs as PPC +import qualified GHC.CmmToAsm.X86.Instr as X86 + +import GHC.Platform +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg +import GHC.CmmToAsm.Monad + + +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm +import GHC.Cmm.CLabel ( CLabel, ForeignLabelSource(..), pprCLabel, + mkDynamicLinkerLabel, DynamicLinkerLabelInfo(..), + dynamicLinkerLabelInfo, mkPicBaseLabel, + labelDynamic, externallyVisibleCLabel ) + +import GHC.Cmm.CLabel ( mkForeignLabel ) + + +import BasicTypes +import Module + +import Outputable + +import GHC.Driver.Session +import FastString + + + +-------------------------------------------------------------------------------- +-- It gets called by the cmmToCmm pass for every CmmLabel in the Cmm +-- code. It does The Right Thing(tm) to convert the CmmLabel into a +-- position-independent, dynamic-linking-aware reference to the thing +-- in question. +-- Note that this also has to be called from MachCodeGen in order to +-- access static data like floating point literals (labels that were +-- created after the cmmToCmm pass). +-- The function must run in a monad that can keep track of imported symbols +-- A function for recording an imported symbol must be passed in: +-- - addImportCmmOpt for the CmmOptM monad +-- - addImportNat for the NatM monad. + +data ReferenceKind + = DataReference + | CallReference + | JumpReference + deriving(Eq) + +class Monad m => CmmMakeDynamicReferenceM m where + addImport :: CLabel -> m () + getThisModule :: m Module + +instance CmmMakeDynamicReferenceM NatM where + addImport = addImportNat + getThisModule = getThisModuleNat + +cmmMakeDynamicReference + :: CmmMakeDynamicReferenceM m + => DynFlags + -> ReferenceKind -- whether this is the target of a jump + -> CLabel -- the label + -> m CmmExpr + +cmmMakeDynamicReference dflags referenceKind lbl + | Just _ <- dynamicLinkerLabelInfo lbl + = return $ CmmLit $ CmmLabel lbl -- already processed it, pass through + + | otherwise + = do this_mod <- getThisModule + case howToAccessLabel + dflags + (platformArch $ targetPlatform dflags) + (platformOS $ targetPlatform dflags) + this_mod + referenceKind lbl of + + AccessViaStub -> do + let stub = mkDynamicLinkerLabel CodeStub lbl + addImport stub + return $ CmmLit $ CmmLabel stub + + AccessViaSymbolPtr -> do + let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl + addImport symbolPtr + return $ CmmLoad (cmmMakePicReference dflags symbolPtr) (bWord dflags) + + AccessDirectly -> case referenceKind of + -- for data, we might have to make some calculations: + DataReference -> return $ cmmMakePicReference dflags lbl + -- all currently supported processors support + -- PC-relative branch and call instructions, + -- so just jump there if it's a call or a jump + _ -> return $ CmmLit $ CmmLabel lbl + + +-- ----------------------------------------------------------------------------- +-- Create a position independent reference to a label. +-- (but do not bother with dynamic linking). +-- We calculate the label's address by adding some (platform-dependent) +-- offset to our base register; this offset is calculated by +-- the function picRelative in the platform-dependent part below. + +cmmMakePicReference :: DynFlags -> CLabel -> CmmExpr +cmmMakePicReference dflags lbl + + -- Windows doesn't need PIC, + -- everything gets relocated at runtime + | OSMinGW32 <- platformOS $ targetPlatform dflags + = CmmLit $ CmmLabel lbl + + | OSAIX <- platformOS $ targetPlatform dflags + = CmmMachOp (MO_Add W32) + [ CmmReg (CmmGlobal PicBaseReg) + , CmmLit $ picRelative dflags + (platformArch $ targetPlatform dflags) + (platformOS $ targetPlatform dflags) + lbl ] + + -- both ABI versions default to medium code model + | ArchPPC_64 _ <- platformArch $ targetPlatform dflags + = CmmMachOp (MO_Add W32) -- code model medium + [ CmmReg (CmmGlobal PicBaseReg) + , CmmLit $ picRelative dflags + (platformArch $ targetPlatform dflags) + (platformOS $ targetPlatform dflags) + lbl ] + + | (positionIndependent dflags || gopt Opt_ExternalDynamicRefs dflags) + && absoluteLabel lbl + = CmmMachOp (MO_Add (wordWidth dflags)) + [ CmmReg (CmmGlobal PicBaseReg) + , CmmLit $ picRelative dflags + (platformArch $ targetPlatform dflags) + (platformOS $ targetPlatform dflags) + lbl ] + + | otherwise + = CmmLit $ CmmLabel lbl + + +absoluteLabel :: CLabel -> Bool +absoluteLabel lbl + = case dynamicLinkerLabelInfo lbl of + Just (GotSymbolPtr, _) -> False + Just (GotSymbolOffset, _) -> False + _ -> True + + +-------------------------------------------------------------------------------- +-- Knowledge about how special dynamic linker labels like symbol +-- pointers, code stubs and GOT offsets look like is located in the +-- module CLabel. + +-- We have to decide which labels need to be accessed +-- indirectly or via a piece of stub code. +data LabelAccessStyle + = AccessViaStub + | AccessViaSymbolPtr + | AccessDirectly + +howToAccessLabel + :: DynFlags -> Arch -> OS -> Module -> ReferenceKind -> CLabel -> LabelAccessStyle + + +-- Windows +-- In Windows speak, a "module" is a set of objects linked into the +-- same Portable Executable (PE) file. (both .exe and .dll files are PEs). +-- +-- If we're compiling a multi-module program then symbols from other modules +-- are accessed by a symbol pointer named __imp_SYMBOL. At runtime we have the +-- following. +-- +-- (in the local module) +-- __imp_SYMBOL: addr of SYMBOL +-- +-- (in the other module) +-- SYMBOL: the real function / data. +-- +-- To access the function at SYMBOL from our local module, we just need to +-- dereference the local __imp_SYMBOL. +-- +-- If not compiling with -dynamic we assume that all our code will be linked +-- into the same .exe file. In this case we always access symbols directly, +-- and never use __imp_SYMBOL. +-- +howToAccessLabel dflags _ OSMinGW32 this_mod _ lbl + + -- Assume all symbols will be in the same PE, so just access them directly. + | not (gopt Opt_ExternalDynamicRefs dflags) + = AccessDirectly + + -- If the target symbol is in another PE we need to access it via the + -- appropriate __imp_SYMBOL pointer. + | labelDynamic dflags this_mod lbl + = AccessViaSymbolPtr + + -- Target symbol is in the same PE as the caller, so just access it directly. + | otherwise + = AccessDirectly + + +-- Mach-O (Darwin, Mac OS X) +-- +-- Indirect access is required in the following cases: +-- * things imported from a dynamic library +-- * (not on x86_64) data from a different module, if we're generating PIC code +-- It is always possible to access something indirectly, +-- even when it's not necessary. +-- +howToAccessLabel dflags arch OSDarwin this_mod DataReference lbl + -- data access to a dynamic library goes via a symbol pointer + | labelDynamic dflags this_mod lbl + = AccessViaSymbolPtr + + -- when generating PIC code, all cross-module data references must + -- must go via a symbol pointer, too, because the assembler + -- cannot generate code for a label difference where one + -- label is undefined. Doesn't apply t x86_64. + -- Unfortunately, we don't know whether it's cross-module, + -- so we do it for all externally visible labels. + -- This is a slight waste of time and space, but otherwise + -- we'd need to pass the current Module all the way in to + -- this function. + | arch /= ArchX86_64 + , positionIndependent dflags && externallyVisibleCLabel lbl + = AccessViaSymbolPtr + + | otherwise + = AccessDirectly + +howToAccessLabel dflags arch OSDarwin this_mod JumpReference lbl + -- dyld code stubs don't work for tailcalls because the + -- stack alignment is only right for regular calls. + -- Therefore, we have to go via a symbol pointer: + | arch == ArchX86 || arch == ArchX86_64 + , labelDynamic dflags this_mod lbl + = AccessViaSymbolPtr + + +howToAccessLabel dflags arch OSDarwin this_mod _ lbl + -- Code stubs are the usual method of choice for imported code; + -- not needed on x86_64 because Apple's new linker, ld64, generates + -- them automatically. + | arch /= ArchX86_64 + , labelDynamic dflags this_mod lbl + = AccessViaStub + + | otherwise + = AccessDirectly + + +---------------------------------------------------------------------------- +-- AIX + +-- quite simple (for now) +howToAccessLabel _dflags _arch OSAIX _this_mod kind _lbl + = case kind of + DataReference -> AccessViaSymbolPtr + CallReference -> AccessDirectly + JumpReference -> AccessDirectly + +-- ELF (Linux) +-- +-- ELF tries to pretend to the main application code that dynamic linking does +-- not exist. While this may sound convenient, it tends to mess things up in +-- very bad ways, so we have to be careful when we generate code for a non-PIE +-- main program (-dynamic but no -fPIC). +-- +-- Indirect access is required for references to imported symbols +-- from position independent code. It is also required from the main program +-- when dynamic libraries containing Haskell code are used. + +howToAccessLabel _ (ArchPPC_64 _) os _ kind _ + | osElfTarget os + = case kind of + -- ELF PPC64 (powerpc64-linux), AIX, MacOS 9, BeOS/PPC + DataReference -> AccessViaSymbolPtr + -- RTLD does not generate stubs for function descriptors + -- in tail calls. Create a symbol pointer and generate + -- the code to load the function descriptor at the call site. + JumpReference -> AccessViaSymbolPtr + -- regular calls are handled by the runtime linker + _ -> AccessDirectly + +howToAccessLabel dflags _ os _ _ _ + -- no PIC -> the dynamic linker does everything for us; + -- if we don't dynamically link to Haskell code, + -- it actually manages to do so without messing things up. + | osElfTarget os + , not (positionIndependent dflags) && + not (gopt Opt_ExternalDynamicRefs dflags) + = AccessDirectly + +howToAccessLabel dflags arch os this_mod DataReference lbl + | osElfTarget os + = case () of + -- A dynamic label needs to be accessed via a symbol pointer. + _ | labelDynamic dflags this_mod lbl + -> AccessViaSymbolPtr + + -- For PowerPC32 -fPIC, we have to access even static data + -- via a symbol pointer (see below for an explanation why + -- PowerPC32 Linux is especially broken). + | arch == ArchPPC + , positionIndependent dflags + -> AccessViaSymbolPtr + + | otherwise + -> AccessDirectly + + + -- In most cases, we have to avoid symbol stubs on ELF, for the following reasons: + -- on i386, the position-independent symbol stubs in the Procedure Linkage Table + -- require the address of the GOT to be loaded into register %ebx on entry. + -- The linker will take any reference to the symbol stub as a hint that + -- the label in question is a code label. When linking executables, this + -- will cause the linker to replace even data references to the label with + -- references to the symbol stub. + + -- This leaves calling a (foreign) function from non-PIC code + -- (AccessDirectly, because we get an implicit symbol stub) + -- and calling functions from PIC code on non-i386 platforms (via a symbol stub) + +howToAccessLabel dflags arch os this_mod CallReference lbl + | osElfTarget os + , labelDynamic dflags this_mod lbl && not (positionIndependent dflags) + = AccessDirectly + + | osElfTarget os + , arch /= ArchX86 + , labelDynamic dflags this_mod lbl + , positionIndependent dflags + = AccessViaStub + +howToAccessLabel dflags _ os this_mod _ lbl + | osElfTarget os + = if labelDynamic dflags this_mod lbl + then AccessViaSymbolPtr + else AccessDirectly + +-- all other platforms +howToAccessLabel dflags _ _ _ _ _ + | not (positionIndependent dflags) + = AccessDirectly + + | otherwise + = panic "howToAccessLabel: PIC not defined for this platform" + + + +-- ------------------------------------------------------------------- +-- | Says what we have to add to our 'PIC base register' in order to +-- get the address of a label. + +picRelative :: DynFlags -> Arch -> OS -> CLabel -> CmmLit + +-- Darwin, but not x86_64: +-- The PIC base register points to the PIC base label at the beginning +-- of the current CmmDecl. We just have to use a label difference to +-- get the offset. +-- We have already made sure that all labels that are not from the current +-- module are accessed indirectly ('as' can't calculate differences between +-- undefined labels). +picRelative dflags arch OSDarwin lbl + | arch /= ArchX86_64 + = CmmLabelDiffOff lbl mkPicBaseLabel 0 (wordWidth dflags) + +-- On AIX we use an indirect local TOC anchored by 'gotLabel'. +-- This way we use up only one global TOC entry per compilation-unit +-- (this is quite similar to GCC's @-mminimal-toc@ compilation mode) +picRelative dflags _ OSAIX lbl + = CmmLabelDiffOff lbl gotLabel 0 (wordWidth dflags) + +-- PowerPC Linux: +-- The PIC base register points to our fake GOT. Use a label difference +-- to get the offset. +-- We have made sure that *everything* is accessed indirectly, so this +-- is only used for offsets from the GOT to symbol pointers inside the +-- GOT. +picRelative dflags ArchPPC os lbl + | osElfTarget os + = CmmLabelDiffOff lbl gotLabel 0 (wordWidth dflags) + + +-- Most Linux versions: +-- The PIC base register points to the GOT. Use foo@got for symbol +-- pointers, and foo@gotoff for everything else. +-- Linux and Darwin on x86_64: +-- The PIC base register is %rip, we use foo@gotpcrel for symbol pointers, +-- and a GotSymbolOffset label for other things. +-- For reasons of tradition, the symbol offset label is written as a plain label. +picRelative _ arch os lbl + | osElfTarget os || (os == OSDarwin && arch == ArchX86_64) + = let result + | Just (SymbolPtr, lbl') <- dynamicLinkerLabelInfo lbl + = CmmLabel $ mkDynamicLinkerLabel GotSymbolPtr lbl' + + | otherwise + = CmmLabel $ mkDynamicLinkerLabel GotSymbolOffset lbl + + in result + +picRelative _ _ _ _ + = panic "PositionIndependentCode.picRelative undefined for this platform" + + + +-------------------------------------------------------------------------------- + +needImportedSymbols :: DynFlags -> Arch -> OS -> Bool +needImportedSymbols dflags arch os + | os == OSDarwin + , arch /= ArchX86_64 + = True + + | os == OSAIX + = True + + -- PowerPC Linux: -fPIC or -dynamic + | osElfTarget os + , arch == ArchPPC + = positionIndependent dflags || gopt Opt_ExternalDynamicRefs dflags + + -- PowerPC 64 Linux: always + | osElfTarget os + , arch == ArchPPC_64 ELF_V1 || arch == ArchPPC_64 ELF_V2 + = True + + -- i386 (and others?): -dynamic but not -fPIC + | osElfTarget os + , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2 + = gopt Opt_ExternalDynamicRefs dflags && + not (positionIndependent dflags) + + | otherwise + = False + +-- gotLabel +-- The label used to refer to our "fake GOT" from +-- position-independent code. +gotLabel :: CLabel +gotLabel + -- HACK: this label isn't really foreign + = mkForeignLabel + (fsLit ".LCTOC1") + Nothing ForeignLabelInThisPackage IsData + + + +-------------------------------------------------------------------------------- +-- We don't need to declare any offset tables. +-- However, for PIC on x86, we need a small helper function. +pprGotDeclaration :: DynFlags -> Arch -> OS -> SDoc +pprGotDeclaration dflags ArchX86 OSDarwin + | positionIndependent dflags + = vcat [ + text ".section __TEXT,__textcoal_nt,coalesced,no_toc", + text ".weak_definition ___i686.get_pc_thunk.ax", + text ".private_extern ___i686.get_pc_thunk.ax", + text "___i686.get_pc_thunk.ax:", + text "\tmovl (%esp), %eax", + text "\tret" ] + +pprGotDeclaration _ _ OSDarwin + = empty + +-- Emit XCOFF TOC section +pprGotDeclaration _ _ OSAIX + = vcat $ [ text ".toc" + , text ".tc ghc_toc_table[TC],.LCTOC1" + , text ".csect ghc_toc_table[RW]" + -- See Note [.LCTOC1 in PPC PIC code] + , text ".set .LCTOC1,$+0x8000" + ] + + +-- PPC 64 ELF v1 needs a Table Of Contents (TOC) +pprGotDeclaration _ (ArchPPC_64 ELF_V1) _ + = text ".section \".toc\",\"aw\"" +-- In ELF v2 we also need to tell the assembler that we want ABI +-- version 2. This would normally be done at the top of the file +-- right after a file directive, but I could not figure out how +-- to do that. +pprGotDeclaration _ (ArchPPC_64 ELF_V2) _ + = vcat [ text ".abiversion 2", + text ".section \".toc\",\"aw\"" + ] + +-- Emit GOT declaration +-- Output whatever needs to be output once per .s file. +pprGotDeclaration dflags arch os + | osElfTarget os + , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2 + , not (positionIndependent dflags) + = empty + + | osElfTarget os + , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2 + = vcat [ + -- See Note [.LCTOC1 in PPC PIC code] + text ".section \".got2\",\"aw\"", + text ".LCTOC1 = .+32768" ] + +pprGotDeclaration _ _ _ + = panic "pprGotDeclaration: no match" + + +-------------------------------------------------------------------------------- +-- On Darwin, we have to generate our own stub code for lazy binding.. +-- For each processor architecture, there are two versions, one for PIC +-- and one for non-PIC. +-- + +pprImportedSymbol :: DynFlags -> Platform -> CLabel -> SDoc +pprImportedSymbol dflags (Platform { platformMini = PlatformMini { platformMini_arch = ArchX86, platformMini_os = OSDarwin } }) importedLbl + | Just (CodeStub, lbl) <- dynamicLinkerLabelInfo importedLbl + = case positionIndependent dflags of + False -> + vcat [ + text ".symbol_stub", + text "L" <> pprCLabel dflags lbl <> ptext (sLit "$stub:"), + text "\t.indirect_symbol" <+> pprCLabel dflags lbl, + text "\tjmp *L" <> pprCLabel dflags lbl + <> text "$lazy_ptr", + text "L" <> pprCLabel dflags lbl + <> text "$stub_binder:", + text "\tpushl $L" <> pprCLabel dflags lbl + <> text "$lazy_ptr", + text "\tjmp dyld_stub_binding_helper" + ] + True -> + vcat [ + text ".section __TEXT,__picsymbolstub2," + <> text "symbol_stubs,pure_instructions,25", + text "L" <> pprCLabel dflags lbl <> ptext (sLit "$stub:"), + text "\t.indirect_symbol" <+> pprCLabel dflags lbl, + text "\tcall ___i686.get_pc_thunk.ax", + text "1:", + text "\tmovl L" <> pprCLabel dflags lbl + <> text "$lazy_ptr-1b(%eax),%edx", + text "\tjmp *%edx", + text "L" <> pprCLabel dflags lbl + <> text "$stub_binder:", + text "\tlea L" <> pprCLabel dflags lbl + <> text "$lazy_ptr-1b(%eax),%eax", + text "\tpushl %eax", + text "\tjmp dyld_stub_binding_helper" + ] + $+$ vcat [ text ".section __DATA, __la_sym_ptr" + <> (if positionIndependent dflags then int 2 else int 3) + <> text ",lazy_symbol_pointers", + text "L" <> pprCLabel dflags lbl <> ptext (sLit "$lazy_ptr:"), + text "\t.indirect_symbol" <+> pprCLabel dflags lbl, + text "\t.long L" <> pprCLabel dflags lbl + <> text "$stub_binder"] + + | Just (SymbolPtr, lbl) <- dynamicLinkerLabelInfo importedLbl + = vcat [ + text ".non_lazy_symbol_pointer", + char 'L' <> pprCLabel dflags lbl <> text "$non_lazy_ptr:", + text "\t.indirect_symbol" <+> pprCLabel dflags lbl, + text "\t.long\t0"] + + | otherwise + = empty + + +pprImportedSymbol _ (Platform { platformMini = PlatformMini { platformMini_os = OSDarwin } }) _ + = empty + +-- XCOFF / AIX +-- +-- Similar to PPC64 ELF v1, there's dedicated TOC register (r2). To +-- workaround the limitation of a global TOC we use an indirect TOC +-- with the label `ghc_toc_table`. +-- +-- See also GCC's `-mminimal-toc` compilation mode or +-- http://www.ibm.com/developerworks/rational/library/overview-toc-aix/ +-- +-- NB: No DSO-support yet + +pprImportedSymbol dflags (Platform { platformMini = PlatformMini { platformMini_os = OSAIX } }) importedLbl + = case dynamicLinkerLabelInfo importedLbl of + Just (SymbolPtr, lbl) + -> vcat [ + text "LC.." <> pprCLabel dflags lbl <> char ':', + text "\t.long" <+> pprCLabel dflags lbl ] + _ -> empty + +-- ELF / Linux +-- +-- In theory, we don't need to generate any stubs or symbol pointers +-- by hand for Linux. +-- +-- Reality differs from this in two areas. +-- +-- 1) If we just use a dynamically imported symbol directly in a read-only +-- section of the main executable (as GCC does), ld generates R_*_COPY +-- relocations, which are fundamentally incompatible with reversed info +-- tables. Therefore, we need a table of imported addresses in a writable +-- section. +-- The "official" GOT mechanism (label@got) isn't intended to be used +-- in position dependent code, so we have to create our own "fake GOT" +-- when not Opt_PIC && WayDyn `elem` ways dflags. +-- +-- 2) PowerPC Linux is just plain broken. +-- While it's theoretically possible to use GOT offsets larger +-- than 16 bit, the standard crt*.o files don't, which leads to +-- linker errors as soon as the GOT size exceeds 16 bit. +-- Also, the assembler doesn't support @gotoff labels. +-- In order to be able to use a larger GOT, we have to circumvent the +-- entire GOT mechanism and do it ourselves (this is also what GCC does). + + +-- When needImportedSymbols is defined, +-- the NCG will keep track of all DynamicLinkerLabels it uses +-- and output each of them using pprImportedSymbol. + +pprImportedSymbol dflags platform@(Platform { platformMini = PlatformMini { platformMini_arch = ArchPPC_64 _ } }) + importedLbl + | osElfTarget (platformOS platform) + = case dynamicLinkerLabelInfo importedLbl of + Just (SymbolPtr, lbl) + -> vcat [ + text ".section \".toc\", \"aw\"", + text ".LC_" <> pprCLabel dflags lbl <> char ':', + text "\t.quad" <+> pprCLabel dflags lbl ] + _ -> empty + +pprImportedSymbol dflags platform importedLbl + | osElfTarget (platformOS platform) + = case dynamicLinkerLabelInfo importedLbl of + Just (SymbolPtr, lbl) + -> let symbolSize = case wordWidth dflags of + W32 -> sLit "\t.long" + W64 -> sLit "\t.quad" + _ -> panic "Unknown wordRep in pprImportedSymbol" + + in vcat [ + text ".section \".got2\", \"aw\"", + text ".LC_" <> pprCLabel dflags lbl <> char ':', + ptext symbolSize <+> pprCLabel dflags lbl ] + + -- PLT code stubs are generated automatically by the dynamic linker. + _ -> empty + +pprImportedSymbol _ _ _ + = panic "PIC.pprImportedSymbol: no match" + +-------------------------------------------------------------------------------- +-- Generate code to calculate the address that should be put in the +-- PIC base register. +-- This is called by MachCodeGen for every CmmProc that accessed the +-- PIC base register. It adds the appropriate instructions to the +-- top of the CmmProc. + +-- It is assumed that the first NatCmmDecl in the input list is a Proc +-- and the rest are CmmDatas. + +-- Darwin is simple: just fetch the address of a local label. +-- The FETCHPC pseudo-instruction is expanded to multiple instructions +-- during pretty-printing so that we don't have to deal with the +-- local label: + +-- PowerPC version: +-- bcl 20,31,1f. +-- 1: mflr picReg + +-- i386 version: +-- call 1f +-- 1: popl %picReg + + + +-- Get a pointer to our own fake GOT, which is defined on a per-module basis. +-- This is exactly how GCC does it in linux. + +initializePicBase_ppc + :: Arch -> OS -> Reg + -> [NatCmmDecl RawCmmStatics PPC.Instr] + -> NatM [NatCmmDecl RawCmmStatics PPC.Instr] + +initializePicBase_ppc ArchPPC os picReg + (CmmProc info lab live (ListGraph blocks) : statics) + | osElfTarget os + = do + let + gotOffset = PPC.ImmConstantDiff + (PPC.ImmCLbl gotLabel) + (PPC.ImmCLbl mkPicBaseLabel) + + blocks' = case blocks of + [] -> [] + (b:bs) -> fetchPC b : map maybeFetchPC bs + + maybeFetchPC b@(BasicBlock bID _) + | bID `mapMember` info = fetchPC b + | otherwise = b + + -- GCC does PIC prologs thusly: + -- bcl 20,31,.L1 + -- .L1: + -- mflr 30 + -- addis 30,30,.LCTOC1-.L1@ha + -- addi 30,30,.LCTOC1-.L1@l + -- TODO: below we use it over temporary register, + -- it can and should be optimised by picking + -- correct PIC reg. + fetchPC (BasicBlock bID insns) = + BasicBlock bID (PPC.FETCHPC picReg + : PPC.ADDIS picReg picReg (PPC.HA gotOffset) + : PPC.ADD picReg picReg + (PPC.RIImm (PPC.LO gotOffset)) + : PPC.MR PPC.r30 picReg + : insns) + + return (CmmProc info lab live (ListGraph blocks') : statics) + +------------------------------------------------------------------------- +-- Load TOC into register 2 +-- PowerPC 64-bit ELF ABI 2.0 requires the address of the callee +-- in register 12. +-- We pass the label to FETCHTOC and create a .localentry too. +-- TODO: Explain this better and refer to ABI spec! +{- +We would like to do approximately this, but spill slot allocation +might be added before the first BasicBlock. That violates the ABI. + +For now we will emit the prologue code in the pretty printer, +which is also what we do for ELF v1. +initializePicBase_ppc (ArchPPC_64 ELF_V2) OSLinux picReg + (CmmProc info lab live (ListGraph (entry:blocks)) : statics) + = do + bID <-getUniqueM + return (CmmProc info lab live (ListGraph (b':entry:blocks)) + : statics) + where BasicBlock entryID _ = entry + b' = BasicBlock bID [PPC.FETCHTOC picReg lab, + PPC.BCC PPC.ALWAYS entryID] +-} + +initializePicBase_ppc _ _ _ _ + = panic "initializePicBase_ppc: not needed" + + +-- We cheat a bit here by defining a pseudo-instruction named FETCHGOT +-- which pretty-prints as: +-- call 1f +-- 1: popl %picReg +-- addl __GLOBAL_OFFSET_TABLE__+.-1b, %picReg +-- (See PprMach.hs) + +initializePicBase_x86 + :: Arch -> OS -> Reg + -> [NatCmmDecl (Alignment, RawCmmStatics) X86.Instr] + -> NatM [NatCmmDecl (Alignment, RawCmmStatics) X86.Instr] + +initializePicBase_x86 ArchX86 os picReg + (CmmProc info lab live (ListGraph blocks) : statics) + | osElfTarget os + = return (CmmProc info lab live (ListGraph blocks') : statics) + where blocks' = case blocks of + [] -> [] + (b:bs) -> fetchGOT b : map maybeFetchGOT bs + + -- we want to add a FETCHGOT instruction to the beginning of + -- every block that is an entry point, which corresponds to + -- the blocks that have entries in the info-table mapping. + maybeFetchGOT b@(BasicBlock bID _) + | bID `mapMember` info = fetchGOT b + | otherwise = b + + fetchGOT (BasicBlock bID insns) = + BasicBlock bID (X86.FETCHGOT picReg : insns) + +initializePicBase_x86 ArchX86 OSDarwin picReg + (CmmProc info lab live (ListGraph (entry:blocks)) : statics) + = return (CmmProc info lab live (ListGraph (block':blocks)) : statics) + + where BasicBlock bID insns = entry + block' = BasicBlock bID (X86.FETCHPC picReg : insns) + +initializePicBase_x86 _ _ _ _ + = panic "initializePicBase_x86: not needed" + diff --git a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs new file mode 100644 index 0000000000..02319171dc --- /dev/null +++ b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs @@ -0,0 +1,2455 @@ +{-# LANGUAGE CPP, GADTs #-} + +----------------------------------------------------------------------------- +-- +-- Generating machine code (instruction selection) +-- +-- (c) The University of Glasgow 1996-2004 +-- +----------------------------------------------------------------------------- + +-- This is a big module, but, if you pay attention to +-- (a) the sectioning, and (b) the type signatures, +-- the structure should not be too overwhelming. + +module GHC.CmmToAsm.PPC.CodeGen ( + cmmTopCodeGen, + generateJumpTableForInstr, + InstrBlock +) + +where + +#include "HsVersions.h" + +-- NCG stuff: +import GhcPrelude + +import GHC.Platform.Regs +import GHC.CmmToAsm.PPC.Instr +import GHC.CmmToAsm.PPC.Cond +import GHC.CmmToAsm.PPC.Regs +import GHC.CmmToAsm.CPrim +import GHC.CmmToAsm.Monad + ( NatM, getNewRegNat, getNewLabelNat + , getBlockIdNat, getPicBaseNat, getNewRegPairNat + , getPicBaseMaybeNat + ) +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.PIC +import GHC.CmmToAsm.Format +import GHC.Platform.Reg.Class +import GHC.Platform.Reg +import GHC.CmmToAsm.Reg.Target +import GHC.Platform + +-- Our intermediate code: +import GHC.Cmm.BlockId +import GHC.Cmm.Ppr ( pprExpr ) +import GHC.Cmm +import GHC.Cmm.Utils +import GHC.Cmm.Switch +import GHC.Cmm.CLabel +import GHC.Cmm.Dataflow.Block +import GHC.Cmm.Dataflow.Graph + +-- The rest: +import OrdList +import Outputable +import GHC.Driver.Session + +import Control.Monad ( mapAndUnzipM, when ) +import Data.Bits +import Data.Word + +import BasicTypes +import FastString +import Util + +-- ----------------------------------------------------------------------------- +-- Top-level of the instruction selector + +-- | 'InstrBlock's are the insn sequences generated by the insn selectors. +-- They are really trees of insns to facilitate fast appending, where a +-- left-to-right traversal (pre-order?) yields the insns in the correct +-- order. + +cmmTopCodeGen + :: RawCmmDecl + -> NatM [NatCmmDecl RawCmmStatics Instr] + +cmmTopCodeGen (CmmProc info lab live graph) = do + let blocks = toBlockListEntryFirst graph + (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks + dflags <- getDynFlags + let proc = CmmProc info lab live (ListGraph $ concat nat_blocks) + tops = proc : concat statics + os = platformOS $ targetPlatform dflags + arch = platformArch $ targetPlatform dflags + case arch of + ArchPPC | os == OSAIX -> return tops + | otherwise -> do + picBaseMb <- getPicBaseMaybeNat + case picBaseMb of + Just picBase -> initializePicBase_ppc arch os picBase tops + Nothing -> return tops + ArchPPC_64 ELF_V1 -> fixup_entry tops + -- generating function descriptor is handled in + -- pretty printer + ArchPPC_64 ELF_V2 -> fixup_entry tops + -- generating function prologue is handled in + -- pretty printer + _ -> panic "PPC.cmmTopCodeGen: unknown arch" + where + fixup_entry (CmmProc info lab live (ListGraph (entry:blocks)) : statics) + = do + let BasicBlock bID insns = entry + bID' <- if lab == (blockLbl bID) + then newBlockId + else return bID + let b' = BasicBlock bID' insns + return (CmmProc info lab live (ListGraph (b':blocks)) : statics) + fixup_entry _ = panic "cmmTopCodegen: Broken CmmProc" + +cmmTopCodeGen (CmmData sec dat) = do + return [CmmData sec dat] -- no translation, we just use CmmStatic + +basicBlockCodeGen + :: Block CmmNode C C + -> NatM ( [NatBasicBlock Instr] + , [NatCmmDecl RawCmmStatics Instr]) + +basicBlockCodeGen block = do + let (_, nodes, tail) = blockSplit block + id = entryLabel block + stmts = blockToList nodes + mid_instrs <- stmtsToInstrs stmts + tail_instrs <- stmtToInstrs tail + let instrs = mid_instrs `appOL` tail_instrs + -- code generation may introduce new basic block boundaries, which + -- are indicated by the NEWBLOCK instruction. We must split up the + -- instruction stream into basic blocks again. Also, we extract + -- LDATAs here too. + let + (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs + + mkBlocks (NEWBLOCK id) (instrs,blocks,statics) + = ([], BasicBlock id instrs : blocks, statics) + mkBlocks (LDATA sec dat) (instrs,blocks,statics) + = (instrs, blocks, CmmData sec dat:statics) + mkBlocks instr (instrs,blocks,statics) + = (instr:instrs, blocks, statics) + return (BasicBlock id top : other_blocks, statics) + +stmtsToInstrs :: [CmmNode e x] -> NatM InstrBlock +stmtsToInstrs stmts + = do instrss <- mapM stmtToInstrs stmts + return (concatOL instrss) + +stmtToInstrs :: CmmNode e x -> NatM InstrBlock +stmtToInstrs stmt = do + dflags <- getDynFlags + case stmt of + CmmComment s -> return (unitOL (COMMENT s)) + CmmTick {} -> return nilOL + CmmUnwind {} -> return nilOL + + CmmAssign reg src + | isFloatType ty -> assignReg_FltCode format reg src + | target32Bit (targetPlatform dflags) && + isWord64 ty -> assignReg_I64Code reg src + | otherwise -> assignReg_IntCode format reg src + where ty = cmmRegType dflags reg + format = cmmTypeFormat ty + + CmmStore addr src + | isFloatType ty -> assignMem_FltCode format addr src + | target32Bit (targetPlatform dflags) && + isWord64 ty -> assignMem_I64Code addr src + | otherwise -> assignMem_IntCode format addr src + where ty = cmmExprType dflags src + format = cmmTypeFormat ty + + CmmUnsafeForeignCall target result_regs args + -> genCCall target result_regs args + + CmmBranch id -> genBranch id + CmmCondBranch arg true false prediction -> do + b1 <- genCondJump true arg prediction + b2 <- genBranch false + return (b1 `appOL` b2) + CmmSwitch arg ids -> do dflags <- getDynFlags + genSwitch dflags arg ids + CmmCall { cml_target = arg + , cml_args_regs = gregs } -> do + dflags <- getDynFlags + genJump arg (jumpRegs dflags gregs) + _ -> + panic "stmtToInstrs: statement should have been cps'd away" + +jumpRegs :: DynFlags -> [GlobalReg] -> [Reg] +jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ] + where platform = targetPlatform dflags + +-------------------------------------------------------------------------------- +-- | 'InstrBlock's are the insn sequences generated by the insn selectors. +-- They are really trees of insns to facilitate fast appending, where a +-- left-to-right traversal yields the insns in the correct order. +-- +type InstrBlock + = OrdList Instr + + +-- | Register's passed up the tree. If the stix code forces the register +-- to live in a pre-decided machine register, it comes out as @Fixed@; +-- otherwise, it comes out as @Any@, and the parent can decide which +-- register to put it in. +-- +data Register + = Fixed Format Reg InstrBlock + | Any Format (Reg -> InstrBlock) + + +swizzleRegisterRep :: Register -> Format -> Register +swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code +swizzleRegisterRep (Any _ codefn) format = Any format codefn + + +-- | Grab the Reg for a CmmReg +getRegisterReg :: Platform -> CmmReg -> Reg + +getRegisterReg _ (CmmLocal (LocalReg u pk)) + = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk) + +getRegisterReg platform (CmmGlobal mid) + = case globalRegMaybe platform mid of + Just reg -> RegReal reg + Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid) + -- By this stage, the only MagicIds remaining should be the + -- ones which map to a real machine register on this + -- platform. Hence ... + +-- | Convert a BlockId to some CmmStatic data +jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic +jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags)) +jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel) + where blockLabel = blockLbl blockid + + + +-- ----------------------------------------------------------------------------- +-- General things for putting together code sequences + +-- Expand CmmRegOff. ToDo: should we do it this way around, or convert +-- CmmExprs into CmmRegOff? +mangleIndexTree :: DynFlags -> CmmExpr -> CmmExpr +mangleIndexTree dflags (CmmRegOff reg off) + = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)] + where width = typeWidth (cmmRegType dflags reg) + +mangleIndexTree _ _ + = panic "PPC.CodeGen.mangleIndexTree: no match" + +-- ----------------------------------------------------------------------------- +-- Code gen for 64-bit arithmetic on 32-bit platforms + +{- +Simple support for generating 64-bit code (ie, 64 bit values and 64 +bit assignments) on 32-bit platforms. Unlike the main code generator +we merely shoot for generating working code as simply as possible, and +pay little attention to code quality. Specifically, there is no +attempt to deal cleverly with the fixed-vs-floating register +distinction; all values are generated into (pairs of) floating +registers, even if this would mean some redundant reg-reg moves as a +result. Only one of the VRegUniques is returned, since it will be +of the VRegUniqueLo form, and the upper-half VReg can be determined +by applying getHiVRegFromLo to it. +-} + +data ChildCode64 -- a.k.a "Register64" + = ChildCode64 + InstrBlock -- code + Reg -- the lower 32-bit temporary which contains the + -- result; use getHiVRegFromLo to find the other + -- VRegUnique. Rules of this simplified insn + -- selection game are therefore that the returned + -- Reg may be modified + + +-- | Compute an expression into a register, but +-- we don't mind which one it is. +getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock) +getSomeReg expr = do + r <- getRegister expr + case r of + Any rep code -> do + tmp <- getNewRegNat rep + return (tmp, code tmp) + Fixed _ reg code -> + return (reg, code) + +getI64Amodes :: CmmExpr -> NatM (AddrMode, AddrMode, InstrBlock) +getI64Amodes addrTree = do + Amode hi_addr addr_code <- getAmode D addrTree + case addrOffset hi_addr 4 of + Just lo_addr -> return (hi_addr, lo_addr, addr_code) + Nothing -> do (hi_ptr, code) <- getSomeReg addrTree + return (AddrRegImm hi_ptr (ImmInt 0), + AddrRegImm hi_ptr (ImmInt 4), + code) + + +assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock +assignMem_I64Code addrTree valueTree = do + (hi_addr, lo_addr, addr_code) <- getI64Amodes addrTree + ChildCode64 vcode rlo <- iselExpr64 valueTree + let + rhi = getHiVRegFromLo rlo + + -- Big-endian store + mov_hi = ST II32 rhi hi_addr + mov_lo = ST II32 rlo lo_addr + return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi) + + +assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock +assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do + ChildCode64 vcode r_src_lo <- iselExpr64 valueTree + let + r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32 + r_dst_hi = getHiVRegFromLo r_dst_lo + r_src_hi = getHiVRegFromLo r_src_lo + mov_lo = MR r_dst_lo r_src_lo + mov_hi = MR r_dst_hi r_src_hi + return ( + vcode `snocOL` mov_lo `snocOL` mov_hi + ) + +assignReg_I64Code _ _ + = panic "assignReg_I64Code(powerpc): invalid lvalue" + + +iselExpr64 :: CmmExpr -> NatM ChildCode64 +iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do + (hi_addr, lo_addr, addr_code) <- getI64Amodes addrTree + (rlo, rhi) <- getNewRegPairNat II32 + let mov_hi = LD II32 rhi hi_addr + mov_lo = LD II32 rlo lo_addr + return $ ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi) + rlo + +iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty + = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32)) + +iselExpr64 (CmmLit (CmmInt i _)) = do + (rlo,rhi) <- getNewRegPairNat II32 + let + half0 = fromIntegral (fromIntegral i :: Word16) + half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16) + half2 = fromIntegral (fromIntegral (i `shiftR` 32) :: Word16) + half3 = fromIntegral (fromIntegral (i `shiftR` 48) :: Word16) + + code = toOL [ + LIS rlo (ImmInt half1), + OR rlo rlo (RIImm $ ImmInt half0), + LIS rhi (ImmInt half3), + OR rhi rhi (RIImm $ ImmInt half2) + ] + return (ChildCode64 code rlo) + +iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do + ChildCode64 code1 r1lo <- iselExpr64 e1 + ChildCode64 code2 r2lo <- iselExpr64 e2 + (rlo,rhi) <- getNewRegPairNat II32 + let + r1hi = getHiVRegFromLo r1lo + r2hi = getHiVRegFromLo r2lo + code = code1 `appOL` + code2 `appOL` + toOL [ ADDC rlo r1lo r2lo, + ADDE rhi r1hi r2hi ] + return (ChildCode64 code rlo) + +iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do + ChildCode64 code1 r1lo <- iselExpr64 e1 + ChildCode64 code2 r2lo <- iselExpr64 e2 + (rlo,rhi) <- getNewRegPairNat II32 + let + r1hi = getHiVRegFromLo r1lo + r2hi = getHiVRegFromLo r2lo + code = code1 `appOL` + code2 `appOL` + toOL [ SUBFC rlo r2lo (RIReg r1lo), + SUBFE rhi r2hi r1hi ] + return (ChildCode64 code rlo) + +iselExpr64 (CmmMachOp (MO_UU_Conv W32 W64) [expr]) = do + (expr_reg,expr_code) <- getSomeReg expr + (rlo, rhi) <- getNewRegPairNat II32 + let mov_hi = LI rhi (ImmInt 0) + mov_lo = MR rlo expr_reg + return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi) + rlo + +iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do + (expr_reg,expr_code) <- getSomeReg expr + (rlo, rhi) <- getNewRegPairNat II32 + let mov_hi = SRA II32 rhi expr_reg (RIImm (ImmInt 31)) + mov_lo = MR rlo expr_reg + return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi) + rlo +iselExpr64 expr + = pprPanic "iselExpr64(powerpc)" (pprExpr expr) + + + +getRegister :: CmmExpr -> NatM Register +getRegister e = do dflags <- getDynFlags + getRegister' dflags e + +getRegister' :: DynFlags -> CmmExpr -> NatM Register + +getRegister' dflags (CmmReg (CmmGlobal PicBaseReg)) + | OSAIX <- platformOS (targetPlatform dflags) = do + let code dst = toOL [ LD II32 dst tocAddr ] + tocAddr = AddrRegImm toc (ImmLit (text "ghc_toc_table[TC]")) + return (Any II32 code) + | target32Bit (targetPlatform dflags) = do + reg <- getPicBaseNat $ archWordFormat (target32Bit (targetPlatform dflags)) + return (Fixed (archWordFormat (target32Bit (targetPlatform dflags))) + reg nilOL) + | otherwise = return (Fixed II64 toc nilOL) + +getRegister' dflags (CmmReg reg) + = return (Fixed (cmmTypeFormat (cmmRegType dflags reg)) + (getRegisterReg (targetPlatform dflags) reg) nilOL) + +getRegister' dflags tree@(CmmRegOff _ _) + = getRegister' dflags (mangleIndexTree dflags tree) + + -- for 32-bit architectures, support some 64 -> 32 bit conversions: + -- TO_W_(x), TO_W_(x >> 32) + +getRegister' dflags (CmmMachOp (MO_UU_Conv W64 W32) + [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) + | target32Bit (targetPlatform dflags) = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 (getHiVRegFromLo rlo) code + +getRegister' dflags (CmmMachOp (MO_SS_Conv W64 W32) + [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) + | target32Bit (targetPlatform dflags) = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 (getHiVRegFromLo rlo) code + +getRegister' dflags (CmmMachOp (MO_UU_Conv W64 W32) [x]) + | target32Bit (targetPlatform dflags) = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 rlo code + +getRegister' dflags (CmmMachOp (MO_SS_Conv W64 W32) [x]) + | target32Bit (targetPlatform dflags) = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 rlo code + +getRegister' dflags (CmmLoad mem pk) + | not (isWord64 pk) = do + let platform = targetPlatform dflags + Amode addr addr_code <- getAmode D mem + let code dst = ASSERT((targetClassOfReg platform dst == RcDouble) == isFloatType pk) + addr_code `snocOL` LD format dst addr + return (Any format code) + | not (target32Bit (targetPlatform dflags)) = do + Amode addr addr_code <- getAmode DS mem + let code dst = addr_code `snocOL` LD II64 dst addr + return (Any II64 code) + + where format = cmmTypeFormat pk + +-- catch simple cases of zero- or sign-extended load +getRegister' _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II32 (\dst -> addr_code `snocOL` LD II8 dst addr)) + +getRegister' _ (CmmMachOp (MO_XX_Conv W8 W32) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II32 (\dst -> addr_code `snocOL` LD II8 dst addr)) + +getRegister' _ (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II64 (\dst -> addr_code `snocOL` LD II8 dst addr)) + +getRegister' _ (CmmMachOp (MO_XX_Conv W8 W64) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II64 (\dst -> addr_code `snocOL` LD II8 dst addr)) + +-- Note: there is no Load Byte Arithmetic instruction, so no signed case here + +getRegister' _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II32 (\dst -> addr_code `snocOL` LD II16 dst addr)) + +getRegister' _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II32 (\dst -> addr_code `snocOL` LA II16 dst addr)) + +getRegister' _ (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II64 (\dst -> addr_code `snocOL` LD II16 dst addr)) + +getRegister' _ (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II64 (\dst -> addr_code `snocOL` LA II16 dst addr)) + +getRegister' _ (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad mem _]) = do + Amode addr addr_code <- getAmode D mem + return (Any II64 (\dst -> addr_code `snocOL` LD II32 dst addr)) + +getRegister' _ (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad mem _]) = do + -- lwa is DS-form. See Note [Power instruction format] + Amode addr addr_code <- getAmode DS mem + return (Any II64 (\dst -> addr_code `snocOL` LA II32 dst addr)) + +getRegister' dflags (CmmMachOp mop [x]) -- unary MachOps + = case mop of + MO_Not rep -> triv_ucode_int rep NOT + + MO_F_Neg w -> triv_ucode_float w FNEG + MO_S_Neg w -> triv_ucode_int w NEG + + MO_FF_Conv W64 W32 -> trivialUCode FF32 FRSP x + MO_FF_Conv W32 W64 -> conversionNop FF64 x + + MO_FS_Conv from to -> coerceFP2Int from to x + MO_SF_Conv from to -> coerceInt2FP from to x + + MO_SS_Conv from to + | from >= to -> conversionNop (intFormat to) x + | otherwise -> triv_ucode_int to (EXTS (intFormat from)) + + MO_UU_Conv from to + | from >= to -> conversionNop (intFormat to) x + | otherwise -> clearLeft from to + + MO_XX_Conv _ to -> conversionNop (intFormat to) x + + _ -> panic "PPC.CodeGen.getRegister: no match" + + where + triv_ucode_int width instr = trivialUCode (intFormat width) instr x + triv_ucode_float width instr = trivialUCode (floatFormat width) instr x + + conversionNop new_format expr + = do e_code <- getRegister' dflags expr + return (swizzleRegisterRep e_code new_format) + + clearLeft from to + = do (src1, code1) <- getSomeReg x + let arch_fmt = intFormat (wordWidth dflags) + arch_bits = widthInBits (wordWidth dflags) + size = widthInBits from + code dst = code1 `snocOL` + CLRLI arch_fmt dst src1 (arch_bits - size) + return (Any (intFormat to) code) + +getRegister' _ (CmmMachOp mop [x, y]) -- dyadic PrimOps + = case mop of + MO_F_Eq _ -> condFltReg EQQ x y + MO_F_Ne _ -> condFltReg NE x y + MO_F_Gt _ -> condFltReg GTT x y + MO_F_Ge _ -> condFltReg GE x y + MO_F_Lt _ -> condFltReg LTT x y + MO_F_Le _ -> condFltReg LE x y + + MO_Eq rep -> condIntReg EQQ rep x y + MO_Ne rep -> condIntReg NE rep x y + + MO_S_Gt rep -> condIntReg GTT rep x y + MO_S_Ge rep -> condIntReg GE rep x y + MO_S_Lt rep -> condIntReg LTT rep x y + MO_S_Le rep -> condIntReg LE rep x y + + MO_U_Gt rep -> condIntReg GU rep x y + MO_U_Ge rep -> condIntReg GEU rep x y + MO_U_Lt rep -> condIntReg LU rep x y + MO_U_Le rep -> condIntReg LEU rep x y + + MO_F_Add w -> triv_float w FADD + MO_F_Sub w -> triv_float w FSUB + MO_F_Mul w -> triv_float w FMUL + MO_F_Quot w -> triv_float w FDIV + + -- optimize addition with 32-bit immediate + -- (needed for PIC) + MO_Add W32 -> + case y of + CmmLit (CmmInt imm immrep) | Just _ <- makeImmediate W32 True imm + -> trivialCode W32 True ADD x (CmmLit $ CmmInt imm immrep) + CmmLit lit + -> do + (src, srcCode) <- getSomeReg x + let imm = litToImm lit + code dst = srcCode `appOL` toOL [ + ADDIS dst src (HA imm), + ADD dst dst (RIImm (LO imm)) + ] + return (Any II32 code) + _ -> trivialCode W32 True ADD x y + + MO_Add rep -> trivialCode rep True ADD x y + MO_Sub rep -> + case y of + CmmLit (CmmInt imm immrep) | Just _ <- makeImmediate rep True (-imm) + -> trivialCode rep True ADD x (CmmLit $ CmmInt (-imm) immrep) + _ -> case x of + CmmLit (CmmInt imm _) + | Just _ <- makeImmediate rep True imm + -- subfi ('subtract from' with immediate) doesn't exist + -> trivialCode rep True SUBFC y x + _ -> trivialCodeNoImm' (intFormat rep) SUBF y x + + MO_Mul rep -> shiftMulCode rep True MULL x y + MO_S_MulMayOflo rep -> do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let + format = intFormat rep + code dst = code1 `appOL` code2 + `appOL` toOL [ MULLO format dst src1 src2 + , MFOV format dst + ] + return (Any format code) + + MO_S_Quot rep -> divCode rep True x y + MO_U_Quot rep -> divCode rep False x y + + MO_S_Rem rep -> remainder rep True x y + MO_U_Rem rep -> remainder rep False x y + + MO_And rep -> case y of + (CmmLit (CmmInt imm _)) | imm == -8 || imm == -4 + -> do + (src, srcCode) <- getSomeReg x + let clear_mask = if imm == -4 then 2 else 3 + fmt = intFormat rep + code dst = srcCode + `appOL` unitOL (CLRRI fmt dst src clear_mask) + return (Any fmt code) + _ -> trivialCode rep False AND x y + MO_Or rep -> trivialCode rep False OR x y + MO_Xor rep -> trivialCode rep False XOR x y + + MO_Shl rep -> shiftMulCode rep False SL x y + MO_S_Shr rep -> srCode rep True SRA x y + MO_U_Shr rep -> srCode rep False SR x y + _ -> panic "PPC.CodeGen.getRegister: no match" + + where + triv_float :: Width -> (Format -> Reg -> Reg -> Reg -> Instr) -> NatM Register + triv_float width instr = trivialCodeNoImm (floatFormat width) instr x y + + remainder :: Width -> Bool -> CmmExpr -> CmmExpr -> NatM Register + remainder rep sgn x y = do + let fmt = intFormat rep + tmp <- getNewRegNat fmt + code <- remainderCode rep sgn tmp x y + return (Any fmt code) + + +getRegister' _ (CmmLit (CmmInt i rep)) + | Just imm <- makeImmediate rep True i + = let + code dst = unitOL (LI dst imm) + in + return (Any (intFormat rep) code) + +getRegister' _ (CmmLit (CmmFloat f frep)) = do + lbl <- getNewLabelNat + dflags <- getDynFlags + dynRef <- cmmMakeDynamicReference dflags DataReference lbl + Amode addr addr_code <- getAmode D dynRef + let format = floatFormat frep + code dst = + LDATA (Section ReadOnlyData lbl) + (RawCmmStatics lbl [CmmStaticLit (CmmFloat f frep)]) + `consOL` (addr_code `snocOL` LD format dst addr) + return (Any format code) + +getRegister' dflags (CmmLit lit) + | target32Bit (targetPlatform dflags) + = let rep = cmmLitType dflags lit + imm = litToImm lit + code dst = toOL [ + LIS dst (HA imm), + ADD dst dst (RIImm (LO imm)) + ] + in return (Any (cmmTypeFormat rep) code) + | otherwise + = do lbl <- getNewLabelNat + dflags <- getDynFlags + dynRef <- cmmMakeDynamicReference dflags DataReference lbl + Amode addr addr_code <- getAmode D dynRef + let rep = cmmLitType dflags lit + format = cmmTypeFormat rep + code dst = + LDATA (Section ReadOnlyData lbl) (RawCmmStatics lbl [CmmStaticLit lit]) + `consOL` (addr_code `snocOL` LD format dst addr) + return (Any format code) + +getRegister' _ other = pprPanic "getRegister(ppc)" (pprExpr other) + + -- extend?Rep: wrap integer expression of type `from` + -- in a conversion to `to` +extendSExpr :: Width -> Width -> CmmExpr -> CmmExpr +extendSExpr from to x = CmmMachOp (MO_SS_Conv from to) [x] + +extendUExpr :: Width -> Width -> CmmExpr -> CmmExpr +extendUExpr from to x = CmmMachOp (MO_UU_Conv from to) [x] + +-- ----------------------------------------------------------------------------- +-- The 'Amode' type: Memory addressing modes passed up the tree. + +data Amode + = Amode AddrMode InstrBlock + +{- +Now, given a tree (the argument to a CmmLoad) that references memory, +produce a suitable addressing mode. + +A Rule of the Game (tm) for Amodes: use of the addr bit must +immediately follow use of the code part, since the code part puts +values in registers which the addr then refers to. So you can't put +anything in between, lest it overwrite some of those registers. If +you need to do some other computation between the code part and use of +the addr bit, first store the effective address from the amode in a +temporary, then do the other computation, and then use the temporary: + + code + LEA amode, tmp + ... other computation ... + ... (tmp) ... +-} + +{- Note [Power instruction format] +In some instructions the 16 bit offset must be a multiple of 4, i.e. +the two least significant bits must be zero. The "Power ISA" specification +calls these instruction formats "DS-FORM" and the instructions with +arbitrary 16 bit offsets are "D-FORM". + +The Power ISA specification document can be obtained from www.power.org. +-} +data InstrForm = D | DS + +getAmode :: InstrForm -> CmmExpr -> NatM Amode +getAmode inf tree@(CmmRegOff _ _) + = do dflags <- getDynFlags + getAmode inf (mangleIndexTree dflags tree) + +getAmode _ (CmmMachOp (MO_Sub W32) [x, CmmLit (CmmInt i _)]) + | Just off <- makeImmediate W32 True (-i) + = do + (reg, code) <- getSomeReg x + return (Amode (AddrRegImm reg off) code) + + +getAmode _ (CmmMachOp (MO_Add W32) [x, CmmLit (CmmInt i _)]) + | Just off <- makeImmediate W32 True i + = do + (reg, code) <- getSomeReg x + return (Amode (AddrRegImm reg off) code) + +getAmode D (CmmMachOp (MO_Sub W64) [x, CmmLit (CmmInt i _)]) + | Just off <- makeImmediate W64 True (-i) + = do + (reg, code) <- getSomeReg x + return (Amode (AddrRegImm reg off) code) + + +getAmode D (CmmMachOp (MO_Add W64) [x, CmmLit (CmmInt i _)]) + | Just off <- makeImmediate W64 True i + = do + (reg, code) <- getSomeReg x + return (Amode (AddrRegImm reg off) code) + +getAmode DS (CmmMachOp (MO_Sub W64) [x, CmmLit (CmmInt i _)]) + | Just off <- makeImmediate W64 True (-i) + = do + (reg, code) <- getSomeReg x + (reg', off', code') <- + if i `mod` 4 == 0 + then do return (reg, off, code) + else do + tmp <- getNewRegNat II64 + return (tmp, ImmInt 0, + code `snocOL` ADD tmp reg (RIImm off)) + return (Amode (AddrRegImm reg' off') code') + +getAmode DS (CmmMachOp (MO_Add W64) [x, CmmLit (CmmInt i _)]) + | Just off <- makeImmediate W64 True i + = do + (reg, code) <- getSomeReg x + (reg', off', code') <- + if i `mod` 4 == 0 + then do return (reg, off, code) + else do + tmp <- getNewRegNat II64 + return (tmp, ImmInt 0, + code `snocOL` ADD tmp reg (RIImm off)) + return (Amode (AddrRegImm reg' off') code') + + -- optimize addition with 32-bit immediate + -- (needed for PIC) +getAmode _ (CmmMachOp (MO_Add W32) [x, CmmLit lit]) + = do + dflags <- getDynFlags + (src, srcCode) <- getSomeReg x + let imm = litToImm lit + case () of + _ | OSAIX <- platformOS (targetPlatform dflags) + , isCmmLabelType lit -> + -- HA16/LO16 relocations on labels not supported on AIX + return (Amode (AddrRegImm src imm) srcCode) + | otherwise -> do + tmp <- getNewRegNat II32 + let code = srcCode `snocOL` ADDIS tmp src (HA imm) + return (Amode (AddrRegImm tmp (LO imm)) code) + where + isCmmLabelType (CmmLabel {}) = True + isCmmLabelType (CmmLabelOff {}) = True + isCmmLabelType (CmmLabelDiffOff {}) = True + isCmmLabelType _ = False + +getAmode _ (CmmLit lit) + = do + dflags <- getDynFlags + case platformArch $ targetPlatform dflags of + ArchPPC -> do + tmp <- getNewRegNat II32 + let imm = litToImm lit + code = unitOL (LIS tmp (HA imm)) + return (Amode (AddrRegImm tmp (LO imm)) code) + _ -> do -- TODO: Load from TOC, + -- see getRegister' _ (CmmLit lit) + tmp <- getNewRegNat II64 + let imm = litToImm lit + code = toOL [ + LIS tmp (HIGHESTA imm), + OR tmp tmp (RIImm (HIGHERA imm)), + SL II64 tmp tmp (RIImm (ImmInt 32)), + ORIS tmp tmp (HA imm) + ] + return (Amode (AddrRegImm tmp (LO imm)) code) + +getAmode _ (CmmMachOp (MO_Add W32) [x, y]) + = do + (regX, codeX) <- getSomeReg x + (regY, codeY) <- getSomeReg y + return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY)) + +getAmode _ (CmmMachOp (MO_Add W64) [x, y]) + = do + (regX, codeX) <- getSomeReg x + (regY, codeY) <- getSomeReg y + return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY)) + +getAmode _ other + = do + (reg, code) <- getSomeReg other + let + off = ImmInt 0 + return (Amode (AddrRegImm reg off) code) + + +-- The 'CondCode' type: Condition codes passed up the tree. +data CondCode + = CondCode Bool Cond InstrBlock + +-- Set up a condition code for a conditional branch. + +getCondCode :: CmmExpr -> NatM CondCode + +-- almost the same as everywhere else - but we need to +-- extend small integers to 32 bit or 64 bit first + +getCondCode (CmmMachOp mop [x, y]) + = do + case mop of + MO_F_Eq W32 -> condFltCode EQQ x y + MO_F_Ne W32 -> condFltCode NE x y + MO_F_Gt W32 -> condFltCode GTT x y + MO_F_Ge W32 -> condFltCode GE x y + MO_F_Lt W32 -> condFltCode LTT x y + MO_F_Le W32 -> condFltCode LE x y + + MO_F_Eq W64 -> condFltCode EQQ x y + MO_F_Ne W64 -> condFltCode NE x y + MO_F_Gt W64 -> condFltCode GTT x y + MO_F_Ge W64 -> condFltCode GE x y + MO_F_Lt W64 -> condFltCode LTT x y + MO_F_Le W64 -> condFltCode LE x y + + MO_Eq rep -> condIntCode EQQ rep x y + MO_Ne rep -> condIntCode NE rep x y + + MO_S_Gt rep -> condIntCode GTT rep x y + MO_S_Ge rep -> condIntCode GE rep x y + MO_S_Lt rep -> condIntCode LTT rep x y + MO_S_Le rep -> condIntCode LE rep x y + + MO_U_Gt rep -> condIntCode GU rep x y + MO_U_Ge rep -> condIntCode GEU rep x y + MO_U_Lt rep -> condIntCode LU rep x y + MO_U_Le rep -> condIntCode LEU rep x y + + _ -> pprPanic "getCondCode(powerpc)" (pprMachOp mop) + +getCondCode _ = panic "getCondCode(2)(powerpc)" + + +-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be +-- passed back up the tree. + +condIntCode :: Cond -> Width -> CmmExpr -> CmmExpr -> NatM CondCode +condIntCode cond width x y = do + dflags <- getDynFlags + condIntCode' (target32Bit (targetPlatform dflags)) cond width x y + +condIntCode' :: Bool -> Cond -> Width -> CmmExpr -> CmmExpr -> NatM CondCode + +-- simple code for 64-bit on 32-bit platforms +condIntCode' True cond W64 x y + | condUnsigned cond + = do + ChildCode64 code_x x_lo <- iselExpr64 x + ChildCode64 code_y y_lo <- iselExpr64 y + let x_hi = getHiVRegFromLo x_lo + y_hi = getHiVRegFromLo y_lo + end_lbl <- getBlockIdNat + let code = code_x `appOL` code_y `appOL` toOL + [ CMPL II32 x_hi (RIReg y_hi) + , BCC NE end_lbl Nothing + , CMPL II32 x_lo (RIReg y_lo) + , BCC ALWAYS end_lbl Nothing + + , NEWBLOCK end_lbl + ] + return (CondCode False cond code) + | otherwise + = do + ChildCode64 code_x x_lo <- iselExpr64 x + ChildCode64 code_y y_lo <- iselExpr64 y + let x_hi = getHiVRegFromLo x_lo + y_hi = getHiVRegFromLo y_lo + end_lbl <- getBlockIdNat + cmp_lo <- getBlockIdNat + let code = code_x `appOL` code_y `appOL` toOL + [ CMP II32 x_hi (RIReg y_hi) + , BCC NE end_lbl Nothing + , CMP II32 x_hi (RIImm (ImmInt 0)) + , BCC LE cmp_lo Nothing + , CMPL II32 x_lo (RIReg y_lo) + , BCC ALWAYS end_lbl Nothing + , NEWBLOCK cmp_lo + , CMPL II32 y_lo (RIReg x_lo) + , BCC ALWAYS end_lbl Nothing + + , NEWBLOCK end_lbl + ] + return (CondCode False cond code) + +-- optimize pointer tag checks. Operation andi. sets condition register +-- so cmpi ..., 0 is redundant. +condIntCode' _ cond _ (CmmMachOp (MO_And _) [x, CmmLit (CmmInt imm rep)]) + (CmmLit (CmmInt 0 _)) + | not $ condUnsigned cond, + Just src2 <- makeImmediate rep False imm + = do + (src1, code) <- getSomeReg x + let code' = code `snocOL` AND r0 src1 (RIImm src2) + return (CondCode False cond code') + +condIntCode' _ cond width x (CmmLit (CmmInt y rep)) + | Just src2 <- makeImmediate rep (not $ condUnsigned cond) y + = do + let op_len = max W32 width + let extend = extendSExpr width op_len + (src1, code) <- getSomeReg (extend x) + let format = intFormat op_len + code' = code `snocOL` + (if condUnsigned cond then CMPL else CMP) format src1 (RIImm src2) + return (CondCode False cond code') + +condIntCode' _ cond width x y = do + let op_len = max W32 width + let extend = if condUnsigned cond then extendUExpr width op_len + else extendSExpr width op_len + (src1, code1) <- getSomeReg (extend x) + (src2, code2) <- getSomeReg (extend y) + let format = intFormat op_len + code' = code1 `appOL` code2 `snocOL` + (if condUnsigned cond then CMPL else CMP) format src1 (RIReg src2) + return (CondCode False cond code') + +condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode +condFltCode cond x y = do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let + code' = code1 `appOL` code2 `snocOL` FCMP src1 src2 + code'' = case cond of -- twiddle CR to handle unordered case + GE -> code' `snocOL` CRNOR ltbit eqbit gtbit + LE -> code' `snocOL` CRNOR gtbit eqbit ltbit + _ -> code' + where + ltbit = 0 ; eqbit = 2 ; gtbit = 1 + return (CondCode True cond code'') + + + +-- ----------------------------------------------------------------------------- +-- Generating assignments + +-- Assignments are really at the heart of the whole code generation +-- business. Almost all top-level nodes of any real importance are +-- assignments, which correspond to loads, stores, or register +-- transfers. If we're really lucky, some of the register transfers +-- will go away, because we can use the destination register to +-- complete the code generation for the right hand side. This only +-- fails when the right hand side is forced into a fixed register +-- (e.g. the result of a call). + +assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock +assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock + +assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock +assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock + +assignMem_IntCode pk addr src = do + (srcReg, code) <- getSomeReg src + Amode dstAddr addr_code <- case pk of + II64 -> getAmode DS addr + _ -> getAmode D addr + return $ code `appOL` addr_code `snocOL` ST pk srcReg dstAddr + +-- dst is a reg, but src could be anything +assignReg_IntCode _ reg src + = do + dflags <- getDynFlags + let dst = getRegisterReg (targetPlatform dflags) reg + r <- getRegister src + return $ case r of + Any _ code -> code dst + Fixed _ freg fcode -> fcode `snocOL` MR dst freg + + + +-- Easy, isn't it? +assignMem_FltCode = assignMem_IntCode +assignReg_FltCode = assignReg_IntCode + + + +genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock + +genJump (CmmLit (CmmLabel lbl)) regs + = return (unitOL $ JMP lbl regs) + +genJump tree gregs + = do + dflags <- getDynFlags + genJump' tree (platformToGCP (targetPlatform dflags)) gregs + +genJump' :: CmmExpr -> GenCCallPlatform -> [Reg] -> NatM InstrBlock + +genJump' tree (GCP64ELF 1) regs + = do + (target,code) <- getSomeReg tree + return (code + `snocOL` LD II64 r11 (AddrRegImm target (ImmInt 0)) + `snocOL` LD II64 toc (AddrRegImm target (ImmInt 8)) + `snocOL` MTCTR r11 + `snocOL` LD II64 r11 (AddrRegImm target (ImmInt 16)) + `snocOL` BCTR [] Nothing regs) + +genJump' tree (GCP64ELF 2) regs + = do + (target,code) <- getSomeReg tree + return (code + `snocOL` MR r12 target + `snocOL` MTCTR r12 + `snocOL` BCTR [] Nothing regs) + +genJump' tree _ regs + = do + (target,code) <- getSomeReg tree + return (code `snocOL` MTCTR target `snocOL` BCTR [] Nothing regs) + +-- ----------------------------------------------------------------------------- +-- Unconditional branches +genBranch :: BlockId -> NatM InstrBlock +genBranch = return . toOL . mkJumpInstr + + +-- ----------------------------------------------------------------------------- +-- Conditional jumps + +{- +Conditional jumps are always to local labels, so we can use branch +instructions. We peek at the arguments to decide what kind of +comparison to do. +-} + + +genCondJump + :: BlockId -- the branch target + -> CmmExpr -- the condition on which to branch + -> Maybe Bool + -> NatM InstrBlock + +genCondJump id bool prediction = do + CondCode _ cond code <- getCondCode bool + return (code `snocOL` BCC cond id prediction) + + + +-- ----------------------------------------------------------------------------- +-- Generating C calls + +-- Now the biggest nightmare---calls. Most of the nastiness is buried in +-- @get_arg@, which moves the arguments to the correct registers/stack +-- locations. Apart from that, the code is easy. + +genCCall :: ForeignTarget -- function to call + -> [CmmFormal] -- where to put the result + -> [CmmActual] -- arguments (of mixed type) + -> NatM InstrBlock +genCCall (PrimTarget MO_ReadBarrier) _ _ + = return $ unitOL LWSYNC +genCCall (PrimTarget MO_WriteBarrier) _ _ + = return $ unitOL LWSYNC + +genCCall (PrimTarget MO_Touch) _ _ + = return $ nilOL + +genCCall (PrimTarget (MO_Prefetch_Data _)) _ _ + = return $ nilOL + +genCCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] + = do dflags <- getDynFlags + let platform = targetPlatform dflags + fmt = intFormat width + reg_dst = getRegisterReg platform (CmmLocal dst) + (instr, n_code) <- case amop of + AMO_Add -> getSomeRegOrImm ADD True reg_dst + AMO_Sub -> case n of + CmmLit (CmmInt i _) + | Just imm <- makeImmediate width True (-i) + -> return (ADD reg_dst reg_dst (RIImm imm), nilOL) + _ + -> do + (n_reg, n_code) <- getSomeReg n + return (SUBF reg_dst n_reg reg_dst, n_code) + AMO_And -> getSomeRegOrImm AND False reg_dst + AMO_Nand -> do (n_reg, n_code) <- getSomeReg n + return (NAND reg_dst reg_dst n_reg, n_code) + AMO_Or -> getSomeRegOrImm OR False reg_dst + AMO_Xor -> getSomeRegOrImm XOR False reg_dst + Amode addr_reg addr_code <- getAmodeIndex addr + lbl_retry <- getBlockIdNat + return $ n_code `appOL` addr_code + `appOL` toOL [ HWSYNC + , BCC ALWAYS lbl_retry Nothing + + , NEWBLOCK lbl_retry + , LDR fmt reg_dst addr_reg + , instr + , STC fmt reg_dst addr_reg + , BCC NE lbl_retry (Just False) + , ISYNC + ] + where + getAmodeIndex (CmmMachOp (MO_Add _) [x, y]) + = do + (regX, codeX) <- getSomeReg x + (regY, codeY) <- getSomeReg y + return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY)) + getAmodeIndex other + = do + (reg, code) <- getSomeReg other + return (Amode (AddrRegReg r0 reg) code) -- NB: r0 is 0 here! + getSomeRegOrImm op sign dst + = case n of + CmmLit (CmmInt i _) | Just imm <- makeImmediate width sign i + -> return (op dst dst (RIImm imm), nilOL) + _ + -> do + (n_reg, n_code) <- getSomeReg n + return (op dst dst (RIReg n_reg), n_code) + +genCCall (PrimTarget (MO_AtomicRead width)) [dst] [addr] + = do dflags <- getDynFlags + let platform = targetPlatform dflags + fmt = intFormat width + reg_dst = getRegisterReg platform (CmmLocal dst) + form = if widthInBits width == 64 then DS else D + Amode addr_reg addr_code <- getAmode form addr + lbl_end <- getBlockIdNat + return $ addr_code `appOL` toOL [ HWSYNC + , LD fmt reg_dst addr_reg + , CMP fmt reg_dst (RIReg reg_dst) + , BCC NE lbl_end (Just False) + , BCC ALWAYS lbl_end Nothing + -- See Note [Seemingly useless cmp and bne] + , NEWBLOCK lbl_end + , ISYNC + ] + +-- Note [Seemingly useless cmp and bne] +-- In Power ISA, Book II, Section 4.4.1, Instruction Synchronize Instruction +-- the second paragraph says that isync may complete before storage accesses +-- "associated" with a preceding instruction have been performed. The cmp +-- operation and the following bne introduce a data and control dependency +-- on the load instruction (See also Power ISA, Book II, Appendix B.2.3, Safe +-- Fetch). +-- This is also what gcc does. + + +genCCall (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do + code <- assignMem_IntCode (intFormat width) addr val + return $ unitOL(HWSYNC) `appOL` code + +genCCall (PrimTarget (MO_Clz width)) [dst] [src] + = do dflags <- getDynFlags + let platform = targetPlatform dflags + reg_dst = getRegisterReg platform (CmmLocal dst) + if target32Bit platform && width == W64 + then do + ChildCode64 code vr_lo <- iselExpr64 src + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + lbl3 <- getBlockIdNat + let vr_hi = getHiVRegFromLo vr_lo + cntlz = toOL [ CMPL II32 vr_hi (RIImm (ImmInt 0)) + , BCC NE lbl2 Nothing + , BCC ALWAYS lbl1 Nothing + + , NEWBLOCK lbl1 + , CNTLZ II32 reg_dst vr_lo + , ADD reg_dst reg_dst (RIImm (ImmInt 32)) + , BCC ALWAYS lbl3 Nothing + + , NEWBLOCK lbl2 + , CNTLZ II32 reg_dst vr_hi + , BCC ALWAYS lbl3 Nothing + + , NEWBLOCK lbl3 + ] + return $ code `appOL` cntlz + else do + let format = if width == W64 then II64 else II32 + (s_reg, s_code) <- getSomeReg src + (pre, reg , post) <- + case width of + W64 -> return (nilOL, s_reg, nilOL) + W32 -> return (nilOL, s_reg, nilOL) + W16 -> do + reg_tmp <- getNewRegNat format + return + ( unitOL $ AND reg_tmp s_reg (RIImm (ImmInt 65535)) + , reg_tmp + , unitOL $ ADD reg_dst reg_dst (RIImm (ImmInt (-16))) + ) + W8 -> do + reg_tmp <- getNewRegNat format + return + ( unitOL $ AND reg_tmp s_reg (RIImm (ImmInt 255)) + , reg_tmp + , unitOL $ ADD reg_dst reg_dst (RIImm (ImmInt (-24))) + ) + _ -> panic "genCall: Clz wrong format" + let cntlz = unitOL (CNTLZ format reg_dst reg) + return $ s_code `appOL` pre `appOL` cntlz `appOL` post + +genCCall (PrimTarget (MO_Ctz width)) [dst] [src] + = do dflags <- getDynFlags + let platform = targetPlatform dflags + reg_dst = getRegisterReg platform (CmmLocal dst) + if target32Bit platform && width == W64 + then do + let format = II32 + ChildCode64 code vr_lo <- iselExpr64 src + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + lbl3 <- getBlockIdNat + x' <- getNewRegNat format + x'' <- getNewRegNat format + r' <- getNewRegNat format + cnttzlo <- cnttz format reg_dst vr_lo + let vr_hi = getHiVRegFromLo vr_lo + cnttz64 = toOL [ CMPL format vr_lo (RIImm (ImmInt 0)) + , BCC NE lbl2 Nothing + , BCC ALWAYS lbl1 Nothing + + , NEWBLOCK lbl1 + , ADD x' vr_hi (RIImm (ImmInt (-1))) + , ANDC x'' x' vr_hi + , CNTLZ format r' x'' + -- 32 + (32 - clz(x'')) + , SUBFC reg_dst r' (RIImm (ImmInt 64)) + , BCC ALWAYS lbl3 Nothing + + , NEWBLOCK lbl2 + ] + `appOL` cnttzlo `appOL` + toOL [ BCC ALWAYS lbl3 Nothing + + , NEWBLOCK lbl3 + ] + return $ code `appOL` cnttz64 + else do + let format = if width == W64 then II64 else II32 + (s_reg, s_code) <- getSomeReg src + (reg_ctz, pre_code) <- + case width of + W64 -> return (s_reg, nilOL) + W32 -> return (s_reg, nilOL) + W16 -> do + reg_tmp <- getNewRegNat format + return (reg_tmp, unitOL $ ORIS reg_tmp s_reg (ImmInt 1)) + W8 -> do + reg_tmp <- getNewRegNat format + return (reg_tmp, unitOL $ OR reg_tmp s_reg (RIImm (ImmInt 256))) + _ -> panic "genCall: Ctz wrong format" + ctz_code <- cnttz format reg_dst reg_ctz + return $ s_code `appOL` pre_code `appOL` ctz_code + where + -- cnttz(x) = sizeof(x) - cntlz(~x & (x - 1)) + -- see Henry S. Warren, Hacker's Delight, p 107 + cnttz format dst src = do + let format_bits = 8 * formatInBytes format + x' <- getNewRegNat format + x'' <- getNewRegNat format + r' <- getNewRegNat format + return $ toOL [ ADD x' src (RIImm (ImmInt (-1))) + , ANDC x'' x' src + , CNTLZ format r' x'' + , SUBFC dst r' (RIImm (ImmInt (format_bits))) + ] + +genCCall target dest_regs argsAndHints + = do dflags <- getDynFlags + let platform = targetPlatform dflags + case target of + PrimTarget (MO_S_QuotRem width) -> divOp1 platform True width + dest_regs argsAndHints + PrimTarget (MO_U_QuotRem width) -> divOp1 platform False width + dest_regs argsAndHints + PrimTarget (MO_U_QuotRem2 width) -> divOp2 platform width dest_regs + argsAndHints + PrimTarget (MO_U_Mul2 width) -> multOp2 platform width dest_regs + argsAndHints + PrimTarget (MO_Add2 _) -> add2Op platform dest_regs argsAndHints + PrimTarget (MO_AddWordC _) -> addcOp platform dest_regs argsAndHints + PrimTarget (MO_SubWordC _) -> subcOp platform dest_regs argsAndHints + PrimTarget (MO_AddIntC width) -> addSubCOp ADDO platform width + dest_regs argsAndHints + PrimTarget (MO_SubIntC width) -> addSubCOp SUBFO platform width + dest_regs argsAndHints + PrimTarget MO_F64_Fabs -> fabs platform dest_regs argsAndHints + PrimTarget MO_F32_Fabs -> fabs platform dest_regs argsAndHints + _ -> genCCall' dflags (platformToGCP platform) + target dest_regs argsAndHints + where divOp1 platform signed width [res_q, res_r] [arg_x, arg_y] + = do let reg_q = getRegisterReg platform (CmmLocal res_q) + reg_r = getRegisterReg platform (CmmLocal res_r) + remainderCode width signed reg_q arg_x arg_y + <*> pure reg_r + + divOp1 _ _ _ _ _ + = panic "genCCall: Wrong number of arguments for divOp1" + divOp2 platform width [res_q, res_r] + [arg_x_high, arg_x_low, arg_y] + = do let reg_q = getRegisterReg platform (CmmLocal res_q) + reg_r = getRegisterReg platform (CmmLocal res_r) + fmt = intFormat width + half = 4 * (formatInBytes fmt) + (xh_reg, xh_code) <- getSomeReg arg_x_high + (xl_reg, xl_code) <- getSomeReg arg_x_low + (y_reg, y_code) <- getSomeReg arg_y + s <- getNewRegNat fmt + b <- getNewRegNat fmt + v <- getNewRegNat fmt + vn1 <- getNewRegNat fmt + vn0 <- getNewRegNat fmt + un32 <- getNewRegNat fmt + tmp <- getNewRegNat fmt + un10 <- getNewRegNat fmt + un1 <- getNewRegNat fmt + un0 <- getNewRegNat fmt + q1 <- getNewRegNat fmt + rhat <- getNewRegNat fmt + tmp1 <- getNewRegNat fmt + q0 <- getNewRegNat fmt + un21 <- getNewRegNat fmt + again1 <- getBlockIdNat + no1 <- getBlockIdNat + then1 <- getBlockIdNat + endif1 <- getBlockIdNat + again2 <- getBlockIdNat + no2 <- getBlockIdNat + then2 <- getBlockIdNat + endif2 <- getBlockIdNat + return $ y_code `appOL` xl_code `appOL` xh_code `appOL` + -- see Hacker's Delight p 196 Figure 9-3 + toOL [ -- b = 2 ^ (bits_in_word / 2) + LI b (ImmInt 1) + , SL fmt b b (RIImm (ImmInt half)) + -- s = clz(y) + , CNTLZ fmt s y_reg + -- v = y << s + , SL fmt v y_reg (RIReg s) + -- vn1 = upper half of v + , SR fmt vn1 v (RIImm (ImmInt half)) + -- vn0 = lower half of v + , CLRLI fmt vn0 v half + -- un32 = (u1 << s) + -- | (u0 >> (bits_in_word - s)) + , SL fmt un32 xh_reg (RIReg s) + , SUBFC tmp s + (RIImm (ImmInt (8 * formatInBytes fmt))) + , SR fmt tmp xl_reg (RIReg tmp) + , OR un32 un32 (RIReg tmp) + -- un10 = u0 << s + , SL fmt un10 xl_reg (RIReg s) + -- un1 = upper half of un10 + , SR fmt un1 un10 (RIImm (ImmInt half)) + -- un0 = lower half of un10 + , CLRLI fmt un0 un10 half + -- q1 = un32/vn1 + , DIV fmt False q1 un32 vn1 + -- rhat = un32 - q1*vn1 + , MULL fmt tmp q1 (RIReg vn1) + , SUBF rhat tmp un32 + , BCC ALWAYS again1 Nothing + + , NEWBLOCK again1 + -- if (q1 >= b || q1*vn0 > b*rhat + un1) + , CMPL fmt q1 (RIReg b) + , BCC GEU then1 Nothing + , BCC ALWAYS no1 Nothing + + , NEWBLOCK no1 + , MULL fmt tmp q1 (RIReg vn0) + , SL fmt tmp1 rhat (RIImm (ImmInt half)) + , ADD tmp1 tmp1 (RIReg un1) + , CMPL fmt tmp (RIReg tmp1) + , BCC LEU endif1 Nothing + , BCC ALWAYS then1 Nothing + + , NEWBLOCK then1 + -- q1 = q1 - 1 + , ADD q1 q1 (RIImm (ImmInt (-1))) + -- rhat = rhat + vn1 + , ADD rhat rhat (RIReg vn1) + -- if (rhat < b) goto again1 + , CMPL fmt rhat (RIReg b) + , BCC LTT again1 Nothing + , BCC ALWAYS endif1 Nothing + + , NEWBLOCK endif1 + -- un21 = un32*b + un1 - q1*v + , SL fmt un21 un32 (RIImm (ImmInt half)) + , ADD un21 un21 (RIReg un1) + , MULL fmt tmp q1 (RIReg v) + , SUBF un21 tmp un21 + -- compute second quotient digit + -- q0 = un21/vn1 + , DIV fmt False q0 un21 vn1 + -- rhat = un21- q0*vn1 + , MULL fmt tmp q0 (RIReg vn1) + , SUBF rhat tmp un21 + , BCC ALWAYS again2 Nothing + + , NEWBLOCK again2 + -- if (q0>b || q0*vn0 > b*rhat + un0) + , CMPL fmt q0 (RIReg b) + , BCC GEU then2 Nothing + , BCC ALWAYS no2 Nothing + + , NEWBLOCK no2 + , MULL fmt tmp q0 (RIReg vn0) + , SL fmt tmp1 rhat (RIImm (ImmInt half)) + , ADD tmp1 tmp1 (RIReg un0) + , CMPL fmt tmp (RIReg tmp1) + , BCC LEU endif2 Nothing + , BCC ALWAYS then2 Nothing + + , NEWBLOCK then2 + -- q0 = q0 - 1 + , ADD q0 q0 (RIImm (ImmInt (-1))) + -- rhat = rhat + vn1 + , ADD rhat rhat (RIReg vn1) + -- if (rhat<b) goto again2 + , CMPL fmt rhat (RIReg b) + , BCC LTT again2 Nothing + , BCC ALWAYS endif2 Nothing + + , NEWBLOCK endif2 + -- compute remainder + -- r = (un21*b + un0 - q0*v) >> s + , SL fmt reg_r un21 (RIImm (ImmInt half)) + , ADD reg_r reg_r (RIReg un0) + , MULL fmt tmp q0 (RIReg v) + , SUBF reg_r tmp reg_r + , SR fmt reg_r reg_r (RIReg s) + -- compute quotient + -- q = q1*b + q0 + , SL fmt reg_q q1 (RIImm (ImmInt half)) + , ADD reg_q reg_q (RIReg q0) + ] + divOp2 _ _ _ _ + = panic "genCCall: Wrong number of arguments for divOp2" + multOp2 platform width [res_h, res_l] [arg_x, arg_y] + = do let reg_h = getRegisterReg platform (CmmLocal res_h) + reg_l = getRegisterReg platform (CmmLocal res_l) + fmt = intFormat width + (x_reg, x_code) <- getSomeReg arg_x + (y_reg, y_code) <- getSomeReg arg_y + return $ y_code `appOL` x_code + `appOL` toOL [ MULL fmt reg_l x_reg (RIReg y_reg) + , MULHU fmt reg_h x_reg y_reg + ] + multOp2 _ _ _ _ + = panic "genCall: Wrong number of arguments for multOp2" + add2Op platform [res_h, res_l] [arg_x, arg_y] + = do let reg_h = getRegisterReg platform (CmmLocal res_h) + reg_l = getRegisterReg platform (CmmLocal res_l) + (x_reg, x_code) <- getSomeReg arg_x + (y_reg, y_code) <- getSomeReg arg_y + return $ y_code `appOL` x_code + `appOL` toOL [ LI reg_h (ImmInt 0) + , ADDC reg_l x_reg y_reg + , ADDZE reg_h reg_h + ] + add2Op _ _ _ + = panic "genCCall: Wrong number of arguments/results for add2" + + addcOp platform [res_r, res_c] [arg_x, arg_y] + = add2Op platform [res_c {-hi-}, res_r {-lo-}] [arg_x, arg_y] + addcOp _ _ _ + = panic "genCCall: Wrong number of arguments/results for addc" + + -- PowerPC subfc sets the carry for rT = ~(rA) + rB + 1, + -- which is 0 for borrow and 1 otherwise. We need 1 and 0 + -- so xor with 1. + subcOp platform [res_r, res_c] [arg_x, arg_y] + = do let reg_r = getRegisterReg platform (CmmLocal res_r) + reg_c = getRegisterReg platform (CmmLocal res_c) + (x_reg, x_code) <- getSomeReg arg_x + (y_reg, y_code) <- getSomeReg arg_y + return $ y_code `appOL` x_code + `appOL` toOL [ LI reg_c (ImmInt 0) + , SUBFC reg_r y_reg (RIReg x_reg) + , ADDZE reg_c reg_c + , XOR reg_c reg_c (RIImm (ImmInt 1)) + ] + subcOp _ _ _ + = panic "genCCall: Wrong number of arguments/results for subc" + addSubCOp instr platform width [res_r, res_c] [arg_x, arg_y] + = do let reg_r = getRegisterReg platform (CmmLocal res_r) + reg_c = getRegisterReg platform (CmmLocal res_c) + (x_reg, x_code) <- getSomeReg arg_x + (y_reg, y_code) <- getSomeReg arg_y + return $ y_code `appOL` x_code + `appOL` toOL [ instr reg_r y_reg x_reg, + -- SUBFO argument order reversed! + MFOV (intFormat width) reg_c + ] + addSubCOp _ _ _ _ _ + = panic "genCall: Wrong number of arguments/results for addC" + fabs platform [res] [arg] + = do let res_r = getRegisterReg platform (CmmLocal res) + (arg_reg, arg_code) <- getSomeReg arg + return $ arg_code `snocOL` FABS res_r arg_reg + fabs _ _ _ + = panic "genCall: Wrong number of arguments/results for fabs" + +-- TODO: replace 'Int' by an enum such as 'PPC_64ABI' +data GenCCallPlatform = GCP32ELF | GCP64ELF !Int | GCPAIX + +platformToGCP :: Platform -> GenCCallPlatform +platformToGCP platform + = case platformOS platform of + OSAIX -> GCPAIX + _ -> case platformArch platform of + ArchPPC -> GCP32ELF + ArchPPC_64 ELF_V1 -> GCP64ELF 1 + ArchPPC_64 ELF_V2 -> GCP64ELF 2 + _ -> panic "platformToGCP: Not PowerPC" + + +genCCall' + :: DynFlags + -> GenCCallPlatform + -> ForeignTarget -- function to call + -> [CmmFormal] -- where to put the result + -> [CmmActual] -- arguments (of mixed type) + -> NatM InstrBlock + +{- + PowerPC Linux uses the System V Release 4 Calling Convention + for PowerPC. It is described in the + "System V Application Binary Interface PowerPC Processor Supplement". + + PowerPC 64 Linux uses the System V Release 4 Calling Convention for + 64-bit PowerPC. It is specified in + "64-bit PowerPC ELF Application Binary Interface Supplement 1.9" + (PPC64 ELF v1.9). + + PowerPC 64 Linux in little endian mode uses the "Power Architecture 64-Bit + ELF V2 ABI Specification -- OpenPOWER ABI for Linux Supplement" + (PPC64 ELF v2). + + AIX follows the "PowerOpen ABI: Application Binary Interface Big-Endian + 32-Bit Hardware Implementation" + + All four conventions are similar: + Parameters may be passed in general-purpose registers starting at r3, in + floating point registers starting at f1, or on the stack. + + But there are substantial differences: + * The number of registers used for parameter passing and the exact set of + nonvolatile registers differs (see MachRegs.hs). + * On AIX and 64-bit ELF, stack space is always reserved for parameters, + even if they are passed in registers. The called routine may choose to + save parameters from registers to the corresponding space on the stack. + * On AIX and 64-bit ELF, a corresponding amount of GPRs is skipped when + a floating point parameter is passed in an FPR. + * SysV insists on either passing I64 arguments on the stack, or in two GPRs, + starting with an odd-numbered GPR. It may skip a GPR to achieve this. + AIX just treats an I64 likt two separate I32s (high word first). + * I64 and FF64 arguments are 8-byte aligned on the stack for SysV, but only + 4-byte aligned like everything else on AIX. + * The SysV spec claims that FF32 is represented as FF64 on the stack. GCC on + PowerPC Linux does not agree, so neither do we. + + According to all conventions, the parameter area should be part of the + caller's stack frame, allocated in the caller's prologue code (large enough + to hold the parameter lists for all called routines). The NCG already + uses the stack for register spilling, leaving 64 bytes free at the top. + If we need a larger parameter area than that, we increase the size + of the stack frame just before ccalling. +-} + + +genCCall' dflags gcp target dest_regs args + = do + (finalStack,passArgumentsCode,usedRegs) <- passArguments + (zip3 args argReps argHints) + allArgRegs + (allFPArgRegs platform) + initialStackOffset + nilOL [] + + (labelOrExpr, reduceToFF32) <- case target of + ForeignTarget (CmmLit (CmmLabel lbl)) _ -> do + uses_pic_base_implicitly + return (Left lbl, False) + ForeignTarget expr _ -> do + uses_pic_base_implicitly + return (Right expr, False) + PrimTarget mop -> outOfLineMachOp mop + + let codeBefore = move_sp_down finalStack `appOL` passArgumentsCode + codeAfter = move_sp_up finalStack `appOL` moveResult reduceToFF32 + + case labelOrExpr of + Left lbl -> do -- the linker does all the work for us + return ( codeBefore + `snocOL` BL lbl usedRegs + `appOL` maybeNOP -- some ABI require a NOP after BL + `appOL` codeAfter) + Right dyn -> do -- implement call through function pointer + (dynReg, dynCode) <- getSomeReg dyn + case gcp of + GCP64ELF 1 -> return ( dynCode + `appOL` codeBefore + `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 40)) + `snocOL` LD II64 r11 (AddrRegImm dynReg (ImmInt 0)) + `snocOL` LD II64 toc (AddrRegImm dynReg (ImmInt 8)) + `snocOL` MTCTR r11 + `snocOL` LD II64 r11 (AddrRegImm dynReg (ImmInt 16)) + `snocOL` BCTRL usedRegs + `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 40)) + `appOL` codeAfter) + GCP64ELF 2 -> return ( dynCode + `appOL` codeBefore + `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 24)) + `snocOL` MR r12 dynReg + `snocOL` MTCTR r12 + `snocOL` BCTRL usedRegs + `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 24)) + `appOL` codeAfter) + GCPAIX -> return ( dynCode + -- AIX/XCOFF follows the PowerOPEN ABI + -- which is quite similar to LinuxPPC64/ELFv1 + `appOL` codeBefore + `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 20)) + `snocOL` LD II32 r11 (AddrRegImm dynReg (ImmInt 0)) + `snocOL` LD II32 toc (AddrRegImm dynReg (ImmInt 4)) + `snocOL` MTCTR r11 + `snocOL` LD II32 r11 (AddrRegImm dynReg (ImmInt 8)) + `snocOL` BCTRL usedRegs + `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 20)) + `appOL` codeAfter) + _ -> return ( dynCode + `snocOL` MTCTR dynReg + `appOL` codeBefore + `snocOL` BCTRL usedRegs + `appOL` codeAfter) + where + platform = targetPlatform dflags + + uses_pic_base_implicitly = do + -- See Note [implicit register in PPC PIC code] + -- on why we claim to use PIC register here + when (positionIndependent dflags && target32Bit platform) $ do + _ <- getPicBaseNat $ archWordFormat True + return () + + initialStackOffset = case gcp of + GCPAIX -> 24 + GCP32ELF -> 8 + GCP64ELF 1 -> 48 + GCP64ELF 2 -> 32 + _ -> panic "genCall': unknown calling convention" + -- size of linkage area + size of arguments, in bytes + stackDelta finalStack = case gcp of + GCPAIX -> + roundTo 16 $ (24 +) $ max 32 $ sum $ + map (widthInBytes . typeWidth) argReps + GCP32ELF -> roundTo 16 finalStack + GCP64ELF 1 -> + roundTo 16 $ (48 +) $ max 64 $ sum $ + map (roundTo 8 . widthInBytes . typeWidth) + argReps + GCP64ELF 2 -> + roundTo 16 $ (32 +) $ max 64 $ sum $ + map (roundTo 8 . widthInBytes . typeWidth) + argReps + _ -> panic "genCall': unknown calling conv." + + argReps = map (cmmExprType dflags) args + (argHints, _) = foreignTargetHints target + + roundTo a x | x `mod` a == 0 = x + | otherwise = x + a - (x `mod` a) + + spFormat = if target32Bit platform then II32 else II64 + + -- TODO: Do not create a new stack frame if delta is too large. + move_sp_down finalStack + | delta > stackFrameHeaderSize dflags = + toOL [STU spFormat sp (AddrRegImm sp (ImmInt (-delta))), + DELTA (-delta)] + | otherwise = nilOL + where delta = stackDelta finalStack + move_sp_up finalStack + | delta > stackFrameHeaderSize dflags = + toOL [ADD sp sp (RIImm (ImmInt delta)), + DELTA 0] + | otherwise = nilOL + where delta = stackDelta finalStack + + -- A NOP instruction is required after a call (bl instruction) + -- on AIX and 64-Bit Linux. + -- If the call is to a function with a different TOC (r2) the + -- link editor replaces the NOP instruction with a load of the TOC + -- from the stack to restore the TOC. + maybeNOP = case gcp of + GCP32ELF -> nilOL + -- See Section 3.9.4 of OpenPower ABI + GCPAIX -> unitOL NOP + -- See Section 3.5.11 of PPC64 ELF v1.9 + GCP64ELF 1 -> unitOL NOP + -- See Section 2.3.6 of PPC64 ELF v2 + GCP64ELF 2 -> unitOL NOP + _ -> panic "maybeNOP: Unknown PowerPC 64-bit ABI" + + passArguments [] _ _ stackOffset accumCode accumUsed = return (stackOffset, accumCode, accumUsed) + passArguments ((arg,arg_ty,_):args) gprs fprs stackOffset + accumCode accumUsed | isWord64 arg_ty + && target32Bit (targetPlatform dflags) = + do + ChildCode64 code vr_lo <- iselExpr64 arg + let vr_hi = getHiVRegFromLo vr_lo + + case gcp of + GCPAIX -> + do let storeWord vr (gpr:_) _ = MR gpr vr + storeWord vr [] offset + = ST II32 vr (AddrRegImm sp (ImmInt offset)) + passArguments args + (drop 2 gprs) + fprs + (stackOffset+8) + (accumCode `appOL` code + `snocOL` storeWord vr_hi gprs stackOffset + `snocOL` storeWord vr_lo (drop 1 gprs) (stackOffset+4)) + ((take 2 gprs) ++ accumUsed) + GCP32ELF -> + do let stackOffset' = roundTo 8 stackOffset + stackCode = accumCode `appOL` code + `snocOL` ST II32 vr_hi (AddrRegImm sp (ImmInt stackOffset')) + `snocOL` ST II32 vr_lo (AddrRegImm sp (ImmInt (stackOffset'+4))) + regCode hireg loreg = + accumCode `appOL` code + `snocOL` MR hireg vr_hi + `snocOL` MR loreg vr_lo + + case gprs of + hireg : loreg : regs | even (length gprs) -> + passArguments args regs fprs stackOffset + (regCode hireg loreg) (hireg : loreg : accumUsed) + _skipped : hireg : loreg : regs -> + passArguments args regs fprs stackOffset + (regCode hireg loreg) (hireg : loreg : accumUsed) + _ -> -- only one or no regs left + passArguments args [] fprs (stackOffset'+8) + stackCode accumUsed + GCP64ELF _ -> panic "passArguments: 32 bit code" + + passArguments ((arg,rep,hint):args) gprs fprs stackOffset accumCode accumUsed + | reg : _ <- regs = do + register <- getRegister arg_pro + let code = case register of + Fixed _ freg fcode -> fcode `snocOL` MR reg freg + Any _ acode -> acode reg + stackOffsetRes = case gcp of + -- The PowerOpen ABI requires that we + -- reserve stack slots for register + -- parameters + GCPAIX -> stackOffset + stackBytes + -- ... the SysV ABI 32-bit doesn't. + GCP32ELF -> stackOffset + -- ... but SysV ABI 64-bit does. + GCP64ELF _ -> stackOffset + stackBytes + passArguments args + (drop nGprs gprs) + (drop nFprs fprs) + stackOffsetRes + (accumCode `appOL` code) + (reg : accumUsed) + | otherwise = do + (vr, code) <- getSomeReg arg_pro + passArguments args + (drop nGprs gprs) + (drop nFprs fprs) + (stackOffset' + stackBytes) + (accumCode `appOL` code + `snocOL` ST format_pro vr stackSlot) + accumUsed + where + arg_pro + | isBitsType rep = CmmMachOp (conv_op (typeWidth rep) (wordWidth dflags)) [arg] + | otherwise = arg + format_pro + | isBitsType rep = intFormat (wordWidth dflags) + | otherwise = cmmTypeFormat rep + conv_op = case hint of + SignedHint -> MO_SS_Conv + _ -> MO_UU_Conv + + stackOffset' = case gcp of + GCPAIX -> + -- The 32bit PowerOPEN ABI is happy with + -- 32bit-alignment ... + stackOffset + GCP32ELF + -- ... the SysV ABI requires 8-byte + -- alignment for doubles. + | isFloatType rep && typeWidth rep == W64 -> + roundTo 8 stackOffset + | otherwise -> + stackOffset + GCP64ELF _ -> + -- Everything on the stack is mapped to + -- 8-byte aligned doublewords + stackOffset + stackOffset'' + | isFloatType rep && typeWidth rep == W32 = + case gcp of + -- The ELF v1 ABI Section 3.2.3 requires: + -- "Single precision floating point values + -- are mapped to the second word in a single + -- doubleword" + GCP64ELF 1 -> stackOffset' + 4 + _ -> stackOffset' + | otherwise = stackOffset' + + stackSlot = AddrRegImm sp (ImmInt stackOffset'') + (nGprs, nFprs, stackBytes, regs) + = case gcp of + GCPAIX -> + case cmmTypeFormat rep of + II8 -> (1, 0, 4, gprs) + II16 -> (1, 0, 4, gprs) + II32 -> (1, 0, 4, gprs) + -- The PowerOpen ABI requires that we skip a + -- corresponding number of GPRs when we use + -- the FPRs. + -- + -- E.g. for a `double` two GPRs are skipped, + -- whereas for a `float` one GPR is skipped + -- when parameters are assigned to + -- registers. + -- + -- The PowerOpen ABI specification can be found at + -- ftp://www.sourceware.org/pub/binutils/ppc-docs/ppc-poweropen/ + FF32 -> (1, 1, 4, fprs) + FF64 -> (2, 1, 8, fprs) + II64 -> panic "genCCall' passArguments II64" + + GCP32ELF -> + case cmmTypeFormat rep of + II8 -> (1, 0, 4, gprs) + II16 -> (1, 0, 4, gprs) + II32 -> (1, 0, 4, gprs) + -- ... the SysV ABI doesn't. + FF32 -> (0, 1, 4, fprs) + FF64 -> (0, 1, 8, fprs) + II64 -> panic "genCCall' passArguments II64" + GCP64ELF _ -> + case cmmTypeFormat rep of + II8 -> (1, 0, 8, gprs) + II16 -> (1, 0, 8, gprs) + II32 -> (1, 0, 8, gprs) + II64 -> (1, 0, 8, gprs) + -- The ELFv1 ABI requires that we skip a + -- corresponding number of GPRs when we use + -- the FPRs. + FF32 -> (1, 1, 8, fprs) + FF64 -> (1, 1, 8, fprs) + + moveResult reduceToFF32 = + case dest_regs of + [] -> nilOL + [dest] + | reduceToFF32 && isFloat32 rep -> unitOL (FRSP r_dest f1) + | isFloat32 rep || isFloat64 rep -> unitOL (MR r_dest f1) + | isWord64 rep && target32Bit (targetPlatform dflags) + -> toOL [MR (getHiVRegFromLo r_dest) r3, + MR r_dest r4] + | otherwise -> unitOL (MR r_dest r3) + where rep = cmmRegType dflags (CmmLocal dest) + r_dest = getRegisterReg platform (CmmLocal dest) + _ -> panic "genCCall' moveResult: Bad dest_regs" + + outOfLineMachOp mop = + do + dflags <- getDynFlags + mopExpr <- cmmMakeDynamicReference dflags CallReference $ + mkForeignLabel functionName Nothing ForeignLabelInThisPackage IsFunction + let mopLabelOrExpr = case mopExpr of + CmmLit (CmmLabel lbl) -> Left lbl + _ -> Right mopExpr + return (mopLabelOrExpr, reduce) + where + (functionName, reduce) = case mop of + MO_F32_Exp -> (fsLit "exp", True) + MO_F32_ExpM1 -> (fsLit "expm1", True) + MO_F32_Log -> (fsLit "log", True) + MO_F32_Log1P -> (fsLit "log1p", True) + MO_F32_Sqrt -> (fsLit "sqrt", True) + MO_F32_Fabs -> unsupported + + MO_F32_Sin -> (fsLit "sin", True) + MO_F32_Cos -> (fsLit "cos", True) + MO_F32_Tan -> (fsLit "tan", True) + + MO_F32_Asin -> (fsLit "asin", True) + MO_F32_Acos -> (fsLit "acos", True) + MO_F32_Atan -> (fsLit "atan", True) + + MO_F32_Sinh -> (fsLit "sinh", True) + MO_F32_Cosh -> (fsLit "cosh", True) + MO_F32_Tanh -> (fsLit "tanh", True) + MO_F32_Pwr -> (fsLit "pow", True) + + MO_F32_Asinh -> (fsLit "asinh", True) + MO_F32_Acosh -> (fsLit "acosh", True) + MO_F32_Atanh -> (fsLit "atanh", True) + + MO_F64_Exp -> (fsLit "exp", False) + MO_F64_ExpM1 -> (fsLit "expm1", False) + MO_F64_Log -> (fsLit "log", False) + MO_F64_Log1P -> (fsLit "log1p", False) + MO_F64_Sqrt -> (fsLit "sqrt", False) + MO_F64_Fabs -> unsupported + + MO_F64_Sin -> (fsLit "sin", False) + MO_F64_Cos -> (fsLit "cos", False) + MO_F64_Tan -> (fsLit "tan", False) + + MO_F64_Asin -> (fsLit "asin", False) + MO_F64_Acos -> (fsLit "acos", False) + MO_F64_Atan -> (fsLit "atan", False) + + MO_F64_Sinh -> (fsLit "sinh", False) + MO_F64_Cosh -> (fsLit "cosh", False) + MO_F64_Tanh -> (fsLit "tanh", False) + MO_F64_Pwr -> (fsLit "pow", False) + + MO_F64_Asinh -> (fsLit "asinh", False) + MO_F64_Acosh -> (fsLit "acosh", False) + MO_F64_Atanh -> (fsLit "atanh", False) + + MO_UF_Conv w -> (fsLit $ word2FloatLabel w, False) + + MO_Memcpy _ -> (fsLit "memcpy", False) + MO_Memset _ -> (fsLit "memset", False) + MO_Memmove _ -> (fsLit "memmove", False) + MO_Memcmp _ -> (fsLit "memcmp", False) + + MO_BSwap w -> (fsLit $ bSwapLabel w, False) + MO_BRev w -> (fsLit $ bRevLabel w, False) + MO_PopCnt w -> (fsLit $ popCntLabel w, False) + MO_Pdep w -> (fsLit $ pdepLabel w, False) + MO_Pext w -> (fsLit $ pextLabel w, False) + MO_Clz _ -> unsupported + MO_Ctz _ -> unsupported + MO_AtomicRMW {} -> unsupported + MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False) + MO_AtomicRead _ -> unsupported + MO_AtomicWrite _ -> unsupported + + MO_S_Mul2 {} -> unsupported + MO_S_QuotRem {} -> unsupported + MO_U_QuotRem {} -> unsupported + MO_U_QuotRem2 {} -> unsupported + MO_Add2 {} -> unsupported + MO_AddWordC {} -> unsupported + MO_SubWordC {} -> unsupported + MO_AddIntC {} -> unsupported + MO_SubIntC {} -> unsupported + MO_U_Mul2 {} -> unsupported + MO_ReadBarrier -> unsupported + MO_WriteBarrier -> unsupported + MO_Touch -> unsupported + MO_Prefetch_Data _ -> unsupported + unsupported = panic ("outOfLineCmmOp: " ++ show mop + ++ " not supported") + +-- ----------------------------------------------------------------------------- +-- Generating a table-branch + +genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock +genSwitch dflags expr targets + | OSAIX <- platformOS (targetPlatform dflags) + = do + (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset) + let fmt = archWordFormat $ target32Bit $ targetPlatform dflags + sha = if target32Bit $ targetPlatform dflags then 2 else 3 + tmp <- getNewRegNat fmt + lbl <- getNewLabelNat + dynRef <- cmmMakeDynamicReference dflags DataReference lbl + (tableReg,t_code) <- getSomeReg $ dynRef + let code = e_code `appOL` t_code `appOL` toOL [ + SL fmt tmp reg (RIImm (ImmInt sha)), + LD fmt tmp (AddrRegReg tableReg tmp), + MTCTR tmp, + BCTR ids (Just lbl) [] + ] + return code + + | (positionIndependent dflags) || (not $ target32Bit $ targetPlatform dflags) + = do + (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset) + let fmt = archWordFormat $ target32Bit $ targetPlatform dflags + sha = if target32Bit $ targetPlatform dflags then 2 else 3 + tmp <- getNewRegNat fmt + lbl <- getNewLabelNat + dynRef <- cmmMakeDynamicReference dflags DataReference lbl + (tableReg,t_code) <- getSomeReg $ dynRef + let code = e_code `appOL` t_code `appOL` toOL [ + SL fmt tmp reg (RIImm (ImmInt sha)), + LD fmt tmp (AddrRegReg tableReg tmp), + ADD tmp tmp (RIReg tableReg), + MTCTR tmp, + BCTR ids (Just lbl) [] + ] + return code + | otherwise + = do + (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset) + let fmt = archWordFormat $ target32Bit $ targetPlatform dflags + sha = if target32Bit $ targetPlatform dflags then 2 else 3 + tmp <- getNewRegNat fmt + lbl <- getNewLabelNat + let code = e_code `appOL` toOL [ + SL fmt tmp reg (RIImm (ImmInt sha)), + ADDIS tmp tmp (HA (ImmCLbl lbl)), + LD fmt tmp (AddrRegImm tmp (LO (ImmCLbl lbl))), + MTCTR tmp, + BCTR ids (Just lbl) [] + ] + return code + where (offset, ids) = switchTargetsToTable targets + +generateJumpTableForInstr :: DynFlags -> Instr + -> Maybe (NatCmmDecl RawCmmStatics Instr) +generateJumpTableForInstr dflags (BCTR ids (Just lbl) _) = + let jumpTable + | (positionIndependent dflags) + || (not $ target32Bit $ targetPlatform dflags) + = map jumpTableEntryRel ids + | otherwise = map (jumpTableEntry dflags) ids + where jumpTableEntryRel Nothing + = CmmStaticLit (CmmInt 0 (wordWidth dflags)) + jumpTableEntryRel (Just blockid) + = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 + (wordWidth dflags)) + where blockLabel = blockLbl blockid + in Just (CmmData (Section ReadOnlyData lbl) (RawCmmStatics lbl jumpTable)) +generateJumpTableForInstr _ _ = Nothing + +-- ----------------------------------------------------------------------------- +-- 'condIntReg' and 'condFltReg': condition codes into registers + +-- Turn those condition codes into integers now (when they appear on +-- the right hand side of an assignment). + + + +condReg :: NatM CondCode -> NatM Register +condReg getCond = do + CondCode _ cond cond_code <- getCond + dflags <- getDynFlags + let + code dst = cond_code + `appOL` negate_code + `appOL` toOL [ + MFCR dst, + RLWINM dst dst (bit + 1) 31 31 + ] + + negate_code | do_negate = unitOL (CRNOR bit bit bit) + | otherwise = nilOL + + (bit, do_negate) = case cond of + LTT -> (0, False) + LE -> (1, True) + EQQ -> (2, False) + GE -> (0, True) + GTT -> (1, False) + + NE -> (2, True) + + LU -> (0, False) + LEU -> (1, True) + GEU -> (0, True) + GU -> (1, False) + _ -> panic "PPC.CodeGen.codeReg: no match" + + format = archWordFormat $ target32Bit $ targetPlatform dflags + return (Any format code) + +condIntReg :: Cond -> Width -> CmmExpr -> CmmExpr -> NatM Register +condIntReg cond width x y = condReg (condIntCode cond width x y) +condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register +condFltReg cond x y = condReg (condFltCode cond x y) + + + +-- ----------------------------------------------------------------------------- +-- 'trivial*Code': deal with trivial instructions + +-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode', +-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions. +-- Only look for constants on the right hand side, because that's +-- where the generic optimizer will have put them. + +-- Similarly, for unary instructions, we don't have to worry about +-- matching an StInt as the argument, because genericOpt will already +-- have handled the constant-folding. + + + +{- +Wolfgang's PowerPC version of The Rules: + +A slightly modified version of The Rules to take advantage of the fact +that PowerPC instructions work on all registers and don't implicitly +clobber any fixed registers. + +* The only expression for which getRegister returns Fixed is (CmmReg reg). + +* If getRegister returns Any, then the code it generates may modify only: + (a) fresh temporaries + (b) the destination register + It may *not* modify global registers, unless the global + register happens to be the destination register. + It may not clobber any other registers. In fact, only ccalls clobber any + fixed registers. + Also, it may not modify the counter register (used by genCCall). + + Corollary: If a getRegister for a subexpression returns Fixed, you need + not move it to a fresh temporary before evaluating the next subexpression. + The Fixed register won't be modified. + Therefore, we don't need a counterpart for the x86's getStableReg on PPC. + +* SDM's First Rule is valid for PowerPC, too: subexpressions can depend on + the value of the destination register. +-} + +trivialCode + :: Width + -> Bool + -> (Reg -> Reg -> RI -> Instr) + -> CmmExpr + -> CmmExpr + -> NatM Register + +trivialCode rep signed instr x (CmmLit (CmmInt y _)) + | Just imm <- makeImmediate rep signed y + = do + (src1, code1) <- getSomeReg x + let code dst = code1 `snocOL` instr dst src1 (RIImm imm) + return (Any (intFormat rep) code) + +trivialCode rep _ instr x y = do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let code dst = code1 `appOL` code2 `snocOL` instr dst src1 (RIReg src2) + return (Any (intFormat rep) code) + +shiftMulCode + :: Width + -> Bool + -> (Format-> Reg -> Reg -> RI -> Instr) + -> CmmExpr + -> CmmExpr + -> NatM Register +shiftMulCode width sign instr x (CmmLit (CmmInt y _)) + | Just imm <- makeImmediate width sign y + = do + (src1, code1) <- getSomeReg x + let format = intFormat width + let ins_fmt = intFormat (max W32 width) + let code dst = code1 `snocOL` instr ins_fmt dst src1 (RIImm imm) + return (Any format code) + +shiftMulCode width _ instr x y = do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let format = intFormat width + let ins_fmt = intFormat (max W32 width) + let code dst = code1 `appOL` code2 + `snocOL` instr ins_fmt dst src1 (RIReg src2) + return (Any format code) + +trivialCodeNoImm' :: Format -> (Reg -> Reg -> Reg -> Instr) + -> CmmExpr -> CmmExpr -> NatM Register +trivialCodeNoImm' format instr x y = do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let code dst = code1 `appOL` code2 `snocOL` instr dst src1 src2 + return (Any format code) + +trivialCodeNoImm :: Format -> (Format -> Reg -> Reg -> Reg -> Instr) + -> CmmExpr -> CmmExpr -> NatM Register +trivialCodeNoImm format instr x y + = trivialCodeNoImm' format (instr format) x y + +srCode :: Width -> Bool -> (Format-> Reg -> Reg -> RI -> Instr) + -> CmmExpr -> CmmExpr -> NatM Register +srCode width sgn instr x (CmmLit (CmmInt y _)) + | Just imm <- makeImmediate width sgn y + = do + let op_len = max W32 width + extend = if sgn then extendSExpr else extendUExpr + (src1, code1) <- getSomeReg (extend width op_len x) + let code dst = code1 `snocOL` + instr (intFormat op_len) dst src1 (RIImm imm) + return (Any (intFormat width) code) + +srCode width sgn instr x y = do + let op_len = max W32 width + extend = if sgn then extendSExpr else extendUExpr + (src1, code1) <- getSomeReg (extend width op_len x) + (src2, code2) <- getSomeReg (extendUExpr width op_len y) + -- Note: Shift amount `y` is unsigned + let code dst = code1 `appOL` code2 `snocOL` + instr (intFormat op_len) dst src1 (RIReg src2) + return (Any (intFormat width) code) + +divCode :: Width -> Bool -> CmmExpr -> CmmExpr -> NatM Register +divCode width sgn x y = do + let op_len = max W32 width + extend = if sgn then extendSExpr else extendUExpr + (src1, code1) <- getSomeReg (extend width op_len x) + (src2, code2) <- getSomeReg (extend width op_len y) + let code dst = code1 `appOL` code2 `snocOL` + DIV (intFormat op_len) sgn dst src1 src2 + return (Any (intFormat width) code) + + +trivialUCode :: Format + -> (Reg -> Reg -> Instr) + -> CmmExpr + -> NatM Register +trivialUCode rep instr x = do + (src, code) <- getSomeReg x + let code' dst = code `snocOL` instr dst src + return (Any rep code') + +-- There is no "remainder" instruction on the PPC, so we have to do +-- it the hard way. +-- The "sgn" parameter is the signedness for the division instruction + +remainderCode :: Width -> Bool -> Reg -> CmmExpr -> CmmExpr + -> NatM (Reg -> InstrBlock) +remainderCode rep sgn reg_q arg_x arg_y = do + let op_len = max W32 rep + fmt = intFormat op_len + extend = if sgn then extendSExpr else extendUExpr + (x_reg, x_code) <- getSomeReg (extend rep op_len arg_x) + (y_reg, y_code) <- getSomeReg (extend rep op_len arg_y) + return $ \reg_r -> y_code `appOL` x_code + `appOL` toOL [ DIV fmt sgn reg_q x_reg y_reg + , MULL fmt reg_r reg_q (RIReg y_reg) + , SUBF reg_r reg_r x_reg + ] + + +coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register +coerceInt2FP fromRep toRep x = do + dflags <- getDynFlags + let arch = platformArch $ targetPlatform dflags + coerceInt2FP' arch fromRep toRep x + +coerceInt2FP' :: Arch -> Width -> Width -> CmmExpr -> NatM Register +coerceInt2FP' ArchPPC fromRep toRep x = do + (src, code) <- getSomeReg x + lbl <- getNewLabelNat + itmp <- getNewRegNat II32 + ftmp <- getNewRegNat FF64 + dflags <- getDynFlags + dynRef <- cmmMakeDynamicReference dflags DataReference lbl + Amode addr addr_code <- getAmode D dynRef + let + code' dst = code `appOL` maybe_exts `appOL` toOL [ + LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl + [CmmStaticLit (CmmInt 0x43300000 W32), + CmmStaticLit (CmmInt 0x80000000 W32)], + XORIS itmp src (ImmInt 0x8000), + ST II32 itmp (spRel dflags 3), + LIS itmp (ImmInt 0x4330), + ST II32 itmp (spRel dflags 2), + LD FF64 ftmp (spRel dflags 2) + ] `appOL` addr_code `appOL` toOL [ + LD FF64 dst addr, + FSUB FF64 dst ftmp dst + ] `appOL` maybe_frsp dst + + maybe_exts = case fromRep of + W8 -> unitOL $ EXTS II8 src src + W16 -> unitOL $ EXTS II16 src src + W32 -> nilOL + _ -> panic "PPC.CodeGen.coerceInt2FP: no match" + + maybe_frsp dst + = case toRep of + W32 -> unitOL $ FRSP dst dst + W64 -> nilOL + _ -> panic "PPC.CodeGen.coerceInt2FP: no match" + + return (Any (floatFormat toRep) code') + +-- On an ELF v1 Linux we use the compiler doubleword in the stack frame +-- this is the TOC pointer doubleword on ELF v2 Linux. The latter is only +-- set right before a call and restored right after return from the call. +-- So it is fine. +coerceInt2FP' (ArchPPC_64 _) fromRep toRep x = do + (src, code) <- getSomeReg x + dflags <- getDynFlags + let + code' dst = code `appOL` maybe_exts `appOL` toOL [ + ST II64 src (spRel dflags 3), + LD FF64 dst (spRel dflags 3), + FCFID dst dst + ] `appOL` maybe_frsp dst + + maybe_exts = case fromRep of + W8 -> unitOL $ EXTS II8 src src + W16 -> unitOL $ EXTS II16 src src + W32 -> unitOL $ EXTS II32 src src + W64 -> nilOL + _ -> panic "PPC.CodeGen.coerceInt2FP: no match" + + maybe_frsp dst + = case toRep of + W32 -> unitOL $ FRSP dst dst + W64 -> nilOL + _ -> panic "PPC.CodeGen.coerceInt2FP: no match" + + return (Any (floatFormat toRep) code') + +coerceInt2FP' _ _ _ _ = panic "PPC.CodeGen.coerceInt2FP: unknown arch" + + +coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register +coerceFP2Int fromRep toRep x = do + dflags <- getDynFlags + let arch = platformArch $ targetPlatform dflags + coerceFP2Int' arch fromRep toRep x + +coerceFP2Int' :: Arch -> Width -> Width -> CmmExpr -> NatM Register +coerceFP2Int' ArchPPC _ toRep x = do + dflags <- getDynFlags + -- the reps don't really matter: F*->FF64 and II32->I* are no-ops + (src, code) <- getSomeReg x + tmp <- getNewRegNat FF64 + let + code' dst = code `appOL` toOL [ + -- convert to int in FP reg + FCTIWZ tmp src, + -- store value (64bit) from FP to stack + ST FF64 tmp (spRel dflags 2), + -- read low word of value (high word is undefined) + LD II32 dst (spRel dflags 3)] + return (Any (intFormat toRep) code') + +coerceFP2Int' (ArchPPC_64 _) _ toRep x = do + dflags <- getDynFlags + -- the reps don't really matter: F*->FF64 and II64->I* are no-ops + (src, code) <- getSomeReg x + tmp <- getNewRegNat FF64 + let + code' dst = code `appOL` toOL [ + -- convert to int in FP reg + FCTIDZ tmp src, + -- store value (64bit) from FP to compiler word on stack + ST FF64 tmp (spRel dflags 3), + LD II64 dst (spRel dflags 3)] + return (Any (intFormat toRep) code') + +coerceFP2Int' _ _ _ _ = panic "PPC.CodeGen.coerceFP2Int: unknown arch" + +-- Note [.LCTOC1 in PPC PIC code] +-- The .LCTOC1 label is defined to point 32768 bytes into the GOT table +-- to make the most of the PPC's 16-bit displacements. +-- As 16-bit signed offset is used (usually via addi/lwz instructions) +-- first element will have '-32768' offset against .LCTOC1. + +-- Note [implicit register in PPC PIC code] +-- PPC generates calls by labels in assembly +-- in form of: +-- bl puts+32768@plt +-- in this form it's not seen directly (by GHC NCG) +-- that r30 (PicBaseReg) is used, +-- but r30 is a required part of PLT code setup: +-- puts+32768@plt: +-- lwz r11,-30484(r30) ; offset in .LCTOC1 +-- mtctr r11 +-- bctr diff --git a/compiler/GHC/CmmToAsm/PPC/Cond.hs b/compiler/GHC/CmmToAsm/PPC/Cond.hs new file mode 100644 index 0000000000..e8efa30064 --- /dev/null +++ b/compiler/GHC/CmmToAsm/PPC/Cond.hs @@ -0,0 +1,63 @@ +module GHC.CmmToAsm.PPC.Cond ( + Cond(..), + condNegate, + condUnsigned, + condToSigned, + condToUnsigned, +) + +where + +import GhcPrelude + +import Panic + +data Cond + = ALWAYS + | EQQ + | GE + | GEU + | GTT + | GU + | LE + | LEU + | LTT + | LU + | NE + deriving Eq + + +condNegate :: Cond -> Cond +condNegate ALWAYS = panic "condNegate: ALWAYS" +condNegate EQQ = NE +condNegate GE = LTT +condNegate GEU = LU +condNegate GTT = LE +condNegate GU = LEU +condNegate LE = GTT +condNegate LEU = GU +condNegate LTT = GE +condNegate LU = GEU +condNegate NE = EQQ + +-- Condition utils +condUnsigned :: Cond -> Bool +condUnsigned GU = True +condUnsigned LU = True +condUnsigned GEU = True +condUnsigned LEU = True +condUnsigned _ = False + +condToSigned :: Cond -> Cond +condToSigned GU = GTT +condToSigned LU = LTT +condToSigned GEU = GE +condToSigned LEU = LE +condToSigned x = x + +condToUnsigned :: Cond -> Cond +condToUnsigned GTT = GU +condToUnsigned LTT = LU +condToUnsigned GE = GEU +condToUnsigned LE = LEU +condToUnsigned x = x diff --git a/compiler/GHC/CmmToAsm/PPC/Instr.hs b/compiler/GHC/CmmToAsm/PPC/Instr.hs new file mode 100644 index 0000000000..b92a952340 --- /dev/null +++ b/compiler/GHC/CmmToAsm/PPC/Instr.hs @@ -0,0 +1,713 @@ +{-# LANGUAGE CPP #-} + +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +----------------------------------------------------------------------------- +-- +-- Machine-dependent assembly language +-- +-- (c) The University of Glasgow 1993-2004 +-- +----------------------------------------------------------------------------- + +#include "HsVersions.h" + +module GHC.CmmToAsm.PPC.Instr ( + archWordFormat, + RI(..), + Instr(..), + stackFrameHeaderSize, + maxSpillSlots, + allocMoreStack, + makeFarBranches +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.PPC.Regs +import GHC.CmmToAsm.PPC.Cond +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Format +import GHC.CmmToAsm.Reg.Target +import GHC.Platform.Reg.Class +import GHC.Platform.Reg + +import GHC.Platform.Regs +import GHC.Cmm.BlockId +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label +import GHC.Driver.Session +import GHC.Cmm +import GHC.Cmm.Info +import FastString +import GHC.Cmm.CLabel +import Outputable +import GHC.Platform +import UniqFM (listToUFM, lookupUFM) +import UniqSupply + +import Control.Monad (replicateM) +import Data.Maybe (fromMaybe) + +-------------------------------------------------------------------------------- +-- Format of a PPC memory address. +-- +archWordFormat :: Bool -> Format +archWordFormat is32Bit + | is32Bit = II32 + | otherwise = II64 + + +-- | Instruction instance for powerpc +instance Instruction Instr where + regUsageOfInstr = ppc_regUsageOfInstr + patchRegsOfInstr = ppc_patchRegsOfInstr + isJumpishInstr = ppc_isJumpishInstr + jumpDestsOfInstr = ppc_jumpDestsOfInstr + patchJumpInstr = ppc_patchJumpInstr + mkSpillInstr = ppc_mkSpillInstr + mkLoadInstr = ppc_mkLoadInstr + takeDeltaInstr = ppc_takeDeltaInstr + isMetaInstr = ppc_isMetaInstr + mkRegRegMoveInstr _ = ppc_mkRegRegMoveInstr + takeRegRegMoveInstr = ppc_takeRegRegMoveInstr + mkJumpInstr = ppc_mkJumpInstr + mkStackAllocInstr = ppc_mkStackAllocInstr + mkStackDeallocInstr = ppc_mkStackDeallocInstr + + +ppc_mkStackAllocInstr :: Platform -> Int -> [Instr] +ppc_mkStackAllocInstr platform amount + = ppc_mkStackAllocInstr' platform (-amount) + +ppc_mkStackDeallocInstr :: Platform -> Int -> [Instr] +ppc_mkStackDeallocInstr platform amount + = ppc_mkStackAllocInstr' platform amount + +ppc_mkStackAllocInstr' :: Platform -> Int -> [Instr] +ppc_mkStackAllocInstr' platform amount + | fits16Bits amount + = [ LD fmt r0 (AddrRegImm sp zero) + , STU fmt r0 (AddrRegImm sp immAmount) + ] + | otherwise + = [ LD fmt r0 (AddrRegImm sp zero) + , ADDIS tmp sp (HA immAmount) + , ADD tmp tmp (RIImm (LO immAmount)) + , STU fmt r0 (AddrRegReg sp tmp) + ] + where + fmt = intFormat $ widthFromBytes (platformWordSizeInBytes platform) + zero = ImmInt 0 + tmp = tmpReg platform + immAmount = ImmInt amount + +-- +-- See note [extra spill slots] in X86/Instr.hs +-- +allocMoreStack + :: Platform + -> Int + -> NatCmmDecl statics GHC.CmmToAsm.PPC.Instr.Instr + -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.PPC.Instr.Instr, [(BlockId,BlockId)]) + +allocMoreStack _ _ top@(CmmData _ _) = return (top,[]) +allocMoreStack platform slots (CmmProc info lbl live (ListGraph code)) = do + let + infos = mapKeys info + entries = case code of + [] -> infos + BasicBlock entry _ : _ -- first block is the entry point + | entry `elem` infos -> infos + | otherwise -> entry : infos + + uniqs <- replicateM (length entries) getUniqueM + + let + delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up + where x = slots * spillSlotSize -- sp delta + + alloc = mkStackAllocInstr platform delta + dealloc = mkStackDeallocInstr platform delta + + retargetList = (zip entries (map mkBlockId uniqs)) + + new_blockmap :: LabelMap BlockId + new_blockmap = mapFromList retargetList + + insert_stack_insns (BasicBlock id insns) + | Just new_blockid <- mapLookup id new_blockmap + = [ BasicBlock id $ alloc ++ [BCC ALWAYS new_blockid Nothing] + , BasicBlock new_blockid block' + ] + | otherwise + = [ BasicBlock id block' ] + where + block' = foldr insert_dealloc [] insns + + insert_dealloc insn r + -- BCTR might or might not be a non-local jump. For + -- "labeled-goto" we use JMP, and for "computed-goto" we + -- use MTCTR followed by BCTR. See 'PPC.CodeGen.genJump'. + = case insn of + JMP _ _ -> dealloc ++ (insn : r) + BCTR [] Nothing _ -> dealloc ++ (insn : r) + BCTR ids label rs -> BCTR (map (fmap retarget) ids) label rs : r + BCCFAR cond b p -> BCCFAR cond (retarget b) p : r + BCC cond b p -> BCC cond (retarget b) p : r + _ -> insn : r + -- BL and BCTRL are call-like instructions rather than + -- jumps, and are used only for C calls. + + retarget :: BlockId -> BlockId + retarget b + = fromMaybe b (mapLookup b new_blockmap) + + new_code + = concatMap insert_stack_insns code + + -- in + return (CmmProc info lbl live (ListGraph new_code),retargetList) + + +-- ----------------------------------------------------------------------------- +-- Machine's assembly language + +-- We have a few common "instructions" (nearly all the pseudo-ops) but +-- mostly all of 'Instr' is machine-specific. + +-- Register or immediate +data RI + = RIReg Reg + | RIImm Imm + +data Instr + -- comment pseudo-op + = COMMENT FastString + + -- some static data spat out during code + -- generation. Will be extracted before + -- pretty-printing. + | LDATA Section RawCmmStatics + + -- start a new basic block. Useful during + -- codegen, removed later. Preceding + -- instruction should be a jump, as per the + -- invariants for a BasicBlock (see Cmm). + | NEWBLOCK BlockId + + -- specify current stack offset for + -- benefit of subsequent passes + | DELTA Int + + -- Loads and stores. + | LD Format Reg AddrMode -- Load format, dst, src + | LDFAR Format Reg AddrMode -- Load format, dst, src 32 bit offset + | LDR Format Reg AddrMode -- Load and reserve format, dst, src + | LA Format Reg AddrMode -- Load arithmetic format, dst, src + | ST Format Reg AddrMode -- Store format, src, dst + | STFAR Format Reg AddrMode -- Store format, src, dst 32 bit offset + | STU Format Reg AddrMode -- Store with Update format, src, dst + | STC Format Reg AddrMode -- Store conditional format, src, dst + | LIS Reg Imm -- Load Immediate Shifted dst, src + | LI Reg Imm -- Load Immediate dst, src + | MR Reg Reg -- Move Register dst, src -- also for fmr + + | CMP Format Reg RI -- format, src1, src2 + | CMPL Format Reg RI -- format, src1, src2 + + | BCC Cond BlockId (Maybe Bool) -- cond, block, hint + | BCCFAR Cond BlockId (Maybe Bool) -- cond, block, hint + -- hint: + -- Just True: branch likely taken + -- Just False: branch likely not taken + -- Nothing: no hint + | JMP CLabel [Reg] -- same as branch, + -- but with CLabel instead of block ID + -- and live global registers + | MTCTR Reg + | BCTR [Maybe BlockId] (Maybe CLabel) [Reg] + -- with list of local destinations, and + -- jump table location if necessary + | BL CLabel [Reg] -- with list of argument regs + | BCTRL [Reg] + + | ADD Reg Reg RI -- dst, src1, src2 + | ADDO Reg Reg Reg -- add and set overflow + | ADDC Reg Reg Reg -- (carrying) dst, src1, src2 + | ADDE Reg Reg Reg -- (extended) dst, src1, src2 + | ADDZE Reg Reg -- (to zero extended) dst, src + | ADDIS Reg Reg Imm -- Add Immediate Shifted dst, src1, src2 + | SUBF Reg Reg Reg -- dst, src1, src2 ; dst = src2 - src1 + | SUBFO Reg Reg Reg -- subtract from and set overflow + | SUBFC Reg Reg RI -- (carrying) dst, src1, src2 ; + -- dst = src2 - src1 + | SUBFE Reg Reg Reg -- (extended) dst, src1, src2 ; + -- dst = src2 - src1 + | MULL Format Reg Reg RI + | MULLO Format Reg Reg Reg -- multiply and set overflow + | MFOV Format Reg -- move overflow bit (1|33) to register + -- pseudo-instruction; pretty printed as + -- mfxer dst + -- extr[w|d]i dst, dst, 1, [1|33] + | MULHU Format Reg Reg Reg + | DIV Format Bool Reg Reg Reg + | AND Reg Reg RI -- dst, src1, src2 + | ANDC Reg Reg Reg -- AND with complement, dst = src1 & ~ src2 + | NAND Reg Reg Reg -- dst, src1, src2 + | OR Reg Reg RI -- dst, src1, src2 + | ORIS Reg Reg Imm -- OR Immediate Shifted dst, src1, src2 + | XOR Reg Reg RI -- dst, src1, src2 + | XORIS Reg Reg Imm -- XOR Immediate Shifted dst, src1, src2 + + | EXTS Format Reg Reg + | CNTLZ Format Reg Reg + + | NEG Reg Reg + | NOT Reg Reg + + | SL Format Reg Reg RI -- shift left + | SR Format Reg Reg RI -- shift right + | SRA Format Reg Reg RI -- shift right arithmetic + + | RLWINM Reg Reg Int Int Int -- Rotate Left Word Immediate then AND with Mask + | CLRLI Format Reg Reg Int -- clear left immediate (extended mnemonic) + | CLRRI Format Reg Reg Int -- clear right immediate (extended mnemonic) + + | FADD Format Reg Reg Reg + | FSUB Format Reg Reg Reg + | FMUL Format Reg Reg Reg + | FDIV Format Reg Reg Reg + | FABS Reg Reg -- abs is the same for single and double + | FNEG Reg Reg -- negate is the same for single and double prec. + + | FCMP Reg Reg + + | FCTIWZ Reg Reg -- convert to integer word + | FCTIDZ Reg Reg -- convert to integer double word + | FCFID Reg Reg -- convert from integer double word + | FRSP Reg Reg -- reduce to single precision + -- (but destination is a FP register) + + | CRNOR Int Int Int -- condition register nor + | MFCR Reg -- move from condition register + + | MFLR Reg -- move from link register + | FETCHPC Reg -- pseudo-instruction: + -- bcl to next insn, mflr reg + | HWSYNC -- heavy weight sync + | ISYNC -- instruction synchronize + | LWSYNC -- memory barrier + | NOP -- no operation, PowerPC 64 bit + -- needs this as place holder to + -- reload TOC pointer + +-- | Get the registers that are being used by this instruction. +-- regUsage doesn't need to do any trickery for jumps and such. +-- Just state precisely the regs read and written by that insn. +-- The consequences of control flow transfers, as far as register +-- allocation goes, are taken care of by the register allocator. +-- +ppc_regUsageOfInstr :: Platform -> Instr -> RegUsage +ppc_regUsageOfInstr platform instr + = case instr of + LD _ reg addr -> usage (regAddr addr, [reg]) + LDFAR _ reg addr -> usage (regAddr addr, [reg]) + LDR _ reg addr -> usage (regAddr addr, [reg]) + LA _ reg addr -> usage (regAddr addr, [reg]) + ST _ reg addr -> usage (reg : regAddr addr, []) + STFAR _ reg addr -> usage (reg : regAddr addr, []) + STU _ reg addr -> usage (reg : regAddr addr, []) + STC _ reg addr -> usage (reg : regAddr addr, []) + LIS reg _ -> usage ([], [reg]) + LI reg _ -> usage ([], [reg]) + MR reg1 reg2 -> usage ([reg2], [reg1]) + CMP _ reg ri -> usage (reg : regRI ri,[]) + CMPL _ reg ri -> usage (reg : regRI ri,[]) + BCC _ _ _ -> noUsage + BCCFAR _ _ _ -> noUsage + JMP _ regs -> usage (regs, []) + MTCTR reg -> usage ([reg],[]) + BCTR _ _ regs -> usage (regs, []) + BL _ params -> usage (params, callClobberedRegs platform) + BCTRL params -> usage (params, callClobberedRegs platform) + + ADD reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + ADDO reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + ADDC reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + ADDE reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + ADDZE reg1 reg2 -> usage ([reg2], [reg1]) + ADDIS reg1 reg2 _ -> usage ([reg2], [reg1]) + SUBF reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + SUBFO reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + SUBFC reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + SUBFE reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + MULL _ reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + MULLO _ reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + MFOV _ reg -> usage ([], [reg]) + MULHU _ reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + DIV _ _ reg1 reg2 reg3 + -> usage ([reg2,reg3], [reg1]) + + AND reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + ANDC reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + NAND reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1]) + OR reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + ORIS reg1 reg2 _ -> usage ([reg2], [reg1]) + XOR reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + XORIS reg1 reg2 _ -> usage ([reg2], [reg1]) + EXTS _ reg1 reg2 -> usage ([reg2], [reg1]) + CNTLZ _ reg1 reg2 -> usage ([reg2], [reg1]) + NEG reg1 reg2 -> usage ([reg2], [reg1]) + NOT reg1 reg2 -> usage ([reg2], [reg1]) + SL _ reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + SR _ reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + SRA _ reg1 reg2 ri -> usage (reg2 : regRI ri, [reg1]) + RLWINM reg1 reg2 _ _ _ -> usage ([reg2], [reg1]) + CLRLI _ reg1 reg2 _ -> usage ([reg2], [reg1]) + CLRRI _ reg1 reg2 _ -> usage ([reg2], [reg1]) + + FADD _ r1 r2 r3 -> usage ([r2,r3], [r1]) + FSUB _ r1 r2 r3 -> usage ([r2,r3], [r1]) + FMUL _ r1 r2 r3 -> usage ([r2,r3], [r1]) + FDIV _ r1 r2 r3 -> usage ([r2,r3], [r1]) + FABS r1 r2 -> usage ([r2], [r1]) + FNEG r1 r2 -> usage ([r2], [r1]) + FCMP r1 r2 -> usage ([r1,r2], []) + FCTIWZ r1 r2 -> usage ([r2], [r1]) + FCTIDZ r1 r2 -> usage ([r2], [r1]) + FCFID r1 r2 -> usage ([r2], [r1]) + FRSP r1 r2 -> usage ([r2], [r1]) + MFCR reg -> usage ([], [reg]) + MFLR reg -> usage ([], [reg]) + FETCHPC reg -> usage ([], [reg]) + _ -> noUsage + where + usage (src, dst) = RU (filter (interesting platform) src) + (filter (interesting platform) dst) + regAddr (AddrRegReg r1 r2) = [r1, r2] + regAddr (AddrRegImm r1 _) = [r1] + + regRI (RIReg r) = [r] + regRI _ = [] + +interesting :: Platform -> Reg -> Bool +interesting _ (RegVirtual _) = True +interesting platform (RegReal (RealRegSingle i)) = freeReg platform i +interesting _ (RegReal (RealRegPair{})) + = panic "PPC.Instr.interesting: no reg pairs on this arch" + + + +-- | Apply a given mapping to all the register references in this +-- instruction. +ppc_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr +ppc_patchRegsOfInstr instr env + = case instr of + LD fmt reg addr -> LD fmt (env reg) (fixAddr addr) + LDFAR fmt reg addr -> LDFAR fmt (env reg) (fixAddr addr) + LDR fmt reg addr -> LDR fmt (env reg) (fixAddr addr) + LA fmt reg addr -> LA fmt (env reg) (fixAddr addr) + ST fmt reg addr -> ST fmt (env reg) (fixAddr addr) + STFAR fmt reg addr -> STFAR fmt (env reg) (fixAddr addr) + STU fmt reg addr -> STU fmt (env reg) (fixAddr addr) + STC fmt reg addr -> STC fmt (env reg) (fixAddr addr) + LIS reg imm -> LIS (env reg) imm + LI reg imm -> LI (env reg) imm + MR reg1 reg2 -> MR (env reg1) (env reg2) + CMP fmt reg ri -> CMP fmt (env reg) (fixRI ri) + CMPL fmt reg ri -> CMPL fmt (env reg) (fixRI ri) + BCC cond lbl p -> BCC cond lbl p + BCCFAR cond lbl p -> BCCFAR cond lbl p + JMP l regs -> JMP l regs -- global regs will not be remapped + MTCTR reg -> MTCTR (env reg) + BCTR targets lbl rs -> BCTR targets lbl rs + BL imm argRegs -> BL imm argRegs -- argument regs + BCTRL argRegs -> BCTRL argRegs -- cannot be remapped + ADD reg1 reg2 ri -> ADD (env reg1) (env reg2) (fixRI ri) + ADDO reg1 reg2 reg3 -> ADDO (env reg1) (env reg2) (env reg3) + ADDC reg1 reg2 reg3 -> ADDC (env reg1) (env reg2) (env reg3) + ADDE reg1 reg2 reg3 -> ADDE (env reg1) (env reg2) (env reg3) + ADDZE reg1 reg2 -> ADDZE (env reg1) (env reg2) + ADDIS reg1 reg2 imm -> ADDIS (env reg1) (env reg2) imm + SUBF reg1 reg2 reg3 -> SUBF (env reg1) (env reg2) (env reg3) + SUBFO reg1 reg2 reg3 -> SUBFO (env reg1) (env reg2) (env reg3) + SUBFC reg1 reg2 ri -> SUBFC (env reg1) (env reg2) (fixRI ri) + SUBFE reg1 reg2 reg3 -> SUBFE (env reg1) (env reg2) (env reg3) + MULL fmt reg1 reg2 ri + -> MULL fmt (env reg1) (env reg2) (fixRI ri) + MULLO fmt reg1 reg2 reg3 + -> MULLO fmt (env reg1) (env reg2) (env reg3) + MFOV fmt reg -> MFOV fmt (env reg) + MULHU fmt reg1 reg2 reg3 + -> MULHU fmt (env reg1) (env reg2) (env reg3) + DIV fmt sgn reg1 reg2 reg3 + -> DIV fmt sgn (env reg1) (env reg2) (env reg3) + + AND reg1 reg2 ri -> AND (env reg1) (env reg2) (fixRI ri) + ANDC reg1 reg2 reg3 -> ANDC (env reg1) (env reg2) (env reg3) + NAND reg1 reg2 reg3 -> NAND (env reg1) (env reg2) (env reg3) + OR reg1 reg2 ri -> OR (env reg1) (env reg2) (fixRI ri) + ORIS reg1 reg2 imm -> ORIS (env reg1) (env reg2) imm + XOR reg1 reg2 ri -> XOR (env reg1) (env reg2) (fixRI ri) + XORIS reg1 reg2 imm -> XORIS (env reg1) (env reg2) imm + EXTS fmt reg1 reg2 -> EXTS fmt (env reg1) (env reg2) + CNTLZ fmt reg1 reg2 -> CNTLZ fmt (env reg1) (env reg2) + NEG reg1 reg2 -> NEG (env reg1) (env reg2) + NOT reg1 reg2 -> NOT (env reg1) (env reg2) + SL fmt reg1 reg2 ri + -> SL fmt (env reg1) (env reg2) (fixRI ri) + SR fmt reg1 reg2 ri + -> SR fmt (env reg1) (env reg2) (fixRI ri) + SRA fmt reg1 reg2 ri + -> SRA fmt (env reg1) (env reg2) (fixRI ri) + RLWINM reg1 reg2 sh mb me + -> RLWINM (env reg1) (env reg2) sh mb me + CLRLI fmt reg1 reg2 n -> CLRLI fmt (env reg1) (env reg2) n + CLRRI fmt reg1 reg2 n -> CLRRI fmt (env reg1) (env reg2) n + FADD fmt r1 r2 r3 -> FADD fmt (env r1) (env r2) (env r3) + FSUB fmt r1 r2 r3 -> FSUB fmt (env r1) (env r2) (env r3) + FMUL fmt r1 r2 r3 -> FMUL fmt (env r1) (env r2) (env r3) + FDIV fmt r1 r2 r3 -> FDIV fmt (env r1) (env r2) (env r3) + FABS r1 r2 -> FABS (env r1) (env r2) + FNEG r1 r2 -> FNEG (env r1) (env r2) + FCMP r1 r2 -> FCMP (env r1) (env r2) + FCTIWZ r1 r2 -> FCTIWZ (env r1) (env r2) + FCTIDZ r1 r2 -> FCTIDZ (env r1) (env r2) + FCFID r1 r2 -> FCFID (env r1) (env r2) + FRSP r1 r2 -> FRSP (env r1) (env r2) + MFCR reg -> MFCR (env reg) + MFLR reg -> MFLR (env reg) + FETCHPC reg -> FETCHPC (env reg) + _ -> instr + where + fixAddr (AddrRegReg r1 r2) = AddrRegReg (env r1) (env r2) + fixAddr (AddrRegImm r1 i) = AddrRegImm (env r1) i + + fixRI (RIReg r) = RIReg (env r) + fixRI other = other + + +-------------------------------------------------------------------------------- +-- | Checks whether this instruction is a jump/branch instruction. +-- One that can change the flow of control in a way that the +-- register allocator needs to worry about. +ppc_isJumpishInstr :: Instr -> Bool +ppc_isJumpishInstr instr + = case instr of + BCC{} -> True + BCCFAR{} -> True + BCTR{} -> True + BCTRL{} -> True + BL{} -> True + JMP{} -> True + _ -> False + + +-- | Checks whether this instruction is a jump/branch instruction. +-- One that can change the flow of control in a way that the +-- register allocator needs to worry about. +ppc_jumpDestsOfInstr :: Instr -> [BlockId] +ppc_jumpDestsOfInstr insn + = case insn of + BCC _ id _ -> [id] + BCCFAR _ id _ -> [id] + BCTR targets _ _ -> [id | Just id <- targets] + _ -> [] + + +-- | Change the destination of this jump instruction. +-- Used in the linear allocator when adding fixup blocks for join +-- points. +ppc_patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr +ppc_patchJumpInstr insn patchF + = case insn of + BCC cc id p -> BCC cc (patchF id) p + BCCFAR cc id p -> BCCFAR cc (patchF id) p + BCTR ids lbl rs -> BCTR (map (fmap patchF) ids) lbl rs + _ -> insn + + +-- ----------------------------------------------------------------------------- + +-- | An instruction to spill a register into a spill slot. +ppc_mkSpillInstr + :: DynFlags + -> Reg -- register to spill + -> Int -- current stack delta + -> Int -- spill slot to use + -> Instr + +ppc_mkSpillInstr dflags reg delta slot + = let platform = targetPlatform dflags + off = spillSlotToOffset dflags slot + arch = platformArch platform + in + let fmt = case targetClassOfReg platform reg of + RcInteger -> case arch of + ArchPPC -> II32 + _ -> II64 + RcDouble -> FF64 + _ -> panic "PPC.Instr.mkSpillInstr: no match" + instr = case makeImmediate W32 True (off-delta) of + Just _ -> ST + Nothing -> STFAR -- pseudo instruction: 32 bit offsets + + in instr fmt reg (AddrRegImm sp (ImmInt (off-delta))) + + +ppc_mkLoadInstr + :: DynFlags + -> Reg -- register to load + -> Int -- current stack delta + -> Int -- spill slot to use + -> Instr + +ppc_mkLoadInstr dflags reg delta slot + = let platform = targetPlatform dflags + off = spillSlotToOffset dflags slot + arch = platformArch platform + in + let fmt = case targetClassOfReg platform reg of + RcInteger -> case arch of + ArchPPC -> II32 + _ -> II64 + RcDouble -> FF64 + _ -> panic "PPC.Instr.mkLoadInstr: no match" + instr = case makeImmediate W32 True (off-delta) of + Just _ -> LD + Nothing -> LDFAR -- pseudo instruction: 32 bit offsets + + in instr fmt reg (AddrRegImm sp (ImmInt (off-delta))) + + +-- | The size of a minimal stackframe header including minimal +-- parameter save area. +stackFrameHeaderSize :: DynFlags -> Int +stackFrameHeaderSize dflags + = case platformOS platform of + OSAIX -> 24 + 8 * 4 + _ -> case platformArch platform of + -- header + parameter save area + ArchPPC -> 64 -- TODO: check ABI spec + ArchPPC_64 ELF_V1 -> 48 + 8 * 8 + ArchPPC_64 ELF_V2 -> 32 + 8 * 8 + _ -> panic "PPC.stackFrameHeaderSize: not defined for this OS" + where platform = targetPlatform dflags + +-- | The maximum number of bytes required to spill a register. PPC32 +-- has 32-bit GPRs and 64-bit FPRs, while PPC64 has 64-bit GPRs and +-- 64-bit FPRs. So the maximum is 8 regardless of platforms unlike +-- x86. Note that AltiVec's vector registers are 128-bit wide so we +-- must not use this to spill them. +spillSlotSize :: Int +spillSlotSize = 8 + +-- | The number of spill slots available without allocating more. +maxSpillSlots :: DynFlags -> Int +maxSpillSlots dflags + = ((rESERVED_C_STACK_BYTES dflags - stackFrameHeaderSize dflags) + `div` spillSlotSize) - 1 +-- = 0 -- useful for testing allocMoreStack + +-- | The number of bytes that the stack pointer should be aligned +-- to. This is 16 both on PPC32 and PPC64 ELF (see ELF processor +-- specific supplements). +stackAlign :: Int +stackAlign = 16 + +-- | Convert a spill slot number to a *byte* offset, with no sign. +spillSlotToOffset :: DynFlags -> Int -> Int +spillSlotToOffset dflags slot + = stackFrameHeaderSize dflags + spillSlotSize * slot + + +-------------------------------------------------------------------------------- +-- | See if this instruction is telling us the current C stack delta +ppc_takeDeltaInstr + :: Instr + -> Maybe Int + +ppc_takeDeltaInstr instr + = case instr of + DELTA i -> Just i + _ -> Nothing + + +ppc_isMetaInstr + :: Instr + -> Bool + +ppc_isMetaInstr instr + = case instr of + COMMENT{} -> True + LDATA{} -> True + NEWBLOCK{} -> True + DELTA{} -> True + _ -> False + + +-- | Copy the value in a register to another one. +-- Must work for all register classes. +ppc_mkRegRegMoveInstr + :: Reg + -> Reg + -> Instr + +ppc_mkRegRegMoveInstr src dst + = MR dst src + + +-- | Make an unconditional jump instruction. +ppc_mkJumpInstr + :: BlockId + -> [Instr] + +ppc_mkJumpInstr id + = [BCC ALWAYS id Nothing] + + +-- | Take the source and destination from this reg -> reg move instruction +-- or Nothing if it's not one +ppc_takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg) +ppc_takeRegRegMoveInstr (MR dst src) = Just (src,dst) +ppc_takeRegRegMoveInstr _ = Nothing + +-- ----------------------------------------------------------------------------- +-- Making far branches + +-- Conditional branches on PowerPC are limited to +-32KB; if our Procs get too +-- big, we have to work around this limitation. + +makeFarBranches + :: LabelMap RawCmmStatics + -> [NatBasicBlock Instr] + -> [NatBasicBlock Instr] +makeFarBranches info_env blocks + | last blockAddresses < nearLimit = blocks + | otherwise = zipWith handleBlock blockAddresses blocks + where + blockAddresses = scanl (+) 0 $ map blockLen blocks + blockLen (BasicBlock _ instrs) = length instrs + + handleBlock addr (BasicBlock id instrs) + = BasicBlock id (zipWith makeFar [addr..] instrs) + + makeFar _ (BCC ALWAYS tgt _) = BCC ALWAYS tgt Nothing + makeFar addr (BCC cond tgt p) + | abs (addr - targetAddr) >= nearLimit + = BCCFAR cond tgt p + | otherwise + = BCC cond tgt p + where Just targetAddr = lookupUFM blockAddressMap tgt + makeFar _ other = other + + -- 8192 instructions are allowed; let's keep some distance, as + -- we have a few pseudo-insns that are pretty-printed as + -- multiple instructions, and it's just not worth the effort + -- to calculate things exactly + nearLimit = 7000 - mapSize info_env * maxRetInfoTableSizeW + + blockAddressMap = listToUFM $ zip (map blockId blocks) blockAddresses diff --git a/compiler/GHC/CmmToAsm/PPC/Ppr.hs b/compiler/GHC/CmmToAsm/PPC/Ppr.hs new file mode 100644 index 0000000000..550bd618ef --- /dev/null +++ b/compiler/GHC/CmmToAsm/PPC/Ppr.hs @@ -0,0 +1,994 @@ +----------------------------------------------------------------------------- +-- +-- Pretty-printing assembly language +-- +-- (c) The University of Glasgow 1993-2005 +-- +----------------------------------------------------------------------------- + +{-# OPTIONS_GHC -fno-warn-orphans #-} +module GHC.CmmToAsm.PPC.Ppr (pprNatCmmDecl) where + +import GhcPrelude + +import GHC.CmmToAsm.PPC.Regs +import GHC.CmmToAsm.PPC.Instr +import GHC.CmmToAsm.PPC.Cond +import GHC.CmmToAsm.Ppr +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Format +import GHC.Platform.Reg +import GHC.Platform.Reg.Class +import GHC.CmmToAsm.Reg.Target + +import GHC.Cmm hiding (topInfoTable) +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label + +import GHC.Cmm.BlockId +import GHC.Cmm.CLabel +import GHC.Cmm.Ppr.Expr () -- For Outputable instances + +import Unique ( pprUniqueAlways, getUnique ) +import GHC.Platform +import FastString +import Outputable +import GHC.Driver.Session + +import Data.Word +import Data.Int +import Data.Bits + +-- ----------------------------------------------------------------------------- +-- Printing this stuff out + +pprNatCmmDecl :: NatCmmDecl RawCmmStatics Instr -> SDoc +pprNatCmmDecl (CmmData section dats) = + pprSectionAlign section $$ pprDatas dats + +pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) = + case topInfoTable proc of + Nothing -> + sdocWithPlatform $ \platform -> + -- special case for code without info table: + pprSectionAlign (Section Text lbl) $$ + (case platformArch platform of + ArchPPC_64 ELF_V1 -> pprFunctionDescriptor lbl + ArchPPC_64 ELF_V2 -> pprFunctionPrologue lbl + _ -> pprLabel lbl) $$ -- blocks guaranteed not null, + -- so label needed + vcat (map (pprBasicBlock top_info) blocks) + + Just (RawCmmStatics info_lbl _) -> + sdocWithPlatform $ \platform -> + pprSectionAlign (Section Text info_lbl) $$ + (if platformHasSubsectionsViaSymbols platform + then ppr (mkDeadStripPreventer info_lbl) <> char ':' + else empty) $$ + vcat (map (pprBasicBlock top_info) blocks) $$ + -- above: Even the first block gets a label, because with branch-chain + -- elimination, it might be the target of a goto. + (if platformHasSubsectionsViaSymbols platform + then + -- See Note [Subsections Via Symbols] in X86/Ppr.hs + text "\t.long " + <+> ppr info_lbl + <+> char '-' + <+> ppr (mkDeadStripPreventer info_lbl) + else empty) + +pprFunctionDescriptor :: CLabel -> SDoc +pprFunctionDescriptor lab = pprGloblDecl lab + $$ text "\t.section \".opd\", \"aw\"" + $$ text "\t.align 3" + $$ ppr lab <> char ':' + $$ text "\t.quad ." + <> ppr lab + <> text ",.TOC.@tocbase,0" + $$ text "\t.previous" + $$ text "\t.type" + <+> ppr lab + <> text ", @function" + $$ char '.' <> ppr lab <> char ':' + +pprFunctionPrologue :: CLabel ->SDoc +pprFunctionPrologue lab = pprGloblDecl lab + $$ text ".type " + <> ppr lab + <> text ", @function" + $$ ppr lab <> char ':' + $$ text "0:\taddis\t" <> pprReg toc + <> text ",12,.TOC.-0b@ha" + $$ text "\taddi\t" <> pprReg toc + <> char ',' <> pprReg toc <> text ",.TOC.-0b@l" + $$ text "\t.localentry\t" <> ppr lab + <> text ",.-" <> ppr lab + +pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc +pprBasicBlock info_env (BasicBlock blockid instrs) + = maybe_infotable $$ + pprLabel (blockLbl blockid) $$ + vcat (map pprInstr instrs) + where + maybe_infotable = case mapLookup blockid info_env of + Nothing -> empty + Just (RawCmmStatics info_lbl info) -> + pprAlignForSection Text $$ + vcat (map pprData info) $$ + pprLabel info_lbl + + + +pprDatas :: RawCmmStatics -> SDoc +-- See note [emit-time elimination of static indirections] in CLabel. +pprDatas (RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _]) + | lbl == mkIndStaticInfoLabel + , let labelInd (CmmLabelOff l _) = Just l + labelInd (CmmLabel l) = Just l + labelInd _ = Nothing + , Just ind' <- labelInd ind + , alias `mayRedirectTo` ind' + = pprGloblDecl alias + $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind') +pprDatas (RawCmmStatics lbl dats) = vcat (pprLabel lbl : map pprData dats) + +pprData :: CmmStatic -> SDoc +pprData (CmmString str) = pprBytes str +pprData (CmmUninitialised bytes) = text ".space " <> int bytes +pprData (CmmStaticLit lit) = pprDataItem lit + +pprGloblDecl :: CLabel -> SDoc +pprGloblDecl lbl + | not (externallyVisibleCLabel lbl) = empty + | otherwise = text ".globl " <> ppr lbl + +pprTypeAndSizeDecl :: CLabel -> SDoc +pprTypeAndSizeDecl lbl + = sdocWithPlatform $ \platform -> + if platformOS platform == OSLinux && externallyVisibleCLabel lbl + then text ".type " <> + ppr lbl <> text ", @object" + else empty + +pprLabel :: CLabel -> SDoc +pprLabel lbl = pprGloblDecl lbl + $$ pprTypeAndSizeDecl lbl + $$ (ppr lbl <> char ':') + +-- ----------------------------------------------------------------------------- +-- pprInstr: print an 'Instr' + +instance Outputable Instr where + ppr instr = pprInstr instr + + +pprReg :: Reg -> SDoc + +pprReg r + = case r of + RegReal (RealRegSingle i) -> ppr_reg_no i + RegReal (RealRegPair{}) -> panic "PPC.pprReg: no reg pairs on this arch" + RegVirtual (VirtualRegI u) -> text "%vI_" <> pprUniqueAlways u + RegVirtual (VirtualRegHi u) -> text "%vHi_" <> pprUniqueAlways u + RegVirtual (VirtualRegF u) -> text "%vF_" <> pprUniqueAlways u + RegVirtual (VirtualRegD u) -> text "%vD_" <> pprUniqueAlways u + + where + ppr_reg_no :: Int -> SDoc + ppr_reg_no i + | i <= 31 = int i -- GPRs + | i <= 63 = int (i-32) -- FPRs + | otherwise = text "very naughty powerpc register" + + + +pprFormat :: Format -> SDoc +pprFormat x + = ptext (case x of + II8 -> sLit "b" + II16 -> sLit "h" + II32 -> sLit "w" + II64 -> sLit "d" + FF32 -> sLit "fs" + FF64 -> sLit "fd") + + +pprCond :: Cond -> SDoc +pprCond c + = ptext (case c of { + ALWAYS -> sLit ""; + EQQ -> sLit "eq"; NE -> sLit "ne"; + LTT -> sLit "lt"; GE -> sLit "ge"; + GTT -> sLit "gt"; LE -> sLit "le"; + LU -> sLit "lt"; GEU -> sLit "ge"; + GU -> sLit "gt"; LEU -> sLit "le"; }) + + +pprImm :: Imm -> SDoc + +pprImm (ImmInt i) = int i +pprImm (ImmInteger i) = integer i +pprImm (ImmCLbl l) = ppr l +pprImm (ImmIndex l i) = ppr l <> char '+' <> int i +pprImm (ImmLit s) = s + +pprImm (ImmFloat _) = text "naughty float immediate" +pprImm (ImmDouble _) = text "naughty double immediate" + +pprImm (ImmConstantSum a b) = pprImm a <> char '+' <> pprImm b +pprImm (ImmConstantDiff a b) = pprImm a <> char '-' + <> lparen <> pprImm b <> rparen + +pprImm (LO (ImmInt i)) = pprImm (LO (ImmInteger (toInteger i))) +pprImm (LO (ImmInteger i)) = pprImm (ImmInteger (toInteger lo16)) + where + lo16 = fromInteger (i .&. 0xffff) :: Int16 + +pprImm (LO i) + = pprImm i <> text "@l" + +pprImm (HI i) + = pprImm i <> text "@h" + +pprImm (HA (ImmInt i)) = pprImm (HA (ImmInteger (toInteger i))) +pprImm (HA (ImmInteger i)) = pprImm (ImmInteger ha16) + where + ha16 = if lo16 >= 0x8000 then hi16+1 else hi16 + hi16 = (i `shiftR` 16) + lo16 = i .&. 0xffff + +pprImm (HA i) + = pprImm i <> text "@ha" + +pprImm (HIGHERA i) + = pprImm i <> text "@highera" + +pprImm (HIGHESTA i) + = pprImm i <> text "@highesta" + + +pprAddr :: AddrMode -> SDoc +pprAddr (AddrRegReg r1 r2) + = pprReg r1 <> char ',' <+> pprReg r2 +pprAddr (AddrRegImm r1 (ImmInt i)) + = hcat [ int i, char '(', pprReg r1, char ')' ] +pprAddr (AddrRegImm r1 (ImmInteger i)) + = hcat [ integer i, char '(', pprReg r1, char ')' ] +pprAddr (AddrRegImm r1 imm) + = hcat [ pprImm imm, char '(', pprReg r1, char ')' ] + + +pprSectionAlign :: Section -> SDoc +pprSectionAlign sec@(Section seg _) = + sdocWithPlatform $ \platform -> + pprSectionHeader platform sec $$ + pprAlignForSection seg + +-- | Print appropriate alignment for the given section type. +pprAlignForSection :: SectionType -> SDoc +pprAlignForSection seg = + sdocWithPlatform $ \platform -> + let ppc64 = not $ target32Bit platform + in ptext $ case seg of + Text -> sLit ".align 2" + Data + | ppc64 -> sLit ".align 3" + | otherwise -> sLit ".align 2" + ReadOnlyData + | ppc64 -> sLit ".align 3" + | otherwise -> sLit ".align 2" + RelocatableReadOnlyData + | ppc64 -> sLit ".align 3" + | otherwise -> sLit ".align 2" + UninitialisedData + | ppc64 -> sLit ".align 3" + | otherwise -> sLit ".align 2" + ReadOnlyData16 -> sLit ".align 4" + -- TODO: This is copied from the ReadOnlyData case, but it can likely be + -- made more efficient. + CString + | ppc64 -> sLit ".align 3" + | otherwise -> sLit ".align 2" + OtherSection _ -> panic "PprMach.pprSectionAlign: unknown section" + +pprDataItem :: CmmLit -> SDoc +pprDataItem lit + = sdocWithDynFlags $ \dflags -> + vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit dflags) + where + imm = litToImm lit + archPPC_64 dflags = not $ target32Bit $ targetPlatform dflags + + ppr_item II8 _ _ = [text "\t.byte\t" <> pprImm imm] + + ppr_item II32 _ _ = [text "\t.long\t" <> pprImm imm] + + ppr_item II64 _ dflags + | archPPC_64 dflags = [text "\t.quad\t" <> pprImm imm] + + + ppr_item FF32 (CmmFloat r _) _ + = let bs = floatToBytes (fromRational r) + in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs + + ppr_item FF64 (CmmFloat r _) _ + = let bs = doubleToBytes (fromRational r) + in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs + + ppr_item II16 _ _ = [text "\t.short\t" <> pprImm imm] + + ppr_item II64 (CmmInt x _) dflags + | not(archPPC_64 dflags) = + [text "\t.long\t" + <> int (fromIntegral + (fromIntegral (x `shiftR` 32) :: Word32)), + text "\t.long\t" + <> int (fromIntegral (fromIntegral x :: Word32))] + + ppr_item _ _ _ + = panic "PPC.Ppr.pprDataItem: no match" + + +pprInstr :: Instr -> SDoc + +pprInstr (COMMENT _) = empty -- nuke 'em +{- +pprInstr (COMMENT s) = + if platformOS platform == OSLinux + then text "# " <> ftext s + else text "; " <> ftext s +-} +pprInstr (DELTA d) + = pprInstr (COMMENT (mkFastString ("\tdelta = " ++ show d))) + +pprInstr (NEWBLOCK _) + = panic "PprMach.pprInstr: NEWBLOCK" + +pprInstr (LDATA _ _) + = panic "PprMach.pprInstr: LDATA" + +{- +pprInstr (SPILL reg slot) + = hcat [ + text "\tSPILL", + char '\t', + pprReg reg, + comma, + text "SLOT" <> parens (int slot)] + +pprInstr (RELOAD slot reg) + = hcat [ + text "\tRELOAD", + char '\t', + text "SLOT" <> parens (int slot), + comma, + pprReg reg] +-} + +pprInstr (LD fmt reg addr) = hcat [ + char '\t', + text "l", + ptext (case fmt of + II8 -> sLit "bz" + II16 -> sLit "hz" + II32 -> sLit "wz" + II64 -> sLit "d" + FF32 -> sLit "fs" + FF64 -> sLit "fd" + ), + case addr of AddrRegImm _ _ -> empty + AddrRegReg _ _ -> char 'x', + char '\t', + pprReg reg, + text ", ", + pprAddr addr + ] + +pprInstr (LDFAR fmt reg (AddrRegImm source off)) = + sdocWithPlatform $ \platform -> vcat [ + pprInstr (ADDIS (tmpReg platform) source (HA off)), + pprInstr (LD fmt reg (AddrRegImm (tmpReg platform) (LO off))) + ] +pprInstr (LDFAR _ _ _) = + panic "PPC.Ppr.pprInstr LDFAR: no match" + +pprInstr (LDR fmt reg1 addr) = hcat [ + text "\tl", + case fmt of + II32 -> char 'w' + II64 -> char 'd' + _ -> panic "PPC.Ppr.Instr LDR: no match", + text "arx\t", + pprReg reg1, + text ", ", + pprAddr addr + ] + +pprInstr (LA fmt reg addr) = hcat [ + char '\t', + text "l", + ptext (case fmt of + II8 -> sLit "ba" + II16 -> sLit "ha" + II32 -> sLit "wa" + II64 -> sLit "d" + FF32 -> sLit "fs" + FF64 -> sLit "fd" + ), + case addr of AddrRegImm _ _ -> empty + AddrRegReg _ _ -> char 'x', + char '\t', + pprReg reg, + text ", ", + pprAddr addr + ] +pprInstr (ST fmt reg addr) = hcat [ + char '\t', + text "st", + pprFormat fmt, + case addr of AddrRegImm _ _ -> empty + AddrRegReg _ _ -> char 'x', + char '\t', + pprReg reg, + text ", ", + pprAddr addr + ] +pprInstr (STFAR fmt reg (AddrRegImm source off)) = + sdocWithPlatform $ \platform -> vcat [ + pprInstr (ADDIS (tmpReg platform) source (HA off)), + pprInstr (ST fmt reg (AddrRegImm (tmpReg platform) (LO off))) + ] +pprInstr (STFAR _ _ _) = + panic "PPC.Ppr.pprInstr STFAR: no match" +pprInstr (STU fmt reg addr) = hcat [ + char '\t', + text "st", + pprFormat fmt, + char 'u', + case addr of AddrRegImm _ _ -> empty + AddrRegReg _ _ -> char 'x', + char '\t', + pprReg reg, + text ", ", + pprAddr addr + ] +pprInstr (STC fmt reg1 addr) = hcat [ + text "\tst", + case fmt of + II32 -> char 'w' + II64 -> char 'd' + _ -> panic "PPC.Ppr.Instr STC: no match", + text "cx.\t", + pprReg reg1, + text ", ", + pprAddr addr + ] +pprInstr (LIS reg imm) = hcat [ + char '\t', + text "lis", + char '\t', + pprReg reg, + text ", ", + pprImm imm + ] +pprInstr (LI reg imm) = hcat [ + char '\t', + text "li", + char '\t', + pprReg reg, + text ", ", + pprImm imm + ] +pprInstr (MR reg1 reg2) + | reg1 == reg2 = empty + | otherwise = hcat [ + char '\t', + sdocWithPlatform $ \platform -> + case targetClassOfReg platform reg1 of + RcInteger -> text "mr" + _ -> text "fmr", + char '\t', + pprReg reg1, + text ", ", + pprReg reg2 + ] +pprInstr (CMP fmt reg ri) = hcat [ + char '\t', + op, + char '\t', + pprReg reg, + text ", ", + pprRI ri + ] + where + op = hcat [ + text "cmp", + pprFormat fmt, + case ri of + RIReg _ -> empty + RIImm _ -> char 'i' + ] +pprInstr (CMPL fmt reg ri) = hcat [ + char '\t', + op, + char '\t', + pprReg reg, + text ", ", + pprRI ri + ] + where + op = hcat [ + text "cmpl", + pprFormat fmt, + case ri of + RIReg _ -> empty + RIImm _ -> char 'i' + ] +pprInstr (BCC cond blockid prediction) = hcat [ + char '\t', + text "b", + pprCond cond, + pprPrediction prediction, + char '\t', + ppr lbl + ] + where lbl = mkLocalBlockLabel (getUnique blockid) + pprPrediction p = case p of + Nothing -> empty + Just True -> char '+' + Just False -> char '-' + +pprInstr (BCCFAR cond blockid prediction) = vcat [ + hcat [ + text "\tb", + pprCond (condNegate cond), + neg_prediction, + text "\t$+8" + ], + hcat [ + text "\tb\t", + ppr lbl + ] + ] + where lbl = mkLocalBlockLabel (getUnique blockid) + neg_prediction = case prediction of + Nothing -> empty + Just True -> char '-' + Just False -> char '+' + +pprInstr (JMP lbl _) + -- We never jump to ForeignLabels; if we ever do, c.f. handling for "BL" + | isForeignLabel lbl = panic "PPC.Ppr.pprInstr: JMP to ForeignLabel" + | otherwise = + hcat [ -- an alias for b that takes a CLabel + char '\t', + text "b", + char '\t', + ppr lbl + ] + +pprInstr (MTCTR reg) = hcat [ + char '\t', + text "mtctr", + char '\t', + pprReg reg + ] +pprInstr (BCTR _ _ _) = hcat [ + char '\t', + text "bctr" + ] +pprInstr (BL lbl _) = do + sdocWithPlatform $ \platform -> case platformOS platform of + OSAIX -> + -- On AIX, "printf" denotes a function-descriptor (for use + -- by function pointers), whereas the actual entry-code + -- address is denoted by the dot-prefixed ".printf" label. + -- Moreover, the PPC NCG only ever emits a BL instruction + -- for calling C ABI functions. Most of the time these calls + -- originate from FFI imports and have a 'ForeignLabel', + -- but when profiling the codegen inserts calls via + -- 'emitRtsCallGen' which are 'CmmLabel's even though + -- they'd technically be more like 'ForeignLabel's. + hcat [ + text "\tbl\t.", + ppr lbl + ] + _ -> + hcat [ + text "\tbl\t", + ppr lbl + ] +pprInstr (BCTRL _) = hcat [ + char '\t', + text "bctrl" + ] +pprInstr (ADD reg1 reg2 ri) = pprLogic (sLit "add") reg1 reg2 ri +pprInstr (ADDIS reg1 reg2 imm) = hcat [ + char '\t', + text "addis", + char '\t', + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprImm imm + ] + +pprInstr (ADDO reg1 reg2 reg3) = pprLogic (sLit "addo") reg1 reg2 (RIReg reg3) +pprInstr (ADDC reg1 reg2 reg3) = pprLogic (sLit "addc") reg1 reg2 (RIReg reg3) +pprInstr (ADDE reg1 reg2 reg3) = pprLogic (sLit "adde") reg1 reg2 (RIReg reg3) +pprInstr (ADDZE reg1 reg2) = pprUnary (sLit "addze") reg1 reg2 +pprInstr (SUBF reg1 reg2 reg3) = pprLogic (sLit "subf") reg1 reg2 (RIReg reg3) +pprInstr (SUBFO reg1 reg2 reg3) = pprLogic (sLit "subfo") reg1 reg2 (RIReg reg3) +pprInstr (SUBFC reg1 reg2 ri) = hcat [ + char '\t', + text "subf", + case ri of + RIReg _ -> empty + RIImm _ -> char 'i', + text "c\t", + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprRI ri + ] +pprInstr (SUBFE reg1 reg2 reg3) = pprLogic (sLit "subfe") reg1 reg2 (RIReg reg3) +pprInstr (MULL fmt reg1 reg2 ri) = pprMul fmt reg1 reg2 ri +pprInstr (MULLO fmt reg1 reg2 reg3) = hcat [ + char '\t', + text "mull", + case fmt of + II32 -> char 'w' + II64 -> char 'd' + _ -> panic "PPC: illegal format", + text "o\t", + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprReg reg3 + ] +pprInstr (MFOV fmt reg) = vcat [ + hcat [ + char '\t', + text "mfxer", + char '\t', + pprReg reg + ], + hcat [ + char '\t', + text "extr", + case fmt of + II32 -> char 'w' + II64 -> char 'd' + _ -> panic "PPC: illegal format", + text "i\t", + pprReg reg, + text ", ", + pprReg reg, + text ", 1, ", + case fmt of + II32 -> text "1" + II64 -> text "33" + _ -> panic "PPC: illegal format" + ] + ] + +pprInstr (MULHU fmt reg1 reg2 reg3) = hcat [ + char '\t', + text "mulh", + case fmt of + II32 -> char 'w' + II64 -> char 'd' + _ -> panic "PPC: illegal format", + text "u\t", + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprReg reg3 + ] + +pprInstr (DIV fmt sgn reg1 reg2 reg3) = pprDiv fmt sgn reg1 reg2 reg3 + + -- for some reason, "andi" doesn't exist. + -- we'll use "andi." instead. +pprInstr (AND reg1 reg2 (RIImm imm)) = hcat [ + char '\t', + text "andi.", + char '\t', + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprImm imm + ] +pprInstr (AND reg1 reg2 ri) = pprLogic (sLit "and") reg1 reg2 ri +pprInstr (ANDC reg1 reg2 reg3) = pprLogic (sLit "andc") reg1 reg2 (RIReg reg3) +pprInstr (NAND reg1 reg2 reg3) = pprLogic (sLit "nand") reg1 reg2 (RIReg reg3) + +pprInstr (OR reg1 reg2 ri) = pprLogic (sLit "or") reg1 reg2 ri +pprInstr (XOR reg1 reg2 ri) = pprLogic (sLit "xor") reg1 reg2 ri + +pprInstr (ORIS reg1 reg2 imm) = hcat [ + char '\t', + text "oris", + char '\t', + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprImm imm + ] + +pprInstr (XORIS reg1 reg2 imm) = hcat [ + char '\t', + text "xoris", + char '\t', + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprImm imm + ] + +pprInstr (EXTS fmt reg1 reg2) = hcat [ + char '\t', + text "exts", + pprFormat fmt, + char '\t', + pprReg reg1, + text ", ", + pprReg reg2 + ] +pprInstr (CNTLZ fmt reg1 reg2) = hcat [ + char '\t', + text "cntlz", + case fmt of + II32 -> char 'w' + II64 -> char 'd' + _ -> panic "PPC: illegal format", + char '\t', + pprReg reg1, + text ", ", + pprReg reg2 + ] + +pprInstr (NEG reg1 reg2) = pprUnary (sLit "neg") reg1 reg2 +pprInstr (NOT reg1 reg2) = pprUnary (sLit "not") reg1 reg2 + +pprInstr (SR II32 reg1 reg2 (RIImm (ImmInt i))) | i < 0 || i > 31 = + -- Handle the case where we are asked to shift a 32 bit register by + -- less than zero or more than 31 bits. We convert this into a clear + -- of the destination register. + -- Fixes ticket https://gitlab.haskell.org/ghc/ghc/issues/5900 + pprInstr (XOR reg1 reg2 (RIReg reg2)) + +pprInstr (SL II32 reg1 reg2 (RIImm (ImmInt i))) | i < 0 || i > 31 = + -- As above for SR, but for left shifts. + -- Fixes ticket https://gitlab.haskell.org/ghc/ghc/issues/10870 + pprInstr (XOR reg1 reg2 (RIReg reg2)) + +pprInstr (SRA II32 reg1 reg2 (RIImm (ImmInt i))) | i > 31 = + -- PT: I don't know what to do for negative shift amounts: + -- For now just panic. + -- + -- For shift amounts greater than 31 set all bit to the + -- value of the sign bit, this also what sraw does. + pprInstr (SRA II32 reg1 reg2 (RIImm (ImmInt 31))) + +pprInstr (SL fmt reg1 reg2 ri) = + let op = case fmt of + II32 -> "slw" + II64 -> "sld" + _ -> panic "PPC.Ppr.pprInstr: shift illegal size" + in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri) + +pprInstr (SR fmt reg1 reg2 ri) = + let op = case fmt of + II32 -> "srw" + II64 -> "srd" + _ -> panic "PPC.Ppr.pprInstr: shift illegal size" + in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri) + +pprInstr (SRA fmt reg1 reg2 ri) = + let op = case fmt of + II32 -> "sraw" + II64 -> "srad" + _ -> panic "PPC.Ppr.pprInstr: shift illegal size" + in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri) + +pprInstr (RLWINM reg1 reg2 sh mb me) = hcat [ + text "\trlwinm\t", + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + int sh, + text ", ", + int mb, + text ", ", + int me + ] + +pprInstr (CLRLI fmt reg1 reg2 n) = hcat [ + text "\tclrl", + pprFormat fmt, + text "i ", + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + int n + ] +pprInstr (CLRRI fmt reg1 reg2 n) = hcat [ + text "\tclrr", + pprFormat fmt, + text "i ", + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + int n + ] + +pprInstr (FADD fmt reg1 reg2 reg3) = pprBinaryF (sLit "fadd") fmt reg1 reg2 reg3 +pprInstr (FSUB fmt reg1 reg2 reg3) = pprBinaryF (sLit "fsub") fmt reg1 reg2 reg3 +pprInstr (FMUL fmt reg1 reg2 reg3) = pprBinaryF (sLit "fmul") fmt reg1 reg2 reg3 +pprInstr (FDIV fmt reg1 reg2 reg3) = pprBinaryF (sLit "fdiv") fmt reg1 reg2 reg3 +pprInstr (FABS reg1 reg2) = pprUnary (sLit "fabs") reg1 reg2 +pprInstr (FNEG reg1 reg2) = pprUnary (sLit "fneg") reg1 reg2 + +pprInstr (FCMP reg1 reg2) = hcat [ + char '\t', + text "fcmpu\t0, ", + -- Note: we're using fcmpu, not fcmpo + -- The difference is with fcmpo, compare with NaN is an invalid operation. + -- We don't handle invalid fp ops, so we don't care. + -- Moreover, we use `fcmpu 0, ...` rather than `fcmpu cr0, ...` for + -- better portability since some non-GNU assembler (such as + -- IBM's `as`) tend not to support the symbolic register name cr0. + -- This matches the syntax that GCC seems to emit for PPC targets. + pprReg reg1, + text ", ", + pprReg reg2 + ] + +pprInstr (FCTIWZ reg1 reg2) = pprUnary (sLit "fctiwz") reg1 reg2 +pprInstr (FCTIDZ reg1 reg2) = pprUnary (sLit "fctidz") reg1 reg2 +pprInstr (FCFID reg1 reg2) = pprUnary (sLit "fcfid") reg1 reg2 +pprInstr (FRSP reg1 reg2) = pprUnary (sLit "frsp") reg1 reg2 + +pprInstr (CRNOR dst src1 src2) = hcat [ + text "\tcrnor\t", + int dst, + text ", ", + int src1, + text ", ", + int src2 + ] + +pprInstr (MFCR reg) = hcat [ + char '\t', + text "mfcr", + char '\t', + pprReg reg + ] + +pprInstr (MFLR reg) = hcat [ + char '\t', + text "mflr", + char '\t', + pprReg reg + ] + +pprInstr (FETCHPC reg) = vcat [ + text "\tbcl\t20,31,1f", + hcat [ text "1:\tmflr\t", pprReg reg ] + ] + +pprInstr HWSYNC = text "\tsync" + +pprInstr ISYNC = text "\tisync" + +pprInstr LWSYNC = text "\tlwsync" + +pprInstr NOP = text "\tnop" + + +pprLogic :: PtrString -> Reg -> Reg -> RI -> SDoc +pprLogic op reg1 reg2 ri = hcat [ + char '\t', + ptext op, + case ri of + RIReg _ -> empty + RIImm _ -> char 'i', + char '\t', + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprRI ri + ] + + +pprMul :: Format -> Reg -> Reg -> RI -> SDoc +pprMul fmt reg1 reg2 ri = hcat [ + char '\t', + text "mull", + case ri of + RIReg _ -> case fmt of + II32 -> char 'w' + II64 -> char 'd' + _ -> panic "PPC: illegal format" + RIImm _ -> char 'i', + char '\t', + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprRI ri + ] + + +pprDiv :: Format -> Bool -> Reg -> Reg -> Reg -> SDoc +pprDiv fmt sgn reg1 reg2 reg3 = hcat [ + char '\t', + text "div", + case fmt of + II32 -> char 'w' + II64 -> char 'd' + _ -> panic "PPC: illegal format", + if sgn then empty else char 'u', + char '\t', + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprReg reg3 + ] + + +pprUnary :: PtrString -> Reg -> Reg -> SDoc +pprUnary op reg1 reg2 = hcat [ + char '\t', + ptext op, + char '\t', + pprReg reg1, + text ", ", + pprReg reg2 + ] + + +pprBinaryF :: PtrString -> Format -> Reg -> Reg -> Reg -> SDoc +pprBinaryF op fmt reg1 reg2 reg3 = hcat [ + char '\t', + ptext op, + pprFFormat fmt, + char '\t', + pprReg reg1, + text ", ", + pprReg reg2, + text ", ", + pprReg reg3 + ] + +pprRI :: RI -> SDoc +pprRI (RIReg r) = pprReg r +pprRI (RIImm r) = pprImm r + + +pprFFormat :: Format -> SDoc +pprFFormat FF64 = empty +pprFFormat FF32 = char 's' +pprFFormat _ = panic "PPC.Ppr.pprFFormat: no match" + + -- limit immediate argument for shift instruction to range 0..63 + -- for 64 bit size and 0..32 otherwise +limitShiftRI :: Format -> RI -> RI +limitShiftRI II64 (RIImm (ImmInt i)) | i > 63 || i < 0 = + panic $ "PPC.Ppr: Shift by " ++ show i ++ " bits is not allowed." +limitShiftRI II32 (RIImm (ImmInt i)) | i > 31 || i < 0 = + panic $ "PPC.Ppr: 32 bit: Shift by " ++ show i ++ " bits is not allowed." +limitShiftRI _ x = x diff --git a/compiler/GHC/CmmToAsm/PPC/RegInfo.hs b/compiler/GHC/CmmToAsm/PPC/RegInfo.hs new file mode 100644 index 0000000000..a75040d703 --- /dev/null +++ b/compiler/GHC/CmmToAsm/PPC/RegInfo.hs @@ -0,0 +1,80 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Machine-specific parts of the register allocator +-- +-- (c) The University of Glasgow 1996-2004 +-- +----------------------------------------------------------------------------- +module GHC.CmmToAsm.PPC.RegInfo ( + JumpDest( DestBlockId ), getJumpDestBlockId, + canShortcut, + shortcutJump, + + shortcutStatics +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.CmmToAsm.PPC.Instr + +import GHC.Cmm.BlockId +import GHC.Cmm +import GHC.Cmm.CLabel + +import Unique +import Outputable (ppr, text, Outputable, (<>)) + +data JumpDest = DestBlockId BlockId + +-- Debug Instance +instance Outputable JumpDest where + ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid + +getJumpDestBlockId :: JumpDest -> Maybe BlockId +getJumpDestBlockId (DestBlockId bid) = Just bid + +canShortcut :: Instr -> Maybe JumpDest +canShortcut _ = Nothing + +shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr +shortcutJump _ other = other + + +-- Here because it knows about JumpDest +shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics +shortcutStatics fn (RawCmmStatics lbl statics) + = RawCmmStatics lbl $ map (shortcutStatic fn) statics + -- we need to get the jump tables, so apply the mapping to the entries + -- of a CmmData too. + +shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel +shortcutLabel fn lab + | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn blkId + | otherwise = lab + +shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic +shortcutStatic fn (CmmStaticLit (CmmLabel lab)) + = CmmStaticLit (CmmLabel (shortcutLabel fn lab)) +shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w)) + = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w) + -- slightly dodgy, we're ignoring the second label, but this + -- works with the way we use CmmLabelDiffOff for jump tables now. +shortcutStatic _ other_static + = other_static + +shortBlockId + :: (BlockId -> Maybe JumpDest) + -> BlockId + -> CLabel + +shortBlockId fn blockid = + case fn blockid of + Nothing -> mkLocalBlockLabel uq + Just (DestBlockId blockid') -> shortBlockId fn blockid' + where uq = getUnique blockid diff --git a/compiler/GHC/CmmToAsm/PPC/Regs.hs b/compiler/GHC/CmmToAsm/PPC/Regs.hs new file mode 100644 index 0000000000..8a9a859665 --- /dev/null +++ b/compiler/GHC/CmmToAsm/PPC/Regs.hs @@ -0,0 +1,333 @@ +{-# LANGUAGE CPP #-} + +-- ----------------------------------------------------------------------------- +-- +-- (c) The University of Glasgow 1994-2004 +-- +-- ----------------------------------------------------------------------------- + +module GHC.CmmToAsm.PPC.Regs ( + -- squeeze functions + virtualRegSqueeze, + realRegSqueeze, + + mkVirtualReg, + regDotColor, + + -- immediates + Imm(..), + strImmLit, + litToImm, + + -- addressing modes + AddrMode(..), + addrOffset, + + -- registers + spRel, + argRegs, + allArgRegs, + callClobberedRegs, + allMachRegNos, + classOfRealReg, + showReg, + + -- machine specific + allFPArgRegs, + fits16Bits, + makeImmediate, + fReg, + r0, sp, toc, r3, r4, r11, r12, r30, + tmpReg, + f1, + + allocatableRegs + +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.Platform.Reg +import GHC.Platform.Reg.Class +import GHC.CmmToAsm.Format + +import GHC.Cmm +import GHC.Cmm.CLabel ( CLabel ) +import Unique + +import GHC.Platform.Regs +import GHC.Driver.Session +import Outputable +import GHC.Platform + +import Data.Word ( Word8, Word16, Word32, Word64 ) +import Data.Int ( Int8, Int16, Int32, Int64 ) + + +-- squeese functions for the graph allocator ----------------------------------- + +-- | regSqueeze_class reg +-- Calculate the maximum number of register colors that could be +-- denied to a node of this class due to having this reg +-- as a neighbour. +-- +{-# INLINE virtualRegSqueeze #-} +virtualRegSqueeze :: RegClass -> VirtualReg -> Int +virtualRegSqueeze cls vr + = case cls of + RcInteger + -> case vr of + VirtualRegI{} -> 1 + VirtualRegHi{} -> 1 + _other -> 0 + + RcDouble + -> case vr of + VirtualRegD{} -> 1 + VirtualRegF{} -> 0 + _other -> 0 + + _other -> 0 + +{-# INLINE realRegSqueeze #-} +realRegSqueeze :: RegClass -> RealReg -> Int +realRegSqueeze cls rr + = case cls of + RcInteger + -> case rr of + RealRegSingle regNo + | regNo < 32 -> 1 -- first fp reg is 32 + | otherwise -> 0 + + RealRegPair{} -> 0 + + RcDouble + -> case rr of + RealRegSingle regNo + | regNo < 32 -> 0 + | otherwise -> 1 + + RealRegPair{} -> 0 + + _other -> 0 + +mkVirtualReg :: Unique -> Format -> VirtualReg +mkVirtualReg u format + | not (isFloatFormat format) = VirtualRegI u + | otherwise + = case format of + FF32 -> VirtualRegD u + FF64 -> VirtualRegD u + _ -> panic "mkVirtualReg" + +regDotColor :: RealReg -> SDoc +regDotColor reg + = case classOfRealReg reg of + RcInteger -> text "blue" + RcFloat -> text "red" + RcDouble -> text "green" + + + +-- immediates ------------------------------------------------------------------ +data Imm + = ImmInt Int + | ImmInteger Integer -- Sigh. + | ImmCLbl CLabel -- AbstractC Label (with baggage) + | ImmLit SDoc -- Simple string + | ImmIndex CLabel Int + | ImmFloat Rational + | ImmDouble Rational + | ImmConstantSum Imm Imm + | ImmConstantDiff Imm Imm + | LO Imm + | HI Imm + | HA Imm {- high halfword adjusted -} + | HIGHERA Imm + | HIGHESTA Imm + + +strImmLit :: String -> Imm +strImmLit s = ImmLit (text s) + + +litToImm :: CmmLit -> Imm +litToImm (CmmInt i w) = ImmInteger (narrowS w i) + -- narrow to the width: a CmmInt might be out of + -- range, but we assume that ImmInteger only contains + -- in-range values. A signed value should be fine here. +litToImm (CmmFloat f W32) = ImmFloat f +litToImm (CmmFloat f W64) = ImmDouble f +litToImm (CmmLabel l) = ImmCLbl l +litToImm (CmmLabelOff l off) = ImmIndex l off +litToImm (CmmLabelDiffOff l1 l2 off _) + = ImmConstantSum + (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2)) + (ImmInt off) +litToImm _ = panic "PPC.Regs.litToImm: no match" + + +-- addressing modes ------------------------------------------------------------ + +data AddrMode + = AddrRegReg Reg Reg + | AddrRegImm Reg Imm + + +addrOffset :: AddrMode -> Int -> Maybe AddrMode +addrOffset addr off + = case addr of + AddrRegImm r (ImmInt n) + | fits16Bits n2 -> Just (AddrRegImm r (ImmInt n2)) + | otherwise -> Nothing + where n2 = n + off + + AddrRegImm r (ImmInteger n) + | fits16Bits n2 -> Just (AddrRegImm r (ImmInt (fromInteger n2))) + | otherwise -> Nothing + where n2 = n + toInteger off + + _ -> Nothing + + +-- registers ------------------------------------------------------------------- +-- @spRel@ gives us a stack relative addressing mode for volatile +-- temporaries and for excess call arguments. @fpRel@, where +-- applicable, is the same but for the frame pointer. + +spRel :: DynFlags + -> Int -- desired stack offset in words, positive or negative + -> AddrMode + +spRel dflags n = AddrRegImm sp (ImmInt (n * wORD_SIZE dflags)) + + +-- argRegs is the set of regs which are read for an n-argument call to C. +-- For archs which pass all args on the stack (x86), is empty. +-- Sparc passes up to the first 6 args in regs. +argRegs :: RegNo -> [Reg] +argRegs 0 = [] +argRegs 1 = map regSingle [3] +argRegs 2 = map regSingle [3,4] +argRegs 3 = map regSingle [3..5] +argRegs 4 = map regSingle [3..6] +argRegs 5 = map regSingle [3..7] +argRegs 6 = map regSingle [3..8] +argRegs 7 = map regSingle [3..9] +argRegs 8 = map regSingle [3..10] +argRegs _ = panic "MachRegs.argRegs(powerpc): don't know about >8 arguments!" + + +allArgRegs :: [Reg] +allArgRegs = map regSingle [3..10] + + +-- these are the regs which we cannot assume stay alive over a C call. +callClobberedRegs :: Platform -> [Reg] +callClobberedRegs _platform + = map regSingle (0:[2..12] ++ map fReg [0..13]) + + +allMachRegNos :: [RegNo] +allMachRegNos = [0..63] + + +{-# INLINE classOfRealReg #-} +classOfRealReg :: RealReg -> RegClass +classOfRealReg (RealRegSingle i) + | i < 32 = RcInteger + | otherwise = RcDouble + +classOfRealReg (RealRegPair{}) + = panic "regClass(ppr): no reg pairs on this architecture" + +showReg :: RegNo -> String +showReg n + | n >= 0 && n <= 31 = "%r" ++ show n + | n >= 32 && n <= 63 = "%f" ++ show (n - 32) + | otherwise = "%unknown_powerpc_real_reg_" ++ show n + + + +-- machine specific ------------------------------------------------------------ + +allFPArgRegs :: Platform -> [Reg] +allFPArgRegs platform + = case platformOS platform of + OSAIX -> map (regSingle . fReg) [1..13] + _ -> case platformArch platform of + ArchPPC -> map (regSingle . fReg) [1..8] + ArchPPC_64 _ -> map (regSingle . fReg) [1..13] + _ -> panic "PPC.Regs.allFPArgRegs: unknown PPC Linux" + +fits16Bits :: Integral a => a -> Bool +fits16Bits x = x >= -32768 && x < 32768 + +makeImmediate :: Integral a => Width -> Bool -> a -> Maybe Imm +makeImmediate rep signed x = fmap ImmInt (toI16 rep signed) + where + narrow W64 False = fromIntegral (fromIntegral x :: Word64) + narrow W32 False = fromIntegral (fromIntegral x :: Word32) + narrow W16 False = fromIntegral (fromIntegral x :: Word16) + narrow W8 False = fromIntegral (fromIntegral x :: Word8) + narrow W64 True = fromIntegral (fromIntegral x :: Int64) + narrow W32 True = fromIntegral (fromIntegral x :: Int32) + narrow W16 True = fromIntegral (fromIntegral x :: Int16) + narrow W8 True = fromIntegral (fromIntegral x :: Int8) + narrow _ _ = panic "PPC.Regs.narrow: no match" + + narrowed = narrow rep signed + + toI16 W32 True + | narrowed >= -32768 && narrowed < 32768 = Just narrowed + | otherwise = Nothing + toI16 W32 False + | narrowed >= 0 && narrowed < 65536 = Just narrowed + | otherwise = Nothing + toI16 W64 True + | narrowed >= -32768 && narrowed < 32768 = Just narrowed + | otherwise = Nothing + toI16 W64 False + | narrowed >= 0 && narrowed < 65536 = Just narrowed + | otherwise = Nothing + toI16 _ _ = Just narrowed + + +{- +The PowerPC has 64 registers of interest; 32 integer registers and 32 floating +point registers. +-} + +fReg :: Int -> RegNo +fReg x = (32 + x) + +r0, sp, toc, r3, r4, r11, r12, r30, f1 :: Reg +r0 = regSingle 0 +sp = regSingle 1 +toc = regSingle 2 +r3 = regSingle 3 +r4 = regSingle 4 +r11 = regSingle 11 +r12 = regSingle 12 +r30 = regSingle 30 +f1 = regSingle $ fReg 1 + +-- allocatableRegs is allMachRegNos with the fixed-use regs removed. +-- i.e., these are the regs for which we are prepared to allow the +-- register allocator to attempt to map VRegs to. +allocatableRegs :: Platform -> [RealReg] +allocatableRegs platform + = let isFree i = freeReg platform i + in map RealRegSingle $ filter isFree allMachRegNos + +-- temporary register for compiler use +tmpReg :: Platform -> Reg +tmpReg platform = + case platformArch platform of + ArchPPC -> regSingle 13 + ArchPPC_64 _ -> regSingle 30 + _ -> panic "PPC.Regs.tmpReg: unknown arch" diff --git a/compiler/GHC/CmmToAsm/Ppr.hs b/compiler/GHC/CmmToAsm/Ppr.hs new file mode 100644 index 0000000000..636d2e4e3a --- /dev/null +++ b/compiler/GHC/CmmToAsm/Ppr.hs @@ -0,0 +1,275 @@ +{-# LANGUAGE MagicHash #-} + +----------------------------------------------------------------------------- +-- +-- Pretty-printing assembly language +-- +-- (c) The University of Glasgow 1993-2005 +-- +----------------------------------------------------------------------------- + +module GHC.CmmToAsm.Ppr ( + castFloatToWord8Array, + castDoubleToWord8Array, + floatToBytes, + doubleToBytes, + pprASCII, + pprBytes, + pprSectionHeader +) + +where + +import GhcPrelude + +import AsmUtils +import GHC.Cmm.CLabel +import GHC.Cmm +import GHC.Driver.Session +import FastString +import Outputable +import GHC.Platform +import FileCleanup + +import qualified Data.Array.Unsafe as U ( castSTUArray ) +import Data.Array.ST + +import Control.Monad.ST + +import Data.Word +import Data.Bits +import Data.ByteString (ByteString) +import qualified Data.ByteString as BS +import GHC.Exts +import GHC.Word +import System.IO.Unsafe + + + +-- ----------------------------------------------------------------------------- +-- Converting floating-point literals to integrals for printing + +castFloatToWord8Array :: STUArray s Int Float -> ST s (STUArray s Int Word8) +castFloatToWord8Array = U.castSTUArray + +castDoubleToWord8Array :: STUArray s Int Double -> ST s (STUArray s Int Word8) +castDoubleToWord8Array = U.castSTUArray + +-- floatToBytes and doubleToBytes convert to the host's byte +-- order. Providing that we're not cross-compiling for a +-- target with the opposite endianness, this should work ok +-- on all targets. + +-- ToDo: this stuff is very similar to the shenanigans in PprAbs, +-- could they be merged? + +floatToBytes :: Float -> [Int] +floatToBytes f + = runST (do + arr <- newArray_ ((0::Int),3) + writeArray arr 0 f + arr <- castFloatToWord8Array arr + i0 <- readArray arr 0 + i1 <- readArray arr 1 + i2 <- readArray arr 2 + i3 <- readArray arr 3 + return (map fromIntegral [i0,i1,i2,i3]) + ) + +doubleToBytes :: Double -> [Int] +doubleToBytes d + = runST (do + arr <- newArray_ ((0::Int),7) + writeArray arr 0 d + arr <- castDoubleToWord8Array arr + i0 <- readArray arr 0 + i1 <- readArray arr 1 + i2 <- readArray arr 2 + i3 <- readArray arr 3 + i4 <- readArray arr 4 + i5 <- readArray arr 5 + i6 <- readArray arr 6 + i7 <- readArray arr 7 + return (map fromIntegral [i0,i1,i2,i3,i4,i5,i6,i7]) + ) + +-- --------------------------------------------------------------------------- +-- Printing ASCII strings. +-- +-- Print as a string and escape non-printable characters. +-- This is similar to charToC in Utils. + +pprASCII :: ByteString -> SDoc +pprASCII str + -- Transform this given literal bytestring to escaped string and construct + -- the literal SDoc directly. + -- See #14741 + -- and Note [Pretty print ASCII when AsmCodeGen] + = text $ BS.foldr (\w s -> do1 w ++ s) "" str + where + do1 :: Word8 -> String + do1 w | 0x09 == w = "\\t" + | 0x0A == w = "\\n" + | 0x22 == w = "\\\"" + | 0x5C == w = "\\\\" + -- ASCII printable characters range + | w >= 0x20 && w <= 0x7E = [chr' w] + | otherwise = '\\' : octal w + + -- we know that the Chars we create are in the ASCII range + -- so we bypass the check in "chr" + chr' :: Word8 -> Char + chr' (W8# w#) = C# (chr# (word2Int# w#)) + + octal :: Word8 -> String + octal w = [ chr' (ord0 + (w `unsafeShiftR` 6) .&. 0x07) + , chr' (ord0 + (w `unsafeShiftR` 3) .&. 0x07) + , chr' (ord0 + w .&. 0x07) + ] + ord0 = 0x30 -- = ord '0' + +-- | Pretty print binary data. +-- +-- Use either the ".string" directive or a ".incbin" directive. +-- See Note [Embedding large binary blobs] +-- +-- A NULL byte is added after the binary data. +-- +pprBytes :: ByteString -> SDoc +pprBytes bs = sdocWithDynFlags $ \dflags -> + if binBlobThreshold dflags == 0 + || fromIntegral (BS.length bs) <= binBlobThreshold dflags + then text "\t.string " <> doubleQuotes (pprASCII bs) + else unsafePerformIO $ do + bFile <- newTempName dflags TFL_CurrentModule ".dat" + BS.writeFile bFile bs + return $ text "\t.incbin " + <> pprFilePathString bFile -- proper escape (see #16389) + <> text "\n\t.byte 0" + +{- +Note [Embedding large binary blobs] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To embed a blob of binary data (e.g. an UTF-8 encoded string) into the generated +code object, we have several options: + + 1. Generate a ".byte" directive for each byte. This is what was done in the past + (see Note [Pretty print ASCII when AsmCodeGen]). + + 2. Generate a single ".string"/".asciz" directive for the whole sequence of + bytes. Bytes in the ASCII printable range are rendered as characters and + other values are escaped (e.g., "\t", "\077", etc.). + + 3. Create a temporary file into which we dump the binary data and generate a + single ".incbin" directive. The assembler will include the binary file for + us in the generated output object. + +Now the code generator uses either (2) or (3), depending on the binary blob +size. Using (3) for small blobs adds too much overhead (see benchmark results +in #16190), so we only do it when the size is above a threshold (500K at the +time of writing). + +The threshold is configurable via the `-fbinary-blob-threshold` flag. + +-} + + +{- +Note [Pretty print ASCII when AsmCodeGen] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Previously, when generating assembly code, we created SDoc with +`(ptext . sLit)` for every bytes in literal bytestring, then +combine them using `hcat`. + +When handling literal bytestrings with millions of bytes, +millions of SDoc would be created and to combine, leading to +high memory usage. + +Now we escape the given bytestring to string directly and construct +SDoc only once. This improvement could dramatically decrease the +memory allocation from 4.7GB to 1.3GB when embedding a 3MB literal +string in source code. See #14741 for profiling results. +-} + +-- ---------------------------------------------------------------------------- +-- Printing section headers. +-- +-- If -split-section was specified, include the suffix label, otherwise just +-- print the section type. For Darwin, where subsections-for-symbols are +-- used instead, only print section type. +-- +-- For string literals, additional flags are specified to enable merging of +-- identical strings in the linker. With -split-sections each string also gets +-- a unique section to allow strings from unused code to be GC'd. + +pprSectionHeader :: Platform -> Section -> SDoc +pprSectionHeader platform (Section t suffix) = + case platformOS platform of + OSAIX -> pprXcoffSectionHeader t + OSDarwin -> pprDarwinSectionHeader t + OSMinGW32 -> pprGNUSectionHeader (char '$') t suffix + _ -> pprGNUSectionHeader (char '.') t suffix + +pprGNUSectionHeader :: SDoc -> SectionType -> CLabel -> SDoc +pprGNUSectionHeader sep t suffix = sdocWithDynFlags $ \dflags -> + let splitSections = gopt Opt_SplitSections dflags + subsection | splitSections = sep <> ppr suffix + | otherwise = empty + in text ".section " <> ptext (header dflags) <> subsection <> + flags dflags + where + header dflags = case t of + Text -> sLit ".text" + Data -> sLit ".data" + ReadOnlyData | OSMinGW32 <- platformOS (targetPlatform dflags) + -> sLit ".rdata" + | otherwise -> sLit ".rodata" + RelocatableReadOnlyData | OSMinGW32 <- platformOS (targetPlatform dflags) + -- Concept does not exist on Windows, + -- So map these to R/O data. + -> sLit ".rdata$rel.ro" + | otherwise -> sLit ".data.rel.ro" + UninitialisedData -> sLit ".bss" + ReadOnlyData16 | OSMinGW32 <- platformOS (targetPlatform dflags) + -> sLit ".rdata$cst16" + | otherwise -> sLit ".rodata.cst16" + CString + | OSMinGW32 <- platformOS (targetPlatform dflags) + -> sLit ".rdata" + | otherwise -> sLit ".rodata.str" + OtherSection _ -> + panic "PprBase.pprGNUSectionHeader: unknown section type" + flags dflags = case t of + CString + | OSMinGW32 <- platformOS (targetPlatform dflags) + -> empty + | otherwise -> text ",\"aMS\"," <> sectionType "progbits" <> text ",1" + _ -> empty + +-- XCOFF doesn't support relocating label-differences, so we place all +-- RO sections into .text[PR] sections +pprXcoffSectionHeader :: SectionType -> SDoc +pprXcoffSectionHeader t = text $ case t of + Text -> ".csect .text[PR]" + Data -> ".csect .data[RW]" + ReadOnlyData -> ".csect .text[PR] # ReadOnlyData" + RelocatableReadOnlyData -> ".csect .text[PR] # RelocatableReadOnlyData" + ReadOnlyData16 -> ".csect .text[PR] # ReadOnlyData16" + CString -> ".csect .text[PR] # CString" + UninitialisedData -> ".csect .data[BS]" + OtherSection _ -> + panic "PprBase.pprXcoffSectionHeader: unknown section type" + +pprDarwinSectionHeader :: SectionType -> SDoc +pprDarwinSectionHeader t = + ptext $ case t of + Text -> sLit ".text" + Data -> sLit ".data" + ReadOnlyData -> sLit ".const" + RelocatableReadOnlyData -> sLit ".const_data" + UninitialisedData -> sLit ".data" + ReadOnlyData16 -> sLit ".const" + CString -> sLit ".section\t__TEXT,__cstring,cstring_literals" + OtherSection _ -> + panic "PprBase.pprDarwinSectionHeader: unknown section type" diff --git a/compiler/GHC/CmmToAsm/Reg/Graph.hs b/compiler/GHC/CmmToAsm/Reg/Graph.hs new file mode 100644 index 0000000000..6dfe84cf95 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph.hs @@ -0,0 +1,472 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE ScopedTypeVariables #-} + +-- | Graph coloring register allocator. +module GHC.CmmToAsm.Reg.Graph ( + regAlloc +) where +import GhcPrelude + +import qualified GraphColor as Color +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Reg.Graph.Spill +import GHC.CmmToAsm.Reg.Graph.SpillClean +import GHC.CmmToAsm.Reg.Graph.SpillCost +import GHC.CmmToAsm.Reg.Graph.Stats +import GHC.CmmToAsm.Reg.Graph.TrivColorable +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Reg.Target +import GHC.Platform.Reg.Class +import GHC.Platform.Reg + +import Bag +import GHC.Driver.Session +import Outputable +import GHC.Platform +import UniqFM +import UniqSet +import UniqSupply +import Util (seqList) +import GHC.CmmToAsm.CFG + +import Data.Maybe +import Control.Monad + + +-- | The maximum number of build\/spill cycles we'll allow. +-- +-- It should only take 3 or 4 cycles for the allocator to converge. +-- If it takes any longer than this it's probably in an infinite loop, +-- so it's better just to bail out and report a bug. +maxSpinCount :: Int +maxSpinCount = 10 + + +-- | The top level of the graph coloring register allocator. +regAlloc + :: (Outputable statics, Outputable instr, Instruction instr) + => DynFlags + -> UniqFM (UniqSet RealReg) -- ^ registers we can use for allocation + -> UniqSet Int -- ^ set of available spill slots. + -> Int -- ^ current number of spill slots + -> [LiveCmmDecl statics instr] -- ^ code annotated with liveness information. + -> Maybe CFG -- ^ CFG of basic blocks if available + -> UniqSM ( [NatCmmDecl statics instr] + , Maybe Int, [RegAllocStats statics instr] ) + -- ^ code with registers allocated, additional stacks required + -- and stats for each stage of allocation + +regAlloc dflags regsFree slotsFree slotsCount code cfg + = do + -- TODO: the regClass function is currently hard coded to the default + -- target architecture. Would prefer to determine this from dflags. + -- There are other uses of targetRegClass later in this module. + let platform = targetPlatform dflags + triv = trivColorable platform + (targetVirtualRegSqueeze platform) + (targetRealRegSqueeze platform) + + (code_final, debug_codeGraphs, slotsCount', _) + <- regAlloc_spin dflags 0 + triv + regsFree slotsFree slotsCount [] code cfg + + let needStack + | slotsCount == slotsCount' + = Nothing + | otherwise + = Just slotsCount' + + return ( code_final + , needStack + , reverse debug_codeGraphs ) + + +-- | Perform solver iterations for the graph coloring allocator. +-- +-- We extract a register conflict graph from the provided cmm code, +-- and try to colour it. If that works then we use the solution rewrite +-- the code with real hregs. If coloring doesn't work we add spill code +-- and try to colour it again. After `maxSpinCount` iterations we give up. +-- +regAlloc_spin + :: forall instr statics. + (Instruction instr, + Outputable instr, + Outputable statics) + => DynFlags + -> Int -- ^ Number of solver iterations we've already performed. + -> Color.Triv VirtualReg RegClass RealReg + -- ^ Function for calculating whether a register is trivially + -- colourable. + -> UniqFM (UniqSet RealReg) -- ^ Free registers that we can allocate. + -> UniqSet Int -- ^ Free stack slots that we can use. + -> Int -- ^ Number of spill slots in use + -> [RegAllocStats statics instr] -- ^ Current regalloc stats to add to. + -> [LiveCmmDecl statics instr] -- ^ Liveness annotated code to allocate. + -> Maybe CFG + -> UniqSM ( [NatCmmDecl statics instr] + , [RegAllocStats statics instr] + , Int -- Slots in use + , Color.Graph VirtualReg RegClass RealReg) + +regAlloc_spin dflags spinCount triv regsFree slotsFree slotsCount debug_codeGraphs code cfg + = do + let platform = targetPlatform dflags + + -- If any of these dump flags are turned on we want to hang on to + -- intermediate structures in the allocator - otherwise tell the + -- allocator to ditch them early so we don't end up creating space leaks. + let dump = or + [ dopt Opt_D_dump_asm_regalloc_stages dflags + , dopt Opt_D_dump_asm_stats dflags + , dopt Opt_D_dump_asm_conflicts dflags ] + + -- Check that we're not running off down the garden path. + when (spinCount > maxSpinCount) + $ pprPanic "regAlloc_spin: max build/spill cycle count exceeded." + ( text "It looks like the register allocator is stuck in an infinite loop." + $$ text "max cycles = " <> int maxSpinCount + $$ text "regsFree = " <> (hcat $ punctuate space $ map ppr + $ nonDetEltsUniqSet $ unionManyUniqSets + $ nonDetEltsUFM regsFree) + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + $$ text "slotsFree = " <> ppr (sizeUniqSet slotsFree)) + + -- Build the register conflict graph from the cmm code. + (graph :: Color.Graph VirtualReg RegClass RealReg) + <- {-# SCC "BuildGraph" #-} buildGraph code + + -- VERY IMPORTANT: + -- We really do want the graph to be fully evaluated _before_ we + -- start coloring. If we don't do this now then when the call to + -- Color.colorGraph forces bits of it, the heap will be filled with + -- half evaluated pieces of graph and zillions of apply thunks. + seqGraph graph `seq` return () + + -- Build a map of the cost of spilling each instruction. + -- This is a lazy binding, so the map will only be computed if we + -- actually have to spill to the stack. + let spillCosts = foldl' plusSpillCostInfo zeroSpillCostInfo + $ map (slurpSpillCostInfo platform cfg) code + + -- The function to choose regs to leave uncolored. + let spill = chooseSpill spillCosts + + -- Record startup state in our log. + let stat1 + = if spinCount == 0 + then Just $ RegAllocStatsStart + { raLiveCmm = code + , raGraph = graph + , raSpillCosts = spillCosts } + else Nothing + + -- Try and color the graph. + let (graph_colored, rsSpill, rmCoalesce) + = {-# SCC "ColorGraph" #-} + Color.colorGraph + (gopt Opt_RegsIterative dflags) + spinCount + regsFree triv spill graph + + -- Rewrite registers in the code that have been coalesced. + let patchF reg + | RegVirtual vr <- reg + = case lookupUFM rmCoalesce vr of + Just vr' -> patchF (RegVirtual vr') + Nothing -> reg + + | otherwise + = reg + + let (code_coalesced :: [LiveCmmDecl statics instr]) + = map (patchEraseLive patchF) code + + -- Check whether we've found a coloring. + if isEmptyUniqSet rsSpill + + -- Coloring was successful because no registers needed to be spilled. + then do + -- if -fasm-lint is turned on then validate the graph. + -- This checks for bugs in the graph allocator itself. + let graph_colored_lint = + if gopt Opt_DoAsmLinting dflags + then Color.validateGraph (text "") + True -- Require all nodes to be colored. + graph_colored + else graph_colored + + -- Rewrite the code to use real hregs, using the colored graph. + let code_patched + = map (patchRegsFromGraph platform graph_colored_lint) + code_coalesced + + -- Clean out unneeded SPILL/RELOAD meta instructions. + -- The spill code generator just spills the entire live range + -- of a vreg, but it might not need to be on the stack for + -- its entire lifetime. + let code_spillclean + = map (cleanSpills platform) code_patched + + -- Strip off liveness information from the allocated code. + -- Also rewrite SPILL/RELOAD meta instructions into real machine + -- instructions along the way + let code_final + = map (stripLive dflags) code_spillclean + + -- Record what happened in this stage for debugging + let stat + = RegAllocStatsColored + { raCode = code + , raGraph = graph + , raGraphColored = graph_colored_lint + , raCoalesced = rmCoalesce + , raCodeCoalesced = code_coalesced + , raPatched = code_patched + , raSpillClean = code_spillclean + , raFinal = code_final + , raSRMs = foldl' addSRM (0, 0, 0) + $ map countSRMs code_spillclean } + + -- Bundle up all the register allocator statistics. + -- .. but make sure to drop them on the floor if they're not + -- needed, otherwise we'll get a space leak. + let statList = + if dump then [stat] ++ maybeToList stat1 ++ debug_codeGraphs + else [] + + -- Ensure all the statistics are evaluated, to avoid space leaks. + seqList statList (return ()) + + return ( code_final + , statList + , slotsCount + , graph_colored_lint) + + -- Coloring was unsuccessful. We need to spill some register to the + -- stack, make a new graph, and try to color it again. + else do + -- if -fasm-lint is turned on then validate the graph + let graph_colored_lint = + if gopt Opt_DoAsmLinting dflags + then Color.validateGraph (text "") + False -- don't require nodes to be colored + graph_colored + else graph_colored + + -- Spill uncolored regs to the stack. + (code_spilled, slotsFree', slotsCount', spillStats) + <- regSpill platform code_coalesced slotsFree slotsCount rsSpill + + -- Recalculate liveness information. + -- NOTE: we have to reverse the SCCs here to get them back into + -- the reverse-dependency order required by computeLiveness. + -- If they're not in the correct order that function will panic. + code_relive <- mapM (regLiveness platform . reverseBlocksInTops) + code_spilled + + -- Record what happened in this stage for debugging. + let stat = + RegAllocStatsSpill + { raCode = code + , raGraph = graph_colored_lint + , raCoalesced = rmCoalesce + , raSpillStats = spillStats + , raSpillCosts = spillCosts + , raSpilled = code_spilled } + + -- Bundle up all the register allocator statistics. + -- .. but make sure to drop them on the floor if they're not + -- needed, otherwise we'll get a space leak. + let statList = + if dump + then [stat] ++ maybeToList stat1 ++ debug_codeGraphs + else [] + + -- Ensure all the statistics are evaluated, to avoid space leaks. + seqList statList (return ()) + + regAlloc_spin dflags (spinCount + 1) triv regsFree slotsFree' + slotsCount' statList code_relive cfg + + +-- | Build a graph from the liveness and coalesce information in this code. +buildGraph + :: Instruction instr + => [LiveCmmDecl statics instr] + -> UniqSM (Color.Graph VirtualReg RegClass RealReg) + +buildGraph code + = do + -- Slurp out the conflicts and reg->reg moves from this code. + let (conflictList, moveList) = + unzip $ map slurpConflicts code + + -- Slurp out the spill/reload coalesces. + let moveList2 = map slurpReloadCoalesce code + + -- Add the reg-reg conflicts to the graph. + let conflictBag = unionManyBags conflictList + let graph_conflict + = foldr graphAddConflictSet Color.initGraph conflictBag + + -- Add the coalescences edges to the graph. + let moveBag + = unionBags (unionManyBags moveList2) + (unionManyBags moveList) + + let graph_coalesce + = foldr graphAddCoalesce graph_conflict moveBag + + return graph_coalesce + + +-- | Add some conflict edges to the graph. +-- Conflicts between virtual and real regs are recorded as exclusions. +graphAddConflictSet + :: UniqSet Reg + -> Color.Graph VirtualReg RegClass RealReg + -> Color.Graph VirtualReg RegClass RealReg + +graphAddConflictSet set graph + = let virtuals = mkUniqSet + [ vr | RegVirtual vr <- nonDetEltsUniqSet set ] + + graph1 = Color.addConflicts virtuals classOfVirtualReg graph + + graph2 = foldr (\(r1, r2) -> Color.addExclusion r1 classOfVirtualReg r2) + graph1 + [ (vr, rr) + | RegVirtual vr <- nonDetEltsUniqSet set + , RegReal rr <- nonDetEltsUniqSet set] + -- See Note [Unique Determinism and code generation] + + in graph2 + + +-- | Add some coalesence edges to the graph +-- Coalesences between virtual and real regs are recorded as preferences. +graphAddCoalesce + :: (Reg, Reg) + -> Color.Graph VirtualReg RegClass RealReg + -> Color.Graph VirtualReg RegClass RealReg + +graphAddCoalesce (r1, r2) graph + | RegReal rr <- r1 + , RegVirtual vr <- r2 + = Color.addPreference (vr, classOfVirtualReg vr) rr graph + + | RegReal rr <- r2 + , RegVirtual vr <- r1 + = Color.addPreference (vr, classOfVirtualReg vr) rr graph + + | RegVirtual vr1 <- r1 + , RegVirtual vr2 <- r2 + = Color.addCoalesce + (vr1, classOfVirtualReg vr1) + (vr2, classOfVirtualReg vr2) + graph + + -- We can't coalesce two real regs, but there could well be existing + -- hreg,hreg moves in the input code. We'll just ignore these + -- for coalescing purposes. + | RegReal _ <- r1 + , RegReal _ <- r2 + = graph + +#if __GLASGOW_HASKELL__ <= 810 + | otherwise + = panic "graphAddCoalesce" +#endif + + +-- | Patch registers in code using the reg -> reg mapping in this graph. +patchRegsFromGraph + :: (Outputable statics, Outputable instr, Instruction instr) + => Platform -> Color.Graph VirtualReg RegClass RealReg + -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr + +patchRegsFromGraph platform graph code + = patchEraseLive patchF code + where + -- Function to lookup the hardreg for a virtual reg from the graph. + patchF reg + -- leave real regs alone. + | RegReal{} <- reg + = reg + + -- this virtual has a regular node in the graph. + | RegVirtual vr <- reg + , Just node <- Color.lookupNode graph vr + = case Color.nodeColor node of + Just color -> RegReal color + Nothing -> RegVirtual vr + + -- no node in the graph for this virtual, bad news. + | otherwise + = pprPanic "patchRegsFromGraph: register mapping failed." + ( text "There is no node in the graph for register " + <> ppr reg + $$ ppr code + $$ Color.dotGraph + (\_ -> text "white") + (trivColorable platform + (targetVirtualRegSqueeze platform) + (targetRealRegSqueeze platform)) + graph) + + +----- +-- for when laziness just isn't what you wanted... +-- We need to deepSeq the whole graph before trying to colour it to avoid +-- space leaks. +seqGraph :: Color.Graph VirtualReg RegClass RealReg -> () +seqGraph graph = seqNodes (nonDetEltsUFM (Color.graphMap graph)) + -- See Note [Unique Determinism and code generation] + +seqNodes :: [Color.Node VirtualReg RegClass RealReg] -> () +seqNodes ns + = case ns of + [] -> () + (n : ns) -> seqNode n `seq` seqNodes ns + +seqNode :: Color.Node VirtualReg RegClass RealReg -> () +seqNode node + = seqVirtualReg (Color.nodeId node) + `seq` seqRegClass (Color.nodeClass node) + `seq` seqMaybeRealReg (Color.nodeColor node) + `seq` (seqVirtualRegList (nonDetEltsUniqSet (Color.nodeConflicts node))) + `seq` (seqRealRegList (nonDetEltsUniqSet (Color.nodeExclusions node))) + `seq` (seqRealRegList (Color.nodePreference node)) + `seq` (seqVirtualRegList (nonDetEltsUniqSet (Color.nodeCoalesce node))) + -- It's OK to use nonDetEltsUniqSet for seq + +seqVirtualReg :: VirtualReg -> () +seqVirtualReg reg = reg `seq` () + +seqRealReg :: RealReg -> () +seqRealReg reg = reg `seq` () + +seqRegClass :: RegClass -> () +seqRegClass c = c `seq` () + +seqMaybeRealReg :: Maybe RealReg -> () +seqMaybeRealReg mr + = case mr of + Nothing -> () + Just r -> seqRealReg r + +seqVirtualRegList :: [VirtualReg] -> () +seqVirtualRegList rs + = case rs of + [] -> () + (r : rs) -> seqVirtualReg r `seq` seqVirtualRegList rs + +seqRealRegList :: [RealReg] -> () +seqRealRegList rs + = case rs of + [] -> () + (r : rs) -> seqRealReg r `seq` seqRealRegList rs diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs new file mode 100644 index 0000000000..95fa174415 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs @@ -0,0 +1,163 @@ + +-- | Utils for calculating general worst, bound, squeese and free, functions. +-- +-- as per: "A Generalized Algorithm for Graph-Coloring Register Allocation" +-- Michael Smith, Normal Ramsey, Glenn Holloway. +-- PLDI 2004 +-- +-- These general versions are not used in GHC proper because they are too slow. +-- Instead, hand written optimised versions are provided for each architecture +-- in MachRegs*.hs +-- +-- This code is here because we can test the architecture specific code against +-- it. +-- +module GHC.CmmToAsm.Reg.Graph.Base ( + RegClass(..), + Reg(..), + RegSub(..), + + worst, + bound, + squeese +) where + +import GhcPrelude + +import UniqSet +import UniqFM +import Unique +import MonadUtils (concatMapM) + + +-- Some basic register classes. +-- These aren't necessarily in 1-to-1 correspondence with the allocatable +-- RegClasses in MachRegs.hs +data RegClass + -- general purpose regs + = ClassG32 -- 32 bit GPRs + | ClassG16 -- 16 bit GPRs + | ClassG8 -- 8 bit GPRs + + -- floating point regs + | ClassF64 -- 64 bit FPRs + deriving (Show, Eq, Enum) + + +-- | A register of some class +data Reg + -- a register of some class + = Reg RegClass Int + + -- a sub-component of one of the other regs + | RegSub RegSub Reg + deriving (Show, Eq) + + +-- | so we can put regs in UniqSets +instance Uniquable Reg where + getUnique (Reg c i) + = mkRegSingleUnique + $ fromEnum c * 1000 + i + + getUnique (RegSub s (Reg c i)) + = mkRegSubUnique + $ fromEnum s * 10000 + fromEnum c * 1000 + i + + getUnique (RegSub _ (RegSub _ _)) + = error "RegArchBase.getUnique: can't have a sub-reg of a sub-reg." + + +-- | A subcomponent of another register +data RegSub + = SubL16 -- lowest 16 bits + | SubL8 -- lowest 8 bits + | SubL8H -- second lowest 8 bits + deriving (Show, Enum, Ord, Eq) + + +-- | Worst case displacement +-- +-- a node N of classN has some number of neighbors, +-- all of which are from classC. +-- +-- (worst neighbors classN classC) is the maximum number of potential +-- colors for N that can be lost by coloring its neighbors. +-- +-- This should be hand coded/cached for each particular architecture, +-- because the compute time is very long.. +worst :: (RegClass -> UniqSet Reg) + -> (Reg -> UniqSet Reg) + -> Int -> RegClass -> RegClass -> Int + +worst regsOfClass regAlias neighbors classN classC + = let regAliasS regs = unionManyUniqSets + $ map regAlias + $ nonDetEltsUniqSet regs + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + + -- all the regs in classes N, C + regsN = regsOfClass classN + regsC = regsOfClass classC + + -- all the possible subsets of c which have size < m + regsS = filter (\s -> sizeUniqSet s >= 1 + && sizeUniqSet s <= neighbors) + $ powersetLS regsC + + -- for each of the subsets of C, the regs which conflict + -- with posiblities for N + regsS_conflict + = map (\s -> intersectUniqSets regsN (regAliasS s)) regsS + + in maximum $ map sizeUniqSet $ regsS_conflict + + +-- | For a node N of classN and neighbors of classesC +-- (bound classN classesC) is the maximum number of potential +-- colors for N that can be lost by coloring its neighbors. +bound :: (RegClass -> UniqSet Reg) + -> (Reg -> UniqSet Reg) + -> RegClass -> [RegClass] -> Int + +bound regsOfClass regAlias classN classesC + = let regAliasS regs = unionManyUniqSets + $ map regAlias + $ nonDetEltsUFM regs + -- See Note [Unique Determinism and code generation] + + regsC_aliases + = unionManyUniqSets + $ map (regAliasS . getUniqSet . regsOfClass) classesC + + overlap = intersectUniqSets (regsOfClass classN) regsC_aliases + + in sizeUniqSet overlap + + +-- | The total squeese on a particular node with a list of neighbors. +-- +-- A version of this should be constructed for each particular architecture, +-- possibly including uses of bound, so that alised registers don't get +-- counted twice, as per the paper. +squeese :: (RegClass -> UniqSet Reg) + -> (Reg -> UniqSet Reg) + -> RegClass -> [(Int, RegClass)] -> Int + +squeese regsOfClass regAlias classN countCs + = sum + $ map (\(i, classC) -> worst regsOfClass regAlias i classN classC) + $ countCs + + +-- | powerset (for lists) +powersetL :: [a] -> [[a]] +powersetL = concatMapM (\x -> [[],[x]]) + + +-- | powersetLS (list of sets) +powersetLS :: Uniquable a => UniqSet a -> [UniqSet a] +powersetLS s = map mkUniqSet $ powersetL $ nonDetEltsUniqSet s + -- See Note [Unique Determinism and code generation] diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs new file mode 100644 index 0000000000..d223137dd0 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs @@ -0,0 +1,99 @@ +-- | Register coalescing. +module GHC.CmmToAsm.Reg.Graph.Coalesce ( + regCoalesce, + slurpJoinMovs +) where +import GhcPrelude + +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg + +import GHC.Cmm +import Bag +import Digraph +import UniqFM +import UniqSet +import UniqSupply + + +-- | Do register coalescing on this top level thing +-- +-- For Reg -> Reg moves, if the first reg dies at the same time the +-- second reg is born then the mov only serves to join live ranges. +-- The two regs can be renamed to be the same and the move instruction +-- safely erased. +regCoalesce + :: Instruction instr + => [LiveCmmDecl statics instr] + -> UniqSM [LiveCmmDecl statics instr] + +regCoalesce code + = do + let joins = foldl' unionBags emptyBag + $ map slurpJoinMovs code + + let alloc = foldl' buildAlloc emptyUFM + $ bagToList joins + + let patched = map (patchEraseLive (sinkReg alloc)) code + + return patched + + +-- | Add a v1 = v2 register renaming to the map. +-- The register with the lowest lexical name is set as the +-- canonical version. +buildAlloc :: UniqFM Reg -> (Reg, Reg) -> UniqFM Reg +buildAlloc fm (r1, r2) + = let rmin = min r1 r2 + rmax = max r1 r2 + in addToUFM fm rmax rmin + + +-- | Determine the canonical name for a register by following +-- v1 = v2 renamings in this map. +sinkReg :: UniqFM Reg -> Reg -> Reg +sinkReg fm r + = case lookupUFM fm r of + Nothing -> r + Just r' -> sinkReg fm r' + + +-- | Slurp out mov instructions that only serve to join live ranges. +-- +-- During a mov, if the source reg dies and the destination reg is +-- born then we can rename the two regs to the same thing and +-- eliminate the move. +slurpJoinMovs + :: Instruction instr + => LiveCmmDecl statics instr + -> Bag (Reg, Reg) + +slurpJoinMovs live + = slurpCmm emptyBag live + where + slurpCmm rs CmmData{} + = rs + + slurpCmm rs (CmmProc _ _ _ sccs) + = foldl' slurpBlock rs (flattenSCCs sccs) + + slurpBlock rs (BasicBlock _ instrs) + = foldl' slurpLI rs instrs + + slurpLI rs (LiveInstr _ Nothing) = rs + slurpLI rs (LiveInstr instr (Just live)) + | Just (r1, r2) <- takeRegRegMoveInstr instr + , elementOfUniqSet r1 $ liveDieRead live + , elementOfUniqSet r2 $ liveBorn live + + -- only coalesce movs between two virtuals for now, + -- else we end up with allocatable regs in the live + -- regs list.. + , isVirtualReg r1 && isVirtualReg r2 + = consBag (r1, r2) rs + + | otherwise + = rs + diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs new file mode 100644 index 0000000000..a0e11433f7 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs @@ -0,0 +1,382 @@ + +-- | When there aren't enough registers to hold all the vregs we have to spill +-- some of those vregs to slots on the stack. This module is used modify the +-- code to use those slots. +module GHC.CmmToAsm.Reg.Graph.Spill ( + regSpill, + SpillStats(..), + accSpillSL +) where +import GhcPrelude + +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg +import GHC.Cmm hiding (RegSet) +import GHC.Cmm.BlockId +import GHC.Cmm.Dataflow.Collections + +import MonadUtils +import State +import Unique +import UniqFM +import UniqSet +import UniqSupply +import Outputable +import GHC.Platform + +import Data.List +import Data.Maybe +import Data.IntSet (IntSet) +import qualified Data.IntSet as IntSet + + +-- | Spill all these virtual regs to stack slots. +-- +-- Bumps the number of required stack slots if required. +-- +-- +-- TODO: See if we can split some of the live ranges instead of just globally +-- spilling the virtual reg. This might make the spill cleaner's job easier. +-- +-- TODO: On CISCy x86 and x86_64 we don't necessarily have to add a mov instruction +-- when making spills. If an instr is using a spilled virtual we may be able to +-- address the spill slot directly. +-- +regSpill + :: Instruction instr + => Platform + -> [LiveCmmDecl statics instr] -- ^ the code + -> UniqSet Int -- ^ available stack slots + -> Int -- ^ current number of spill slots. + -> UniqSet VirtualReg -- ^ the regs to spill + -> UniqSM + ([LiveCmmDecl statics instr] + -- code with SPILL and RELOAD meta instructions added. + , UniqSet Int -- left over slots + , Int -- slot count in use now. + , SpillStats ) -- stats about what happened during spilling + +regSpill platform code slotsFree slotCount regs + + -- Not enough slots to spill these regs. + | sizeUniqSet slotsFree < sizeUniqSet regs + = -- pprTrace "Bumping slot count:" (ppr slotCount <> text " -> " <> ppr (slotCount+512)) $ + let slotsFree' = (addListToUniqSet slotsFree [slotCount+1 .. slotCount+512]) + in regSpill platform code slotsFree' (slotCount+512) regs + + | otherwise + = do + -- Allocate a slot for each of the spilled regs. + let slots = take (sizeUniqSet regs) $ nonDetEltsUniqSet slotsFree + let regSlotMap = listToUFM + $ zip (nonDetEltsUniqSet regs) slots + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + + -- Grab the unique supply from the monad. + us <- getUniqueSupplyM + + -- Run the spiller on all the blocks. + let (code', state') = + runState (mapM (regSpill_top platform regSlotMap) code) + (initSpillS us) + + return ( code' + , minusUniqSet slotsFree (mkUniqSet slots) + , slotCount + , makeSpillStats state') + + +-- | Spill some registers to stack slots in a top-level thing. +regSpill_top + :: Instruction instr + => Platform + -> RegMap Int + -- ^ map of vregs to slots they're being spilled to. + -> LiveCmmDecl statics instr + -- ^ the top level thing. + -> SpillM (LiveCmmDecl statics instr) + +regSpill_top platform regSlotMap cmm + = case cmm of + CmmData{} + -> return cmm + + CmmProc info label live sccs + | LiveInfo static firstId liveVRegsOnEntry liveSlotsOnEntry <- info + -> do + -- The liveVRegsOnEntry contains the set of vregs that are live + -- on entry to each basic block. If we spill one of those vregs + -- we remove it from that set and add the corresponding slot + -- number to the liveSlotsOnEntry set. The spill cleaner needs + -- this information to erase unneeded spill and reload instructions + -- after we've done a successful allocation. + let liveSlotsOnEntry' :: BlockMap IntSet + liveSlotsOnEntry' + = mapFoldlWithKey patchLiveSlot + liveSlotsOnEntry liveVRegsOnEntry + + let info' + = LiveInfo static firstId + liveVRegsOnEntry + liveSlotsOnEntry' + + -- Apply the spiller to all the basic blocks in the CmmProc. + sccs' <- mapM (mapSCCM (regSpill_block platform regSlotMap)) sccs + + return $ CmmProc info' label live sccs' + + where -- Given a BlockId and the set of registers live in it, + -- if registers in this block are being spilled to stack slots, + -- then record the fact that these slots are now live in those blocks + -- in the given slotmap. + patchLiveSlot + :: BlockMap IntSet -> BlockId -> RegSet -> BlockMap IntSet + + patchLiveSlot slotMap blockId regsLive + = let + -- Slots that are already recorded as being live. + curSlotsLive = fromMaybe IntSet.empty + $ mapLookup blockId slotMap + + moreSlotsLive = IntSet.fromList + $ catMaybes + $ map (lookupUFM regSlotMap) + $ nonDetEltsUniqSet regsLive + -- See Note [Unique Determinism and code generation] + + slotMap' + = mapInsert blockId (IntSet.union curSlotsLive moreSlotsLive) + slotMap + + in slotMap' + + +-- | Spill some registers to stack slots in a basic block. +regSpill_block + :: Instruction instr + => Platform + -> UniqFM Int -- ^ map of vregs to slots they're being spilled to. + -> LiveBasicBlock instr + -> SpillM (LiveBasicBlock instr) + +regSpill_block platform regSlotMap (BasicBlock i instrs) + = do instrss' <- mapM (regSpill_instr platform regSlotMap) instrs + return $ BasicBlock i (concat instrss') + + +-- | Spill some registers to stack slots in a single instruction. +-- If the instruction uses registers that need to be spilled, then it is +-- prefixed (or postfixed) with the appropriate RELOAD or SPILL meta +-- instructions. +regSpill_instr + :: Instruction instr + => Platform + -> UniqFM Int -- ^ map of vregs to slots they're being spilled to. + -> LiveInstr instr + -> SpillM [LiveInstr instr] + +regSpill_instr _ _ li@(LiveInstr _ Nothing) + = do return [li] + +regSpill_instr platform regSlotMap + (LiveInstr instr (Just _)) + = do + -- work out which regs are read and written in this instr + let RU rlRead rlWritten = regUsageOfInstr platform instr + + -- sometimes a register is listed as being read more than once, + -- nub this so we don't end up inserting two lots of spill code. + let rsRead_ = nub rlRead + let rsWritten_ = nub rlWritten + + -- if a reg is modified, it appears in both lists, want to undo this.. + let rsRead = rsRead_ \\ rsWritten_ + let rsWritten = rsWritten_ \\ rsRead_ + let rsModify = intersect rsRead_ rsWritten_ + + -- work out if any of the regs being used are currently being spilled. + let rsSpillRead = filter (\r -> elemUFM r regSlotMap) rsRead + let rsSpillWritten = filter (\r -> elemUFM r regSlotMap) rsWritten + let rsSpillModify = filter (\r -> elemUFM r regSlotMap) rsModify + + -- rewrite the instr and work out spill code. + (instr1, prepost1) <- mapAccumLM (spillRead regSlotMap) instr rsSpillRead + (instr2, prepost2) <- mapAccumLM (spillWrite regSlotMap) instr1 rsSpillWritten + (instr3, prepost3) <- mapAccumLM (spillModify regSlotMap) instr2 rsSpillModify + + let (mPrefixes, mPostfixes) = unzip (prepost1 ++ prepost2 ++ prepost3) + let prefixes = concat mPrefixes + let postfixes = concat mPostfixes + + -- final code + let instrs' = prefixes + ++ [LiveInstr instr3 Nothing] + ++ postfixes + + return $ instrs' + + +-- | Add a RELOAD met a instruction to load a value for an instruction that +-- writes to a vreg that is being spilled. +spillRead + :: Instruction instr + => UniqFM Int + -> instr + -> Reg + -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr'])) + +spillRead regSlotMap instr reg + | Just slot <- lookupUFM regSlotMap reg + = do (instr', nReg) <- patchInstr reg instr + + modify $ \s -> s + { stateSpillSL = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 0, 1) } + + return ( instr' + , ( [LiveInstr (RELOAD slot nReg) Nothing] + , []) ) + + | otherwise = panic "RegSpill.spillRead: no slot defined for spilled reg" + + +-- | Add a SPILL meta instruction to store a value for an instruction that +-- writes to a vreg that is being spilled. +spillWrite + :: Instruction instr + => UniqFM Int + -> instr + -> Reg + -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr'])) + +spillWrite regSlotMap instr reg + | Just slot <- lookupUFM regSlotMap reg + = do (instr', nReg) <- patchInstr reg instr + + modify $ \s -> s + { stateSpillSL = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 1, 0) } + + return ( instr' + , ( [] + , [LiveInstr (SPILL nReg slot) Nothing])) + + | otherwise = panic "RegSpill.spillWrite: no slot defined for spilled reg" + + +-- | Add both RELOAD and SPILL meta instructions for an instruction that +-- both reads and writes to a vreg that is being spilled. +spillModify + :: Instruction instr + => UniqFM Int + -> instr + -> Reg + -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr'])) + +spillModify regSlotMap instr reg + | Just slot <- lookupUFM regSlotMap reg + = do (instr', nReg) <- patchInstr reg instr + + modify $ \s -> s + { stateSpillSL = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 1, 1) } + + return ( instr' + , ( [LiveInstr (RELOAD slot nReg) Nothing] + , [LiveInstr (SPILL nReg slot) Nothing])) + + | otherwise = panic "RegSpill.spillModify: no slot defined for spilled reg" + + +-- | Rewrite uses of this virtual reg in an instr to use a different +-- virtual reg. +patchInstr + :: Instruction instr + => Reg -> instr -> SpillM (instr, Reg) + +patchInstr reg instr + = do nUnique <- newUnique + + -- The register we're rewriting is supposed to be virtual. + -- If it's not then something has gone horribly wrong. + let nReg + = case reg of + RegVirtual vr + -> RegVirtual (renameVirtualReg nUnique vr) + + RegReal{} + -> panic "RegAlloc.Graph.Spill.patchIntr: not patching real reg" + + let instr' = patchReg1 reg nReg instr + return (instr', nReg) + + +patchReg1 + :: Instruction instr + => Reg -> Reg -> instr -> instr + +patchReg1 old new instr + = let patchF r + | r == old = new + | otherwise = r + in patchRegsOfInstr instr patchF + + +-- Spiller monad -------------------------------------------------------------- +-- | State monad for the spill code generator. +type SpillM a + = State SpillS a + +-- | Spill code generator state. +data SpillS + = SpillS + { -- | Unique supply for generating fresh vregs. + stateUS :: UniqSupply + + -- | Spilled vreg vs the number of times it was loaded, stored. + , stateSpillSL :: UniqFM (Reg, Int, Int) } + + +-- | Create a new spiller state. +initSpillS :: UniqSupply -> SpillS +initSpillS uniqueSupply + = SpillS + { stateUS = uniqueSupply + , stateSpillSL = emptyUFM } + + +-- | Allocate a new unique in the spiller monad. +newUnique :: SpillM Unique +newUnique + = do us <- gets stateUS + case takeUniqFromSupply us of + (uniq, us') + -> do modify $ \s -> s { stateUS = us' } + return uniq + + +-- | Add a spill/reload count to a stats record for a register. +accSpillSL :: (Reg, Int, Int) -> (Reg, Int, Int) -> (Reg, Int, Int) +accSpillSL (r1, s1, l1) (_, s2, l2) + = (r1, s1 + s2, l1 + l2) + + +-- Spiller stats -------------------------------------------------------------- +-- | Spiller statistics. +-- Tells us what registers were spilled. +data SpillStats + = SpillStats + { spillStoreLoad :: UniqFM (Reg, Int, Int) } + + +-- | Extract spiller statistics from the spiller state. +makeSpillStats :: SpillS -> SpillStats +makeSpillStats s + = SpillStats + { spillStoreLoad = stateSpillSL s } + + +instance Outputable SpillStats where + ppr stats + = pprUFM (spillStoreLoad stats) + (vcat . map (\(r, s, l) -> ppr r <+> int s <+> int l)) diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs b/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs new file mode 100644 index 0000000000..6d14c7194b --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs @@ -0,0 +1,616 @@ +{-# LANGUAGE CPP #-} + +-- | Clean out unneeded spill\/reload instructions. +-- +-- Handling of join points +-- ~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- B1: B2: +-- ... ... +-- RELOAD SLOT(0), %r1 RELOAD SLOT(0), %r1 +-- ... A ... ... B ... +-- jump B3 jump B3 +-- +-- B3: ... C ... +-- RELOAD SLOT(0), %r1 +-- ... +-- +-- The Plan +-- ~~~~~~~~ +-- As long as %r1 hasn't been written to in A, B or C then we don't need +-- the reload in B3. +-- +-- What we really care about here is that on the entry to B3, %r1 will +-- always have the same value that is in SLOT(0) (ie, %r1 is _valid_) +-- +-- This also works if the reloads in B1\/B2 were spills instead, because +-- spilling %r1 to a slot makes that slot have the same value as %r1. +-- +module GHC.CmmToAsm.Reg.Graph.SpillClean ( + cleanSpills +) where +import GhcPrelude + +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg + +import GHC.Cmm.BlockId +import GHC.Cmm +import UniqSet +import UniqFM +import Unique +import State +import Outputable +import GHC.Platform +import GHC.Cmm.Dataflow.Collections + +import Data.List +import Data.Maybe +import Data.IntSet (IntSet) +import qualified Data.IntSet as IntSet + + +-- | The identification number of a spill slot. +-- A value is stored in a spill slot when we don't have a free +-- register to hold it. +type Slot = Int + + +-- | Clean out unneeded spill\/reloads from this top level thing. +cleanSpills + :: Instruction instr + => Platform + -> LiveCmmDecl statics instr + -> LiveCmmDecl statics instr + +cleanSpills platform cmm + = evalState (cleanSpin platform 0 cmm) initCleanS + + +-- | Do one pass of cleaning. +cleanSpin + :: Instruction instr + => Platform + -> Int -- ^ Iteration number for the cleaner. + -> LiveCmmDecl statics instr -- ^ Liveness annotated code to clean. + -> CleanM (LiveCmmDecl statics instr) + +cleanSpin platform spinCount code + = do + -- Initialise count of cleaned spill and reload instructions. + modify $ \s -> s + { sCleanedSpillsAcc = 0 + , sCleanedReloadsAcc = 0 + , sReloadedBy = emptyUFM } + + code_forward <- mapBlockTopM (cleanBlockForward platform) code + code_backward <- cleanTopBackward code_forward + + -- During the cleaning of each block we collected information about + -- what regs were valid across each jump. Based on this, work out + -- whether it will be safe to erase reloads after join points for + -- the next pass. + collateJoinPoints + + -- Remember how many spill and reload instructions we cleaned in this pass. + spills <- gets sCleanedSpillsAcc + reloads <- gets sCleanedReloadsAcc + modify $ \s -> s + { sCleanedCount = (spills, reloads) : sCleanedCount s } + + -- If nothing was cleaned in this pass or the last one + -- then we're done and it's time to bail out. + cleanedCount <- gets sCleanedCount + if take 2 cleanedCount == [(0, 0), (0, 0)] + then return code + + -- otherwise go around again + else cleanSpin platform (spinCount + 1) code_backward + + +------------------------------------------------------------------------------- +-- | Clean out unneeded reload instructions, +-- while walking forward over the code. +cleanBlockForward + :: Instruction instr + => Platform + -> LiveBasicBlock instr + -> CleanM (LiveBasicBlock instr) + +cleanBlockForward platform (BasicBlock blockId instrs) + = do + -- See if we have a valid association for the entry to this block. + jumpValid <- gets sJumpValid + let assoc = case lookupUFM jumpValid blockId of + Just assoc -> assoc + Nothing -> emptyAssoc + + instrs_reload <- cleanForward platform blockId assoc [] instrs + return $ BasicBlock blockId instrs_reload + + + +-- | Clean out unneeded reload instructions. +-- +-- Walking forwards across the code +-- On a reload, if we know a reg already has the same value as a slot +-- then we don't need to do the reload. +-- +cleanForward + :: Instruction instr + => Platform + -> BlockId -- ^ the block that we're currently in + -> Assoc Store -- ^ two store locations are associated if + -- they have the same value + -> [LiveInstr instr] -- ^ acc + -> [LiveInstr instr] -- ^ instrs to clean (in backwards order) + -> CleanM [LiveInstr instr] -- ^ cleaned instrs (in forward order) + +cleanForward _ _ _ acc [] + = return acc + +-- Rewrite live range joins via spill slots to just a spill and a reg-reg move +-- hopefully the spill will be also be cleaned in the next pass +cleanForward platform blockId assoc acc (li1 : li2 : instrs) + + | LiveInstr (SPILL reg1 slot1) _ <- li1 + , LiveInstr (RELOAD slot2 reg2) _ <- li2 + , slot1 == slot2 + = do + modify $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 } + cleanForward platform blockId assoc acc + $ li1 : LiveInstr (mkRegRegMoveInstr platform reg1 reg2) Nothing + : instrs + +cleanForward platform blockId assoc acc (li@(LiveInstr i1 _) : instrs) + | Just (r1, r2) <- takeRegRegMoveInstr i1 + = if r1 == r2 + -- Erase any left over nop reg reg moves while we're here + -- this will also catch any nop moves that the previous case + -- happens to add. + then cleanForward platform blockId assoc acc instrs + + -- If r1 has the same value as some slots and we copy r1 to r2, + -- then r2 is now associated with those slots instead + else do let assoc' = addAssoc (SReg r1) (SReg r2) + $ delAssoc (SReg r2) + $ assoc + + cleanForward platform blockId assoc' (li : acc) instrs + + +cleanForward platform blockId assoc acc (li : instrs) + + -- Update association due to the spill. + | LiveInstr (SPILL reg slot) _ <- li + = let assoc' = addAssoc (SReg reg) (SSlot slot) + $ delAssoc (SSlot slot) + $ assoc + in cleanForward platform blockId assoc' (li : acc) instrs + + -- Clean a reload instr. + | LiveInstr (RELOAD{}) _ <- li + = do (assoc', mli) <- cleanReload platform blockId assoc li + case mli of + Nothing -> cleanForward platform blockId assoc' acc + instrs + + Just li' -> cleanForward platform blockId assoc' (li' : acc) + instrs + + -- Remember the association over a jump. + | LiveInstr instr _ <- li + , targets <- jumpDestsOfInstr instr + , not $ null targets + = do mapM_ (accJumpValid assoc) targets + cleanForward platform blockId assoc (li : acc) instrs + + -- Writing to a reg changes its value. + | LiveInstr instr _ <- li + , RU _ written <- regUsageOfInstr platform instr + = let assoc' = foldr delAssoc assoc (map SReg $ nub written) + in cleanForward platform blockId assoc' (li : acc) instrs + + + +-- | Try and rewrite a reload instruction to something more pleasing +cleanReload + :: Instruction instr + => Platform + -> BlockId + -> Assoc Store + -> LiveInstr instr + -> CleanM (Assoc Store, Maybe (LiveInstr instr)) + +cleanReload platform blockId assoc li@(LiveInstr (RELOAD slot reg) _) + + -- If the reg we're reloading already has the same value as the slot + -- then we can erase the instruction outright. + | elemAssoc (SSlot slot) (SReg reg) assoc + = do modify $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 } + return (assoc, Nothing) + + -- If we can find another reg with the same value as this slot then + -- do a move instead of a reload. + | Just reg2 <- findRegOfSlot assoc slot + = do modify $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 } + + let assoc' = addAssoc (SReg reg) (SReg reg2) + $ delAssoc (SReg reg) + $ assoc + + return ( assoc' + , Just $ LiveInstr (mkRegRegMoveInstr platform reg2 reg) Nothing) + + -- Gotta keep this instr. + | otherwise + = do -- Update the association. + let assoc' + = addAssoc (SReg reg) (SSlot slot) + -- doing the reload makes reg and slot the same value + $ delAssoc (SReg reg) + -- reg value changes on reload + $ assoc + + -- Remember that this block reloads from this slot. + accBlockReloadsSlot blockId slot + + return (assoc', Just li) + +cleanReload _ _ _ _ + = panic "RegSpillClean.cleanReload: unhandled instr" + + +------------------------------------------------------------------------------- +-- | Clean out unneeded spill instructions, +-- while walking backwards over the code. +-- +-- If there were no reloads from a slot between a spill and the last one +-- then the slot was never read and we don't need the spill. +-- +-- SPILL r0 -> s1 +-- RELOAD s1 -> r2 +-- SPILL r3 -> s1 <--- don't need this spill +-- SPILL r4 -> s1 +-- RELOAD s1 -> r5 +-- +-- Maintain a set of +-- "slots which were spilled to but not reloaded from yet" +-- +-- Walking backwards across the code: +-- a) On a reload from a slot, remove it from the set. +-- +-- a) On a spill from a slot +-- If the slot is in set then we can erase the spill, +-- because it won't be reloaded from until after the next spill. +-- +-- otherwise +-- keep the spill and add the slot to the set +-- +-- TODO: This is mostly inter-block +-- we should really be updating the noReloads set as we cross jumps also. +-- +-- TODO: generate noReloads from liveSlotsOnEntry +-- +cleanTopBackward + :: Instruction instr + => LiveCmmDecl statics instr + -> CleanM (LiveCmmDecl statics instr) + +cleanTopBackward cmm + = case cmm of + CmmData{} + -> return cmm + + CmmProc info label live sccs + | LiveInfo _ _ _ liveSlotsOnEntry <- info + -> do sccs' <- mapM (mapSCCM (cleanBlockBackward liveSlotsOnEntry)) sccs + return $ CmmProc info label live sccs' + + +cleanBlockBackward + :: Instruction instr + => BlockMap IntSet + -> LiveBasicBlock instr + -> CleanM (LiveBasicBlock instr) + +cleanBlockBackward liveSlotsOnEntry (BasicBlock blockId instrs) + = do instrs_spill <- cleanBackward liveSlotsOnEntry emptyUniqSet [] instrs + return $ BasicBlock blockId instrs_spill + + + +cleanBackward + :: Instruction instr + => BlockMap IntSet -- ^ Slots live on entry to each block + -> UniqSet Int -- ^ Slots that have been spilled, but not reloaded from + -> [LiveInstr instr] -- ^ acc + -> [LiveInstr instr] -- ^ Instrs to clean (in forwards order) + -> CleanM [LiveInstr instr] -- ^ Cleaned instrs (in backwards order) + +cleanBackward liveSlotsOnEntry noReloads acc lis + = do reloadedBy <- gets sReloadedBy + cleanBackward' liveSlotsOnEntry reloadedBy noReloads acc lis + + +cleanBackward' + :: Instruction instr + => BlockMap IntSet + -> UniqFM [BlockId] + -> UniqSet Int + -> [LiveInstr instr] + -> [LiveInstr instr] + -> State CleanS [LiveInstr instr] + +cleanBackward' _ _ _ acc [] + = return acc + +cleanBackward' liveSlotsOnEntry reloadedBy noReloads acc (li : instrs) + + -- If nothing ever reloads from this slot then we don't need the spill. + | LiveInstr (SPILL _ slot) _ <- li + , Nothing <- lookupUFM reloadedBy (SSlot slot) + = do modify $ \s -> s { sCleanedSpillsAcc = sCleanedSpillsAcc s + 1 } + cleanBackward liveSlotsOnEntry noReloads acc instrs + + | LiveInstr (SPILL _ slot) _ <- li + = if elementOfUniqSet slot noReloads + + -- We can erase this spill because the slot won't be read until + -- after the next one + then do + modify $ \s -> s { sCleanedSpillsAcc = sCleanedSpillsAcc s + 1 } + cleanBackward liveSlotsOnEntry noReloads acc instrs + + else do + -- This slot is being spilled to, but we haven't seen any reloads yet. + let noReloads' = addOneToUniqSet noReloads slot + cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs + + -- if we reload from a slot then it's no longer unused + | LiveInstr (RELOAD slot _) _ <- li + , noReloads' <- delOneFromUniqSet noReloads slot + = cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs + + -- If a slot is live in a jump target then assume it's reloaded there. + -- + -- TODO: A real dataflow analysis would do a better job here. + -- If the target block _ever_ used the slot then we assume + -- it always does, but if those reloads are cleaned the slot + -- liveness map doesn't get updated. + | LiveInstr instr _ <- li + , targets <- jumpDestsOfInstr instr + = do + let slotsReloadedByTargets + = IntSet.unions + $ catMaybes + $ map (flip mapLookup liveSlotsOnEntry) + $ targets + + let noReloads' + = foldl' delOneFromUniqSet noReloads + $ IntSet.toList slotsReloadedByTargets + + cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs + +#if __GLASGOW_HASKELL__ <= 810 + -- some other instruction + | otherwise + = cleanBackward liveSlotsOnEntry noReloads (li : acc) instrs +#endif + + +-- | Combine the associations from all the inward control flow edges. +-- +collateJoinPoints :: CleanM () +collateJoinPoints + = modify $ \s -> s + { sJumpValid = mapUFM intersects (sJumpValidAcc s) + , sJumpValidAcc = emptyUFM } + +intersects :: [Assoc Store] -> Assoc Store +intersects [] = emptyAssoc +intersects assocs = foldl1' intersectAssoc assocs + + +-- | See if we have a reg with the same value as this slot in the association table. +findRegOfSlot :: Assoc Store -> Int -> Maybe Reg +findRegOfSlot assoc slot + | close <- closeAssoc (SSlot slot) assoc + , Just (SReg reg) <- find isStoreReg $ nonDetEltsUniqSet close + -- See Note [Unique Determinism and code generation] + = Just reg + + | otherwise + = Nothing + + +------------------------------------------------------------------------------- +-- | Cleaner monad. +type CleanM + = State CleanS + +-- | Cleaner state. +data CleanS + = CleanS + { -- | Regs which are valid at the start of each block. + sJumpValid :: UniqFM (Assoc Store) + + -- | Collecting up what regs were valid across each jump. + -- in the next pass we can collate these and write the results + -- to sJumpValid. + , sJumpValidAcc :: UniqFM [Assoc Store] + + -- | Map of (slot -> blocks which reload from this slot) + -- used to decide if whether slot spilled to will ever be + -- reloaded from on this path. + , sReloadedBy :: UniqFM [BlockId] + + -- | Spills and reloads cleaned each pass (latest at front) + , sCleanedCount :: [(Int, Int)] + + -- | Spills and reloads that have been cleaned in this pass so far. + , sCleanedSpillsAcc :: Int + , sCleanedReloadsAcc :: Int } + + +-- | Construct the initial cleaner state. +initCleanS :: CleanS +initCleanS + = CleanS + { sJumpValid = emptyUFM + , sJumpValidAcc = emptyUFM + + , sReloadedBy = emptyUFM + + , sCleanedCount = [] + + , sCleanedSpillsAcc = 0 + , sCleanedReloadsAcc = 0 } + + +-- | Remember the associations before a jump. +accJumpValid :: Assoc Store -> BlockId -> CleanM () +accJumpValid assocs target + = modify $ \s -> s { + sJumpValidAcc = addToUFM_C (++) + (sJumpValidAcc s) + target + [assocs] } + + +accBlockReloadsSlot :: BlockId -> Slot -> CleanM () +accBlockReloadsSlot blockId slot + = modify $ \s -> s { + sReloadedBy = addToUFM_C (++) + (sReloadedBy s) + (SSlot slot) + [blockId] } + + +------------------------------------------------------------------------------- +-- A store location can be a stack slot or a register +data Store + = SSlot Int + | SReg Reg + + +-- | Check if this is a reg store. +isStoreReg :: Store -> Bool +isStoreReg ss + = case ss of + SSlot _ -> False + SReg _ -> True + + +-- Spill cleaning is only done once all virtuals have been allocated to realRegs +instance Uniquable Store where + getUnique (SReg r) + | RegReal (RealRegSingle i) <- r + = mkRegSingleUnique i + + | RegReal (RealRegPair r1 r2) <- r + = mkRegPairUnique (r1 * 65535 + r2) + + | otherwise + = error $ "RegSpillClean.getUnique: found virtual reg during spill clean," + ++ "only real regs expected." + + getUnique (SSlot i) = mkRegSubUnique i -- [SLPJ] I hope "SubUnique" is ok + + +instance Outputable Store where + ppr (SSlot i) = text "slot" <> int i + ppr (SReg r) = ppr r + + +------------------------------------------------------------------------------- +-- Association graphs. +-- In the spill cleaner, two store locations are associated if they are known +-- to hold the same value. +-- +type Assoc a = UniqFM (UniqSet a) + +-- | An empty association +emptyAssoc :: Assoc a +emptyAssoc = emptyUFM + + +-- | Add an association between these two things. +addAssoc :: Uniquable a + => a -> a -> Assoc a -> Assoc a + +addAssoc a b m + = let m1 = addToUFM_C unionUniqSets m a (unitUniqSet b) + m2 = addToUFM_C unionUniqSets m1 b (unitUniqSet a) + in m2 + + +-- | Delete all associations to a node. +delAssoc :: (Uniquable a) + => a -> Assoc a -> Assoc a + +delAssoc a m + | Just aSet <- lookupUFM m a + , m1 <- delFromUFM m a + = nonDetFoldUniqSet (\x m -> delAssoc1 x a m) m1 aSet + -- It's OK to use nonDetFoldUFM here because deletion is commutative + + | otherwise = m + + +-- | Delete a single association edge (a -> b). +delAssoc1 :: Uniquable a + => a -> a -> Assoc a -> Assoc a + +delAssoc1 a b m + | Just aSet <- lookupUFM m a + = addToUFM m a (delOneFromUniqSet aSet b) + + | otherwise = m + + +-- | Check if these two things are associated. +elemAssoc :: (Uniquable a) + => a -> a -> Assoc a -> Bool + +elemAssoc a b m + = elementOfUniqSet b (closeAssoc a m) + + +-- | Find the refl. trans. closure of the association from this point. +closeAssoc :: (Uniquable a) + => a -> Assoc a -> UniqSet a + +closeAssoc a assoc + = closeAssoc' assoc emptyUniqSet (unitUniqSet a) + where + closeAssoc' assoc visited toVisit + = case nonDetEltsUniqSet toVisit of + -- See Note [Unique Determinism and code generation] + + -- nothing else to visit, we're done + [] -> visited + + (x:_) + -- we've already seen this node + | elementOfUniqSet x visited + -> closeAssoc' assoc visited (delOneFromUniqSet toVisit x) + + -- haven't seen this node before, + -- remember to visit all its neighbors + | otherwise + -> let neighbors + = case lookupUFM assoc x of + Nothing -> emptyUniqSet + Just set -> set + + in closeAssoc' assoc + (addOneToUniqSet visited x) + (unionUniqSets toVisit neighbors) + +-- | Intersect two associations. +intersectAssoc :: Assoc a -> Assoc a -> Assoc a +intersectAssoc a b + = intersectUFM_C (intersectUniqSets) a b diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs b/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs new file mode 100644 index 0000000000..e3e456e98d --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs @@ -0,0 +1,317 @@ +{-# LANGUAGE ScopedTypeVariables, GADTs, BangPatterns #-} +module GHC.CmmToAsm.Reg.Graph.SpillCost ( + SpillCostRecord, + plusSpillCostRecord, + pprSpillCostRecord, + + SpillCostInfo, + zeroSpillCostInfo, + plusSpillCostInfo, + + slurpSpillCostInfo, + chooseSpill, + + lifeMapFromSpillCostInfo +) where +import GhcPrelude + +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg.Class +import GHC.Platform.Reg + +import GraphBase + +import GHC.Cmm.Dataflow.Collections (mapLookup) +import GHC.Cmm.Dataflow.Label +import GHC.Cmm +import UniqFM +import UniqSet +import Digraph (flattenSCCs) +import Outputable +import GHC.Platform +import State +import GHC.CmmToAsm.CFG + +import Data.List (nub, minimumBy) +import Data.Maybe +import Control.Monad (join) + + +-- | Records the expected cost to spill some register. +type SpillCostRecord + = ( VirtualReg -- register name + , Int -- number of writes to this reg + , Int -- number of reads from this reg + , Int) -- number of instrs this reg was live on entry to + + +-- | Map of `SpillCostRecord` +type SpillCostInfo + = UniqFM SpillCostRecord + +type SpillCostState = State (UniqFM SpillCostRecord) () + +-- | An empty map of spill costs. +zeroSpillCostInfo :: SpillCostInfo +zeroSpillCostInfo = emptyUFM + + +-- | Add two spill cost infos. +plusSpillCostInfo :: SpillCostInfo -> SpillCostInfo -> SpillCostInfo +plusSpillCostInfo sc1 sc2 + = plusUFM_C plusSpillCostRecord sc1 sc2 + + +-- | Add two spill cost records. +plusSpillCostRecord :: SpillCostRecord -> SpillCostRecord -> SpillCostRecord +plusSpillCostRecord (r1, a1, b1, c1) (r2, a2, b2, c2) + | r1 == r2 = (r1, a1 + a2, b1 + b2, c1 + c2) + | otherwise = error "RegSpillCost.plusRegInt: regs don't match" + + +-- | Slurp out information used for determining spill costs. +-- +-- For each vreg, the number of times it was written to, read from, +-- and the number of instructions it was live on entry to (lifetime) +-- +slurpSpillCostInfo :: forall instr statics. (Outputable instr, Instruction instr) + => Platform + -> Maybe CFG + -> LiveCmmDecl statics instr + -> SpillCostInfo + +slurpSpillCostInfo platform cfg cmm + = execState (countCmm cmm) zeroSpillCostInfo + where + countCmm CmmData{} = return () + countCmm (CmmProc info _ _ sccs) + = mapM_ (countBlock info freqMap) + $ flattenSCCs sccs + where + LiveInfo _ entries _ _ = info + freqMap = (fst . mkGlobalWeights (head entries)) <$> cfg + + -- Lookup the regs that are live on entry to this block in + -- the info table from the CmmProc. + countBlock info freqMap (BasicBlock blockId instrs) + | LiveInfo _ _ blockLive _ <- info + , Just rsLiveEntry <- mapLookup blockId blockLive + , rsLiveEntry_virt <- takeVirtuals rsLiveEntry + = countLIs (ceiling $ blockFreq freqMap blockId) rsLiveEntry_virt instrs + + | otherwise + = error "RegAlloc.SpillCost.slurpSpillCostInfo: bad block" + + + countLIs :: Int -> UniqSet VirtualReg -> [LiveInstr instr] -> SpillCostState + countLIs _ _ [] + = return () + + -- Skip over comment and delta pseudo instrs. + countLIs scale rsLive (LiveInstr instr Nothing : lis) + | isMetaInstr instr + = countLIs scale rsLive lis + + | otherwise + = pprPanic "RegSpillCost.slurpSpillCostInfo" + $ text "no liveness information on instruction " <> ppr instr + + countLIs scale rsLiveEntry (LiveInstr instr (Just live) : lis) + = do + -- Increment the lifetime counts for regs live on entry to this instr. + mapM_ incLifetime $ nonDetEltsUniqSet rsLiveEntry + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + + -- Increment counts for what regs were read/written from. + let (RU read written) = regUsageOfInstr platform instr + mapM_ (incUses scale) $ catMaybes $ map takeVirtualReg $ nub read + mapM_ (incDefs scale) $ catMaybes $ map takeVirtualReg $ nub written + + -- Compute liveness for entry to next instruction. + let liveDieRead_virt = takeVirtuals (liveDieRead live) + let liveDieWrite_virt = takeVirtuals (liveDieWrite live) + let liveBorn_virt = takeVirtuals (liveBorn live) + + let rsLiveAcross + = rsLiveEntry `minusUniqSet` liveDieRead_virt + + let rsLiveNext + = (rsLiveAcross `unionUniqSets` liveBorn_virt) + `minusUniqSet` liveDieWrite_virt + + countLIs scale rsLiveNext lis + + incDefs count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, count, 0, 0) + incUses count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, count, 0) + incLifetime reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, 0, 1) + + blockFreq :: Maybe (LabelMap Double) -> Label -> Double + blockFreq freqs bid + | Just freq <- join (mapLookup bid <$> freqs) + = max 1.0 (10000 * freq) + | otherwise + = 1.0 -- Only if no cfg given + +-- | Take all the virtual registers from this set. +takeVirtuals :: UniqSet Reg -> UniqSet VirtualReg +takeVirtuals set = mkUniqSet + [ vr | RegVirtual vr <- nonDetEltsUniqSet set ] + -- See Note [Unique Determinism and code generation] + + +-- | Choose a node to spill from this graph +chooseSpill + :: SpillCostInfo + -> Graph VirtualReg RegClass RealReg + -> VirtualReg + +chooseSpill info graph + = let cost = spillCost_length info graph + node = minimumBy (\n1 n2 -> compare (cost $ nodeId n1) (cost $ nodeId n2)) + $ nonDetEltsUFM $ graphMap graph + -- See Note [Unique Determinism and code generation] + + in nodeId node + + +------------------------------------------------------------------------------- +-- | Chaitins spill cost function is: +-- +-- cost = sum loadCost * freq (u) + sum storeCost * freq (d) +-- u <- uses (v) d <- defs (v) +-- +-- There are no loops in our code at the moment, so we can set the freq's to 1. +-- +-- If we don't have live range splitting then Chaitins function performs badly +-- if we have lots of nested live ranges and very few registers. +-- +-- v1 v2 v3 +-- def v1 . +-- use v1 . +-- def v2 . . +-- def v3 . . . +-- use v1 . . . +-- use v3 . . . +-- use v2 . . +-- use v1 . +-- +-- defs uses degree cost +-- v1: 1 3 3 1.5 +-- v2: 1 2 3 1.0 +-- v3: 1 1 3 0.666 +-- +-- v3 has the lowest cost, but if we only have 2 hardregs and we insert +-- spill code for v3 then this isn't going to improve the colorability of +-- the graph. +-- +-- When compiling SHA1, which as very long basic blocks and some vregs +-- with very long live ranges the allocator seems to try and spill from +-- the inside out and eventually run out of stack slots. +-- +-- Without live range splitting, its's better to spill from the outside +-- in so set the cost of very long live ranges to zero +-- + +-- spillCost_chaitin +-- :: SpillCostInfo +-- -> Graph VirtualReg RegClass RealReg +-- -> VirtualReg +-- -> Float + +-- spillCost_chaitin info graph reg +-- -- Spilling a live range that only lives for 1 instruction +-- -- isn't going to help us at all - and we definitely want to avoid +-- -- trying to re-spill previously inserted spill code. +-- | lifetime <= 1 = 1/0 + +-- -- It's unlikely that we'll find a reg for a live range this long +-- -- better to spill it straight up and not risk trying to keep it around +-- -- and have to go through the build/color cycle again. + +-- -- To facility this we scale down the spill cost of long ranges. +-- -- This makes sure long ranges are still spilled first. +-- -- But this way spill cost remains relevant for long live +-- -- ranges. +-- | lifetime >= 128 +-- = (spillCost / conflicts) / 10.0 + + +-- -- Otherwise revert to chaitin's regular cost function. +-- | otherwise = (spillCost / conflicts) +-- where +-- !spillCost = fromIntegral (uses + defs) :: Float +-- conflicts = fromIntegral (nodeDegree classOfVirtualReg graph reg) +-- (_, defs, uses, lifetime) +-- = fromMaybe (reg, 0, 0, 0) $ lookupUFM info reg + + +-- Just spill the longest live range. +spillCost_length + :: SpillCostInfo + -> Graph VirtualReg RegClass RealReg + -> VirtualReg + -> Float + +spillCost_length info _ reg + | lifetime <= 1 = 1/0 + | otherwise = 1 / fromIntegral lifetime + where (_, _, _, lifetime) + = fromMaybe (reg, 0, 0, 0) + $ lookupUFM info reg + + +-- | Extract a map of register lifetimes from a `SpillCostInfo`. +lifeMapFromSpillCostInfo :: SpillCostInfo -> UniqFM (VirtualReg, Int) +lifeMapFromSpillCostInfo info + = listToUFM + $ map (\(r, _, _, life) -> (r, (r, life))) + $ nonDetEltsUFM info + -- See Note [Unique Determinism and code generation] + + +-- | Determine the degree (number of neighbors) of this node which +-- have the same class. +nodeDegree + :: (VirtualReg -> RegClass) + -> Graph VirtualReg RegClass RealReg + -> VirtualReg + -> Int + +nodeDegree classOfVirtualReg graph reg + | Just node <- lookupUFM (graphMap graph) reg + + , virtConflicts + <- length + $ filter (\r -> classOfVirtualReg r == classOfVirtualReg reg) + $ nonDetEltsUniqSet + -- See Note [Unique Determinism and code generation] + $ nodeConflicts node + + = virtConflicts + sizeUniqSet (nodeExclusions node) + + | otherwise + = 0 + + +-- | Show a spill cost record, including the degree from the graph +-- and final calculated spill cost. +pprSpillCostRecord + :: (VirtualReg -> RegClass) + -> (Reg -> SDoc) + -> Graph VirtualReg RegClass RealReg + -> SpillCostRecord + -> SDoc + +pprSpillCostRecord regClass pprReg graph (reg, uses, defs, life) + = hsep + [ pprReg (RegVirtual reg) + , ppr uses + , ppr defs + , ppr life + , ppr $ nodeDegree regClass graph reg + , text $ show $ (fromIntegral (uses + defs) + / fromIntegral (nodeDegree regClass graph reg) :: Float) ] + diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs new file mode 100644 index 0000000000..05d2e814af --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs @@ -0,0 +1,346 @@ +{-# LANGUAGE BangPatterns, CPP #-} + +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +-- | Carries interesting info for debugging / profiling of the +-- graph coloring register allocator. +module GHC.CmmToAsm.Reg.Graph.Stats ( + RegAllocStats (..), + + pprStats, + pprStatsSpills, + pprStatsLifetimes, + pprStatsConflict, + pprStatsLifeConflict, + + countSRMs, addSRM +) where + +import GhcPrelude + +import qualified GraphColor as Color +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Reg.Graph.Spill +import GHC.CmmToAsm.Reg.Graph.SpillCost +import GHC.CmmToAsm.Reg.Graph.TrivColorable +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg.Class +import GHC.Platform.Reg +import GHC.CmmToAsm.Reg.Target + +import Outputable +import UniqFM +import UniqSet +import State + +-- | Holds interesting statistics from the register allocator. +data RegAllocStats statics instr + + -- Information about the initial conflict graph. + = RegAllocStatsStart + { -- | Initial code, with liveness. + raLiveCmm :: [LiveCmmDecl statics instr] + + -- | The initial, uncolored graph. + , raGraph :: Color.Graph VirtualReg RegClass RealReg + + -- | Information to help choose which regs to spill. + , raSpillCosts :: SpillCostInfo } + + + -- Information about an intermediate graph. + -- This is one that we couldn't color, so had to insert spill code + -- instruction stream. + | RegAllocStatsSpill + { -- | Code we tried to allocate registers for. + raCode :: [LiveCmmDecl statics instr] + + -- | Partially colored graph. + , raGraph :: Color.Graph VirtualReg RegClass RealReg + + -- | The regs that were coalesced. + , raCoalesced :: UniqFM VirtualReg + + -- | Spiller stats. + , raSpillStats :: SpillStats + + -- | Number of instructions each reg lives for. + , raSpillCosts :: SpillCostInfo + + -- | Code with spill instructions added. + , raSpilled :: [LiveCmmDecl statics instr] } + + + -- a successful coloring + | RegAllocStatsColored + { -- | Code we tried to allocate registers for. + raCode :: [LiveCmmDecl statics instr] + + -- | Uncolored graph. + , raGraph :: Color.Graph VirtualReg RegClass RealReg + + -- | Coalesced and colored graph. + , raGraphColored :: Color.Graph VirtualReg RegClass RealReg + + -- | Regs that were coalesced. + , raCoalesced :: UniqFM VirtualReg + + -- | Code with coalescings applied. + , raCodeCoalesced :: [LiveCmmDecl statics instr] + + -- | Code with vregs replaced by hregs. + , raPatched :: [LiveCmmDecl statics instr] + + -- | Code with unneeded spill\/reloads cleaned out. + , raSpillClean :: [LiveCmmDecl statics instr] + + -- | Final code. + , raFinal :: [NatCmmDecl statics instr] + + -- | Spill\/reload\/reg-reg moves present in this code. + , raSRMs :: (Int, Int, Int) } + + +instance (Outputable statics, Outputable instr) + => Outputable (RegAllocStats statics instr) where + + ppr (s@RegAllocStatsStart{}) = sdocWithPlatform $ \platform -> + text "# Start" + $$ text "# Native code with liveness information." + $$ ppr (raLiveCmm s) + $$ text "" + $$ text "# Initial register conflict graph." + $$ Color.dotGraph + (targetRegDotColor platform) + (trivColorable platform + (targetVirtualRegSqueeze platform) + (targetRealRegSqueeze platform)) + (raGraph s) + + + ppr (s@RegAllocStatsSpill{}) = + text "# Spill" + + $$ text "# Code with liveness information." + $$ ppr (raCode s) + $$ text "" + + $$ (if (not $ isNullUFM $ raCoalesced s) + then text "# Registers coalesced." + $$ pprUFMWithKeys (raCoalesced s) (vcat . map ppr) + $$ text "" + else empty) + + $$ text "# Spills inserted." + $$ ppr (raSpillStats s) + $$ text "" + + $$ text "# Code with spills inserted." + $$ ppr (raSpilled s) + + + ppr (s@RegAllocStatsColored { raSRMs = (spills, reloads, moves) }) + = sdocWithPlatform $ \platform -> + text "# Colored" + + $$ text "# Code with liveness information." + $$ ppr (raCode s) + $$ text "" + + $$ text "# Register conflict graph (colored)." + $$ Color.dotGraph + (targetRegDotColor platform) + (trivColorable platform + (targetVirtualRegSqueeze platform) + (targetRealRegSqueeze platform)) + (raGraphColored s) + $$ text "" + + $$ (if (not $ isNullUFM $ raCoalesced s) + then text "# Registers coalesced." + $$ pprUFMWithKeys (raCoalesced s) (vcat . map ppr) + $$ text "" + else empty) + + $$ text "# Native code after coalescings applied." + $$ ppr (raCodeCoalesced s) + $$ text "" + + $$ text "# Native code after register allocation." + $$ ppr (raPatched s) + $$ text "" + + $$ text "# Clean out unneeded spill/reloads." + $$ ppr (raSpillClean s) + $$ text "" + + $$ text "# Final code, after rewriting spill/rewrite pseudo instrs." + $$ ppr (raFinal s) + $$ text "" + $$ text "# Score:" + $$ (text "# spills inserted: " <> int spills) + $$ (text "# reloads inserted: " <> int reloads) + $$ (text "# reg-reg moves remaining: " <> int moves) + $$ text "" + + +-- | Do all the different analysis on this list of RegAllocStats +pprStats + :: [RegAllocStats statics instr] + -> Color.Graph VirtualReg RegClass RealReg + -> SDoc + +pprStats stats graph + = let outSpills = pprStatsSpills stats + outLife = pprStatsLifetimes stats + outConflict = pprStatsConflict stats + outScatter = pprStatsLifeConflict stats graph + + in vcat [outSpills, outLife, outConflict, outScatter] + + +-- | Dump a table of how many spill loads \/ stores were inserted for each vreg. +pprStatsSpills + :: [RegAllocStats statics instr] -> SDoc + +pprStatsSpills stats + = let + finals = [ s | s@RegAllocStatsColored{} <- stats] + + -- sum up how many stores\/loads\/reg-reg-moves were left in the code + total = foldl' addSRM (0, 0, 0) + $ map raSRMs finals + + in ( text "-- spills-added-total" + $$ text "-- (stores, loads, reg_reg_moves_remaining)" + $$ ppr total + $$ text "") + + +-- | Dump a table of how long vregs tend to live for in the initial code. +pprStatsLifetimes + :: [RegAllocStats statics instr] -> SDoc + +pprStatsLifetimes stats + = let info = foldl' plusSpillCostInfo zeroSpillCostInfo + [ raSpillCosts s + | s@RegAllocStatsStart{} <- stats ] + + lifeBins = binLifetimeCount $ lifeMapFromSpillCostInfo info + + in ( text "-- vreg-population-lifetimes" + $$ text "-- (instruction_count, number_of_vregs_that_lived_that_long)" + $$ pprUFM lifeBins (vcat . map ppr) + $$ text "\n") + + +binLifetimeCount :: UniqFM (VirtualReg, Int) -> UniqFM (Int, Int) +binLifetimeCount fm + = let lifes = map (\l -> (l, (l, 1))) + $ map snd + $ nonDetEltsUFM fm + -- See Note [Unique Determinism and code generation] + + in addListToUFM_C + (\(l1, c1) (_, c2) -> (l1, c1 + c2)) + emptyUFM + lifes + + +-- | Dump a table of how many conflicts vregs tend to have in the initial code. +pprStatsConflict + :: [RegAllocStats statics instr] -> SDoc + +pprStatsConflict stats + = let confMap = foldl' (plusUFM_C (\(c1, n1) (_, n2) -> (c1, n1 + n2))) + emptyUFM + $ map Color.slurpNodeConflictCount + [ raGraph s | s@RegAllocStatsStart{} <- stats ] + + in ( text "-- vreg-conflicts" + $$ text "-- (conflict_count, number_of_vregs_that_had_that_many_conflicts)" + $$ pprUFM confMap (vcat . map ppr) + $$ text "\n") + + +-- | For every vreg, dump how many conflicts it has, and its lifetime. +-- Good for making a scatter plot. +pprStatsLifeConflict + :: [RegAllocStats statics instr] + -> Color.Graph VirtualReg RegClass RealReg -- ^ global register conflict graph + -> SDoc + +pprStatsLifeConflict stats graph + = let lifeMap = lifeMapFromSpillCostInfo + $ foldl' plusSpillCostInfo zeroSpillCostInfo + $ [ raSpillCosts s | s@RegAllocStatsStart{} <- stats ] + + scatter = map (\r -> let lifetime = case lookupUFM lifeMap r of + Just (_, l) -> l + Nothing -> 0 + Just node = Color.lookupNode graph r + in parens $ hcat $ punctuate (text ", ") + [ doubleQuotes $ ppr $ Color.nodeId node + , ppr $ sizeUniqSet (Color.nodeConflicts node) + , ppr $ lifetime ]) + $ map Color.nodeId + $ nonDetEltsUFM + -- See Note [Unique Determinism and code generation] + $ Color.graphMap graph + + in ( text "-- vreg-conflict-lifetime" + $$ text "-- (vreg, vreg_conflicts, vreg_lifetime)" + $$ (vcat scatter) + $$ text "\n") + + +-- | Count spill/reload/reg-reg moves. +-- Lets us see how well the register allocator has done. +countSRMs + :: Instruction instr + => LiveCmmDecl statics instr -> (Int, Int, Int) + +countSRMs cmm + = execState (mapBlockTopM countSRM_block cmm) (0, 0, 0) + + +countSRM_block + :: Instruction instr + => GenBasicBlock (LiveInstr instr) + -> State (Int, Int, Int) (GenBasicBlock (LiveInstr instr)) + +countSRM_block (BasicBlock i instrs) + = do instrs' <- mapM countSRM_instr instrs + return $ BasicBlock i instrs' + + +countSRM_instr + :: Instruction instr + => LiveInstr instr -> State (Int, Int, Int) (LiveInstr instr) + +countSRM_instr li + | LiveInstr SPILL{} _ <- li + = do modify $ \(s, r, m) -> (s + 1, r, m) + return li + + | LiveInstr RELOAD{} _ <- li + = do modify $ \(s, r, m) -> (s, r + 1, m) + return li + + | LiveInstr instr _ <- li + , Just _ <- takeRegRegMoveInstr instr + = do modify $ \(s, r, m) -> (s, r, m + 1) + return li + + | otherwise + = return li + + +-- sigh.. +addSRM :: (Int, Int, Int) -> (Int, Int, Int) -> (Int, Int, Int) +addSRM (s1, r1, m1) (s2, r2, m2) + = let !s = s1 + s2 + !r = r1 + r2 + !m = m1 + m2 + in (s, r, m) + diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs new file mode 100644 index 0000000000..ec7c5ad13e --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs @@ -0,0 +1,274 @@ +{-# LANGUAGE CPP #-} + +module GHC.CmmToAsm.Reg.Graph.TrivColorable ( + trivColorable, +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.Platform.Reg.Class +import GHC.Platform.Reg + +import GraphBase + +import UniqSet +import GHC.Platform +import Panic + +-- trivColorable --------------------------------------------------------------- + +-- trivColorable function for the graph coloring allocator +-- +-- This gets hammered by scanGraph during register allocation, +-- so needs to be fairly efficient. +-- +-- NOTE: This only works for architectures with just RcInteger and RcDouble +-- (which are disjoint) ie. x86, x86_64 and ppc +-- +-- The number of allocatable regs is hard coded in here so we can do +-- a fast comparison in trivColorable. +-- +-- It's ok if these numbers are _less_ than the actual number of free +-- regs, but they can't be more or the register conflict +-- graph won't color. +-- +-- If the graph doesn't color then the allocator will panic, but it won't +-- generate bad object code or anything nasty like that. +-- +-- There is an allocatableRegsInClass :: RegClass -> Int, but doing +-- the unboxing is too slow for us here. +-- TODO: Is that still true? Could we use allocatableRegsInClass +-- without losing performance now? +-- +-- Look at includes/stg/MachRegs.h to get the numbers. +-- + + +-- Disjoint registers ---------------------------------------------------------- +-- +-- The definition has been unfolded into individual cases for speed. +-- Each architecture has a different register setup, so we use a +-- different regSqueeze function for each. +-- +accSqueeze + :: Int + -> Int + -> (reg -> Int) + -> UniqSet reg + -> Int + +accSqueeze count maxCount squeeze us = acc count (nonDetEltsUniqSet us) + -- See Note [Unique Determinism and code generation] + where acc count [] = count + acc count _ | count >= maxCount = count + acc count (r:rs) = acc (count + squeeze r) rs + +{- Note [accSqueeze] +~~~~~~~~~~~~~~~~~~~~ +BL 2007/09 +Doing a nice fold over the UniqSet makes trivColorable use +32% of total compile time and 42% of total alloc when compiling SHA1.hs from darcs. +Therefore the UniqFM is made non-abstract and we use custom fold. + +MS 2010/04 +When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal +representation any more. But it is imperative that the accSqueeze stops +the folding if the count gets greater or equal to maxCount. We thus convert +UniqFM to a (lazy) list, do the fold and stops if necessary, which was +the most efficient variant tried. Benchmark compiling 10-times SHA1.hs follows. +(original = previous implementation, folding = fold of the whole UFM, + lazyFold = the current implementation, + hackFold = using internal representation of Data.IntMap) + + original folding hackFold lazyFold + -O -fasm (used everywhere) 31.509s 30.387s 30.791s 30.603s + 100.00% 96.44% 97.72% 97.12% + -fregs-graph 67.938s 74.875s 62.673s 64.679s + 100.00% 110.21% 92.25% 95.20% + -fregs-iterative 89.761s 143.913s 81.075s 86.912s + 100.00% 160.33% 90.32% 96.83% + -fnew-codegen 38.225s 37.142s 37.551s 37.119s + 100.00% 97.17% 98.24% 97.11% + -fnew-codegen -fregs-graph 91.786s 91.51s 87.368s 86.88s + 100.00% 99.70% 95.19% 94.65% + -fnew-codegen -fregs-iterative 206.72s 343.632s 194.694s 208.677s + 100.00% 166.23% 94.18% 100.95% +-} + +trivColorable + :: Platform + -> (RegClass -> VirtualReg -> Int) + -> (RegClass -> RealReg -> Int) + -> Triv VirtualReg RegClass RealReg + +trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions + | let cALLOCATABLE_REGS_INTEGER + = (case platformArch platform of + ArchX86 -> 3 + ArchX86_64 -> 5 + ArchPPC -> 16 + ArchSPARC -> 14 + ArchSPARC64 -> panic "trivColorable ArchSPARC64" + ArchPPC_64 _ -> 15 + ArchARM _ _ _ -> panic "trivColorable ArchARM" + ArchARM64 -> panic "trivColorable ArchARM64" + ArchAlpha -> panic "trivColorable ArchAlpha" + ArchMipseb -> panic "trivColorable ArchMipseb" + ArchMipsel -> panic "trivColorable ArchMipsel" + ArchS390X -> panic "trivColorable ArchS390X" + ArchJavaScript-> panic "trivColorable ArchJavaScript" + ArchUnknown -> panic "trivColorable ArchUnknown") + , count2 <- accSqueeze 0 cALLOCATABLE_REGS_INTEGER + (virtualRegSqueeze RcInteger) + conflicts + + , count3 <- accSqueeze count2 cALLOCATABLE_REGS_INTEGER + (realRegSqueeze RcInteger) + exclusions + + = count3 < cALLOCATABLE_REGS_INTEGER + +trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions + | let cALLOCATABLE_REGS_FLOAT + = (case platformArch platform of + -- On x86_64 and x86, Float and RcDouble + -- use the same registers, + -- so we only use RcDouble to represent the + -- register allocation problem on those types. + ArchX86 -> 0 + ArchX86_64 -> 0 + ArchPPC -> 0 + ArchSPARC -> 22 + ArchSPARC64 -> panic "trivColorable ArchSPARC64" + ArchPPC_64 _ -> 0 + ArchARM _ _ _ -> panic "trivColorable ArchARM" + ArchARM64 -> panic "trivColorable ArchARM64" + ArchAlpha -> panic "trivColorable ArchAlpha" + ArchMipseb -> panic "trivColorable ArchMipseb" + ArchMipsel -> panic "trivColorable ArchMipsel" + ArchS390X -> panic "trivColorable ArchS390X" + ArchJavaScript-> panic "trivColorable ArchJavaScript" + ArchUnknown -> panic "trivColorable ArchUnknown") + , count2 <- accSqueeze 0 cALLOCATABLE_REGS_FLOAT + (virtualRegSqueeze RcFloat) + conflicts + + , count3 <- accSqueeze count2 cALLOCATABLE_REGS_FLOAT + (realRegSqueeze RcFloat) + exclusions + + = count3 < cALLOCATABLE_REGS_FLOAT + +trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions + | let cALLOCATABLE_REGS_DOUBLE + = (case platformArch platform of + ArchX86 -> 8 + -- in x86 32bit mode sse2 there are only + -- 8 XMM registers xmm0 ... xmm7 + ArchX86_64 -> 10 + -- in x86_64 there are 16 XMM registers + -- xmm0 .. xmm15, here 10 is a + -- "dont need to solve conflicts" count that + -- was chosen at some point in the past. + ArchPPC -> 26 + ArchSPARC -> 11 + ArchSPARC64 -> panic "trivColorable ArchSPARC64" + ArchPPC_64 _ -> 20 + ArchARM _ _ _ -> panic "trivColorable ArchARM" + ArchARM64 -> panic "trivColorable ArchARM64" + ArchAlpha -> panic "trivColorable ArchAlpha" + ArchMipseb -> panic "trivColorable ArchMipseb" + ArchMipsel -> panic "trivColorable ArchMipsel" + ArchS390X -> panic "trivColorable ArchS390X" + ArchJavaScript-> panic "trivColorable ArchJavaScript" + ArchUnknown -> panic "trivColorable ArchUnknown") + , count2 <- accSqueeze 0 cALLOCATABLE_REGS_DOUBLE + (virtualRegSqueeze RcDouble) + conflicts + + , count3 <- accSqueeze count2 cALLOCATABLE_REGS_DOUBLE + (realRegSqueeze RcDouble) + exclusions + + = count3 < cALLOCATABLE_REGS_DOUBLE + + + + +-- Specification Code ---------------------------------------------------------- +-- +-- The trivColorable function for each particular architecture should +-- implement the following function, but faster. +-- + +{- +trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool +trivColorable classN conflicts exclusions + = let + + acc :: Reg -> (Int, Int) -> (Int, Int) + acc r (cd, cf) + = case regClass r of + RcInteger -> (cd+1, cf) + RcFloat -> (cd, cf+1) + _ -> panic "Regs.trivColorable: reg class not handled" + + tmp = nonDetFoldUFM acc (0, 0) conflicts + (countInt, countFloat) = nonDetFoldUFM acc tmp exclusions + + squeese = worst countInt classN RcInteger + + worst countFloat classN RcFloat + + in squeese < allocatableRegsInClass classN + +-- | Worst case displacement +-- node N of classN has n neighbors of class C. +-- +-- We currently only have RcInteger and RcDouble, which don't conflict at all. +-- This is a bit boring compared to what's in RegArchX86. +-- +worst :: Int -> RegClass -> RegClass -> Int +worst n classN classC + = case classN of + RcInteger + -> case classC of + RcInteger -> min n (allocatableRegsInClass RcInteger) + RcFloat -> 0 + + RcDouble + -> case classC of + RcFloat -> min n (allocatableRegsInClass RcFloat) + RcInteger -> 0 + +-- allocatableRegs is allMachRegNos with the fixed-use regs removed. +-- i.e., these are the regs for which we are prepared to allow the +-- register allocator to attempt to map VRegs to. +allocatableRegs :: [RegNo] +allocatableRegs + = let isFree i = freeReg i + in filter isFree allMachRegNos + + +-- | The number of regs in each class. +-- We go via top level CAFs to ensure that we're not recomputing +-- the length of these lists each time the fn is called. +allocatableRegsInClass :: RegClass -> Int +allocatableRegsInClass cls + = case cls of + RcInteger -> allocatableRegsInteger + RcFloat -> allocatableRegsDouble + +allocatableRegsInteger :: Int +allocatableRegsInteger + = length $ filter (\r -> regClass r == RcInteger) + $ map RealReg allocatableRegs + +allocatableRegsFloat :: Int +allocatableRegsFloat + = length $ filter (\r -> regClass r == RcFloat + $ map RealReg allocatableRegs +-} diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs b/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs new file mode 100644 index 0000000000..0d4c56ba21 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs @@ -0,0 +1,161 @@ + +-- | A description of the register set of the X86. +-- +-- This isn't used directly in GHC proper. +-- +-- See RegArchBase.hs for the reference. +-- See MachRegs.hs for the actual trivColorable function used in GHC. +-- +module GHC.CmmToAsm.Reg.Graph.X86 ( + classOfReg, + regsOfClass, + regName, + regAlias, + worst, + squeese, +) where + +import GhcPrelude + +import GHC.CmmToAsm.Reg.Graph.Base (Reg(..), RegSub(..), RegClass(..)) +import UniqSet + +import qualified Data.Array as A + + +-- | Determine the class of a register +classOfReg :: Reg -> RegClass +classOfReg reg + = case reg of + Reg c _ -> c + + RegSub SubL16 _ -> ClassG16 + RegSub SubL8 _ -> ClassG8 + RegSub SubL8H _ -> ClassG8 + + +-- | Determine all the regs that make up a certain class. +regsOfClass :: RegClass -> UniqSet Reg +regsOfClass c + = case c of + ClassG32 + -> mkUniqSet [ Reg ClassG32 i + | i <- [0..7] ] + + ClassG16 + -> mkUniqSet [ RegSub SubL16 (Reg ClassG32 i) + | i <- [0..7] ] + + ClassG8 + -> unionUniqSets + (mkUniqSet [ RegSub SubL8 (Reg ClassG32 i) | i <- [0..3] ]) + (mkUniqSet [ RegSub SubL8H (Reg ClassG32 i) | i <- [0..3] ]) + + ClassF64 + -> mkUniqSet [ Reg ClassF64 i + | i <- [0..5] ] + + +-- | Determine the common name of a reg +-- returns Nothing if this reg is not part of the machine. +regName :: Reg -> Maybe String +regName reg + = case reg of + Reg ClassG32 i + | i <= 7 -> + let names = A.listArray (0,8) + [ "eax", "ebx", "ecx", "edx" + , "ebp", "esi", "edi", "esp" ] + in Just $ names A.! i + + RegSub SubL16 (Reg ClassG32 i) + | i <= 7 -> + let names = A.listArray (0,8) + [ "ax", "bx", "cx", "dx" + , "bp", "si", "di", "sp"] + in Just $ names A.! i + + RegSub SubL8 (Reg ClassG32 i) + | i <= 3 -> + let names = A.listArray (0,4) [ "al", "bl", "cl", "dl"] + in Just $ names A.! i + + RegSub SubL8H (Reg ClassG32 i) + | i <= 3 -> + let names = A.listArray (0,4) [ "ah", "bh", "ch", "dh"] + in Just $ names A.! i + + _ -> Nothing + + +-- | Which regs alias what other regs. +regAlias :: Reg -> UniqSet Reg +regAlias reg + = case reg of + + -- 32 bit regs alias all of the subregs + Reg ClassG32 i + + -- for eax, ebx, ecx, eds + | i <= 3 + -> mkUniqSet + $ [ Reg ClassG32 i, RegSub SubL16 reg + , RegSub SubL8 reg, RegSub SubL8H reg ] + + -- for esi, edi, esp, ebp + | 4 <= i && i <= 7 + -> mkUniqSet + $ [ Reg ClassG32 i, RegSub SubL16 reg ] + + -- 16 bit subregs alias the whole reg + RegSub SubL16 r@(Reg ClassG32 _) + -> regAlias r + + -- 8 bit subregs alias the 32 and 16, but not the other 8 bit subreg + RegSub SubL8 r@(Reg ClassG32 _) + -> mkUniqSet $ [ r, RegSub SubL16 r, RegSub SubL8 r ] + + RegSub SubL8H r@(Reg ClassG32 _) + -> mkUniqSet $ [ r, RegSub SubL16 r, RegSub SubL8H r ] + + -- fp + Reg ClassF64 _ + -> unitUniqSet reg + + _ -> error "regAlias: invalid register" + + +-- | Optimised versions of RegColorBase.{worst, squeese} specific to x86 +worst :: Int -> RegClass -> RegClass -> Int +worst n classN classC + = case classN of + ClassG32 + -> case classC of + ClassG32 -> min n 8 + ClassG16 -> min n 8 + ClassG8 -> min n 4 + ClassF64 -> 0 + + ClassG16 + -> case classC of + ClassG32 -> min n 8 + ClassG16 -> min n 8 + ClassG8 -> min n 4 + ClassF64 -> 0 + + ClassG8 + -> case classC of + ClassG32 -> min (n*2) 8 + ClassG16 -> min (n*2) 8 + ClassG8 -> min n 8 + ClassF64 -> 0 + + ClassF64 + -> case classC of + ClassF64 -> min n 6 + _ -> 0 + +squeese :: RegClass -> [(Int, RegClass)] -> Int +squeese classN countCs + = sum (map (\(i, classC) -> worst i classN classC) countCs) + diff --git a/compiler/GHC/CmmToAsm/Reg/Linear.hs b/compiler/GHC/CmmToAsm/Reg/Linear.hs new file mode 100644 index 0000000000..9b263889d8 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear.hs @@ -0,0 +1,920 @@ +{-# LANGUAGE BangPatterns, CPP, ScopedTypeVariables #-} + +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +----------------------------------------------------------------------------- +-- +-- The register allocator +-- +-- (c) The University of Glasgow 2004 +-- +----------------------------------------------------------------------------- + +{- +The algorithm is roughly: + + 1) Compute strongly connected components of the basic block list. + + 2) Compute liveness (mapping from pseudo register to + point(s) of death?). + + 3) Walk instructions in each basic block. We keep track of + (a) Free real registers (a bitmap?) + (b) Current assignment of temporaries to machine registers and/or + spill slots (call this the "assignment"). + (c) Partial mapping from basic block ids to a virt-to-loc mapping. + When we first encounter a branch to a basic block, + we fill in its entry in this table with the current mapping. + + For each instruction: + (a) For each temporary *read* by the instruction: + If the temporary does not have a real register allocation: + - Allocate a real register from the free list. If + the list is empty: + - Find a temporary to spill. Pick one that is + not used in this instruction (ToDo: not + used for a while...) + - generate a spill instruction + - If the temporary was previously spilled, + generate an instruction to read the temp from its spill loc. + (optimisation: if we can see that a real register is going to + be used soon, then don't use it for allocation). + + (b) For each real register clobbered by this instruction: + If a temporary resides in it, + If the temporary is live after this instruction, + Move the temporary to another (non-clobbered & free) reg, + or spill it to memory. Mark the temporary as residing + in both memory and a register if it was spilled (it might + need to be read by this instruction). + + (ToDo: this is wrong for jump instructions?) + + We do this after step (a), because if we start with + movq v1, %rsi + which is an instruction that clobbers %rsi, if v1 currently resides + in %rsi we want to get + movq %rsi, %freereg + movq %rsi, %rsi -- will disappear + instead of + movq %rsi, %freereg + movq %freereg, %rsi + + (c) Update the current assignment + + (d) If the instruction is a branch: + if the destination block already has a register assignment, + Generate a new block with fixup code and redirect the + jump to the new block. + else, + Update the block id->assignment mapping with the current + assignment. + + (e) Delete all register assignments for temps which are read + (only) and die here. Update the free register list. + + (f) Mark all registers clobbered by this instruction as not free, + and mark temporaries which have been spilled due to clobbering + as in memory (step (a) marks then as in both mem & reg). + + (g) For each temporary *written* by this instruction: + Allocate a real register as for (b), spilling something + else if necessary. + - except when updating the assignment, drop any memory + locations that the temporary was previously in, since + they will be no longer valid after this instruction. + + (h) Delete all register assignments for temps which are + written and die here (there should rarely be any). Update + the free register list. + + (i) Rewrite the instruction with the new mapping. + + (j) For each spilled reg known to be now dead, re-add its stack slot + to the free list. + +-} + +module GHC.CmmToAsm.Reg.Linear ( + regAlloc, + module GHC.CmmToAsm.Reg.Linear.Base, + module GHC.CmmToAsm.Reg.Linear.Stats + ) where + +#include "HsVersions.h" + + +import GhcPrelude + +import GHC.CmmToAsm.Reg.Linear.State +import GHC.CmmToAsm.Reg.Linear.Base +import GHC.CmmToAsm.Reg.Linear.StackMap +import GHC.CmmToAsm.Reg.Linear.FreeRegs +import GHC.CmmToAsm.Reg.Linear.Stats +import GHC.CmmToAsm.Reg.Linear.JoinToTargets +import qualified GHC.CmmToAsm.Reg.Linear.PPC as PPC +import qualified GHC.CmmToAsm.Reg.Linear.SPARC as SPARC +import qualified GHC.CmmToAsm.Reg.Linear.X86 as X86 +import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64 +import GHC.CmmToAsm.Reg.Target +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg + +import GHC.Cmm.BlockId +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm hiding (RegSet) + +import Digraph +import GHC.Driver.Session +import Unique +import UniqSet +import UniqFM +import UniqSupply +import Outputable +import GHC.Platform + +import Data.Maybe +import Data.List +import Control.Monad + +-- ----------------------------------------------------------------------------- +-- Top level of the register allocator + +-- Allocate registers +regAlloc + :: (Outputable instr, Instruction instr) + => DynFlags + -> LiveCmmDecl statics instr + -> UniqSM ( NatCmmDecl statics instr + , Maybe Int -- number of extra stack slots required, + -- beyond maxSpillSlots + , Maybe RegAllocStats + ) + +regAlloc _ (CmmData sec d) + = return + ( CmmData sec d + , Nothing + , Nothing ) + +regAlloc _ (CmmProc (LiveInfo info _ _ _) lbl live []) + = return ( CmmProc info lbl live (ListGraph []) + , Nothing + , Nothing ) + +regAlloc dflags (CmmProc static lbl live sccs) + | LiveInfo info entry_ids@(first_id:_) block_live _ <- static + = do + -- do register allocation on each component. + (final_blocks, stats, stack_use) + <- linearRegAlloc dflags entry_ids block_live sccs + + -- make sure the block that was first in the input list + -- stays at the front of the output + let ((first':_), rest') + = partition ((== first_id) . blockId) final_blocks + + let max_spill_slots = maxSpillSlots dflags + extra_stack + | stack_use > max_spill_slots + = Just (stack_use - max_spill_slots) + | otherwise + = Nothing + + return ( CmmProc info lbl live (ListGraph (first' : rest')) + , extra_stack + , Just stats) + +-- bogus. to make non-exhaustive match warning go away. +regAlloc _ (CmmProc _ _ _ _) + = panic "RegAllocLinear.regAlloc: no match" + + +-- ----------------------------------------------------------------------------- +-- Linear sweep to allocate registers + + +-- | Do register allocation on some basic blocks. +-- But be careful to allocate a block in an SCC only if it has +-- an entry in the block map or it is the first block. +-- +linearRegAlloc + :: (Outputable instr, Instruction instr) + => DynFlags + -> [BlockId] -- ^ entry points + -> BlockMap RegSet + -- ^ live regs on entry to each basic block + -> [SCC (LiveBasicBlock instr)] + -- ^ instructions annotated with "deaths" + -> UniqSM ([NatBasicBlock instr], RegAllocStats, Int) + +linearRegAlloc dflags entry_ids block_live sccs + = case platformArch platform of + ArchX86 -> go $ (frInitFreeRegs platform :: X86.FreeRegs) + ArchX86_64 -> go $ (frInitFreeRegs platform :: X86_64.FreeRegs) + ArchS390X -> panic "linearRegAlloc ArchS390X" + ArchSPARC -> go $ (frInitFreeRegs platform :: SPARC.FreeRegs) + ArchSPARC64 -> panic "linearRegAlloc ArchSPARC64" + ArchPPC -> go $ (frInitFreeRegs platform :: PPC.FreeRegs) + ArchARM _ _ _ -> panic "linearRegAlloc ArchARM" + ArchARM64 -> panic "linearRegAlloc ArchARM64" + ArchPPC_64 _ -> go $ (frInitFreeRegs platform :: PPC.FreeRegs) + ArchAlpha -> panic "linearRegAlloc ArchAlpha" + ArchMipseb -> panic "linearRegAlloc ArchMipseb" + ArchMipsel -> panic "linearRegAlloc ArchMipsel" + ArchJavaScript -> panic "linearRegAlloc ArchJavaScript" + ArchUnknown -> panic "linearRegAlloc ArchUnknown" + where + go f = linearRegAlloc' dflags f entry_ids block_live sccs + platform = targetPlatform dflags + +linearRegAlloc' + :: (FR freeRegs, Outputable instr, Instruction instr) + => DynFlags + -> freeRegs + -> [BlockId] -- ^ entry points + -> BlockMap RegSet -- ^ live regs on entry to each basic block + -> [SCC (LiveBasicBlock instr)] -- ^ instructions annotated with "deaths" + -> UniqSM ([NatBasicBlock instr], RegAllocStats, Int) + +linearRegAlloc' dflags initFreeRegs entry_ids block_live sccs + = do us <- getUniqueSupplyM + let (_, stack, stats, blocks) = + runR dflags mapEmpty initFreeRegs emptyRegMap (emptyStackMap dflags) us + $ linearRA_SCCs entry_ids block_live [] sccs + return (blocks, stats, getStackUse stack) + + +linearRA_SCCs :: (FR freeRegs, Instruction instr, Outputable instr) + => [BlockId] + -> BlockMap RegSet + -> [NatBasicBlock instr] + -> [SCC (LiveBasicBlock instr)] + -> RegM freeRegs [NatBasicBlock instr] + +linearRA_SCCs _ _ blocksAcc [] + = return $ reverse blocksAcc + +linearRA_SCCs entry_ids block_live blocksAcc (AcyclicSCC block : sccs) + = do blocks' <- processBlock block_live block + linearRA_SCCs entry_ids block_live + ((reverse blocks') ++ blocksAcc) + sccs + +linearRA_SCCs entry_ids block_live blocksAcc (CyclicSCC blocks : sccs) + = do + blockss' <- process entry_ids block_live blocks [] (return []) False + linearRA_SCCs entry_ids block_live + (reverse (concat blockss') ++ blocksAcc) + sccs + +{- from John Dias's patch 2008/10/16: + The linear-scan allocator sometimes allocates a block + before allocating one of its predecessors, which could lead to + inconsistent allocations. Make it so a block is only allocated + if a predecessor has set the "incoming" assignments for the block, or + if it's the procedure's entry block. + + BL 2009/02: Careful. If the assignment for a block doesn't get set for + some reason then this function will loop. We should probably do some + more sanity checking to guard against this eventuality. +-} + +process :: (FR freeRegs, Instruction instr, Outputable instr) + => [BlockId] + -> BlockMap RegSet + -> [GenBasicBlock (LiveInstr instr)] + -> [GenBasicBlock (LiveInstr instr)] + -> [[NatBasicBlock instr]] + -> Bool + -> RegM freeRegs [[NatBasicBlock instr]] + +process _ _ [] [] accum _ + = return $ reverse accum + +process entry_ids block_live [] next_round accum madeProgress + | not madeProgress + + {- BUGS: There are so many unreachable blocks in the code the warnings are overwhelming. + pprTrace "RegAlloc.Linear.Main.process: no progress made, bailing out." + ( text "Unreachable blocks:" + $$ vcat (map ppr next_round)) -} + = return $ reverse accum + + | otherwise + = process entry_ids block_live + next_round [] accum False + +process entry_ids block_live (b@(BasicBlock id _) : blocks) + next_round accum madeProgress + = do + block_assig <- getBlockAssigR + + if isJust (mapLookup id block_assig) + || id `elem` entry_ids + then do + b' <- processBlock block_live b + process entry_ids block_live blocks + next_round (b' : accum) True + + else process entry_ids block_live blocks + (b : next_round) accum madeProgress + + +-- | Do register allocation on this basic block +-- +processBlock + :: (FR freeRegs, Outputable instr, Instruction instr) + => BlockMap RegSet -- ^ live regs on entry to each basic block + -> LiveBasicBlock instr -- ^ block to do register allocation on + -> RegM freeRegs [NatBasicBlock instr] -- ^ block with registers allocated + +processBlock block_live (BasicBlock id instrs) + = do initBlock id block_live + (instrs', fixups) + <- linearRA block_live [] [] id instrs + return $ BasicBlock id instrs' : fixups + + +-- | Load the freeregs and current reg assignment into the RegM state +-- for the basic block with this BlockId. +initBlock :: FR freeRegs + => BlockId -> BlockMap RegSet -> RegM freeRegs () +initBlock id block_live + = do dflags <- getDynFlags + let platform = targetPlatform dflags + block_assig <- getBlockAssigR + case mapLookup id block_assig of + -- no prior info about this block: we must consider + -- any fixed regs to be allocated, but we can ignore + -- virtual regs (presumably this is part of a loop, + -- and we'll iterate again). The assignment begins + -- empty. + Nothing + -> do -- pprTrace "initFreeRegs" (text $ show initFreeRegs) (return ()) + case mapLookup id block_live of + Nothing -> + setFreeRegsR (frInitFreeRegs platform) + Just live -> + setFreeRegsR $ foldl' (flip $ frAllocateReg platform) (frInitFreeRegs platform) + [ r | RegReal r <- nonDetEltsUniqSet live ] + -- See Note [Unique Determinism and code generation] + setAssigR emptyRegMap + + -- load info about register assignments leading into this block. + Just (freeregs, assig) + -> do setFreeRegsR freeregs + setAssigR assig + + +-- | Do allocation for a sequence of instructions. +linearRA + :: (FR freeRegs, Outputable instr, Instruction instr) + => BlockMap RegSet -- ^ map of what vregs are live on entry to each block. + -> [instr] -- ^ accumulator for instructions already processed. + -> [NatBasicBlock instr] -- ^ accumulator for blocks of fixup code. + -> BlockId -- ^ id of the current block, for debugging. + -> [LiveInstr instr] -- ^ liveness annotated instructions in this block. + + -> RegM freeRegs + ( [instr] -- instructions after register allocation + , [NatBasicBlock instr]) -- fresh blocks of fixup code. + + +linearRA _ accInstr accFixup _ [] + = return + ( reverse accInstr -- instrs need to be returned in the correct order. + , accFixup) -- it doesn't matter what order the fixup blocks are returned in. + + +linearRA block_live accInstr accFixups id (instr:instrs) + = do + (accInstr', new_fixups) <- raInsn block_live accInstr id instr + + linearRA block_live accInstr' (new_fixups ++ accFixups) id instrs + + +-- | Do allocation for a single instruction. +raInsn + :: (FR freeRegs, Outputable instr, Instruction instr) + => BlockMap RegSet -- ^ map of what vregs are love on entry to each block. + -> [instr] -- ^ accumulator for instructions already processed. + -> BlockId -- ^ the id of the current block, for debugging + -> LiveInstr instr -- ^ the instr to have its regs allocated, with liveness info. + -> RegM freeRegs + ( [instr] -- new instructions + , [NatBasicBlock instr]) -- extra fixup blocks + +raInsn _ new_instrs _ (LiveInstr ii Nothing) + | Just n <- takeDeltaInstr ii + = do setDeltaR n + return (new_instrs, []) + +raInsn _ new_instrs _ (LiveInstr ii@(Instr i) Nothing) + | isMetaInstr ii + = return (i : new_instrs, []) + + +raInsn block_live new_instrs id (LiveInstr (Instr instr) (Just live)) + = do + assig <- getAssigR + + -- If we have a reg->reg move between virtual registers, where the + -- src register is not live after this instruction, and the dst + -- register does not already have an assignment, + -- and the source register is assigned to a register, not to a spill slot, + -- then we can eliminate the instruction. + -- (we can't eliminate it if the source register is on the stack, because + -- we do not want to use one spill slot for different virtual registers) + case takeRegRegMoveInstr instr of + Just (src,dst) | src `elementOfUniqSet` (liveDieRead live), + isVirtualReg dst, + not (dst `elemUFM` assig), + isRealReg src || isInReg src assig -> do + case src of + (RegReal rr) -> setAssigR (addToUFM assig dst (InReg rr)) + -- if src is a fixed reg, then we just map dest to this + -- reg in the assignment. src must be an allocatable reg, + -- otherwise it wouldn't be in r_dying. + _virt -> case lookupUFM assig src of + Nothing -> panic "raInsn" + Just loc -> + setAssigR (addToUFM (delFromUFM assig src) dst loc) + + -- we have eliminated this instruction + {- + freeregs <- getFreeRegsR + assig <- getAssigR + pprTrace "raInsn" (text "ELIMINATED: " <> docToSDoc (pprInstr instr) + $$ ppr r_dying <+> ppr w_dying $$ text (show freeregs) $$ ppr assig) $ do + -} + return (new_instrs, []) + + _ -> genRaInsn block_live new_instrs id instr + (nonDetEltsUniqSet $ liveDieRead live) + (nonDetEltsUniqSet $ liveDieWrite live) + -- See Note [Unique Determinism and code generation] + +raInsn _ _ _ instr + = pprPanic "raInsn" (text "no match for:" <> ppr instr) + +-- ToDo: what can we do about +-- +-- R1 = x +-- jump I64[x] // [R1] +-- +-- where x is mapped to the same reg as R1. We want to coalesce x and +-- R1, but the register allocator doesn't know whether x will be +-- assigned to again later, in which case x and R1 should be in +-- different registers. Right now we assume the worst, and the +-- assignment to R1 will clobber x, so we'll spill x into another reg, +-- generating another reg->reg move. + + +isInReg :: Reg -> RegMap Loc -> Bool +isInReg src assig | Just (InReg _) <- lookupUFM assig src = True + | otherwise = False + + +genRaInsn :: (FR freeRegs, Instruction instr, Outputable instr) + => BlockMap RegSet + -> [instr] + -> BlockId + -> instr + -> [Reg] + -> [Reg] + -> RegM freeRegs ([instr], [NatBasicBlock instr]) + +genRaInsn block_live new_instrs block_id instr r_dying w_dying = do + dflags <- getDynFlags + let platform = targetPlatform dflags + case regUsageOfInstr platform instr of { RU read written -> + do + let real_written = [ rr | (RegReal rr) <- written ] + let virt_written = [ vr | (RegVirtual vr) <- written ] + + -- we don't need to do anything with real registers that are + -- only read by this instr. (the list is typically ~2 elements, + -- so using nub isn't a problem). + let virt_read = nub [ vr | (RegVirtual vr) <- read ] + + -- debugging +{- freeregs <- getFreeRegsR + assig <- getAssigR + pprDebugAndThen (defaultDynFlags Settings{ sTargetPlatform=platform } undefined) trace "genRaInsn" + (ppr instr + $$ text "r_dying = " <+> ppr r_dying + $$ text "w_dying = " <+> ppr w_dying + $$ text "virt_read = " <+> ppr virt_read + $$ text "virt_written = " <+> ppr virt_written + $$ text "freeregs = " <+> text (show freeregs) + $$ text "assig = " <+> ppr assig) + $ do +-} + + -- (a), (b) allocate real regs for all regs read by this instruction. + (r_spills, r_allocd) <- + allocateRegsAndSpill True{-reading-} virt_read [] [] virt_read + + -- (c) save any temporaries which will be clobbered by this instruction + clobber_saves <- saveClobberedTemps real_written r_dying + + -- (d) Update block map for new destinations + -- NB. do this before removing dead regs from the assignment, because + -- these dead regs might in fact be live in the jump targets (they're + -- only dead in the code that follows in the current basic block). + (fixup_blocks, adjusted_instr) + <- joinToTargets block_live block_id instr + + -- Debugging - show places where the reg alloc inserted + -- assignment fixup blocks. + -- when (not $ null fixup_blocks) $ + -- pprTrace "fixup_blocks" (ppr fixup_blocks) (return ()) + + -- (e) Delete all register assignments for temps which are read + -- (only) and die here. Update the free register list. + releaseRegs r_dying + + -- (f) Mark regs which are clobbered as unallocatable + clobberRegs real_written + + -- (g) Allocate registers for temporaries *written* (only) + (w_spills, w_allocd) <- + allocateRegsAndSpill False{-writing-} virt_written [] [] virt_written + + -- (h) Release registers for temps which are written here and not + -- used again. + releaseRegs w_dying + + let + -- (i) Patch the instruction + patch_map + = listToUFM + [ (t, RegReal r) + | (t, r) <- zip virt_read r_allocd + ++ zip virt_written w_allocd ] + + patched_instr + = patchRegsOfInstr adjusted_instr patchLookup + + patchLookup x + = case lookupUFM patch_map x of + Nothing -> x + Just y -> y + + + -- (j) free up stack slots for dead spilled regs + -- TODO (can't be bothered right now) + + -- erase reg->reg moves where the source and destination are the same. + -- If the src temp didn't die in this instr but happened to be allocated + -- to the same real reg as the destination, then we can erase the move anyway. + let squashed_instr = case takeRegRegMoveInstr patched_instr of + Just (src, dst) + | src == dst -> [] + _ -> [patched_instr] + + let code = squashed_instr ++ w_spills ++ reverse r_spills + ++ clobber_saves ++ new_instrs + +-- pprTrace "patched-code" ((vcat $ map (docToSDoc . pprInstr) code)) $ do +-- pprTrace "pached-fixup" ((ppr fixup_blocks)) $ do + + return (code, fixup_blocks) + + } + +-- ----------------------------------------------------------------------------- +-- releaseRegs + +releaseRegs :: FR freeRegs => [Reg] -> RegM freeRegs () +releaseRegs regs = do + dflags <- getDynFlags + let platform = targetPlatform dflags + assig <- getAssigR + free <- getFreeRegsR + let loop assig !free [] = do setAssigR assig; setFreeRegsR free; return () + loop assig !free (RegReal rr : rs) = loop assig (frReleaseReg platform rr free) rs + loop assig !free (r:rs) = + case lookupUFM assig r of + Just (InBoth real _) -> loop (delFromUFM assig r) + (frReleaseReg platform real free) rs + Just (InReg real) -> loop (delFromUFM assig r) + (frReleaseReg platform real free) rs + _ -> loop (delFromUFM assig r) free rs + loop assig free regs + + +-- ----------------------------------------------------------------------------- +-- Clobber real registers + +-- For each temp in a register that is going to be clobbered: +-- - if the temp dies after this instruction, do nothing +-- - otherwise, put it somewhere safe (another reg if possible, +-- otherwise spill and record InBoth in the assignment). +-- - for allocateRegs on the temps *read*, +-- - clobbered regs are allocatable. +-- +-- for allocateRegs on the temps *written*, +-- - clobbered regs are not allocatable. +-- + +saveClobberedTemps + :: (Instruction instr, FR freeRegs) + => [RealReg] -- real registers clobbered by this instruction + -> [Reg] -- registers which are no longer live after this insn + -> RegM freeRegs [instr] -- return: instructions to spill any temps that will + -- be clobbered. + +saveClobberedTemps [] _ + = return [] + +saveClobberedTemps clobbered dying + = do + assig <- getAssigR + let to_spill + = [ (temp,reg) + | (temp, InReg reg) <- nonDetUFMToList assig + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + , any (realRegsAlias reg) clobbered + , temp `notElem` map getUnique dying ] + + (instrs,assig') <- clobber assig [] to_spill + setAssigR assig' + return instrs + + where + clobber assig instrs [] + = return (instrs, assig) + + clobber assig instrs ((temp, reg) : rest) + = do dflags <- getDynFlags + let platform = targetPlatform dflags + + freeRegs <- getFreeRegsR + let regclass = targetClassOfRealReg platform reg + freeRegs_thisClass = frGetFreeRegs platform regclass freeRegs + + case filter (`notElem` clobbered) freeRegs_thisClass of + + -- (1) we have a free reg of the right class that isn't + -- clobbered by this instruction; use it to save the + -- clobbered value. + (my_reg : _) -> do + setFreeRegsR (frAllocateReg platform my_reg freeRegs) + + let new_assign = addToUFM assig temp (InReg my_reg) + let instr = mkRegRegMoveInstr platform + (RegReal reg) (RegReal my_reg) + + clobber new_assign (instr : instrs) rest + + -- (2) no free registers: spill the value + [] -> do + (spill, slot) <- spillR (RegReal reg) temp + + -- record why this reg was spilled for profiling + recordSpill (SpillClobber temp) + + let new_assign = addToUFM assig temp (InBoth reg slot) + + clobber new_assign (spill : instrs) rest + + + +-- | Mark all these real regs as allocated, +-- and kick out their vreg assignments. +-- +clobberRegs :: FR freeRegs => [RealReg] -> RegM freeRegs () +clobberRegs [] + = return () + +clobberRegs clobbered + = do dflags <- getDynFlags + let platform = targetPlatform dflags + + freeregs <- getFreeRegsR + setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered + + assig <- getAssigR + setAssigR $! clobber assig (nonDetUFMToList assig) + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + + where + -- if the temp was InReg and clobbered, then we will have + -- saved it in saveClobberedTemps above. So the only case + -- we have to worry about here is InBoth. Note that this + -- also catches temps which were loaded up during allocation + -- of read registers, not just those saved in saveClobberedTemps. + + clobber assig [] + = assig + + clobber assig ((temp, InBoth reg slot) : rest) + | any (realRegsAlias reg) clobbered + = clobber (addToUFM assig temp (InMem slot)) rest + + clobber assig (_:rest) + = clobber assig rest + +-- ----------------------------------------------------------------------------- +-- allocateRegsAndSpill + +-- Why are we performing a spill? +data SpillLoc = ReadMem StackSlot -- reading from register only in memory + | WriteNew -- writing to a new variable + | WriteMem -- writing to register only in memory +-- Note that ReadNew is not valid, since you don't want to be reading +-- from an uninitialized register. We also don't need the location of +-- the register in memory, since that will be invalidated by the write. +-- Technically, we could coalesce WriteNew and WriteMem into a single +-- entry as well. -- EZY + +-- This function does several things: +-- For each temporary referred to by this instruction, +-- we allocate a real register (spilling another temporary if necessary). +-- We load the temporary up from memory if necessary. +-- We also update the register assignment in the process, and +-- the list of free registers and free stack slots. + +allocateRegsAndSpill + :: (FR freeRegs, Outputable instr, Instruction instr) + => Bool -- True <=> reading (load up spilled regs) + -> [VirtualReg] -- don't push these out + -> [instr] -- spill insns + -> [RealReg] -- real registers allocated (accum.) + -> [VirtualReg] -- temps to allocate + -> RegM freeRegs ( [instr] , [RealReg]) + +allocateRegsAndSpill _ _ spills alloc [] + = return (spills, reverse alloc) + +allocateRegsAndSpill reading keep spills alloc (r:rs) + = do assig <- getAssigR + let doSpill = allocRegsAndSpill_spill reading keep spills alloc r rs assig + case lookupUFM assig r of + -- case (1a): already in a register + Just (InReg my_reg) -> + allocateRegsAndSpill reading keep spills (my_reg:alloc) rs + + -- case (1b): already in a register (and memory) + -- NB1. if we're writing this register, update its assignment to be + -- InReg, because the memory value is no longer valid. + -- NB2. This is why we must process written registers here, even if they + -- are also read by the same instruction. + Just (InBoth my_reg _) + -> do when (not reading) (setAssigR (addToUFM assig r (InReg my_reg))) + allocateRegsAndSpill reading keep spills (my_reg:alloc) rs + + -- Not already in a register, so we need to find a free one... + Just (InMem slot) | reading -> doSpill (ReadMem slot) + | otherwise -> doSpill WriteMem + Nothing | reading -> + pprPanic "allocateRegsAndSpill: Cannot read from uninitialized register" (ppr r) + -- NOTE: if the input to the NCG contains some + -- unreachable blocks with junk code, this panic + -- might be triggered. Make sure you only feed + -- sensible code into the NCG. In GHC.Cmm.Pipeline we + -- call removeUnreachableBlocks at the end for this + -- reason. + + | otherwise -> doSpill WriteNew + + +-- reading is redundant with reason, but we keep it around because it's +-- convenient and it maintains the recursive structure of the allocator. -- EZY +allocRegsAndSpill_spill :: (FR freeRegs, Instruction instr, Outputable instr) + => Bool + -> [VirtualReg] + -> [instr] + -> [RealReg] + -> VirtualReg + -> [VirtualReg] + -> UniqFM Loc + -> SpillLoc + -> RegM freeRegs ([instr], [RealReg]) +allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc + = do dflags <- getDynFlags + let platform = targetPlatform dflags + freeRegs <- getFreeRegsR + let freeRegs_thisClass = frGetFreeRegs platform (classOfVirtualReg r) freeRegs + + case freeRegs_thisClass of + + -- case (2): we have a free register + (my_reg : _) -> + do spills' <- loadTemp r spill_loc my_reg spills + + setAssigR (addToUFM assig r $! newLocation spill_loc my_reg) + setFreeRegsR $ frAllocateReg platform my_reg freeRegs + + allocateRegsAndSpill reading keep spills' (my_reg : alloc) rs + + + -- case (3): we need to push something out to free up a register + [] -> + do let inRegOrBoth (InReg _) = True + inRegOrBoth (InBoth _ _) = True + inRegOrBoth _ = False + let candidates' = + flip delListFromUFM keep $ + filterUFM inRegOrBoth $ + assig + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + let candidates = nonDetUFMToList candidates' + + -- the vregs we could kick out that are already in a slot + let candidates_inBoth + = [ (temp, reg, mem) + | (temp, InBoth reg mem) <- candidates + , targetClassOfRealReg platform reg == classOfVirtualReg r ] + + -- the vregs we could kick out that are only in a reg + -- this would require writing the reg to a new slot before using it. + let candidates_inReg + = [ (temp, reg) + | (temp, InReg reg) <- candidates + , targetClassOfRealReg platform reg == classOfVirtualReg r ] + + let result + + -- we have a temporary that is in both register and mem, + -- just free up its register for use. + | (temp, my_reg, slot) : _ <- candidates_inBoth + = do spills' <- loadTemp r spill_loc my_reg spills + let assig1 = addToUFM assig temp (InMem slot) + let assig2 = addToUFM assig1 r $! newLocation spill_loc my_reg + + setAssigR assig2 + allocateRegsAndSpill reading keep spills' (my_reg:alloc) rs + + -- otherwise, we need to spill a temporary that currently + -- resides in a register. + | (temp_to_push_out, (my_reg :: RealReg)) : _ + <- candidates_inReg + = do + (spill_insn, slot) <- spillR (RegReal my_reg) temp_to_push_out + let spill_store = (if reading then id else reverse) + [ -- COMMENT (fsLit "spill alloc") + spill_insn ] + + -- record that this temp was spilled + recordSpill (SpillAlloc temp_to_push_out) + + -- update the register assignment + let assig1 = addToUFM assig temp_to_push_out (InMem slot) + let assig2 = addToUFM assig1 r $! newLocation spill_loc my_reg + setAssigR assig2 + + -- if need be, load up a spilled temp into the reg we've just freed up. + spills' <- loadTemp r spill_loc my_reg spills + + allocateRegsAndSpill reading keep + (spill_store ++ spills') + (my_reg:alloc) rs + + + -- there wasn't anything to spill, so we're screwed. + | otherwise + = pprPanic ("RegAllocLinear.allocRegsAndSpill: no spill candidates\n") + $ vcat + [ text "allocating vreg: " <> text (show r) + , text "assignment: " <> ppr assig + , text "freeRegs: " <> text (show freeRegs) + , text "initFreeRegs: " <> text (show (frInitFreeRegs platform `asTypeOf` freeRegs)) ] + + result + + +-- | Calculate a new location after a register has been loaded. +newLocation :: SpillLoc -> RealReg -> Loc +-- if the tmp was read from a slot, then now its in a reg as well +newLocation (ReadMem slot) my_reg = InBoth my_reg slot +-- writes will always result in only the register being available +newLocation _ my_reg = InReg my_reg + +-- | Load up a spilled temporary if we need to (read from memory). +loadTemp + :: (Instruction instr) + => VirtualReg -- the temp being loaded + -> SpillLoc -- the current location of this temp + -> RealReg -- the hreg to load the temp into + -> [instr] + -> RegM freeRegs [instr] + +loadTemp vreg (ReadMem slot) hreg spills + = do + insn <- loadR (RegReal hreg) slot + recordSpill (SpillLoad $ getUnique vreg) + return $ {- COMMENT (fsLit "spill load") : -} insn : spills + +loadTemp _ _ _ spills = + return spills + diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs b/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs new file mode 100644 index 0000000000..43dbab843b --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs @@ -0,0 +1,141 @@ + +-- | Put common type definitions here to break recursive module dependencies. + +module GHC.CmmToAsm.Reg.Linear.Base ( + BlockAssignment, + + Loc(..), + regsOfLoc, + + -- for stats + SpillReason(..), + RegAllocStats(..), + + -- the allocator monad + RA_State(..), +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.Reg.Linear.StackMap +import GHC.CmmToAsm.Reg.Liveness +import GHC.Platform.Reg + +import GHC.Driver.Session +import Outputable +import Unique +import UniqFM +import UniqSupply +import GHC.Cmm.BlockId + + +-- | Used to store the register assignment on entry to a basic block. +-- We use this to handle join points, where multiple branch instructions +-- target a particular label. We have to insert fixup code to make +-- the register assignments from the different sources match up. +-- +type BlockAssignment freeRegs + = BlockMap (freeRegs, RegMap Loc) + + +-- | Where a vreg is currently stored +-- A temporary can be marked as living in both a register and memory +-- (InBoth), for example if it was recently loaded from a spill location. +-- This makes it cheap to spill (no save instruction required), but we +-- have to be careful to turn this into InReg if the value in the +-- register is changed. + +-- This is also useful when a temporary is about to be clobbered. We +-- save it in a spill location, but mark it as InBoth because the current +-- instruction might still want to read it. +-- +data Loc + -- | vreg is in a register + = InReg !RealReg + + -- | vreg is held in a stack slot + | InMem {-# UNPACK #-} !StackSlot + + + -- | vreg is held in both a register and a stack slot + | InBoth !RealReg + {-# UNPACK #-} !StackSlot + deriving (Eq, Show, Ord) + +instance Outputable Loc where + ppr l = text (show l) + + +-- | Get the reg numbers stored in this Loc. +regsOfLoc :: Loc -> [RealReg] +regsOfLoc (InReg r) = [r] +regsOfLoc (InBoth r _) = [r] +regsOfLoc (InMem _) = [] + + +-- | Reasons why instructions might be inserted by the spiller. +-- Used when generating stats for -ddrop-asm-stats. +-- +data SpillReason + -- | vreg was spilled to a slot so we could use its + -- current hreg for another vreg + = SpillAlloc !Unique + + -- | vreg was moved because its hreg was clobbered + | SpillClobber !Unique + + -- | vreg was loaded from a spill slot + | SpillLoad !Unique + + -- | reg-reg move inserted during join to targets + | SpillJoinRR !Unique + + -- | reg-mem move inserted during join to targets + | SpillJoinRM !Unique + + +-- | Used to carry interesting stats out of the register allocator. +data RegAllocStats + = RegAllocStats + { ra_spillInstrs :: UniqFM [Int] + , ra_fixupList :: [(BlockId,BlockId,BlockId)] + -- ^ (from,fixup,to) : We inserted fixup code between from and to + } + + +-- | The register allocator state +data RA_State freeRegs + = RA_State + + { + -- | the current mapping from basic blocks to + -- the register assignments at the beginning of that block. + ra_blockassig :: BlockAssignment freeRegs + + -- | free machine registers + , ra_freeregs :: !freeRegs + + -- | assignment of temps to locations + , ra_assig :: RegMap Loc + + -- | current stack delta + , ra_delta :: Int + + -- | free stack slots for spilling + , ra_stack :: StackMap + + -- | unique supply for generating names for join point fixup blocks. + , ra_us :: UniqSupply + + -- | Record why things were spilled, for -ddrop-asm-stats. + -- Just keep a list here instead of a map of regs -> reasons. + -- We don't want to slow down the allocator if we're not going to emit the stats. + , ra_spills :: [SpillReason] + , ra_DynFlags :: DynFlags + + -- | (from,fixup,to) : We inserted fixup code between from and to + , ra_fixups :: [(BlockId,BlockId,BlockId)] } + + diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs new file mode 100644 index 0000000000..0d72d8b6e9 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs @@ -0,0 +1,89 @@ +{-# LANGUAGE CPP #-} + +module GHC.CmmToAsm.Reg.Linear.FreeRegs ( + FR(..), + maxSpillSlots +) + +#include "HsVersions.h" + +where + +import GhcPrelude + +import GHC.Platform.Reg +import GHC.Platform.Reg.Class + +import GHC.Driver.Session +import Panic +import GHC.Platform + +-- ----------------------------------------------------------------------------- +-- The free register set +-- This needs to be *efficient* +-- Here's an inefficient 'executable specification' of the FreeRegs data type: +-- +-- type FreeRegs = [RegNo] +-- noFreeRegs = 0 +-- releaseReg n f = if n `elem` f then f else (n : f) +-- initFreeRegs = allocatableRegs +-- getFreeRegs cls f = filter ( (==cls) . regClass . RealReg ) f +-- allocateReg f r = filter (/= r) f + +import qualified GHC.CmmToAsm.Reg.Linear.PPC as PPC +import qualified GHC.CmmToAsm.Reg.Linear.SPARC as SPARC +import qualified GHC.CmmToAsm.Reg.Linear.X86 as X86 +import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64 + +import qualified GHC.CmmToAsm.PPC.Instr as PPC.Instr +import qualified GHC.CmmToAsm.SPARC.Instr as SPARC.Instr +import qualified GHC.CmmToAsm.X86.Instr as X86.Instr + +class Show freeRegs => FR freeRegs where + frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs + frGetFreeRegs :: Platform -> RegClass -> freeRegs -> [RealReg] + frInitFreeRegs :: Platform -> freeRegs + frReleaseReg :: Platform -> RealReg -> freeRegs -> freeRegs + +instance FR X86.FreeRegs where + frAllocateReg = \_ -> X86.allocateReg + frGetFreeRegs = X86.getFreeRegs + frInitFreeRegs = X86.initFreeRegs + frReleaseReg = \_ -> X86.releaseReg + +instance FR X86_64.FreeRegs where + frAllocateReg = \_ -> X86_64.allocateReg + frGetFreeRegs = X86_64.getFreeRegs + frInitFreeRegs = X86_64.initFreeRegs + frReleaseReg = \_ -> X86_64.releaseReg + +instance FR PPC.FreeRegs where + frAllocateReg = \_ -> PPC.allocateReg + frGetFreeRegs = \_ -> PPC.getFreeRegs + frInitFreeRegs = PPC.initFreeRegs + frReleaseReg = \_ -> PPC.releaseReg + +instance FR SPARC.FreeRegs where + frAllocateReg = SPARC.allocateReg + frGetFreeRegs = \_ -> SPARC.getFreeRegs + frInitFreeRegs = SPARC.initFreeRegs + frReleaseReg = SPARC.releaseReg + +maxSpillSlots :: DynFlags -> Int +maxSpillSlots dflags + = case platformArch (targetPlatform dflags) of + ArchX86 -> X86.Instr.maxSpillSlots dflags + ArchX86_64 -> X86.Instr.maxSpillSlots dflags + ArchPPC -> PPC.Instr.maxSpillSlots dflags + ArchS390X -> panic "maxSpillSlots ArchS390X" + ArchSPARC -> SPARC.Instr.maxSpillSlots dflags + ArchSPARC64 -> panic "maxSpillSlots ArchSPARC64" + ArchARM _ _ _ -> panic "maxSpillSlots ArchARM" + ArchARM64 -> panic "maxSpillSlots ArchARM64" + ArchPPC_64 _ -> PPC.Instr.maxSpillSlots dflags + ArchAlpha -> panic "maxSpillSlots ArchAlpha" + ArchMipseb -> panic "maxSpillSlots ArchMipseb" + ArchMipsel -> panic "maxSpillSlots ArchMipsel" + ArchJavaScript-> panic "maxSpillSlots ArchJavaScript" + ArchUnknown -> panic "maxSpillSlots ArchUnknown" + diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs new file mode 100644 index 0000000000..b4ad1b948c --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs @@ -0,0 +1,378 @@ +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +-- | Handles joining of a jump instruction to its targets. + +-- The first time we encounter a jump to a particular basic block, we +-- record the assignment of temporaries. The next time we encounter a +-- jump to the same block, we compare our current assignment to the +-- stored one. They might be different if spilling has occurred in one +-- branch; so some fixup code will be required to match up the assignments. +-- +module GHC.CmmToAsm.Reg.Linear.JoinToTargets (joinToTargets) where + +import GhcPrelude + +import GHC.CmmToAsm.Reg.Linear.State +import GHC.CmmToAsm.Reg.Linear.Base +import GHC.CmmToAsm.Reg.Linear.FreeRegs +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg + +import GHC.Cmm.BlockId +import GHC.Cmm.Dataflow.Collections +import Digraph +import GHC.Driver.Session +import Outputable +import Unique +import UniqFM +import UniqSet + +-- | For a jump instruction at the end of a block, generate fixup code so its +-- vregs are in the correct regs for its destination. +-- +joinToTargets + :: (FR freeRegs, Instruction instr, Outputable instr) + => BlockMap RegSet -- ^ maps the unique of the blockid to the set of vregs + -- that are known to be live on the entry to each block. + + -> BlockId -- ^ id of the current block + -> instr -- ^ branch instr on the end of the source block. + + -> RegM freeRegs ([NatBasicBlock instr] -- fresh blocks of fixup code. + , instr) -- the original branch + -- instruction, but maybe + -- patched to jump + -- to a fixup block first. + +joinToTargets block_live id instr + + -- we only need to worry about jump instructions. + | not $ isJumpishInstr instr + = return ([], instr) + + | otherwise + = joinToTargets' block_live [] id instr (jumpDestsOfInstr instr) + +----- +joinToTargets' + :: (FR freeRegs, Instruction instr, Outputable instr) + => BlockMap RegSet -- ^ maps the unique of the blockid to the set of vregs + -- that are known to be live on the entry to each block. + + -> [NatBasicBlock instr] -- ^ acc blocks of fixup code. + + -> BlockId -- ^ id of the current block + -> instr -- ^ branch instr on the end of the source block. + + -> [BlockId] -- ^ branch destinations still to consider. + + -> RegM freeRegs ([NatBasicBlock instr], instr) + +-- no more targets to consider. all done. +joinToTargets' _ new_blocks _ instr [] + = return (new_blocks, instr) + +-- handle a branch target. +joinToTargets' block_live new_blocks block_id instr (dest:dests) + = do + -- get the map of where the vregs are stored on entry to each basic block. + block_assig <- getBlockAssigR + + -- get the assignment on entry to the branch instruction. + assig <- getAssigR + + -- adjust the current assignment to remove any vregs that are not live + -- on entry to the destination block. + let Just live_set = mapLookup dest block_live + let still_live uniq _ = uniq `elemUniqSet_Directly` live_set + let adjusted_assig = filterUFM_Directly still_live assig + + -- and free up those registers which are now free. + let to_free = + [ r | (reg, loc) <- nonDetUFMToList assig + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + , not (elemUniqSet_Directly reg live_set) + , r <- regsOfLoc loc ] + + case mapLookup dest block_assig of + Nothing + -> joinToTargets_first + block_live new_blocks block_id instr dest dests + block_assig adjusted_assig to_free + + Just (_, dest_assig) + -> joinToTargets_again + block_live new_blocks block_id instr dest dests + adjusted_assig dest_assig + + +-- this is the first time we jumped to this block. +joinToTargets_first :: (FR freeRegs, Instruction instr, Outputable instr) + => BlockMap RegSet + -> [NatBasicBlock instr] + -> BlockId + -> instr + -> BlockId + -> [BlockId] + -> BlockAssignment freeRegs + -> RegMap Loc + -> [RealReg] + -> RegM freeRegs ([NatBasicBlock instr], instr) +joinToTargets_first block_live new_blocks block_id instr dest dests + block_assig src_assig + to_free + + = do dflags <- getDynFlags + let platform = targetPlatform dflags + + -- free up the regs that are not live on entry to this block. + freeregs <- getFreeRegsR + let freeregs' = foldl' (flip $ frReleaseReg platform) freeregs to_free + + -- remember the current assignment on entry to this block. + setBlockAssigR (mapInsert dest (freeregs', src_assig) block_assig) + + joinToTargets' block_live new_blocks block_id instr dests + + +-- we've jumped to this block before +joinToTargets_again :: (Instruction instr, FR freeRegs, Outputable instr) + => BlockMap RegSet + -> [NatBasicBlock instr] + -> BlockId + -> instr + -> BlockId + -> [BlockId] + -> UniqFM Loc + -> UniqFM Loc + -> RegM freeRegs ([NatBasicBlock instr], instr) +joinToTargets_again + block_live new_blocks block_id instr dest dests + src_assig dest_assig + + -- the assignments already match, no problem. + | nonDetUFMToList dest_assig == nonDetUFMToList src_assig + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + = joinToTargets' block_live new_blocks block_id instr dests + + -- assignments don't match, need fixup code + | otherwise + = do + + -- make a graph of what things need to be moved where. + let graph = makeRegMovementGraph src_assig dest_assig + + -- look for cycles in the graph. This can happen if regs need to be swapped. + -- Note that we depend on the fact that this function does a + -- bottom up traversal of the tree-like portions of the graph. + -- + -- eg, if we have + -- R1 -> R2 -> R3 + -- + -- ie move value in R1 to R2 and value in R2 to R3. + -- + -- We need to do the R2 -> R3 move before R1 -> R2. + -- + let sccs = stronglyConnCompFromEdgedVerticesOrdR graph + + -- debugging + {- + pprTrace + ("joinToTargets: making fixup code") + (vcat [ text " in block: " <> ppr block_id + , text " jmp instruction: " <> ppr instr + , text " src assignment: " <> ppr src_assig + , text " dest assignment: " <> ppr dest_assig + , text " movement graph: " <> ppr graph + , text " sccs of graph: " <> ppr sccs + , text ""]) + (return ()) + -} + delta <- getDeltaR + fixUpInstrs_ <- mapM (handleComponent delta instr) sccs + let fixUpInstrs = concat fixUpInstrs_ + + -- make a new basic block containing the fixup code. + -- A the end of the current block we will jump to the fixup one, + -- then that will jump to our original destination. + fixup_block_id <- mkBlockId <$> getUniqueR + let block = BasicBlock fixup_block_id + $ fixUpInstrs ++ mkJumpInstr dest + + -- if we didn't need any fixups, then don't include the block + case fixUpInstrs of + [] -> joinToTargets' block_live new_blocks block_id instr dests + + -- patch the original branch instruction so it goes to our + -- fixup block instead. + _ -> let instr' = patchJumpInstr instr + (\bid -> if bid == dest + then fixup_block_id + else bid) -- no change! + + in do + {- --debugging + pprTrace "FixUpEdge info:" + ( + text "inBlock:" <> ppr block_id $$ + text "instr:" <> ppr instr $$ + text "instr':" <> ppr instr' $$ + text "fixup_block_id':" <> + ppr fixup_block_id $$ + text "dest:" <> ppr dest + ) (return ()) + -} + recordFixupBlock block_id fixup_block_id dest + joinToTargets' block_live (block : new_blocks) + block_id instr' dests + + +-- | Construct a graph of register\/spill movements. +-- +-- Cyclic components seem to occur only very rarely. +-- +-- We cut some corners by not handling memory-to-memory moves. +-- This shouldn't happen because every temporary gets its own stack slot. +-- +makeRegMovementGraph :: RegMap Loc -> RegMap Loc -> [Node Loc Unique] +makeRegMovementGraph adjusted_assig dest_assig + = [ node | (vreg, src) <- nonDetUFMToList adjusted_assig + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + -- source reg might not be needed at the dest: + , Just loc <- [lookupUFM_Directly dest_assig vreg] + , node <- expandNode vreg src loc ] + + +-- | Expand out the destination, so InBoth destinations turn into +-- a combination of InReg and InMem. + +-- The InBoth handling is a little tricky here. If the destination is +-- InBoth, then we must ensure that the value ends up in both locations. +-- An InBoth destination must conflict with an InReg or InMem source, so +-- we expand an InBoth destination as necessary. +-- +-- An InBoth source is slightly different: we only care about the register +-- that the source value is in, so that we can move it to the destinations. +-- +expandNode + :: a + -> Loc -- ^ source of move + -> Loc -- ^ destination of move + -> [Node Loc a ] + +expandNode vreg loc@(InReg src) (InBoth dst mem) + | src == dst = [DigraphNode vreg loc [InMem mem]] + | otherwise = [DigraphNode vreg loc [InReg dst, InMem mem]] + +expandNode vreg loc@(InMem src) (InBoth dst mem) + | src == mem = [DigraphNode vreg loc [InReg dst]] + | otherwise = [DigraphNode vreg loc [InReg dst, InMem mem]] + +expandNode _ (InBoth _ src) (InMem dst) + | src == dst = [] -- guaranteed to be true + +expandNode _ (InBoth src _) (InReg dst) + | src == dst = [] + +expandNode vreg (InBoth src _) dst + = expandNode vreg (InReg src) dst + +expandNode vreg src dst + | src == dst = [] + | otherwise = [DigraphNode vreg src [dst]] + + +-- | Generate fixup code for a particular component in the move graph +-- This component tells us what values need to be moved to what +-- destinations. We have eliminated any possibility of single-node +-- cycles in expandNode above. +-- +handleComponent + :: Instruction instr + => Int -> instr -> SCC (Node Loc Unique) + -> RegM freeRegs [instr] + +-- If the graph is acyclic then we won't get the swapping problem below. +-- In this case we can just do the moves directly, and avoid having to +-- go via a spill slot. +-- +handleComponent delta _ (AcyclicSCC (DigraphNode vreg src dsts)) + = mapM (makeMove delta vreg src) dsts + + +-- Handle some cyclic moves. +-- This can happen if we have two regs that need to be swapped. +-- eg: +-- vreg source loc dest loc +-- (vreg1, InReg r1, [InReg r2]) +-- (vreg2, InReg r2, [InReg r1]) +-- +-- To avoid needing temp register, we just spill all the source regs, then +-- reaload them into their destination regs. +-- +-- Note that we can not have cycles that involve memory locations as +-- sources as single destination because memory locations (stack slots) +-- are allocated exclusively for a virtual register and therefore can not +-- require a fixup. +-- +handleComponent delta instr + (CyclicSCC ((DigraphNode vreg (InReg sreg) ((InReg dreg: _))) : rest)) + -- dest list may have more than one element, if the reg is also InMem. + = do + -- spill the source into its slot + (instrSpill, slot) + <- spillR (RegReal sreg) vreg + + -- reload into destination reg + instrLoad <- loadR (RegReal dreg) slot + + remainingFixUps <- mapM (handleComponent delta instr) + (stronglyConnCompFromEdgedVerticesOrdR rest) + + -- make sure to do all the reloads after all the spills, + -- so we don't end up clobbering the source values. + return ([instrSpill] ++ concat remainingFixUps ++ [instrLoad]) + +handleComponent _ _ (CyclicSCC _) + = panic "Register Allocator: handleComponent cyclic" + + +-- | Move a vreg between these two locations. +-- +makeMove + :: Instruction instr + => Int -- ^ current C stack delta. + -> Unique -- ^ unique of the vreg that we're moving. + -> Loc -- ^ source location. + -> Loc -- ^ destination location. + -> RegM freeRegs instr -- ^ move instruction. + +makeMove delta vreg src dst + = do dflags <- getDynFlags + let platform = targetPlatform dflags + + case (src, dst) of + (InReg s, InReg d) -> + do recordSpill (SpillJoinRR vreg) + return $ mkRegRegMoveInstr platform (RegReal s) (RegReal d) + (InMem s, InReg d) -> + do recordSpill (SpillJoinRM vreg) + return $ mkLoadInstr dflags (RegReal d) delta s + (InReg s, InMem d) -> + do recordSpill (SpillJoinRM vreg) + return $ mkSpillInstr dflags (RegReal s) delta d + _ -> + -- we don't handle memory to memory moves. + -- they shouldn't happen because we don't share + -- stack slots between vregs. + panic ("makeMove " ++ show vreg ++ " (" ++ show src ++ ") (" + ++ show dst ++ ")" + ++ " we don't handle mem->mem moves.") + diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs b/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs new file mode 100644 index 0000000000..ce0a187647 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs @@ -0,0 +1,60 @@ +-- | Free regs map for PowerPC +module GHC.CmmToAsm.Reg.Linear.PPC where + +import GhcPrelude + +import GHC.CmmToAsm.PPC.Regs +import GHC.Platform.Reg.Class +import GHC.Platform.Reg + +import Outputable +import GHC.Platform + +import Data.Word +import Data.Bits + +-- The PowerPC has 32 integer and 32 floating point registers. +-- This is 32bit PowerPC, so Word64 is inefficient - two Word32s are much +-- better. +-- Note that when getFreeRegs scans for free registers, it starts at register +-- 31 and counts down. This is a hack for the PowerPC - the higher-numbered +-- registers are callee-saves, while the lower regs are caller-saves, so it +-- makes sense to start at the high end. +-- Apart from that, the code does nothing PowerPC-specific, so feel free to +-- add your favourite platform to the #if (if you have 64 registers but only +-- 32-bit words). + +data FreeRegs = FreeRegs !Word32 !Word32 + deriving( Show ) -- The Show is used in an ASSERT + +noFreeRegs :: FreeRegs +noFreeRegs = FreeRegs 0 0 + +releaseReg :: RealReg -> FreeRegs -> FreeRegs +releaseReg (RealRegSingle r) (FreeRegs g f) + | r > 31 = FreeRegs g (f .|. (1 `shiftL` (r - 32))) + | otherwise = FreeRegs (g .|. (1 `shiftL` r)) f + +releaseReg _ _ + = panic "RegAlloc.Linear.PPC.releaseReg: bad reg" + +initFreeRegs :: Platform -> FreeRegs +initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform) + +getFreeRegs :: RegClass -> FreeRegs -> [RealReg] -- lazily +getFreeRegs cls (FreeRegs g f) + | RcDouble <- cls = go f (0x80000000) 63 + | RcInteger <- cls = go g (0x80000000) 31 + | otherwise = pprPanic "RegAllocLinear.getFreeRegs: Bad register class" (ppr cls) + where + go _ 0 _ = [] + go x m i | x .&. m /= 0 = RealRegSingle i : (go x (m `shiftR` 1) $! i-1) + | otherwise = go x (m `shiftR` 1) $! i-1 + +allocateReg :: RealReg -> FreeRegs -> FreeRegs +allocateReg (RealRegSingle r) (FreeRegs g f) + | r > 31 = FreeRegs g (f .&. complement (1 `shiftL` (r - 32))) + | otherwise = FreeRegs (g .&. complement (1 `shiftL` r)) f + +allocateReg _ _ + = panic "RegAlloc.Linear.PPC.allocateReg: bad reg" diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs b/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs new file mode 100644 index 0000000000..7fa85f0913 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs @@ -0,0 +1,188 @@ +{-# LANGUAGE CPP #-} + +-- | Free regs map for SPARC +module GHC.CmmToAsm.Reg.Linear.SPARC where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Regs +import GHC.Platform.Reg.Class +import GHC.Platform.Reg + +import GHC.Platform.Regs +import Outputable +import GHC.Platform + +import Data.Word +import Data.Bits + + +-------------------------------------------------------------------------------- +-- SPARC is like PPC, except for twinning of floating point regs. +-- When we allocate a double reg we must take an even numbered +-- float reg, as well as the one after it. + + +-- Holds bitmaps showing what registers are currently allocated. +-- The float and double reg bitmaps overlap, but we only alloc +-- float regs into the float map, and double regs into the double map. +-- +-- Free regs have a bit set in the corresponding bitmap. +-- +data FreeRegs + = FreeRegs + !Word32 -- int reg bitmap regs 0..31 + !Word32 -- float reg bitmap regs 32..63 + !Word32 -- double reg bitmap regs 32..63 + +instance Show FreeRegs where + show = showFreeRegs + +-- | A reg map where no regs are free to be allocated. +noFreeRegs :: FreeRegs +noFreeRegs = FreeRegs 0 0 0 + + +-- | The initial set of free regs. +initFreeRegs :: Platform -> FreeRegs +initFreeRegs platform + = foldl' (flip $ releaseReg platform) noFreeRegs allocatableRegs + + +-- | Get all the free registers of this class. +getFreeRegs :: RegClass -> FreeRegs -> [RealReg] -- lazily +getFreeRegs cls (FreeRegs g f d) + | RcInteger <- cls = map RealRegSingle $ go 1 g 1 0 + | RcFloat <- cls = map RealRegSingle $ go 1 f 1 32 + | RcDouble <- cls = map (\i -> RealRegPair i (i+1)) $ go 2 d 1 32 +#if __GLASGOW_HASKELL__ <= 810 + | otherwise = pprPanic "RegAllocLinear.getFreeRegs: Bad register class " (ppr cls) +#endif + where + go _ _ 0 _ + = [] + + go step bitmap mask ix + | bitmap .&. mask /= 0 + = ix : (go step bitmap (mask `shiftL` step) $! ix + step) + + | otherwise + = go step bitmap (mask `shiftL` step) $! ix + step + + +-- | Grab a register. +allocateReg :: Platform -> RealReg -> FreeRegs -> FreeRegs +allocateReg platform + reg@(RealRegSingle r) + (FreeRegs g f d) + + -- can't allocate free regs + | not $ freeReg platform r + = pprPanic "SPARC.FreeRegs.allocateReg: not allocating pinned reg" (ppr reg) + + -- a general purpose reg + | r <= 31 + = let mask = complement (bitMask r) + in FreeRegs + (g .&. mask) + f + d + + -- a float reg + | r >= 32, r <= 63 + = let mask = complement (bitMask (r - 32)) + + -- the mask of the double this FP reg aliases + maskLow = if r `mod` 2 == 0 + then complement (bitMask (r - 32)) + else complement (bitMask (r - 32 - 1)) + in FreeRegs + g + (f .&. mask) + (d .&. maskLow) + + | otherwise + = pprPanic "SPARC.FreeRegs.releaseReg: not allocating bad reg" (ppr reg) + +allocateReg _ + reg@(RealRegPair r1 r2) + (FreeRegs g f d) + + | r1 >= 32, r1 <= 63, r1 `mod` 2 == 0 + , r2 >= 32, r2 <= 63 + = let mask1 = complement (bitMask (r1 - 32)) + mask2 = complement (bitMask (r2 - 32)) + in + FreeRegs + g + ((f .&. mask1) .&. mask2) + (d .&. mask1) + + | otherwise + = pprPanic "SPARC.FreeRegs.releaseReg: not allocating bad reg" (ppr reg) + + + +-- | Release a register from allocation. +-- The register liveness information says that most regs die after a C call, +-- but we still don't want to allocate to some of them. +-- +releaseReg :: Platform -> RealReg -> FreeRegs -> FreeRegs +releaseReg platform + reg@(RealRegSingle r) + regs@(FreeRegs g f d) + + -- don't release pinned reg + | not $ freeReg platform r + = regs + + -- a general purpose reg + | r <= 31 + = let mask = bitMask r + in FreeRegs (g .|. mask) f d + + -- a float reg + | r >= 32, r <= 63 + = let mask = bitMask (r - 32) + + -- the mask of the double this FP reg aliases + maskLow = if r `mod` 2 == 0 + then bitMask (r - 32) + else bitMask (r - 32 - 1) + in FreeRegs + g + (f .|. mask) + (d .|. maskLow) + + | otherwise + = pprPanic "SPARC.FreeRegs.releaseReg: not releasing bad reg" (ppr reg) + +releaseReg _ + reg@(RealRegPair r1 r2) + (FreeRegs g f d) + + | r1 >= 32, r1 <= 63, r1 `mod` 2 == 0 + , r2 >= 32, r2 <= 63 + = let mask1 = bitMask (r1 - 32) + mask2 = bitMask (r2 - 32) + in + FreeRegs + g + ((f .|. mask1) .|. mask2) + (d .|. mask1) + + | otherwise + = pprPanic "SPARC.FreeRegs.releaseReg: not releasing bad reg" (ppr reg) + + + +bitMask :: Int -> Word32 +bitMask n = 1 `shiftL` n + + +showFreeRegs :: FreeRegs -> String +showFreeRegs regs + = "FreeRegs\n" + ++ " integer: " ++ (show $ getFreeRegs RcInteger regs) ++ "\n" + ++ " float: " ++ (show $ getFreeRegs RcFloat regs) ++ "\n" + ++ " double: " ++ (show $ getFreeRegs RcDouble regs) ++ "\n" diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs b/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs new file mode 100644 index 0000000000..630b101fc7 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs @@ -0,0 +1,61 @@ + +-- | The assignment of virtual registers to stack slots + +-- We have lots of stack slots. Memory-to-memory moves are a pain on most +-- architectures. Therefore, we avoid having to generate memory-to-memory moves +-- by simply giving every virtual register its own stack slot. + +-- The StackMap stack map keeps track of virtual register - stack slot +-- associations and of which stack slots are still free. Once it has been +-- associated, a stack slot is never "freed" or removed from the StackMap again, +-- it remains associated until we are done with the current CmmProc. +-- +module GHC.CmmToAsm.Reg.Linear.StackMap ( + StackSlot, + StackMap(..), + emptyStackMap, + getStackSlotFor, + getStackUse +) + +where + +import GhcPrelude + +import GHC.Driver.Session +import UniqFM +import Unique + + +-- | Identifier for a stack slot. +type StackSlot = Int + +data StackMap + = StackMap + { -- | The slots that are still available to be allocated. + stackMapNextFreeSlot :: !Int + + -- | Assignment of vregs to stack slots. + , stackMapAssignment :: UniqFM StackSlot } + + +-- | An empty stack map, with all slots available. +emptyStackMap :: DynFlags -> StackMap +emptyStackMap _ = StackMap 0 emptyUFM + + +-- | If this vreg unique already has a stack assignment then return the slot number, +-- otherwise allocate a new slot, and update the map. +-- +getStackSlotFor :: StackMap -> Unique -> (StackMap, Int) + +getStackSlotFor fs@(StackMap _ reserved) reg + | Just slot <- lookupUFM reserved reg = (fs, slot) + +getStackSlotFor (StackMap freeSlot reserved) reg = + (StackMap (freeSlot+1) (addToUFM reserved reg freeSlot), freeSlot) + +-- | Return the number of stack slots that were allocated +getStackUse :: StackMap -> Int +getStackUse (StackMap freeSlot _) = freeSlot + diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs new file mode 100644 index 0000000000..a167cc7e00 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs @@ -0,0 +1,184 @@ +{-# LANGUAGE CPP, PatternSynonyms, DeriveFunctor #-} + +#if !defined(GHC_LOADED_INTO_GHCI) +{-# LANGUAGE UnboxedTuples #-} +#endif + +-- | State monad for the linear register allocator. + +-- Here we keep all the state that the register allocator keeps track +-- of as it walks the instructions in a basic block. + +module GHC.CmmToAsm.Reg.Linear.State ( + RA_State(..), + RegM, + runR, + + spillR, + loadR, + + getFreeRegsR, + setFreeRegsR, + + getAssigR, + setAssigR, + + getBlockAssigR, + setBlockAssigR, + + setDeltaR, + getDeltaR, + + getUniqueR, + + recordSpill, + recordFixupBlock +) +where + +import GhcPrelude + +import GHC.CmmToAsm.Reg.Linear.Stats +import GHC.CmmToAsm.Reg.Linear.StackMap +import GHC.CmmToAsm.Reg.Linear.Base +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg +import GHC.Cmm.BlockId + +import GHC.Driver.Session +import Unique +import UniqSupply + +import Control.Monad (ap) + +-- Avoids using unboxed tuples when loading into GHCi +#if !defined(GHC_LOADED_INTO_GHCI) + +type RA_Result freeRegs a = (# RA_State freeRegs, a #) + +pattern RA_Result :: a -> b -> (# a, b #) +pattern RA_Result a b = (# a, b #) +{-# COMPLETE RA_Result #-} +#else + +data RA_Result freeRegs a = RA_Result {-# UNPACK #-} !(RA_State freeRegs) !a + deriving (Functor) + +#endif + +-- | The register allocator monad type. +newtype RegM freeRegs a + = RegM { unReg :: RA_State freeRegs -> RA_Result freeRegs a } + deriving (Functor) + +instance Applicative (RegM freeRegs) where + pure a = RegM $ \s -> RA_Result s a + (<*>) = ap + +instance Monad (RegM freeRegs) where + m >>= k = RegM $ \s -> case unReg m s of { RA_Result s a -> unReg (k a) s } + +instance HasDynFlags (RegM a) where + getDynFlags = RegM $ \s -> RA_Result s (ra_DynFlags s) + + +-- | Run a computation in the RegM register allocator monad. +runR :: DynFlags + -> BlockAssignment freeRegs + -> freeRegs + -> RegMap Loc + -> StackMap + -> UniqSupply + -> RegM freeRegs a + -> (BlockAssignment freeRegs, StackMap, RegAllocStats, a) + +runR dflags block_assig freeregs assig stack us thing = + case unReg thing + (RA_State + { ra_blockassig = block_assig + , ra_freeregs = freeregs + , ra_assig = assig + , ra_delta = 0{-???-} + , ra_stack = stack + , ra_us = us + , ra_spills = [] + , ra_DynFlags = dflags + , ra_fixups = [] }) + of + RA_Result state returned_thing + -> (ra_blockassig state, ra_stack state, makeRAStats state, returned_thing) + + +-- | Make register allocator stats from its final state. +makeRAStats :: RA_State freeRegs -> RegAllocStats +makeRAStats state + = RegAllocStats + { ra_spillInstrs = binSpillReasons (ra_spills state) + , ra_fixupList = ra_fixups state } + + +spillR :: Instruction instr + => Reg -> Unique -> RegM freeRegs (instr, Int) + +spillR reg temp = RegM $ \ s@RA_State{ra_delta=delta, ra_stack=stack0} -> + let dflags = ra_DynFlags s + (stack1,slot) = getStackSlotFor stack0 temp + instr = mkSpillInstr dflags reg delta slot + in + RA_Result s{ra_stack=stack1} (instr,slot) + + +loadR :: Instruction instr + => Reg -> Int -> RegM freeRegs instr + +loadR reg slot = RegM $ \ s@RA_State{ra_delta=delta} -> + let dflags = ra_DynFlags s + in RA_Result s (mkLoadInstr dflags reg delta slot) + +getFreeRegsR :: RegM freeRegs freeRegs +getFreeRegsR = RegM $ \ s@RA_State{ra_freeregs = freeregs} -> + RA_Result s freeregs + +setFreeRegsR :: freeRegs -> RegM freeRegs () +setFreeRegsR regs = RegM $ \ s -> + RA_Result s{ra_freeregs = regs} () + +getAssigR :: RegM freeRegs (RegMap Loc) +getAssigR = RegM $ \ s@RA_State{ra_assig = assig} -> + RA_Result s assig + +setAssigR :: RegMap Loc -> RegM freeRegs () +setAssigR assig = RegM $ \ s -> + RA_Result s{ra_assig=assig} () + +getBlockAssigR :: RegM freeRegs (BlockAssignment freeRegs) +getBlockAssigR = RegM $ \ s@RA_State{ra_blockassig = assig} -> + RA_Result s assig + +setBlockAssigR :: BlockAssignment freeRegs -> RegM freeRegs () +setBlockAssigR assig = RegM $ \ s -> + RA_Result s{ra_blockassig = assig} () + +setDeltaR :: Int -> RegM freeRegs () +setDeltaR n = RegM $ \ s -> + RA_Result s{ra_delta = n} () + +getDeltaR :: RegM freeRegs Int +getDeltaR = RegM $ \s -> RA_Result s (ra_delta s) + +getUniqueR :: RegM freeRegs Unique +getUniqueR = RegM $ \s -> + case takeUniqFromSupply (ra_us s) of + (uniq, us) -> RA_Result s{ra_us = us} uniq + + +-- | Record that a spill instruction was inserted, for profiling. +recordSpill :: SpillReason -> RegM freeRegs () +recordSpill spill + = RegM $ \s -> RA_Result (s { ra_spills = spill : ra_spills s }) () + +-- | Record a created fixup block +recordFixupBlock :: BlockId -> BlockId -> BlockId -> RegM freeRegs () +recordFixupBlock from between to + = RegM $ \s -> RA_Result (s { ra_fixups = (from,between,to) : ra_fixups s }) () diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs b/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs new file mode 100644 index 0000000000..1176b220a3 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs @@ -0,0 +1,87 @@ +module GHC.CmmToAsm.Reg.Linear.Stats ( + binSpillReasons, + countRegRegMovesNat, + pprStats +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.Reg.Linear.Base +import GHC.CmmToAsm.Reg.Liveness +import GHC.CmmToAsm.Instr + +import UniqFM +import Outputable + +import State + +-- | Build a map of how many times each reg was alloced, clobbered, loaded etc. +binSpillReasons + :: [SpillReason] -> UniqFM [Int] + +binSpillReasons reasons + = addListToUFM_C + (zipWith (+)) + emptyUFM + (map (\reason -> case reason of + SpillAlloc r -> (r, [1, 0, 0, 0, 0]) + SpillClobber r -> (r, [0, 1, 0, 0, 0]) + SpillLoad r -> (r, [0, 0, 1, 0, 0]) + SpillJoinRR r -> (r, [0, 0, 0, 1, 0]) + SpillJoinRM r -> (r, [0, 0, 0, 0, 1])) reasons) + + +-- | Count reg-reg moves remaining in this code. +countRegRegMovesNat + :: Instruction instr + => NatCmmDecl statics instr -> Int + +countRegRegMovesNat cmm + = execState (mapGenBlockTopM countBlock cmm) 0 + where + countBlock b@(BasicBlock _ instrs) + = do mapM_ countInstr instrs + return b + + countInstr instr + | Just _ <- takeRegRegMoveInstr instr + = do modify (+ 1) + return instr + + | otherwise + = return instr + + +-- | Pretty print some RegAllocStats +pprStats + :: Instruction instr + => [NatCmmDecl statics instr] -> [RegAllocStats] -> SDoc + +pprStats code statss + = let -- sum up all the instrs inserted by the spiller + spills = foldl' (plusUFM_C (zipWith (+))) + emptyUFM + $ map ra_spillInstrs statss + + spillTotals = foldl' (zipWith (+)) + [0, 0, 0, 0, 0] + $ nonDetEltsUFM spills + -- See Note [Unique Determinism and code generation] + + -- count how many reg-reg-moves remain in the code + moves = sum $ map countRegRegMovesNat code + + pprSpill (reg, spills) + = parens $ (hcat $ punctuate (text ", ") (doubleQuotes (ppr reg) : map ppr spills)) + + in ( text "-- spills-added-total" + $$ text "-- (allocs, clobbers, loads, joinRR, joinRM, reg_reg_moves_remaining)" + $$ (parens $ (hcat $ punctuate (text ", ") (map ppr spillTotals ++ [ppr moves]))) + $$ text "" + $$ text "-- spills-added" + $$ text "-- (reg_name, allocs, clobbers, loads, joinRR, joinRM)" + $$ (pprUFMWithKeys spills (vcat . map pprSpill)) + $$ text "") + diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs b/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs new file mode 100644 index 0000000000..ce103bd6b2 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs @@ -0,0 +1,52 @@ + +-- | Free regs map for i386 +module GHC.CmmToAsm.Reg.Linear.X86 where + +import GhcPrelude + +import GHC.CmmToAsm.X86.Regs +import GHC.Platform.Reg.Class +import GHC.Platform.Reg +import Panic +import GHC.Platform + +import Data.Word +import Data.Bits + +newtype FreeRegs = FreeRegs Word32 + deriving Show + +noFreeRegs :: FreeRegs +noFreeRegs = FreeRegs 0 + +releaseReg :: RealReg -> FreeRegs -> FreeRegs +releaseReg (RealRegSingle n) (FreeRegs f) + = FreeRegs (f .|. (1 `shiftL` n)) + +releaseReg _ _ + = panic "RegAlloc.Linear.X86.FreeRegs.releaseReg: no reg" + +initFreeRegs :: Platform -> FreeRegs +initFreeRegs platform + = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform) + +getFreeRegs :: Platform -> RegClass -> FreeRegs -> [RealReg] -- lazily +getFreeRegs platform cls (FreeRegs f) = go f 0 + + where go 0 _ = [] + go n m + | n .&. 1 /= 0 && classOfRealReg platform (RealRegSingle m) == cls + = RealRegSingle m : (go (n `shiftR` 1) $! (m+1)) + + | otherwise + = go (n `shiftR` 1) $! (m+1) + -- ToDo: there's no point looking through all the integer registers + -- in order to find a floating-point one. + +allocateReg :: RealReg -> FreeRegs -> FreeRegs +allocateReg (RealRegSingle r) (FreeRegs f) + = FreeRegs (f .&. complement (1 `shiftL` r)) + +allocateReg _ _ + = panic "RegAlloc.Linear.X86.FreeRegs.allocateReg: no reg" + diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs b/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs new file mode 100644 index 0000000000..322ddd6bdd --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs @@ -0,0 +1,53 @@ + +-- | Free regs map for x86_64 +module GHC.CmmToAsm.Reg.Linear.X86_64 where + +import GhcPrelude + +import GHC.CmmToAsm.X86.Regs +import GHC.Platform.Reg.Class +import GHC.Platform.Reg +import Panic +import GHC.Platform + +import Data.Word +import Data.Bits + +newtype FreeRegs = FreeRegs Word64 + deriving Show + +noFreeRegs :: FreeRegs +noFreeRegs = FreeRegs 0 + +releaseReg :: RealReg -> FreeRegs -> FreeRegs +releaseReg (RealRegSingle n) (FreeRegs f) + = FreeRegs (f .|. (1 `shiftL` n)) + +releaseReg _ _ + = panic "RegAlloc.Linear.X86_64.FreeRegs.releaseReg: no reg" + +initFreeRegs :: Platform -> FreeRegs +initFreeRegs platform + = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform) + +getFreeRegs :: Platform -> RegClass -> FreeRegs -> [RealReg] -- lazily +getFreeRegs platform cls (FreeRegs f) = go f 0 + + where go 0 _ = [] + go n m + | n .&. 1 /= 0 && classOfRealReg platform (RealRegSingle m) == cls + = RealRegSingle m : (go (n `shiftR` 1) $! (m+1)) + + | otherwise + = go (n `shiftR` 1) $! (m+1) + -- ToDo: there's no point looking through all the integer registers + -- in order to find a floating-point one. + +allocateReg :: RealReg -> FreeRegs -> FreeRegs +allocateReg (RealRegSingle r) (FreeRegs f) + = FreeRegs (f .&. complement (1 `shiftL` r)) + +allocateReg _ _ + = panic "RegAlloc.Linear.X86_64.FreeRegs.allocateReg: no reg" + + diff --git a/compiler/GHC/CmmToAsm/Reg/Liveness.hs b/compiler/GHC/CmmToAsm/Reg/Liveness.hs new file mode 100644 index 0000000000..03b8123f93 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Liveness.hs @@ -0,0 +1,1025 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeFamilies #-} + +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +----------------------------------------------------------------------------- +-- +-- The register liveness determinator +-- +-- (c) The University of Glasgow 2004-2013 +-- +----------------------------------------------------------------------------- + +module GHC.CmmToAsm.Reg.Liveness ( + RegSet, + RegMap, emptyRegMap, + BlockMap, mapEmpty, + LiveCmmDecl, + InstrSR (..), + LiveInstr (..), + Liveness (..), + LiveInfo (..), + LiveBasicBlock, + + mapBlockTop, mapBlockTopM, mapSCCM, + mapGenBlockTop, mapGenBlockTopM, + stripLive, + stripLiveBlock, + slurpConflicts, + slurpReloadCoalesce, + eraseDeltasLive, + patchEraseLive, + patchRegsLiveInstr, + reverseBlocksInTops, + regLiveness, + cmmTopLiveness + ) where +import GhcPrelude + +import GHC.Platform.Reg +import GHC.CmmToAsm.Instr + +import GHC.Cmm.BlockId +import GHC.CmmToAsm.CFG +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label +import GHC.Cmm hiding (RegSet, emptyRegSet) + +import Digraph +import GHC.Driver.Session +import MonadUtils +import Outputable +import GHC.Platform +import UniqSet +import UniqFM +import UniqSupply +import Bag +import State + +import Data.List +import Data.Maybe +import Data.IntSet (IntSet) + +----------------------------------------------------------------------------- +type RegSet = UniqSet Reg + +type RegMap a = UniqFM a + +emptyRegMap :: UniqFM a +emptyRegMap = emptyUFM + +emptyRegSet :: RegSet +emptyRegSet = emptyUniqSet + +type BlockMap a = LabelMap a + + +-- | A top level thing which carries liveness information. +type LiveCmmDecl statics instr + = GenCmmDecl + statics + LiveInfo + [SCC (LiveBasicBlock instr)] + + +-- | The register allocator also wants to use SPILL/RELOAD meta instructions, +-- so we'll keep those here. +data InstrSR instr + -- | A real machine instruction + = Instr instr + + -- | spill this reg to a stack slot + | SPILL Reg Int + + -- | reload this reg from a stack slot + | RELOAD Int Reg + +instance Instruction instr => Instruction (InstrSR instr) where + regUsageOfInstr platform i + = case i of + Instr instr -> regUsageOfInstr platform instr + SPILL reg _ -> RU [reg] [] + RELOAD _ reg -> RU [] [reg] + + patchRegsOfInstr i f + = case i of + Instr instr -> Instr (patchRegsOfInstr instr f) + SPILL reg slot -> SPILL (f reg) slot + RELOAD slot reg -> RELOAD slot (f reg) + + isJumpishInstr i + = case i of + Instr instr -> isJumpishInstr instr + _ -> False + + jumpDestsOfInstr i + = case i of + Instr instr -> jumpDestsOfInstr instr + _ -> [] + + patchJumpInstr i f + = case i of + Instr instr -> Instr (patchJumpInstr instr f) + _ -> i + + mkSpillInstr = error "mkSpillInstr[InstrSR]: Not making SPILL meta-instr" + mkLoadInstr = error "mkLoadInstr[InstrSR]: Not making LOAD meta-instr" + + takeDeltaInstr i + = case i of + Instr instr -> takeDeltaInstr instr + _ -> Nothing + + isMetaInstr i + = case i of + Instr instr -> isMetaInstr instr + _ -> False + + mkRegRegMoveInstr platform r1 r2 + = Instr (mkRegRegMoveInstr platform r1 r2) + + takeRegRegMoveInstr i + = case i of + Instr instr -> takeRegRegMoveInstr instr + _ -> Nothing + + mkJumpInstr target = map Instr (mkJumpInstr target) + + mkStackAllocInstr platform amount = + Instr <$> mkStackAllocInstr platform amount + + mkStackDeallocInstr platform amount = + Instr <$> mkStackDeallocInstr platform amount + + +-- | An instruction with liveness information. +data LiveInstr instr + = LiveInstr (InstrSR instr) (Maybe Liveness) + +-- | Liveness information. +-- The regs which die are ones which are no longer live in the *next* instruction +-- in this sequence. +-- (NB. if the instruction is a jump, these registers might still be live +-- at the jump target(s) - you have to check the liveness at the destination +-- block to find out). + +data Liveness + = Liveness + { liveBorn :: RegSet -- ^ registers born in this instruction (written to for first time). + , liveDieRead :: RegSet -- ^ registers that died because they were read for the last time. + , liveDieWrite :: RegSet } -- ^ registers that died because they were clobbered by something. + + +-- | Stash regs live on entry to each basic block in the info part of the cmm code. +data LiveInfo + = LiveInfo + (LabelMap RawCmmStatics) -- cmm info table static stuff + [BlockId] -- entry points (first one is the + -- entry point for the proc). + (BlockMap RegSet) -- argument locals live on entry to this block + (BlockMap IntSet) -- stack slots live on entry to this block + + +-- | A basic block with liveness information. +type LiveBasicBlock instr + = GenBasicBlock (LiveInstr instr) + + +instance Outputable instr + => Outputable (InstrSR instr) where + + ppr (Instr realInstr) + = ppr realInstr + + ppr (SPILL reg slot) + = hcat [ + text "\tSPILL", + char ' ', + ppr reg, + comma, + text "SLOT" <> parens (int slot)] + + ppr (RELOAD slot reg) + = hcat [ + text "\tRELOAD", + char ' ', + text "SLOT" <> parens (int slot), + comma, + ppr reg] + +instance Outputable instr + => Outputable (LiveInstr instr) where + + ppr (LiveInstr instr Nothing) + = ppr instr + + ppr (LiveInstr instr (Just live)) + = ppr instr + $$ (nest 8 + $ vcat + [ pprRegs (text "# born: ") (liveBorn live) + , pprRegs (text "# r_dying: ") (liveDieRead live) + , pprRegs (text "# w_dying: ") (liveDieWrite live) ] + $+$ space) + + where pprRegs :: SDoc -> RegSet -> SDoc + pprRegs name regs + | isEmptyUniqSet regs = empty + | otherwise = name <> + (pprUFM (getUniqSet regs) (hcat . punctuate space . map ppr)) + +instance Outputable LiveInfo where + ppr (LiveInfo mb_static entryIds liveVRegsOnEntry liveSlotsOnEntry) + = (ppr mb_static) + $$ text "# entryIds = " <> ppr entryIds + $$ text "# liveVRegsOnEntry = " <> ppr liveVRegsOnEntry + $$ text "# liveSlotsOnEntry = " <> text (show liveSlotsOnEntry) + + + +-- | map a function across all the basic blocks in this code +-- +mapBlockTop + :: (LiveBasicBlock instr -> LiveBasicBlock instr) + -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr + +mapBlockTop f cmm + = evalState (mapBlockTopM (\x -> return $ f x) cmm) () + + +-- | map a function across all the basic blocks in this code (monadic version) +-- +mapBlockTopM + :: Monad m + => (LiveBasicBlock instr -> m (LiveBasicBlock instr)) + -> LiveCmmDecl statics instr -> m (LiveCmmDecl statics instr) + +mapBlockTopM _ cmm@(CmmData{}) + = return cmm + +mapBlockTopM f (CmmProc header label live sccs) + = do sccs' <- mapM (mapSCCM f) sccs + return $ CmmProc header label live sccs' + +mapSCCM :: Monad m => (a -> m b) -> SCC a -> m (SCC b) +mapSCCM f (AcyclicSCC x) + = do x' <- f x + return $ AcyclicSCC x' + +mapSCCM f (CyclicSCC xs) + = do xs' <- mapM f xs + return $ CyclicSCC xs' + + +-- map a function across all the basic blocks in this code +mapGenBlockTop + :: (GenBasicBlock i -> GenBasicBlock i) + -> (GenCmmDecl d h (ListGraph i) -> GenCmmDecl d h (ListGraph i)) + +mapGenBlockTop f cmm + = evalState (mapGenBlockTopM (\x -> return $ f x) cmm) () + + +-- | map a function across all the basic blocks in this code (monadic version) +mapGenBlockTopM + :: Monad m + => (GenBasicBlock i -> m (GenBasicBlock i)) + -> (GenCmmDecl d h (ListGraph i) -> m (GenCmmDecl d h (ListGraph i))) + +mapGenBlockTopM _ cmm@(CmmData{}) + = return cmm + +mapGenBlockTopM f (CmmProc header label live (ListGraph blocks)) + = do blocks' <- mapM f blocks + return $ CmmProc header label live (ListGraph blocks') + + +-- | Slurp out the list of register conflicts and reg-reg moves from this top level thing. +-- Slurping of conflicts and moves is wrapped up together so we don't have +-- to make two passes over the same code when we want to build the graph. +-- +slurpConflicts + :: Instruction instr + => LiveCmmDecl statics instr + -> (Bag (UniqSet Reg), Bag (Reg, Reg)) + +slurpConflicts live + = slurpCmm (emptyBag, emptyBag) live + + where slurpCmm rs CmmData{} = rs + slurpCmm rs (CmmProc info _ _ sccs) + = foldl' (slurpSCC info) rs sccs + + slurpSCC info rs (AcyclicSCC b) + = slurpBlock info rs b + + slurpSCC info rs (CyclicSCC bs) + = foldl' (slurpBlock info) rs bs + + slurpBlock info rs (BasicBlock blockId instrs) + | LiveInfo _ _ blockLive _ <- info + , Just rsLiveEntry <- mapLookup blockId blockLive + , (conflicts, moves) <- slurpLIs rsLiveEntry rs instrs + = (consBag rsLiveEntry conflicts, moves) + + | otherwise + = panic "Liveness.slurpConflicts: bad block" + + slurpLIs rsLive (conflicts, moves) [] + = (consBag rsLive conflicts, moves) + + slurpLIs rsLive rs (LiveInstr _ Nothing : lis) + = slurpLIs rsLive rs lis + + slurpLIs rsLiveEntry (conflicts, moves) (LiveInstr instr (Just live) : lis) + = let + -- regs that die because they are read for the last time at the start of an instruction + -- are not live across it. + rsLiveAcross = rsLiveEntry `minusUniqSet` (liveDieRead live) + + -- regs live on entry to the next instruction. + -- be careful of orphans, make sure to delete dying regs _after_ unioning + -- in the ones that are born here. + rsLiveNext = (rsLiveAcross `unionUniqSets` (liveBorn live)) + `minusUniqSet` (liveDieWrite live) + + -- orphan vregs are the ones that die in the same instruction they are born in. + -- these are likely to be results that are never used, but we still + -- need to assign a hreg to them.. + rsOrphans = intersectUniqSets + (liveBorn live) + (unionUniqSets (liveDieWrite live) (liveDieRead live)) + + -- + rsConflicts = unionUniqSets rsLiveNext rsOrphans + + in case takeRegRegMoveInstr instr of + Just rr -> slurpLIs rsLiveNext + ( consBag rsConflicts conflicts + , consBag rr moves) lis + + Nothing -> slurpLIs rsLiveNext + ( consBag rsConflicts conflicts + , moves) lis + + +-- | For spill\/reloads +-- +-- SPILL v1, slot1 +-- ... +-- RELOAD slot1, v2 +-- +-- If we can arrange that v1 and v2 are allocated to the same hreg it's more likely +-- the spill\/reload instrs can be cleaned and replaced by a nop reg-reg move. +-- +-- +slurpReloadCoalesce + :: forall statics instr. Instruction instr + => LiveCmmDecl statics instr + -> Bag (Reg, Reg) + +slurpReloadCoalesce live + = slurpCmm emptyBag live + + where + slurpCmm :: Bag (Reg, Reg) + -> GenCmmDecl t t1 [SCC (LiveBasicBlock instr)] + -> Bag (Reg, Reg) + slurpCmm cs CmmData{} = cs + slurpCmm cs (CmmProc _ _ _ sccs) + = slurpComp cs (flattenSCCs sccs) + + slurpComp :: Bag (Reg, Reg) + -> [LiveBasicBlock instr] + -> Bag (Reg, Reg) + slurpComp cs blocks + = let (moveBags, _) = runState (slurpCompM blocks) emptyUFM + in unionManyBags (cs : moveBags) + + slurpCompM :: [LiveBasicBlock instr] + -> State (UniqFM [UniqFM Reg]) [Bag (Reg, Reg)] + slurpCompM blocks + = do -- run the analysis once to record the mapping across jumps. + mapM_ (slurpBlock False) blocks + + -- run it a second time while using the information from the last pass. + -- We /could/ run this many more times to deal with graphical control + -- flow and propagating info across multiple jumps, but it's probably + -- not worth the trouble. + mapM (slurpBlock True) blocks + + slurpBlock :: Bool -> LiveBasicBlock instr + -> State (UniqFM [UniqFM Reg]) (Bag (Reg, Reg)) + slurpBlock propagate (BasicBlock blockId instrs) + = do -- grab the slot map for entry to this block + slotMap <- if propagate + then getSlotMap blockId + else return emptyUFM + + (_, mMoves) <- mapAccumLM slurpLI slotMap instrs + return $ listToBag $ catMaybes mMoves + + slurpLI :: UniqFM Reg -- current slotMap + -> LiveInstr instr + -> State (UniqFM [UniqFM Reg]) -- blockId -> [slot -> reg] + -- for tracking slotMaps across jumps + + ( UniqFM Reg -- new slotMap + , Maybe (Reg, Reg)) -- maybe a new coalesce edge + + slurpLI slotMap li + + -- remember what reg was stored into the slot + | LiveInstr (SPILL reg slot) _ <- li + , slotMap' <- addToUFM slotMap slot reg + = return (slotMap', Nothing) + + -- add an edge between the this reg and the last one stored into the slot + | LiveInstr (RELOAD slot reg) _ <- li + = case lookupUFM slotMap slot of + Just reg2 + | reg /= reg2 -> return (slotMap, Just (reg, reg2)) + | otherwise -> return (slotMap, Nothing) + + Nothing -> return (slotMap, Nothing) + + -- if we hit a jump, remember the current slotMap + | LiveInstr (Instr instr) _ <- li + , targets <- jumpDestsOfInstr instr + , not $ null targets + = do mapM_ (accSlotMap slotMap) targets + return (slotMap, Nothing) + + | otherwise + = return (slotMap, Nothing) + + -- record a slotmap for an in edge to this block + accSlotMap slotMap blockId + = modify (\s -> addToUFM_C (++) s blockId [slotMap]) + + -- work out the slot map on entry to this block + -- if we have slot maps for multiple in-edges then we need to merge them. + getSlotMap blockId + = do map <- get + let slotMaps = fromMaybe [] (lookupUFM map blockId) + return $ foldr mergeSlotMaps emptyUFM slotMaps + + mergeSlotMaps :: UniqFM Reg -> UniqFM Reg -> UniqFM Reg + mergeSlotMaps map1 map2 + = listToUFM + $ [ (k, r1) + | (k, r1) <- nonDetUFMToList map1 + -- This is non-deterministic but we do not + -- currently support deterministic code-generation. + -- See Note [Unique Determinism and code generation] + , case lookupUFM map2 k of + Nothing -> False + Just r2 -> r1 == r2 ] + + +-- | Strip away liveness information, yielding NatCmmDecl +stripLive + :: (Outputable statics, Outputable instr, Instruction instr) + => DynFlags + -> LiveCmmDecl statics instr + -> NatCmmDecl statics instr + +stripLive dflags live + = stripCmm live + + where stripCmm :: (Outputable statics, Outputable instr, Instruction instr) + => LiveCmmDecl statics instr -> NatCmmDecl statics instr + stripCmm (CmmData sec ds) = CmmData sec ds + stripCmm (CmmProc (LiveInfo info (first_id:_) _ _) label live sccs) + = let final_blocks = flattenSCCs sccs + + -- make sure the block that was first in the input list + -- stays at the front of the output. This is the entry point + -- of the proc, and it needs to come first. + ((first':_), rest') + = partition ((== first_id) . blockId) final_blocks + + in CmmProc info label live + (ListGraph $ map (stripLiveBlock dflags) $ first' : rest') + + -- If the proc has blocks but we don't know what the first one was, then we're dead. + stripCmm proc + = pprPanic "RegAlloc.Liveness.stripLive: no first_id on proc" (ppr proc) + +-- | Strip away liveness information from a basic block, +-- and make real spill instructions out of SPILL, RELOAD pseudos along the way. + +stripLiveBlock + :: Instruction instr + => DynFlags + -> LiveBasicBlock instr + -> NatBasicBlock instr + +stripLiveBlock dflags (BasicBlock i lis) + = BasicBlock i instrs' + + where (instrs', _) + = runState (spillNat [] lis) 0 + + spillNat acc [] + = return (reverse acc) + + spillNat acc (LiveInstr (SPILL reg slot) _ : instrs) + = do delta <- get + spillNat (mkSpillInstr dflags reg delta slot : acc) instrs + + spillNat acc (LiveInstr (RELOAD slot reg) _ : instrs) + = do delta <- get + spillNat (mkLoadInstr dflags reg delta slot : acc) instrs + + spillNat acc (LiveInstr (Instr instr) _ : instrs) + | Just i <- takeDeltaInstr instr + = do put i + spillNat acc instrs + + spillNat acc (LiveInstr (Instr instr) _ : instrs) + = spillNat (instr : acc) instrs + + +-- | Erase Delta instructions. + +eraseDeltasLive + :: Instruction instr + => LiveCmmDecl statics instr + -> LiveCmmDecl statics instr + +eraseDeltasLive cmm + = mapBlockTop eraseBlock cmm + where + eraseBlock (BasicBlock id lis) + = BasicBlock id + $ filter (\(LiveInstr i _) -> not $ isJust $ takeDeltaInstr i) + $ lis + + +-- | Patch the registers in this code according to this register mapping. +-- also erase reg -> reg moves when the reg is the same. +-- also erase reg -> reg moves when the destination dies in this instr. +patchEraseLive + :: Instruction instr + => (Reg -> Reg) + -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr + +patchEraseLive patchF cmm + = patchCmm cmm + where + patchCmm cmm@CmmData{} = cmm + + patchCmm (CmmProc info label live sccs) + | LiveInfo static id blockMap mLiveSlots <- info + = let + patchRegSet set = mkUniqSet $ map patchF $ nonDetEltsUFM set + -- See Note [Unique Determinism and code generation] + blockMap' = mapMap (patchRegSet . getUniqSet) blockMap + + info' = LiveInfo static id blockMap' mLiveSlots + in CmmProc info' label live $ map patchSCC sccs + + patchSCC (AcyclicSCC b) = AcyclicSCC (patchBlock b) + patchSCC (CyclicSCC bs) = CyclicSCC (map patchBlock bs) + + patchBlock (BasicBlock id lis) + = BasicBlock id $ patchInstrs lis + + patchInstrs [] = [] + patchInstrs (li : lis) + + | LiveInstr i (Just live) <- li' + , Just (r1, r2) <- takeRegRegMoveInstr i + , eatMe r1 r2 live + = patchInstrs lis + + | otherwise + = li' : patchInstrs lis + + where li' = patchRegsLiveInstr patchF li + + eatMe r1 r2 live + -- source and destination regs are the same + | r1 == r2 = True + + -- destination reg is never used + | elementOfUniqSet r2 (liveBorn live) + , elementOfUniqSet r2 (liveDieRead live) || elementOfUniqSet r2 (liveDieWrite live) + = True + + | otherwise = False + + +-- | Patch registers in this LiveInstr, including the liveness information. +-- +patchRegsLiveInstr + :: Instruction instr + => (Reg -> Reg) + -> LiveInstr instr -> LiveInstr instr + +patchRegsLiveInstr patchF li + = case li of + LiveInstr instr Nothing + -> LiveInstr (patchRegsOfInstr instr patchF) Nothing + + LiveInstr instr (Just live) + -> LiveInstr + (patchRegsOfInstr instr patchF) + (Just live + { -- WARNING: have to go via lists here because patchF changes the uniq in the Reg + liveBorn = mapUniqSet patchF $ liveBorn live + , liveDieRead = mapUniqSet patchF $ liveDieRead live + , liveDieWrite = mapUniqSet patchF $ liveDieWrite live }) + -- See Note [Unique Determinism and code generation] + + +-------------------------------------------------------------------------------- +-- | Convert a NatCmmDecl to a LiveCmmDecl, with liveness information + +cmmTopLiveness + :: (Outputable instr, Instruction instr) + => Maybe CFG -> Platform + -> NatCmmDecl statics instr + -> UniqSM (LiveCmmDecl statics instr) +cmmTopLiveness cfg platform cmm + = regLiveness platform $ natCmmTopToLive cfg cmm + +natCmmTopToLive + :: (Instruction instr, Outputable instr) + => Maybe CFG -> NatCmmDecl statics instr + -> LiveCmmDecl statics instr + +natCmmTopToLive _ (CmmData i d) + = CmmData i d + +natCmmTopToLive _ (CmmProc info lbl live (ListGraph [])) + = CmmProc (LiveInfo info [] mapEmpty mapEmpty) lbl live [] + +natCmmTopToLive mCfg proc@(CmmProc info lbl live (ListGraph blocks@(first : _))) + = CmmProc (LiveInfo info' (first_id : entry_ids) mapEmpty mapEmpty) + lbl live sccsLive + where + first_id = blockId first + all_entry_ids = entryBlocks proc + sccs = sccBlocks blocks all_entry_ids mCfg + sccsLive = map (fmap (\(BasicBlock l instrs) -> + BasicBlock l (map (\i -> LiveInstr (Instr i) Nothing) instrs))) + $ sccs + + entry_ids = filter (reachable_node) . + filter (/= first_id) $ all_entry_ids + info' = mapFilterWithKey (\node _ -> reachable_node node) info + reachable_node + | Just cfg <- mCfg + = hasNode cfg + | otherwise + = const True + +-- +-- Compute the liveness graph of the set of basic blocks. Important: +-- we also discard any unreachable code here, starting from the entry +-- points (the first block in the list, and any blocks with info +-- tables). Unreachable code arises when code blocks are orphaned in +-- earlier optimisation passes, and may confuse the register allocator +-- by referring to registers that are not initialised. It's easy to +-- discard the unreachable code as part of the SCC pass, so that's +-- exactly what we do. (#7574) +-- +sccBlocks + :: forall instr . Instruction instr + => [NatBasicBlock instr] + -> [BlockId] + -> Maybe CFG + -> [SCC (NatBasicBlock instr)] + +sccBlocks blocks entries mcfg = map (fmap node_payload) sccs + where + nodes :: [ Node BlockId (NatBasicBlock instr) ] + nodes = [ DigraphNode block id (getOutEdges instrs) + | block@(BasicBlock id instrs) <- blocks ] + + g1 = graphFromEdgedVerticesUniq nodes + + reachable :: LabelSet + reachable + | Just cfg <- mcfg + -- Our CFG only contains reachable nodes by construction at this point. + = setFromList $ getCfgNodes cfg + | otherwise + = setFromList $ [ node_key node | node <- reachablesG g1 roots ] + + g2 = graphFromEdgedVerticesUniq [ node | node <- nodes + , node_key node + `setMember` reachable ] + + sccs = stronglyConnCompG g2 + + getOutEdges :: Instruction instr => [instr] -> [BlockId] + getOutEdges instrs = concatMap jumpDestsOfInstr instrs + + -- This is truly ugly, but I don't see a good alternative. + -- Digraph just has the wrong API. We want to identify nodes + -- by their keys (BlockId), but Digraph requires the whole + -- node: (NatBasicBlock, BlockId, [BlockId]). This takes + -- advantage of the fact that Digraph only looks at the key, + -- even though it asks for the whole triple. + roots = [DigraphNode (panic "sccBlocks") b (panic "sccBlocks") + | b <- entries ] + +-------------------------------------------------------------------------------- +-- Annotate code with register liveness information +-- + +regLiveness + :: (Outputable instr, Instruction instr) + => Platform + -> LiveCmmDecl statics instr + -> UniqSM (LiveCmmDecl statics instr) + +regLiveness _ (CmmData i d) + = return $ CmmData i d + +regLiveness _ (CmmProc info lbl live []) + | LiveInfo static mFirst _ _ <- info + = return $ CmmProc + (LiveInfo static mFirst mapEmpty mapEmpty) + lbl live [] + +regLiveness platform (CmmProc info lbl live sccs) + | LiveInfo static mFirst _ liveSlotsOnEntry <- info + = let (ann_sccs, block_live) = computeLiveness platform sccs + + in return $ CmmProc (LiveInfo static mFirst block_live liveSlotsOnEntry) + lbl live ann_sccs + + +-- ----------------------------------------------------------------------------- +-- | Check ordering of Blocks +-- The computeLiveness function requires SCCs to be in reverse +-- dependent order. If they're not the liveness information will be +-- wrong, and we'll get a bad allocation. Better to check for this +-- precondition explicitly or some other poor sucker will waste a +-- day staring at bad assembly code.. +-- +checkIsReverseDependent + :: Instruction instr + => [SCC (LiveBasicBlock instr)] -- ^ SCCs of blocks that we're about to run the liveness determinator on. + -> Maybe BlockId -- ^ BlockIds that fail the test (if any) + +checkIsReverseDependent sccs' + = go emptyUniqSet sccs' + + where go _ [] + = Nothing + + go blocksSeen (AcyclicSCC block : sccs) + = let dests = slurpJumpDestsOfBlock block + blocksSeen' = unionUniqSets blocksSeen $ mkUniqSet [blockId block] + badDests = dests `minusUniqSet` blocksSeen' + in case nonDetEltsUniqSet badDests of + -- See Note [Unique Determinism and code generation] + [] -> go blocksSeen' sccs + bad : _ -> Just bad + + go blocksSeen (CyclicSCC blocks : sccs) + = let dests = unionManyUniqSets $ map slurpJumpDestsOfBlock blocks + blocksSeen' = unionUniqSets blocksSeen $ mkUniqSet $ map blockId blocks + badDests = dests `minusUniqSet` blocksSeen' + in case nonDetEltsUniqSet badDests of + -- See Note [Unique Determinism and code generation] + [] -> go blocksSeen' sccs + bad : _ -> Just bad + + slurpJumpDestsOfBlock (BasicBlock _ instrs) + = unionManyUniqSets + $ map (mkUniqSet . jumpDestsOfInstr) + [ i | LiveInstr i _ <- instrs] + + +-- | If we've compute liveness info for this code already we have to reverse +-- the SCCs in each top to get them back to the right order so we can do it again. +reverseBlocksInTops :: LiveCmmDecl statics instr -> LiveCmmDecl statics instr +reverseBlocksInTops top + = case top of + CmmData{} -> top + CmmProc info lbl live sccs -> CmmProc info lbl live (reverse sccs) + + +-- | Computing liveness +-- +-- On entry, the SCCs must be in "reverse" order: later blocks may transfer +-- control to earlier ones only, else `panic`. +-- +-- The SCCs returned are in the *opposite* order, which is exactly what we +-- want for the next pass. +-- +computeLiveness + :: (Outputable instr, Instruction instr) + => Platform + -> [SCC (LiveBasicBlock instr)] + -> ([SCC (LiveBasicBlock instr)], -- instructions annotated with list of registers + -- which are "dead after this instruction". + BlockMap RegSet) -- blocks annotated with set of live registers + -- on entry to the block. + +computeLiveness platform sccs + = case checkIsReverseDependent sccs of + Nothing -> livenessSCCs platform mapEmpty [] sccs + Just bad -> pprPanic "RegAlloc.Liveness.computeLiveness" + (vcat [ text "SCCs aren't in reverse dependent order" + , text "bad blockId" <+> ppr bad + , ppr sccs]) + +livenessSCCs + :: Instruction instr + => Platform + -> BlockMap RegSet + -> [SCC (LiveBasicBlock instr)] -- accum + -> [SCC (LiveBasicBlock instr)] + -> ( [SCC (LiveBasicBlock instr)] + , BlockMap RegSet) + +livenessSCCs _ blockmap done [] + = (done, blockmap) + +livenessSCCs platform blockmap done (AcyclicSCC block : sccs) + = let (blockmap', block') = livenessBlock platform blockmap block + in livenessSCCs platform blockmap' (AcyclicSCC block' : done) sccs + +livenessSCCs platform blockmap done + (CyclicSCC blocks : sccs) = + livenessSCCs platform blockmap' (CyclicSCC blocks':done) sccs + where (blockmap', blocks') + = iterateUntilUnchanged linearLiveness equalBlockMaps + blockmap blocks + + iterateUntilUnchanged + :: (a -> b -> (a,c)) -> (a -> a -> Bool) + -> a -> b + -> (a,c) + + iterateUntilUnchanged f eq a b + = head $ + concatMap tail $ + groupBy (\(a1, _) (a2, _) -> eq a1 a2) $ + iterate (\(a, _) -> f a b) $ + (a, panic "RegLiveness.livenessSCCs") + + + linearLiveness + :: Instruction instr + => BlockMap RegSet -> [LiveBasicBlock instr] + -> (BlockMap RegSet, [LiveBasicBlock instr]) + + linearLiveness = mapAccumL (livenessBlock platform) + + -- probably the least efficient way to compare two + -- BlockMaps for equality. + equalBlockMaps a b + = a' == b' + where a' = map f $ mapToList a + b' = map f $ mapToList b + f (key,elt) = (key, nonDetEltsUniqSet elt) + -- See Note [Unique Determinism and code generation] + + + +-- | Annotate a basic block with register liveness information. +-- +livenessBlock + :: Instruction instr + => Platform + -> BlockMap RegSet + -> LiveBasicBlock instr + -> (BlockMap RegSet, LiveBasicBlock instr) + +livenessBlock platform blockmap (BasicBlock block_id instrs) + = let + (regsLiveOnEntry, instrs1) + = livenessBack platform emptyUniqSet blockmap [] (reverse instrs) + blockmap' = mapInsert block_id regsLiveOnEntry blockmap + + instrs2 = livenessForward platform regsLiveOnEntry instrs1 + + output = BasicBlock block_id instrs2 + + in ( blockmap', output) + +-- | Calculate liveness going forwards, +-- filling in when regs are born + +livenessForward + :: Instruction instr + => Platform + -> RegSet -- regs live on this instr + -> [LiveInstr instr] -> [LiveInstr instr] + +livenessForward _ _ [] = [] +livenessForward platform rsLiveEntry (li@(LiveInstr instr mLive) : lis) + | Just live <- mLive + = let + RU _ written = regUsageOfInstr platform instr + -- Regs that are written to but weren't live on entry to this instruction + -- are recorded as being born here. + rsBorn = mkUniqSet + $ filter (\r -> not $ elementOfUniqSet r rsLiveEntry) written + + rsLiveNext = (rsLiveEntry `unionUniqSets` rsBorn) + `minusUniqSet` (liveDieRead live) + `minusUniqSet` (liveDieWrite live) + + in LiveInstr instr (Just live { liveBorn = rsBorn }) + : livenessForward platform rsLiveNext lis + + | otherwise + = li : livenessForward platform rsLiveEntry lis + + +-- | Calculate liveness going backwards, +-- filling in when regs die, and what regs are live across each instruction + +livenessBack + :: Instruction instr + => Platform + -> RegSet -- regs live on this instr + -> BlockMap RegSet -- regs live on entry to other BBs + -> [LiveInstr instr] -- instructions (accum) + -> [LiveInstr instr] -- instructions + -> (RegSet, [LiveInstr instr]) + +livenessBack _ liveregs _ done [] = (liveregs, done) + +livenessBack platform liveregs blockmap acc (instr : instrs) + = let (liveregs', instr') = liveness1 platform liveregs blockmap instr + in livenessBack platform liveregs' blockmap (instr' : acc) instrs + + +-- don't bother tagging comments or deltas with liveness +liveness1 + :: Instruction instr + => Platform + -> RegSet + -> BlockMap RegSet + -> LiveInstr instr + -> (RegSet, LiveInstr instr) + +liveness1 _ liveregs _ (LiveInstr instr _) + | isMetaInstr instr + = (liveregs, LiveInstr instr Nothing) + +liveness1 platform liveregs blockmap (LiveInstr instr _) + + | not_a_branch + = (liveregs1, LiveInstr instr + (Just $ Liveness + { liveBorn = emptyUniqSet + , liveDieRead = mkUniqSet r_dying + , liveDieWrite = mkUniqSet w_dying })) + + | otherwise + = (liveregs_br, LiveInstr instr + (Just $ Liveness + { liveBorn = emptyUniqSet + , liveDieRead = mkUniqSet r_dying_br + , liveDieWrite = mkUniqSet w_dying })) + + where + !(RU read written) = regUsageOfInstr platform instr + + -- registers that were written here are dead going backwards. + -- registers that were read here are live going backwards. + liveregs1 = (liveregs `delListFromUniqSet` written) + `addListToUniqSet` read + + -- registers that are not live beyond this point, are recorded + -- as dying here. + r_dying = [ reg | reg <- read, reg `notElem` written, + not (elementOfUniqSet reg liveregs) ] + + w_dying = [ reg | reg <- written, + not (elementOfUniqSet reg liveregs) ] + + -- union in the live regs from all the jump destinations of this + -- instruction. + targets = jumpDestsOfInstr instr -- where we go from here + not_a_branch = null targets + + targetLiveRegs target + = case mapLookup target blockmap of + Just ra -> ra + Nothing -> emptyRegSet + + live_from_branch = unionManyUniqSets (map targetLiveRegs targets) + + liveregs_br = liveregs1 `unionUniqSets` live_from_branch + + -- registers that are live only in the branch targets should + -- be listed as dying here. + live_branch_only = live_from_branch `minusUniqSet` liveregs + r_dying_br = nonDetEltsUniqSet (mkUniqSet r_dying `unionUniqSets` + live_branch_only) + -- See Note [Unique Determinism and code generation] diff --git a/compiler/GHC/CmmToAsm/Reg/Target.hs b/compiler/GHC/CmmToAsm/Reg/Target.hs new file mode 100644 index 0000000000..a45d70c826 --- /dev/null +++ b/compiler/GHC/CmmToAsm/Reg/Target.hs @@ -0,0 +1,135 @@ +{-# LANGUAGE CPP #-} +-- | Hard wired things related to registers. +-- This is module is preventing the native code generator being able to +-- emit code for non-host architectures. +-- +-- TODO: Do a better job of the overloading, and eliminate this module. +-- We'd probably do better with a Register type class, and hook this to +-- Instruction somehow. +-- +-- TODO: We should also make arch specific versions of RegAlloc.Graph.TrivColorable +module GHC.CmmToAsm.Reg.Target ( + targetVirtualRegSqueeze, + targetRealRegSqueeze, + targetClassOfRealReg, + targetMkVirtualReg, + targetRegDotColor, + targetClassOfReg +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.Platform.Reg +import GHC.Platform.Reg.Class +import GHC.CmmToAsm.Format + +import Outputable +import Unique +import GHC.Platform + +import qualified GHC.CmmToAsm.X86.Regs as X86 +import qualified GHC.CmmToAsm.X86.RegInfo as X86 +import qualified GHC.CmmToAsm.PPC.Regs as PPC +import qualified GHC.CmmToAsm.SPARC.Regs as SPARC + +targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int +targetVirtualRegSqueeze platform + = case platformArch platform of + ArchX86 -> X86.virtualRegSqueeze + ArchX86_64 -> X86.virtualRegSqueeze + ArchPPC -> PPC.virtualRegSqueeze + ArchS390X -> panic "targetVirtualRegSqueeze ArchS390X" + ArchSPARC -> SPARC.virtualRegSqueeze + ArchSPARC64 -> panic "targetVirtualRegSqueeze ArchSPARC64" + ArchPPC_64 _ -> PPC.virtualRegSqueeze + ArchARM _ _ _ -> panic "targetVirtualRegSqueeze ArchARM" + ArchARM64 -> panic "targetVirtualRegSqueeze ArchARM64" + ArchAlpha -> panic "targetVirtualRegSqueeze ArchAlpha" + ArchMipseb -> panic "targetVirtualRegSqueeze ArchMipseb" + ArchMipsel -> panic "targetVirtualRegSqueeze ArchMipsel" + ArchJavaScript-> panic "targetVirtualRegSqueeze ArchJavaScript" + ArchUnknown -> panic "targetVirtualRegSqueeze ArchUnknown" + + +targetRealRegSqueeze :: Platform -> RegClass -> RealReg -> Int +targetRealRegSqueeze platform + = case platformArch platform of + ArchX86 -> X86.realRegSqueeze + ArchX86_64 -> X86.realRegSqueeze + ArchPPC -> PPC.realRegSqueeze + ArchS390X -> panic "targetRealRegSqueeze ArchS390X" + ArchSPARC -> SPARC.realRegSqueeze + ArchSPARC64 -> panic "targetRealRegSqueeze ArchSPARC64" + ArchPPC_64 _ -> PPC.realRegSqueeze + ArchARM _ _ _ -> panic "targetRealRegSqueeze ArchARM" + ArchARM64 -> panic "targetRealRegSqueeze ArchARM64" + ArchAlpha -> panic "targetRealRegSqueeze ArchAlpha" + ArchMipseb -> panic "targetRealRegSqueeze ArchMipseb" + ArchMipsel -> panic "targetRealRegSqueeze ArchMipsel" + ArchJavaScript-> panic "targetRealRegSqueeze ArchJavaScript" + ArchUnknown -> panic "targetRealRegSqueeze ArchUnknown" + +targetClassOfRealReg :: Platform -> RealReg -> RegClass +targetClassOfRealReg platform + = case platformArch platform of + ArchX86 -> X86.classOfRealReg platform + ArchX86_64 -> X86.classOfRealReg platform + ArchPPC -> PPC.classOfRealReg + ArchS390X -> panic "targetClassOfRealReg ArchS390X" + ArchSPARC -> SPARC.classOfRealReg + ArchSPARC64 -> panic "targetClassOfRealReg ArchSPARC64" + ArchPPC_64 _ -> PPC.classOfRealReg + ArchARM _ _ _ -> panic "targetClassOfRealReg ArchARM" + ArchARM64 -> panic "targetClassOfRealReg ArchARM64" + ArchAlpha -> panic "targetClassOfRealReg ArchAlpha" + ArchMipseb -> panic "targetClassOfRealReg ArchMipseb" + ArchMipsel -> panic "targetClassOfRealReg ArchMipsel" + ArchJavaScript-> panic "targetClassOfRealReg ArchJavaScript" + ArchUnknown -> panic "targetClassOfRealReg ArchUnknown" + +targetMkVirtualReg :: Platform -> Unique -> Format -> VirtualReg +targetMkVirtualReg platform + = case platformArch platform of + ArchX86 -> X86.mkVirtualReg + ArchX86_64 -> X86.mkVirtualReg + ArchPPC -> PPC.mkVirtualReg + ArchS390X -> panic "targetMkVirtualReg ArchS390X" + ArchSPARC -> SPARC.mkVirtualReg + ArchSPARC64 -> panic "targetMkVirtualReg ArchSPARC64" + ArchPPC_64 _ -> PPC.mkVirtualReg + ArchARM _ _ _ -> panic "targetMkVirtualReg ArchARM" + ArchARM64 -> panic "targetMkVirtualReg ArchARM64" + ArchAlpha -> panic "targetMkVirtualReg ArchAlpha" + ArchMipseb -> panic "targetMkVirtualReg ArchMipseb" + ArchMipsel -> panic "targetMkVirtualReg ArchMipsel" + ArchJavaScript-> panic "targetMkVirtualReg ArchJavaScript" + ArchUnknown -> panic "targetMkVirtualReg ArchUnknown" + +targetRegDotColor :: Platform -> RealReg -> SDoc +targetRegDotColor platform + = case platformArch platform of + ArchX86 -> X86.regDotColor platform + ArchX86_64 -> X86.regDotColor platform + ArchPPC -> PPC.regDotColor + ArchS390X -> panic "targetRegDotColor ArchS390X" + ArchSPARC -> SPARC.regDotColor + ArchSPARC64 -> panic "targetRegDotColor ArchSPARC64" + ArchPPC_64 _ -> PPC.regDotColor + ArchARM _ _ _ -> panic "targetRegDotColor ArchARM" + ArchARM64 -> panic "targetRegDotColor ArchARM64" + ArchAlpha -> panic "targetRegDotColor ArchAlpha" + ArchMipseb -> panic "targetRegDotColor ArchMipseb" + ArchMipsel -> panic "targetRegDotColor ArchMipsel" + ArchJavaScript-> panic "targetRegDotColor ArchJavaScript" + ArchUnknown -> panic "targetRegDotColor ArchUnknown" + + +targetClassOfReg :: Platform -> Reg -> RegClass +targetClassOfReg platform reg + = case reg of + RegVirtual vr -> classOfVirtualReg vr + RegReal rr -> targetClassOfRealReg platform rr diff --git a/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs b/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs new file mode 100644 index 0000000000..6cc660bba9 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs @@ -0,0 +1,44 @@ + +module GHC.CmmToAsm.SPARC.AddrMode ( + AddrMode(..), + addrOffset +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.Base +import GHC.Platform.Reg + +-- addressing modes ------------------------------------------------------------ + +-- | Represents a memory address in an instruction. +-- Being a RISC machine, the SPARC addressing modes are very regular. +-- +data AddrMode + = AddrRegReg Reg Reg -- addr = r1 + r2 + | AddrRegImm Reg Imm -- addr = r1 + imm + + +-- | Add an integer offset to the address in an AddrMode. +-- +addrOffset :: AddrMode -> Int -> Maybe AddrMode +addrOffset addr off + = case addr of + AddrRegImm r (ImmInt n) + | fits13Bits n2 -> Just (AddrRegImm r (ImmInt n2)) + | otherwise -> Nothing + where n2 = n + off + + AddrRegImm r (ImmInteger n) + | fits13Bits n2 -> Just (AddrRegImm r (ImmInt (fromInteger n2))) + | otherwise -> Nothing + where n2 = n + toInteger off + + AddrRegReg r (RegReal (RealRegSingle 0)) + | fits13Bits off -> Just (AddrRegImm r (ImmInt off)) + | otherwise -> Nothing + + _ -> Nothing diff --git a/compiler/GHC/CmmToAsm/SPARC/Base.hs b/compiler/GHC/CmmToAsm/SPARC/Base.hs new file mode 100644 index 0000000000..86a897dacb --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/Base.hs @@ -0,0 +1,77 @@ + +-- | Bits and pieces on the bottom of the module dependency tree. +-- Also import the required constants, so we know what we're using. +-- +-- In the interests of cross-compilation, we want to free ourselves +-- from the autoconf generated modules like main/Constants + +module GHC.CmmToAsm.SPARC.Base ( + wordLength, + wordLengthInBits, + spillAreaLength, + spillSlotSize, + extraStackArgsHere, + fits13Bits, + is32BitInteger, + largeOffsetError +) + +where + +import GhcPrelude + +import GHC.Driver.Session +import Panic + +import Data.Int + + +-- On 32 bit SPARC, pointers are 32 bits. +wordLength :: Int +wordLength = 4 + +wordLengthInBits :: Int +wordLengthInBits + = wordLength * 8 + +-- Size of the available spill area +spillAreaLength :: DynFlags -> Int +spillAreaLength + = rESERVED_C_STACK_BYTES + +-- | We need 8 bytes because our largest registers are 64 bit. +spillSlotSize :: Int +spillSlotSize = 8 + + +-- | We (allegedly) put the first six C-call arguments in registers; +-- where do we start putting the rest of them? +extraStackArgsHere :: Int +extraStackArgsHere = 23 + + +{-# SPECIALIZE fits13Bits :: Int -> Bool, Integer -> Bool #-} +-- | Check whether an offset is representable with 13 bits. +fits13Bits :: Integral a => a -> Bool +fits13Bits x = x >= -4096 && x < 4096 + +-- | Check whether an integer will fit in 32 bits. +-- A CmmInt is intended to be truncated to the appropriate +-- number of bits, so here we truncate it to Int64. This is +-- important because e.g. -1 as a CmmInt might be either +-- -1 or 18446744073709551615. +-- +is32BitInteger :: Integer -> Bool +is32BitInteger i + = i64 <= 0x7fffffff && i64 >= -0x80000000 + where i64 = fromIntegral i :: Int64 + + +-- | Sadness. +largeOffsetError :: (Show a) => a -> b +largeOffsetError i + = panic ("ERROR: SPARC native-code generator cannot handle large offset (" + ++ show i ++ ");\nprobably because of large constant data structures;" ++ + "\nworkaround: use -fllvm on this module.\n") + + diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs new file mode 100644 index 0000000000..2580ea4014 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs @@ -0,0 +1,700 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Generating machine code (instruction selection) +-- +-- (c) The University of Glasgow 1996-2013 +-- +----------------------------------------------------------------------------- + +{-# LANGUAGE GADTs #-} +module GHC.CmmToAsm.SPARC.CodeGen ( + cmmTopCodeGen, + generateJumpTableForInstr, + InstrBlock +) + +where + +#include "HsVersions.h" + +-- NCG stuff: +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Base +import GHC.CmmToAsm.SPARC.CodeGen.Sanity +import GHC.CmmToAsm.SPARC.CodeGen.Amode +import GHC.CmmToAsm.SPARC.CodeGen.CondCode +import GHC.CmmToAsm.SPARC.CodeGen.Gen64 +import GHC.CmmToAsm.SPARC.CodeGen.Gen32 +import GHC.CmmToAsm.SPARC.CodeGen.Base +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.SPARC.Stack +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Format +import GHC.CmmToAsm.Monad ( NatM, getNewRegNat, getNewLabelNat ) + +-- Our intermediate code: +import GHC.Cmm.BlockId +import GHC.Cmm +import GHC.Cmm.Utils +import GHC.Cmm.Switch +import GHC.Cmm.Dataflow.Block +import GHC.Cmm.Dataflow.Graph +import GHC.CmmToAsm.PIC +import GHC.Platform.Reg +import GHC.Cmm.CLabel +import GHC.CmmToAsm.CPrim + +-- The rest: +import BasicTypes +import GHC.Driver.Session +import FastString +import OrdList +import Outputable +import GHC.Platform + +import Control.Monad ( mapAndUnzipM ) + +-- | Top level code generation +cmmTopCodeGen :: RawCmmDecl + -> NatM [NatCmmDecl RawCmmStatics Instr] + +cmmTopCodeGen (CmmProc info lab live graph) + = do let blocks = toBlockListEntryFirst graph + (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks + + let proc = CmmProc info lab live (ListGraph $ concat nat_blocks) + let tops = proc : concat statics + + return tops + +cmmTopCodeGen (CmmData sec dat) = do + return [CmmData sec dat] -- no translation, we just use CmmStatic + + +-- | Do code generation on a single block of CMM code. +-- code generation may introduce new basic block boundaries, which +-- are indicated by the NEWBLOCK instruction. We must split up the +-- instruction stream into basic blocks again. Also, we extract +-- LDATAs here too. +basicBlockCodeGen :: CmmBlock + -> NatM ( [NatBasicBlock Instr] + , [NatCmmDecl RawCmmStatics Instr]) + +basicBlockCodeGen block = do + let (_, nodes, tail) = blockSplit block + id = entryLabel block + stmts = blockToList nodes + mid_instrs <- stmtsToInstrs stmts + tail_instrs <- stmtToInstrs tail + let instrs = mid_instrs `appOL` tail_instrs + let + (top,other_blocks,statics) + = foldrOL mkBlocks ([],[],[]) instrs + + mkBlocks (NEWBLOCK id) (instrs,blocks,statics) + = ([], BasicBlock id instrs : blocks, statics) + + mkBlocks (LDATA sec dat) (instrs,blocks,statics) + = (instrs, blocks, CmmData sec dat:statics) + + mkBlocks instr (instrs,blocks,statics) + = (instr:instrs, blocks, statics) + + -- do intra-block sanity checking + blocksChecked + = map (checkBlock block) + $ BasicBlock id top : other_blocks + + return (blocksChecked, statics) + + +-- | Convert some Cmm statements to SPARC instructions. +stmtsToInstrs :: [CmmNode e x] -> NatM InstrBlock +stmtsToInstrs stmts + = do instrss <- mapM stmtToInstrs stmts + return (concatOL instrss) + + +stmtToInstrs :: CmmNode e x -> NatM InstrBlock +stmtToInstrs stmt = do + dflags <- getDynFlags + case stmt of + CmmComment s -> return (unitOL (COMMENT s)) + CmmTick {} -> return nilOL + CmmUnwind {} -> return nilOL + + CmmAssign reg src + | isFloatType ty -> assignReg_FltCode format reg src + | isWord64 ty -> assignReg_I64Code reg src + | otherwise -> assignReg_IntCode format reg src + where ty = cmmRegType dflags reg + format = cmmTypeFormat ty + + CmmStore addr src + | isFloatType ty -> assignMem_FltCode format addr src + | isWord64 ty -> assignMem_I64Code addr src + | otherwise -> assignMem_IntCode format addr src + where ty = cmmExprType dflags src + format = cmmTypeFormat ty + + CmmUnsafeForeignCall target result_regs args + -> genCCall target result_regs args + + CmmBranch id -> genBranch id + CmmCondBranch arg true false _ -> do + b1 <- genCondJump true arg + b2 <- genBranch false + return (b1 `appOL` b2) + CmmSwitch arg ids -> do dflags <- getDynFlags + genSwitch dflags arg ids + CmmCall { cml_target = arg } -> genJump arg + + _ + -> panic "stmtToInstrs: statement should have been cps'd away" + + +{- +Now, given a tree (the argument to a CmmLoad) that references memory, +produce a suitable addressing mode. + +A Rule of the Game (tm) for Amodes: use of the addr bit must +immediately follow use of the code part, since the code part puts +values in registers which the addr then refers to. So you can't put +anything in between, lest it overwrite some of those registers. If +you need to do some other computation between the code part and use of +the addr bit, first store the effective address from the amode in a +temporary, then do the other computation, and then use the temporary: + + code + LEA amode, tmp + ... other computation ... + ... (tmp) ... +-} + + + +-- | Convert a BlockId to some CmmStatic data +jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic +jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags)) +jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel) + where blockLabel = blockLbl blockid + + + +-- ----------------------------------------------------------------------------- +-- Generating assignments + +-- Assignments are really at the heart of the whole code generation +-- business. Almost all top-level nodes of any real importance are +-- assignments, which correspond to loads, stores, or register +-- transfers. If we're really lucky, some of the register transfers +-- will go away, because we can use the destination register to +-- complete the code generation for the right hand side. This only +-- fails when the right hand side is forced into a fixed register +-- (e.g. the result of a call). + +assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock +assignMem_IntCode pk addr src = do + (srcReg, code) <- getSomeReg src + Amode dstAddr addr_code <- getAmode addr + return $ code `appOL` addr_code `snocOL` ST pk srcReg dstAddr + + +assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock +assignReg_IntCode _ reg src = do + dflags <- getDynFlags + r <- getRegister src + let dst = getRegisterReg (targetPlatform dflags) reg + return $ case r of + Any _ code -> code dst + Fixed _ freg fcode -> fcode `snocOL` OR False g0 (RIReg freg) dst + + + +-- Floating point assignment to memory +assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock +assignMem_FltCode pk addr src = do + dflags <- getDynFlags + Amode dst__2 code1 <- getAmode addr + (src__2, code2) <- getSomeReg src + tmp1 <- getNewRegNat pk + let + pk__2 = cmmExprType dflags src + code__2 = code1 `appOL` code2 `appOL` + if formatToWidth pk == typeWidth pk__2 + then unitOL (ST pk src__2 dst__2) + else toOL [ FxTOy (cmmTypeFormat pk__2) pk src__2 tmp1 + , ST pk tmp1 dst__2] + return code__2 + +-- Floating point assignment to a register/temporary +assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock +assignReg_FltCode pk dstCmmReg srcCmmExpr = do + dflags <- getDynFlags + let platform = targetPlatform dflags + srcRegister <- getRegister srcCmmExpr + let dstReg = getRegisterReg platform dstCmmReg + + return $ case srcRegister of + Any _ code -> code dstReg + Fixed _ srcFixedReg srcCode -> srcCode `snocOL` FMOV pk srcFixedReg dstReg + + + + +genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock + +genJump (CmmLit (CmmLabel lbl)) + = return (toOL [CALL (Left target) 0 True, NOP]) + where + target = ImmCLbl lbl + +genJump tree + = do + (target, code) <- getSomeReg tree + return (code `snocOL` JMP (AddrRegReg target g0) `snocOL` NOP) + +-- ----------------------------------------------------------------------------- +-- Unconditional branches + +genBranch :: BlockId -> NatM InstrBlock +genBranch = return . toOL . mkJumpInstr + + +-- ----------------------------------------------------------------------------- +-- Conditional jumps + +{- +Conditional jumps are always to local labels, so we can use branch +instructions. We peek at the arguments to decide what kind of +comparison to do. + +SPARC: First, we have to ensure that the condition codes are set +according to the supplied comparison operation. We generate slightly +different code for floating point comparisons, because a floating +point operation cannot directly precede a @BF@. We assume the worst +and fill that slot with a @NOP@. + +SPARC: Do not fill the delay slots here; you will confuse the register +allocator. +-} + + +genCondJump + :: BlockId -- the branch target + -> CmmExpr -- the condition on which to branch + -> NatM InstrBlock + + + +genCondJump bid bool = do + CondCode is_float cond code <- getCondCode bool + return ( + code `appOL` + toOL ( + if is_float + then [NOP, BF cond False bid, NOP] + else [BI cond False bid, NOP] + ) + ) + + + +-- ----------------------------------------------------------------------------- +-- Generating a table-branch + +genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock +genSwitch dflags expr targets + | positionIndependent dflags + = error "MachCodeGen: sparc genSwitch PIC not finished\n" + + | otherwise + = do (e_reg, e_code) <- getSomeReg (cmmOffset dflags expr offset) + + base_reg <- getNewRegNat II32 + offset_reg <- getNewRegNat II32 + dst <- getNewRegNat II32 + + label <- getNewLabelNat + + return $ e_code `appOL` + toOL + [ -- load base of jump table + SETHI (HI (ImmCLbl label)) base_reg + , OR False base_reg (RIImm $ LO $ ImmCLbl label) base_reg + + -- the addrs in the table are 32 bits wide.. + , SLL e_reg (RIImm $ ImmInt 2) offset_reg + + -- load and jump to the destination + , LD II32 (AddrRegReg base_reg offset_reg) dst + , JMP_TBL (AddrRegImm dst (ImmInt 0)) ids label + , NOP ] + where (offset, ids) = switchTargetsToTable targets + +generateJumpTableForInstr :: DynFlags -> Instr + -> Maybe (NatCmmDecl RawCmmStatics Instr) +generateJumpTableForInstr dflags (JMP_TBL _ ids label) = + let jumpTable = map (jumpTableEntry dflags) ids + in Just (CmmData (Section ReadOnlyData label) (RawCmmStatics label jumpTable)) +generateJumpTableForInstr _ _ = Nothing + + + +-- ----------------------------------------------------------------------------- +-- Generating C calls + +{- + Now the biggest nightmare---calls. Most of the nastiness is buried in + @get_arg@, which moves the arguments to the correct registers/stack + locations. Apart from that, the code is easy. + + The SPARC calling convention is an absolute + nightmare. The first 6x32 bits of arguments are mapped into + %o0 through %o5, and the remaining arguments are dumped to the + stack, beginning at [%sp+92]. (Note that %o6 == %sp.) + + If we have to put args on the stack, move %o6==%sp down by + the number of words to go on the stack, to ensure there's enough space. + + According to Fraser and Hanson's lcc book, page 478, fig 17.2, + 16 words above the stack pointer is a word for the address of + a structure return value. I use this as a temporary location + for moving values from float to int regs. Certainly it isn't + safe to put anything in the 16 words starting at %sp, since + this area can get trashed at any time due to window overflows + caused by signal handlers. + + A final complication (if the above isn't enough) is that + we can't blithely calculate the arguments one by one into + %o0 .. %o5. Consider the following nested calls: + + fff a (fff b c) + + Naive code moves a into %o0, and (fff b c) into %o1. Unfortunately + the inner call will itself use %o0, which trashes the value put there + in preparation for the outer call. Upshot: we need to calculate the + args into temporary regs, and move those to arg regs or onto the + stack only immediately prior to the call proper. Sigh. +-} + +genCCall + :: ForeignTarget -- function to call + -> [CmmFormal] -- where to put the result + -> [CmmActual] -- arguments (of mixed type) + -> NatM InstrBlock + + + +-- On SPARC under TSO (Total Store Ordering), writes earlier in the instruction stream +-- are guaranteed to take place before writes afterwards (unlike on PowerPC). +-- Ref: Section 8.4 of the SPARC V9 Architecture manual. +-- +-- In the SPARC case we don't need a barrier. +-- +genCCall (PrimTarget MO_ReadBarrier) _ _ + = return $ nilOL +genCCall (PrimTarget MO_WriteBarrier) _ _ + = return $ nilOL + +genCCall (PrimTarget (MO_Prefetch_Data _)) _ _ + = return $ nilOL + +genCCall target dest_regs args + = do -- work out the arguments, and assign them to integer regs + argcode_and_vregs <- mapM arg_to_int_vregs args + let (argcodes, vregss) = unzip argcode_and_vregs + let vregs = concat vregss + + let n_argRegs = length allArgRegs + let n_argRegs_used = min (length vregs) n_argRegs + + + -- deal with static vs dynamic call targets + callinsns <- case target of + ForeignTarget (CmmLit (CmmLabel lbl)) _ -> + return (unitOL (CALL (Left (litToImm (CmmLabel lbl))) n_argRegs_used False)) + + ForeignTarget expr _ + -> do (dyn_c, dyn_rs) <- arg_to_int_vregs expr + let dyn_r = case dyn_rs of + [dyn_r'] -> dyn_r' + _ -> panic "SPARC.CodeGen.genCCall: arg_to_int" + return (dyn_c `snocOL` CALL (Right dyn_r) n_argRegs_used False) + + PrimTarget mop + -> do res <- outOfLineMachOp mop + lblOrMopExpr <- case res of + Left lbl -> do + return (unitOL (CALL (Left (litToImm (CmmLabel lbl))) n_argRegs_used False)) + + Right mopExpr -> do + (dyn_c, dyn_rs) <- arg_to_int_vregs mopExpr + let dyn_r = case dyn_rs of + [dyn_r'] -> dyn_r' + _ -> panic "SPARC.CodeGen.genCCall: arg_to_int" + return (dyn_c `snocOL` CALL (Right dyn_r) n_argRegs_used False) + + return lblOrMopExpr + + let argcode = concatOL argcodes + + let (move_sp_down, move_sp_up) + = let diff = length vregs - n_argRegs + nn = if odd diff then diff + 1 else diff -- keep 8-byte alignment + in if nn <= 0 + then (nilOL, nilOL) + else (unitOL (moveSp (-1*nn)), unitOL (moveSp (1*nn))) + + let transfer_code + = toOL (move_final vregs allArgRegs extraStackArgsHere) + + dflags <- getDynFlags + return + $ argcode `appOL` + move_sp_down `appOL` + transfer_code `appOL` + callinsns `appOL` + unitOL NOP `appOL` + move_sp_up `appOL` + assign_code (targetPlatform dflags) dest_regs + + +-- | Generate code to calculate an argument, and move it into one +-- or two integer vregs. +arg_to_int_vregs :: CmmExpr -> NatM (OrdList Instr, [Reg]) +arg_to_int_vregs arg = do dflags <- getDynFlags + arg_to_int_vregs' dflags arg + +arg_to_int_vregs' :: DynFlags -> CmmExpr -> NatM (OrdList Instr, [Reg]) +arg_to_int_vregs' dflags arg + + -- If the expr produces a 64 bit int, then we can just use iselExpr64 + | isWord64 (cmmExprType dflags arg) + = do (ChildCode64 code r_lo) <- iselExpr64 arg + let r_hi = getHiVRegFromLo r_lo + return (code, [r_hi, r_lo]) + + | otherwise + = do (src, code) <- getSomeReg arg + let pk = cmmExprType dflags arg + + case cmmTypeFormat pk of + + -- Load a 64 bit float return value into two integer regs. + FF64 -> do + v1 <- getNewRegNat II32 + v2 <- getNewRegNat II32 + + let code2 = + code `snocOL` + FMOV FF64 src f0 `snocOL` + ST FF32 f0 (spRel 16) `snocOL` + LD II32 (spRel 16) v1 `snocOL` + ST FF32 f1 (spRel 16) `snocOL` + LD II32 (spRel 16) v2 + + return (code2, [v1,v2]) + + -- Load a 32 bit float return value into an integer reg + FF32 -> do + v1 <- getNewRegNat II32 + + let code2 = + code `snocOL` + ST FF32 src (spRel 16) `snocOL` + LD II32 (spRel 16) v1 + + return (code2, [v1]) + + -- Move an integer return value into its destination reg. + _ -> do + v1 <- getNewRegNat II32 + + let code2 = + code `snocOL` + OR False g0 (RIReg src) v1 + + return (code2, [v1]) + + +-- | Move args from the integer vregs into which they have been +-- marshalled, into %o0 .. %o5, and the rest onto the stack. +-- +move_final :: [Reg] -> [Reg] -> Int -> [Instr] + +-- all args done +move_final [] _ _ + = [] + +-- out of aregs; move to stack +move_final (v:vs) [] offset + = ST II32 v (spRel offset) + : move_final vs [] (offset+1) + +-- move into an arg (%o[0..5]) reg +move_final (v:vs) (a:az) offset + = OR False g0 (RIReg v) a + : move_final vs az offset + + +-- | Assign results returned from the call into their +-- destination regs. +-- +assign_code :: Platform -> [LocalReg] -> OrdList Instr + +assign_code _ [] = nilOL + +assign_code platform [dest] + = let rep = localRegType dest + width = typeWidth rep + r_dest = getRegisterReg platform (CmmLocal dest) + + result + | isFloatType rep + , W32 <- width + = unitOL $ FMOV FF32 (regSingle $ fReg 0) r_dest + + | isFloatType rep + , W64 <- width + = unitOL $ FMOV FF64 (regSingle $ fReg 0) r_dest + + | not $ isFloatType rep + , W32 <- width + = unitOL $ mkRegRegMoveInstr platform (regSingle $ oReg 0) r_dest + + | not $ isFloatType rep + , W64 <- width + , r_dest_hi <- getHiVRegFromLo r_dest + = toOL [ mkRegRegMoveInstr platform (regSingle $ oReg 0) r_dest_hi + , mkRegRegMoveInstr platform (regSingle $ oReg 1) r_dest] + + | otherwise + = panic "SPARC.CodeGen.GenCCall: no match" + + in result + +assign_code _ _ + = panic "SPARC.CodeGen.GenCCall: no match" + + + +-- | Generate a call to implement an out-of-line floating point operation +outOfLineMachOp + :: CallishMachOp + -> NatM (Either CLabel CmmExpr) + +outOfLineMachOp mop + = do let functionName + = outOfLineMachOp_table mop + + dflags <- getDynFlags + mopExpr <- cmmMakeDynamicReference dflags CallReference + $ mkForeignLabel functionName Nothing ForeignLabelInExternalPackage IsFunction + + let mopLabelOrExpr + = case mopExpr of + CmmLit (CmmLabel lbl) -> Left lbl + _ -> Right mopExpr + + return mopLabelOrExpr + + +-- | Decide what C function to use to implement a CallishMachOp +-- +outOfLineMachOp_table + :: CallishMachOp + -> FastString + +outOfLineMachOp_table mop + = case mop of + MO_F32_Exp -> fsLit "expf" + MO_F32_ExpM1 -> fsLit "expm1f" + MO_F32_Log -> fsLit "logf" + MO_F32_Log1P -> fsLit "log1pf" + MO_F32_Sqrt -> fsLit "sqrtf" + MO_F32_Fabs -> unsupported + MO_F32_Pwr -> fsLit "powf" + + MO_F32_Sin -> fsLit "sinf" + MO_F32_Cos -> fsLit "cosf" + MO_F32_Tan -> fsLit "tanf" + + MO_F32_Asin -> fsLit "asinf" + MO_F32_Acos -> fsLit "acosf" + MO_F32_Atan -> fsLit "atanf" + + MO_F32_Sinh -> fsLit "sinhf" + MO_F32_Cosh -> fsLit "coshf" + MO_F32_Tanh -> fsLit "tanhf" + + MO_F32_Asinh -> fsLit "asinhf" + MO_F32_Acosh -> fsLit "acoshf" + MO_F32_Atanh -> fsLit "atanhf" + + MO_F64_Exp -> fsLit "exp" + MO_F64_ExpM1 -> fsLit "expm1" + MO_F64_Log -> fsLit "log" + MO_F64_Log1P -> fsLit "log1p" + MO_F64_Sqrt -> fsLit "sqrt" + MO_F64_Fabs -> unsupported + MO_F64_Pwr -> fsLit "pow" + + MO_F64_Sin -> fsLit "sin" + MO_F64_Cos -> fsLit "cos" + MO_F64_Tan -> fsLit "tan" + + MO_F64_Asin -> fsLit "asin" + MO_F64_Acos -> fsLit "acos" + MO_F64_Atan -> fsLit "atan" + + MO_F64_Sinh -> fsLit "sinh" + MO_F64_Cosh -> fsLit "cosh" + MO_F64_Tanh -> fsLit "tanh" + + MO_F64_Asinh -> fsLit "asinh" + MO_F64_Acosh -> fsLit "acosh" + MO_F64_Atanh -> fsLit "atanh" + + MO_UF_Conv w -> fsLit $ word2FloatLabel w + + MO_Memcpy _ -> fsLit "memcpy" + MO_Memset _ -> fsLit "memset" + MO_Memmove _ -> fsLit "memmove" + MO_Memcmp _ -> fsLit "memcmp" + + MO_BSwap w -> fsLit $ bSwapLabel w + MO_BRev w -> fsLit $ bRevLabel w + MO_PopCnt w -> fsLit $ popCntLabel w + MO_Pdep w -> fsLit $ pdepLabel w + MO_Pext w -> fsLit $ pextLabel w + MO_Clz w -> fsLit $ clzLabel w + MO_Ctz w -> fsLit $ ctzLabel w + MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop + MO_Cmpxchg w -> fsLit $ cmpxchgLabel w + MO_AtomicRead w -> fsLit $ atomicReadLabel w + MO_AtomicWrite w -> fsLit $ atomicWriteLabel w + + MO_S_Mul2 {} -> unsupported + MO_S_QuotRem {} -> unsupported + MO_U_QuotRem {} -> unsupported + MO_U_QuotRem2 {} -> unsupported + MO_Add2 {} -> unsupported + MO_AddWordC {} -> unsupported + MO_SubWordC {} -> unsupported + MO_AddIntC {} -> unsupported + MO_SubIntC {} -> unsupported + MO_U_Mul2 {} -> unsupported + MO_ReadBarrier -> unsupported + MO_WriteBarrier -> unsupported + MO_Touch -> unsupported + (MO_Prefetch_Data _) -> unsupported + where unsupported = panic ("outOfLineCmmOp: " ++ show mop + ++ " not supported here") + diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs new file mode 100644 index 0000000000..d6c9d7b360 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs @@ -0,0 +1,74 @@ +module GHC.CmmToAsm.SPARC.CodeGen.Amode ( + getAmode +) + +where + +import GhcPrelude + +import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32 +import GHC.CmmToAsm.SPARC.CodeGen.Base +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.SPARC.Base +import GHC.CmmToAsm.Monad +import GHC.CmmToAsm.Format + +import GHC.Cmm + +import OrdList + + +-- | Generate code to reference a memory address. +getAmode + :: CmmExpr -- ^ expr producing an address + -> NatM Amode + +getAmode tree@(CmmRegOff _ _) + = do dflags <- getDynFlags + getAmode (mangleIndexTree dflags tree) + +getAmode (CmmMachOp (MO_Sub _) [x, CmmLit (CmmInt i _)]) + | fits13Bits (-i) + = do + (reg, code) <- getSomeReg x + let + off = ImmInt (-(fromInteger i)) + return (Amode (AddrRegImm reg off) code) + + +getAmode (CmmMachOp (MO_Add _) [x, CmmLit (CmmInt i _)]) + | fits13Bits i + = do + (reg, code) <- getSomeReg x + let + off = ImmInt (fromInteger i) + return (Amode (AddrRegImm reg off) code) + +getAmode (CmmMachOp (MO_Add _) [x, y]) + = do + (regX, codeX) <- getSomeReg x + (regY, codeY) <- getSomeReg y + let + code = codeX `appOL` codeY + return (Amode (AddrRegReg regX regY) code) + +getAmode (CmmLit lit) + = do + let imm__2 = litToImm lit + tmp1 <- getNewRegNat II32 + tmp2 <- getNewRegNat II32 + + let code = toOL [ SETHI (HI imm__2) tmp1 + , OR False tmp1 (RIImm (LO imm__2)) tmp2] + + return (Amode (AddrRegReg tmp2 g0) code) + +getAmode other + = do + (reg, code) <- getSomeReg other + let + off = ImmInt 0 + return (Amode (AddrRegImm reg off) code) diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs new file mode 100644 index 0000000000..cf249303e4 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs @@ -0,0 +1,119 @@ +module GHC.CmmToAsm.SPARC.CodeGen.Base ( + InstrBlock, + CondCode(..), + ChildCode64(..), + Amode(..), + + Register(..), + setFormatOfRegister, + + getRegisterReg, + mangleIndexTree +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Cond +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.Format +import GHC.Platform.Reg + +import GHC.Platform.Regs +import GHC.Driver.Session +import GHC.Cmm +import GHC.Cmm.Ppr.Expr () -- For Outputable instances +import GHC.Platform + +import Outputable +import OrdList + +-------------------------------------------------------------------------------- +-- | 'InstrBlock's are the insn sequences generated by the insn selectors. +-- They are really trees of insns to facilitate fast appending, where a +-- left-to-right traversal yields the insns in the correct order. +-- +type InstrBlock + = OrdList Instr + + +-- | Condition codes passed up the tree. +-- +data CondCode + = CondCode Bool Cond InstrBlock + + +-- | a.k.a "Register64" +-- Reg is the lower 32-bit temporary which contains the result. +-- Use getHiVRegFromLo to find the other VRegUnique. +-- +-- Rules of this simplified insn selection game are therefore that +-- the returned Reg may be modified +-- +data ChildCode64 + = ChildCode64 + InstrBlock + Reg + + +-- | Holds code that references a memory address. +data Amode + = Amode + -- the AddrMode we can use in the instruction + -- that does the real load\/store. + AddrMode + + -- other setup code we have to run first before we can use the + -- above AddrMode. + InstrBlock + + + +-------------------------------------------------------------------------------- +-- | Code to produce a result into a register. +-- If the result must go in a specific register, it comes out as Fixed. +-- Otherwise, the parent can decide which register to put it in. +-- +data Register + = Fixed Format Reg InstrBlock + | Any Format (Reg -> InstrBlock) + + +-- | Change the format field in a Register. +setFormatOfRegister + :: Register -> Format -> Register + +setFormatOfRegister reg format + = case reg of + Fixed _ reg code -> Fixed format reg code + Any _ codefn -> Any format codefn + + +-------------------------------------------------------------------------------- +-- | Grab the Reg for a CmmReg +getRegisterReg :: Platform -> CmmReg -> Reg + +getRegisterReg _ (CmmLocal (LocalReg u pk)) + = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk) + +getRegisterReg platform (CmmGlobal mid) + = case globalRegMaybe platform mid of + Just reg -> RegReal reg + Nothing -> pprPanic + "SPARC.CodeGen.Base.getRegisterReg: global is in memory" + (ppr $ CmmGlobal mid) + + +-- Expand CmmRegOff. ToDo: should we do it this way around, or convert +-- CmmExprs into CmmRegOff? +mangleIndexTree :: DynFlags -> CmmExpr -> CmmExpr + +mangleIndexTree dflags (CmmRegOff reg off) + = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)] + where width = typeWidth (cmmRegType dflags reg) + +mangleIndexTree _ _ + = panic "SPARC.CodeGen.Base.mangleIndexTree: no match" diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs new file mode 100644 index 0000000000..e501d799f2 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs @@ -0,0 +1,110 @@ +module GHC.CmmToAsm.SPARC.CodeGen.CondCode ( + getCondCode, + condIntCode, + condFltCode +) + +where + +import GhcPrelude + +import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32 +import GHC.CmmToAsm.SPARC.CodeGen.Base +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.SPARC.Cond +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.Base +import GHC.CmmToAsm.Monad +import GHC.CmmToAsm.Format + +import GHC.Cmm + +import OrdList +import Outputable + + +getCondCode :: CmmExpr -> NatM CondCode +getCondCode (CmmMachOp mop [x, y]) + = + case mop of + MO_F_Eq W32 -> condFltCode EQQ x y + MO_F_Ne W32 -> condFltCode NE x y + MO_F_Gt W32 -> condFltCode GTT x y + MO_F_Ge W32 -> condFltCode GE x y + MO_F_Lt W32 -> condFltCode LTT x y + MO_F_Le W32 -> condFltCode LE x y + + MO_F_Eq W64 -> condFltCode EQQ x y + MO_F_Ne W64 -> condFltCode NE x y + MO_F_Gt W64 -> condFltCode GTT x y + MO_F_Ge W64 -> condFltCode GE x y + MO_F_Lt W64 -> condFltCode LTT x y + MO_F_Le W64 -> condFltCode LE x y + + MO_Eq _ -> condIntCode EQQ x y + MO_Ne _ -> condIntCode NE x y + + MO_S_Gt _ -> condIntCode GTT x y + MO_S_Ge _ -> condIntCode GE x y + MO_S_Lt _ -> condIntCode LTT x y + MO_S_Le _ -> condIntCode LE x y + + MO_U_Gt _ -> condIntCode GU x y + MO_U_Ge _ -> condIntCode GEU x y + MO_U_Lt _ -> condIntCode LU x y + MO_U_Le _ -> condIntCode LEU x y + + _ -> pprPanic "SPARC.CodeGen.CondCode.getCondCode" (ppr (CmmMachOp mop [x,y])) + +getCondCode other = pprPanic "SPARC.CodeGen.CondCode.getCondCode" (ppr other) + + + + + +-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be +-- passed back up the tree. + +condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode +condIntCode cond x (CmmLit (CmmInt y _)) + | fits13Bits y + = do + (src1, code) <- getSomeReg x + let + src2 = ImmInt (fromInteger y) + code' = code `snocOL` SUB False True src1 (RIImm src2) g0 + return (CondCode False cond code') + +condIntCode cond x y = do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let + code__2 = code1 `appOL` code2 `snocOL` + SUB False True src1 (RIReg src2) g0 + return (CondCode False cond code__2) + + +condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode +condFltCode cond x y = do + dflags <- getDynFlags + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + tmp <- getNewRegNat FF64 + let + promote x = FxTOy FF32 FF64 x tmp + + pk1 = cmmExprType dflags x + pk2 = cmmExprType dflags y + + code__2 = + if pk1 `cmmEqType` pk2 then + code1 `appOL` code2 `snocOL` + FCMP True (cmmTypeFormat pk1) src1 src2 + else if typeWidth pk1 == W32 then + code1 `snocOL` promote src1 `appOL` code2 `snocOL` + FCMP True FF64 tmp src2 + else + code1 `appOL` code2 `snocOL` promote src2 `snocOL` + FCMP True FF64 src1 tmp + return (CondCode True cond code__2) diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs new file mode 100644 index 0000000000..77732cf70c --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs @@ -0,0 +1,156 @@ +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +-- | Expand out synthetic instructions into single machine instrs. +module GHC.CmmToAsm.SPARC.CodeGen.Expand ( + expandTop +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg +import GHC.CmmToAsm.Format +import GHC.Cmm + + +import Outputable +import OrdList + +-- | Expand out synthetic instructions in this top level thing +expandTop :: NatCmmDecl RawCmmStatics Instr -> NatCmmDecl RawCmmStatics Instr +expandTop top@(CmmData{}) + = top + +expandTop (CmmProc info lbl live (ListGraph blocks)) + = CmmProc info lbl live (ListGraph $ map expandBlock blocks) + + +-- | Expand out synthetic instructions in this block +expandBlock :: NatBasicBlock Instr -> NatBasicBlock Instr + +expandBlock (BasicBlock label instrs) + = let instrs_ol = expandBlockInstrs instrs + instrs' = fromOL instrs_ol + in BasicBlock label instrs' + + +-- | Expand out some instructions +expandBlockInstrs :: [Instr] -> OrdList Instr +expandBlockInstrs [] = nilOL + +expandBlockInstrs (ii:is) + = let ii_doubleRegs = remapRegPair ii + is_misaligned = expandMisalignedDoubles ii_doubleRegs + + in is_misaligned `appOL` expandBlockInstrs is + + + +-- | In the SPARC instruction set the FP register pairs that are used +-- to hold 64 bit floats are referred to by just the first reg +-- of the pair. Remap our internal reg pairs to the appropriate reg. +-- +-- For example: +-- ldd [%l1], (%f0 | %f1) +-- +-- gets mapped to +-- ldd [$l1], %f0 +-- +remapRegPair :: Instr -> Instr +remapRegPair instr + = let patchF reg + = case reg of + RegReal (RealRegSingle _) + -> reg + + RegReal (RealRegPair r1 r2) + + -- sanity checking + | r1 >= 32 + , r1 <= 63 + , r1 `mod` 2 == 0 + , r2 == r1 + 1 + -> RegReal (RealRegSingle r1) + + | otherwise + -> pprPanic "SPARC.CodeGen.Expand: not remapping dodgy looking reg pair " (ppr reg) + + RegVirtual _ + -> pprPanic "SPARC.CodeGen.Expand: not remapping virtual reg " (ppr reg) + + in patchRegsOfInstr instr patchF + + + + +-- Expand out 64 bit load/stores into individual instructions to handle +-- possible double alignment problems. +-- +-- TODO: It'd be better to use a scratch reg instead of the add/sub thing. +-- We might be able to do this faster if we use the UA2007 instr set +-- instead of restricting ourselves to SPARC V9. +-- +expandMisalignedDoubles :: Instr -> OrdList Instr +expandMisalignedDoubles instr + + -- Translate to: + -- add g1,g2,g1 + -- ld [g1],%fn + -- ld [g1+4],%f(n+1) + -- sub g1,g2,g1 -- to restore g1 + | LD FF64 (AddrRegReg r1 r2) fReg <- instr + = toOL [ ADD False False r1 (RIReg r2) r1 + , LD FF32 (AddrRegReg r1 g0) fReg + , LD FF32 (AddrRegImm r1 (ImmInt 4)) (fRegHi fReg) + , SUB False False r1 (RIReg r2) r1 ] + + -- Translate to + -- ld [addr],%fn + -- ld [addr+4],%f(n+1) + | LD FF64 addr fReg <- instr + = let Just addr' = addrOffset addr 4 + in toOL [ LD FF32 addr fReg + , LD FF32 addr' (fRegHi fReg) ] + + -- Translate to: + -- add g1,g2,g1 + -- st %fn,[g1] + -- st %f(n+1),[g1+4] + -- sub g1,g2,g1 -- to restore g1 + | ST FF64 fReg (AddrRegReg r1 r2) <- instr + = toOL [ ADD False False r1 (RIReg r2) r1 + , ST FF32 fReg (AddrRegReg r1 g0) + , ST FF32 (fRegHi fReg) (AddrRegImm r1 (ImmInt 4)) + , SUB False False r1 (RIReg r2) r1 ] + + -- Translate to + -- ld [addr],%fn + -- ld [addr+4],%f(n+1) + | ST FF64 fReg addr <- instr + = let Just addr' = addrOffset addr 4 + in toOL [ ST FF32 fReg addr + , ST FF32 (fRegHi fReg) addr' ] + + -- some other instr + | otherwise + = unitOL instr + + + +-- | The high partner for this float reg. +fRegHi :: Reg -> Reg +fRegHi (RegReal (RealRegSingle r1)) + | r1 >= 32 + , r1 <= 63 + , r1 `mod` 2 == 0 + = (RegReal $ RealRegSingle (r1 + 1)) + +-- Can't take high partner for non-low reg. +fRegHi reg + = pprPanic "SPARC.CodeGen.Expand: can't take fRegHi from " (ppr reg) diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs new file mode 100644 index 0000000000..ee67bd4a9d --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs @@ -0,0 +1,692 @@ +-- | Evaluation of 32 bit values. +module GHC.CmmToAsm.SPARC.CodeGen.Gen32 ( + getSomeReg, + getRegister +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.CodeGen.CondCode +import GHC.CmmToAsm.SPARC.CodeGen.Amode +import GHC.CmmToAsm.SPARC.CodeGen.Gen64 +import GHC.CmmToAsm.SPARC.CodeGen.Base +import GHC.CmmToAsm.SPARC.Stack +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Cond +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.SPARC.Base +import GHC.CmmToAsm.Monad +import GHC.CmmToAsm.Format +import GHC.Platform.Reg + +import GHC.Cmm + +import Control.Monad (liftM) +import GHC.Driver.Session +import OrdList +import Outputable + +-- | The dual to getAnyReg: compute an expression into a register, but +-- we don't mind which one it is. +getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock) +getSomeReg expr = do + r <- getRegister expr + case r of + Any rep code -> do + tmp <- getNewRegNat rep + return (tmp, code tmp) + Fixed _ reg code -> + return (reg, code) + + + +-- | Make code to evaluate a 32 bit expression. +-- +getRegister :: CmmExpr -> NatM Register + +getRegister (CmmReg reg) + = do dflags <- getDynFlags + let platform = targetPlatform dflags + return (Fixed (cmmTypeFormat (cmmRegType dflags reg)) + (getRegisterReg platform reg) nilOL) + +getRegister tree@(CmmRegOff _ _) + = do dflags <- getDynFlags + getRegister (mangleIndexTree dflags tree) + +getRegister (CmmMachOp (MO_UU_Conv W64 W32) + [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 (getHiVRegFromLo rlo) code + +getRegister (CmmMachOp (MO_SS_Conv W64 W32) + [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 (getHiVRegFromLo rlo) code + +getRegister (CmmMachOp (MO_UU_Conv W64 W32) [x]) = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 rlo code + +getRegister (CmmMachOp (MO_SS_Conv W64 W32) [x]) = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 rlo code + + +-- Load a literal float into a float register. +-- The actual literal is stored in a new data area, and we load it +-- at runtime. +getRegister (CmmLit (CmmFloat f W32)) = do + + -- a label for the new data area + lbl <- getNewLabelNat + tmp <- getNewRegNat II32 + + let code dst = toOL [ + -- the data area + LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl + [CmmStaticLit (CmmFloat f W32)], + + -- load the literal + SETHI (HI (ImmCLbl lbl)) tmp, + LD II32 (AddrRegImm tmp (LO (ImmCLbl lbl))) dst] + + return (Any FF32 code) + +getRegister (CmmLit (CmmFloat d W64)) = do + lbl <- getNewLabelNat + tmp <- getNewRegNat II32 + let code dst = toOL [ + LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl + [CmmStaticLit (CmmFloat d W64)], + SETHI (HI (ImmCLbl lbl)) tmp, + LD II64 (AddrRegImm tmp (LO (ImmCLbl lbl))) dst] + return (Any FF64 code) + + +-- Unary machine ops +getRegister (CmmMachOp mop [x]) + = case mop of + -- Floating point negation ------------------------- + MO_F_Neg W32 -> trivialUFCode FF32 (FNEG FF32) x + MO_F_Neg W64 -> trivialUFCode FF64 (FNEG FF64) x + + + -- Integer negation -------------------------------- + MO_S_Neg rep -> trivialUCode (intFormat rep) (SUB False False g0) x + MO_Not rep -> trivialUCode (intFormat rep) (XNOR False g0) x + + + -- Float word size conversion ---------------------- + MO_FF_Conv W64 W32 -> coerceDbl2Flt x + MO_FF_Conv W32 W64 -> coerceFlt2Dbl x + + + -- Float <-> Signed Int conversion ----------------- + MO_FS_Conv from to -> coerceFP2Int from to x + MO_SF_Conv from to -> coerceInt2FP from to x + + + -- Unsigned integer word size conversions ---------- + + -- If it's the same size, then nothing needs to be done. + MO_UU_Conv from to + | from == to -> conversionNop (intFormat to) x + + -- To narrow an unsigned word, mask out the high bits to simulate what would + -- happen if we copied the value into a smaller register. + MO_UU_Conv W16 W8 -> trivialCode W8 (AND False) x (CmmLit (CmmInt 255 W8)) + MO_UU_Conv W32 W8 -> trivialCode W8 (AND False) x (CmmLit (CmmInt 255 W8)) + + -- for narrowing 32 bit to 16 bit, don't use a literal mask value like the W16->W8 + -- case because the only way we can load it is via SETHI, which needs 2 ops. + -- Do some shifts to chop out the high bits instead. + MO_UU_Conv W32 W16 + -> do tmpReg <- getNewRegNat II32 + (xReg, xCode) <- getSomeReg x + let code dst + = xCode + `appOL` toOL + [ SLL xReg (RIImm $ ImmInt 16) tmpReg + , SRL tmpReg (RIImm $ ImmInt 16) dst] + + return $ Any II32 code + + -- trivialCode W16 (AND False) x (CmmLit (CmmInt 65535 W16)) + + -- To widen an unsigned word we don't have to do anything. + -- Just leave it in the same register and mark the result as the new size. + MO_UU_Conv W8 W16 -> conversionNop (intFormat W16) x + MO_UU_Conv W8 W32 -> conversionNop (intFormat W32) x + MO_UU_Conv W16 W32 -> conversionNop (intFormat W32) x + + + -- Signed integer word size conversions ------------ + + -- Mask out high bits when narrowing them + MO_SS_Conv W16 W8 -> trivialCode W8 (AND False) x (CmmLit (CmmInt 255 W8)) + MO_SS_Conv W32 W8 -> trivialCode W8 (AND False) x (CmmLit (CmmInt 255 W8)) + MO_SS_Conv W32 W16 -> trivialCode W16 (AND False) x (CmmLit (CmmInt 65535 W16)) + + -- Sign extend signed words when widening them. + MO_SS_Conv W8 W16 -> integerExtend W8 W16 x + MO_SS_Conv W8 W32 -> integerExtend W8 W32 x + MO_SS_Conv W16 W32 -> integerExtend W16 W32 x + + _ -> panic ("Unknown unary mach op: " ++ show mop) + + +-- Binary machine ops +getRegister (CmmMachOp mop [x, y]) + = case mop of + MO_Eq _ -> condIntReg EQQ x y + MO_Ne _ -> condIntReg NE x y + + MO_S_Gt _ -> condIntReg GTT x y + MO_S_Ge _ -> condIntReg GE x y + MO_S_Lt _ -> condIntReg LTT x y + MO_S_Le _ -> condIntReg LE x y + + MO_U_Gt W32 -> condIntReg GU x y + MO_U_Ge W32 -> condIntReg GEU x y + MO_U_Lt W32 -> condIntReg LU x y + MO_U_Le W32 -> condIntReg LEU x y + + MO_U_Gt W16 -> condIntReg GU x y + MO_U_Ge W16 -> condIntReg GEU x y + MO_U_Lt W16 -> condIntReg LU x y + MO_U_Le W16 -> condIntReg LEU x y + + MO_Add W32 -> trivialCode W32 (ADD False False) x y + MO_Sub W32 -> trivialCode W32 (SUB False False) x y + + MO_S_MulMayOflo rep -> imulMayOflo rep x y + + MO_S_Quot W32 -> idiv True False x y + MO_U_Quot W32 -> idiv False False x y + + MO_S_Rem W32 -> irem True x y + MO_U_Rem W32 -> irem False x y + + MO_F_Eq _ -> condFltReg EQQ x y + MO_F_Ne _ -> condFltReg NE x y + + MO_F_Gt _ -> condFltReg GTT x y + MO_F_Ge _ -> condFltReg GE x y + MO_F_Lt _ -> condFltReg LTT x y + MO_F_Le _ -> condFltReg LE x y + + MO_F_Add w -> trivialFCode w FADD x y + MO_F_Sub w -> trivialFCode w FSUB x y + MO_F_Mul w -> trivialFCode w FMUL x y + MO_F_Quot w -> trivialFCode w FDIV x y + + MO_And rep -> trivialCode rep (AND False) x y + MO_Or rep -> trivialCode rep (OR False) x y + MO_Xor rep -> trivialCode rep (XOR False) x y + + MO_Mul rep -> trivialCode rep (SMUL False) x y + + MO_Shl rep -> trivialCode rep SLL x y + MO_U_Shr rep -> trivialCode rep SRL x y + MO_S_Shr rep -> trivialCode rep SRA x y + + _ -> pprPanic "getRegister(sparc) - binary CmmMachOp (1)" (pprMachOp mop) + +getRegister (CmmLoad mem pk) = do + Amode src code <- getAmode mem + let + code__2 dst = code `snocOL` LD (cmmTypeFormat pk) src dst + return (Any (cmmTypeFormat pk) code__2) + +getRegister (CmmLit (CmmInt i _)) + | fits13Bits i + = let + src = ImmInt (fromInteger i) + code dst = unitOL (OR False g0 (RIImm src) dst) + in + return (Any II32 code) + +getRegister (CmmLit lit) + = let imm = litToImm lit + code dst = toOL [ + SETHI (HI imm) dst, + OR False dst (RIImm (LO imm)) dst] + in return (Any II32 code) + + +getRegister _ + = panic "SPARC.CodeGen.Gen32.getRegister: no match" + + +-- | sign extend and widen +integerExtend + :: Width -- ^ width of source expression + -> Width -- ^ width of result + -> CmmExpr -- ^ source expression + -> NatM Register + +integerExtend from to expr + = do -- load the expr into some register + (reg, e_code) <- getSomeReg expr + tmp <- getNewRegNat II32 + let bitCount + = case (from, to) of + (W8, W32) -> 24 + (W16, W32) -> 16 + (W8, W16) -> 24 + _ -> panic "SPARC.CodeGen.Gen32: no match" + let code dst + = e_code + + -- local shift word left to load the sign bit + `snocOL` SLL reg (RIImm (ImmInt bitCount)) tmp + + -- arithmetic shift right to sign extend + `snocOL` SRA tmp (RIImm (ImmInt bitCount)) dst + + return (Any (intFormat to) code) + + +-- | For nop word format conversions we set the resulting value to have the +-- required size, but don't need to generate any actual code. +-- +conversionNop + :: Format -> CmmExpr -> NatM Register + +conversionNop new_rep expr + = do e_code <- getRegister expr + return (setFormatOfRegister e_code new_rep) + + + +-- | Generate an integer division instruction. +idiv :: Bool -> Bool -> CmmExpr -> CmmExpr -> NatM Register + +-- For unsigned division with a 32 bit numerator, +-- we can just clear the Y register. +idiv False cc x y + = do + (a_reg, a_code) <- getSomeReg x + (b_reg, b_code) <- getSomeReg y + + let code dst + = a_code + `appOL` b_code + `appOL` toOL + [ WRY g0 g0 + , UDIV cc a_reg (RIReg b_reg) dst] + + return (Any II32 code) + + +-- For _signed_ division with a 32 bit numerator, +-- we have to sign extend the numerator into the Y register. +idiv True cc x y + = do + (a_reg, a_code) <- getSomeReg x + (b_reg, b_code) <- getSomeReg y + + tmp <- getNewRegNat II32 + + let code dst + = a_code + `appOL` b_code + `appOL` toOL + [ SRA a_reg (RIImm (ImmInt 16)) tmp -- sign extend + , SRA tmp (RIImm (ImmInt 16)) tmp + + , WRY tmp g0 + , SDIV cc a_reg (RIReg b_reg) dst] + + return (Any II32 code) + + +-- | Do an integer remainder. +-- +-- NOTE: The SPARC v8 architecture manual says that integer division +-- instructions _may_ generate a remainder, depending on the implementation. +-- If so it is _recommended_ that the remainder is placed in the Y register. +-- +-- The UltraSparc 2007 manual says Y is _undefined_ after division. +-- +-- The SPARC T2 doesn't store the remainder, not sure about the others. +-- It's probably best not to worry about it, and just generate our own +-- remainders. +-- +irem :: Bool -> CmmExpr -> CmmExpr -> NatM Register + +-- For unsigned operands: +-- Division is between a 64 bit numerator and a 32 bit denominator, +-- so we still have to clear the Y register. +irem False x y + = do + (a_reg, a_code) <- getSomeReg x + (b_reg, b_code) <- getSomeReg y + + tmp_reg <- getNewRegNat II32 + + let code dst + = a_code + `appOL` b_code + `appOL` toOL + [ WRY g0 g0 + , UDIV False a_reg (RIReg b_reg) tmp_reg + , UMUL False tmp_reg (RIReg b_reg) tmp_reg + , SUB False False a_reg (RIReg tmp_reg) dst] + + return (Any II32 code) + + + +-- For signed operands: +-- Make sure to sign extend into the Y register, or the remainder +-- will have the wrong sign when the numerator is negative. +-- +-- TODO: When sign extending, GCC only shifts the a_reg right by 17 bits, +-- not the full 32. Not sure why this is, something to do with overflow? +-- If anyone cares enough about the speed of signed remainder they +-- can work it out themselves (then tell me). -- BL 2009/01/20 +irem True x y + = do + (a_reg, a_code) <- getSomeReg x + (b_reg, b_code) <- getSomeReg y + + tmp1_reg <- getNewRegNat II32 + tmp2_reg <- getNewRegNat II32 + + let code dst + = a_code + `appOL` b_code + `appOL` toOL + [ SRA a_reg (RIImm (ImmInt 16)) tmp1_reg -- sign extend + , SRA tmp1_reg (RIImm (ImmInt 16)) tmp1_reg -- sign extend + , WRY tmp1_reg g0 + + , SDIV False a_reg (RIReg b_reg) tmp2_reg + , SMUL False tmp2_reg (RIReg b_reg) tmp2_reg + , SUB False False a_reg (RIReg tmp2_reg) dst] + + return (Any II32 code) + + +imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register +imulMayOflo rep a b + = do + (a_reg, a_code) <- getSomeReg a + (b_reg, b_code) <- getSomeReg b + res_lo <- getNewRegNat II32 + res_hi <- getNewRegNat II32 + + let shift_amt = case rep of + W32 -> 31 + W64 -> 63 + _ -> panic "shift_amt" + + let code dst = a_code `appOL` b_code `appOL` + toOL [ + SMUL False a_reg (RIReg b_reg) res_lo, + RDY res_hi, + SRA res_lo (RIImm (ImmInt shift_amt)) res_lo, + SUB False False res_lo (RIReg res_hi) dst + ] + return (Any II32 code) + + +-- ----------------------------------------------------------------------------- +-- 'trivial*Code': deal with trivial instructions + +-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode', +-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions. +-- Only look for constants on the right hand side, because that's +-- where the generic optimizer will have put them. + +-- Similarly, for unary instructions, we don't have to worry about +-- matching an StInt as the argument, because genericOpt will already +-- have handled the constant-folding. + +trivialCode + :: Width + -> (Reg -> RI -> Reg -> Instr) + -> CmmExpr + -> CmmExpr + -> NatM Register + +trivialCode _ instr x (CmmLit (CmmInt y _)) + | fits13Bits y + = do + (src1, code) <- getSomeReg x + let + src2 = ImmInt (fromInteger y) + code__2 dst = code `snocOL` instr src1 (RIImm src2) dst + return (Any II32 code__2) + + +trivialCode _ instr x y = do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let + code__2 dst = code1 `appOL` code2 `snocOL` + instr src1 (RIReg src2) dst + return (Any II32 code__2) + + +trivialFCode + :: Width + -> (Format -> Reg -> Reg -> Reg -> Instr) + -> CmmExpr + -> CmmExpr + -> NatM Register + +trivialFCode pk instr x y = do + dflags <- getDynFlags + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + tmp <- getNewRegNat FF64 + let + promote x = FxTOy FF32 FF64 x tmp + + pk1 = cmmExprType dflags x + pk2 = cmmExprType dflags y + + code__2 dst = + if pk1 `cmmEqType` pk2 then + code1 `appOL` code2 `snocOL` + instr (floatFormat pk) src1 src2 dst + else if typeWidth pk1 == W32 then + code1 `snocOL` promote src1 `appOL` code2 `snocOL` + instr FF64 tmp src2 dst + else + code1 `appOL` code2 `snocOL` promote src2 `snocOL` + instr FF64 src1 tmp dst + return (Any (cmmTypeFormat $ if pk1 `cmmEqType` pk2 then pk1 else cmmFloat W64) + code__2) + + + +trivialUCode + :: Format + -> (RI -> Reg -> Instr) + -> CmmExpr + -> NatM Register + +trivialUCode format instr x = do + (src, code) <- getSomeReg x + let + code__2 dst = code `snocOL` instr (RIReg src) dst + return (Any format code__2) + + +trivialUFCode + :: Format + -> (Reg -> Reg -> Instr) + -> CmmExpr + -> NatM Register + +trivialUFCode pk instr x = do + (src, code) <- getSomeReg x + let + code__2 dst = code `snocOL` instr src dst + return (Any pk code__2) + + + + +-- Coercions ------------------------------------------------------------------- + +-- | Coerce a integer value to floating point +coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register +coerceInt2FP width1 width2 x = do + (src, code) <- getSomeReg x + let + code__2 dst = code `appOL` toOL [ + ST (intFormat width1) src (spRel (-2)), + LD (intFormat width1) (spRel (-2)) dst, + FxTOy (intFormat width1) (floatFormat width2) dst dst] + return (Any (floatFormat $ width2) code__2) + + + +-- | Coerce a floating point value to integer +-- +-- NOTE: On sparc v9 there are no instructions to move a value from an +-- FP register directly to an int register, so we have to use a load/store. +-- +coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register +coerceFP2Int width1 width2 x + = do let fformat1 = floatFormat width1 + fformat2 = floatFormat width2 + + iformat2 = intFormat width2 + + (fsrc, code) <- getSomeReg x + fdst <- getNewRegNat fformat2 + + let code2 dst + = code + `appOL` toOL + -- convert float to int format, leaving it in a float reg. + [ FxTOy fformat1 iformat2 fsrc fdst + + -- store the int into mem, then load it back to move + -- it into an actual int reg. + , ST fformat2 fdst (spRel (-2)) + , LD iformat2 (spRel (-2)) dst] + + return (Any iformat2 code2) + + +-- | Coerce a double precision floating point value to single precision. +coerceDbl2Flt :: CmmExpr -> NatM Register +coerceDbl2Flt x = do + (src, code) <- getSomeReg x + return (Any FF32 (\dst -> code `snocOL` FxTOy FF64 FF32 src dst)) + + +-- | Coerce a single precision floating point value to double precision +coerceFlt2Dbl :: CmmExpr -> NatM Register +coerceFlt2Dbl x = do + (src, code) <- getSomeReg x + return (Any FF64 (\dst -> code `snocOL` FxTOy FF32 FF64 src dst)) + + + + +-- Condition Codes ------------------------------------------------------------- +-- +-- Evaluate a comparison, and get the result into a register. +-- +-- Do not fill the delay slots here. you will confuse the register allocator. +-- +condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register +condIntReg EQQ x (CmmLit (CmmInt 0 _)) = do + (src, code) <- getSomeReg x + let + code__2 dst = code `appOL` toOL [ + SUB False True g0 (RIReg src) g0, + SUB True False g0 (RIImm (ImmInt (-1))) dst] + return (Any II32 code__2) + +condIntReg EQQ x y = do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let + code__2 dst = code1 `appOL` code2 `appOL` toOL [ + XOR False src1 (RIReg src2) dst, + SUB False True g0 (RIReg dst) g0, + SUB True False g0 (RIImm (ImmInt (-1))) dst] + return (Any II32 code__2) + +condIntReg NE x (CmmLit (CmmInt 0 _)) = do + (src, code) <- getSomeReg x + let + code__2 dst = code `appOL` toOL [ + SUB False True g0 (RIReg src) g0, + ADD True False g0 (RIImm (ImmInt 0)) dst] + return (Any II32 code__2) + +condIntReg NE x y = do + (src1, code1) <- getSomeReg x + (src2, code2) <- getSomeReg y + let + code__2 dst = code1 `appOL` code2 `appOL` toOL [ + XOR False src1 (RIReg src2) dst, + SUB False True g0 (RIReg dst) g0, + ADD True False g0 (RIImm (ImmInt 0)) dst] + return (Any II32 code__2) + +condIntReg cond x y = do + bid1 <- liftM (\a -> seq a a) getBlockIdNat + bid2 <- liftM (\a -> seq a a) getBlockIdNat + CondCode _ cond cond_code <- condIntCode cond x y + let + code__2 dst + = cond_code + `appOL` toOL + [ BI cond False bid1 + , NOP + + , OR False g0 (RIImm (ImmInt 0)) dst + , BI ALWAYS False bid2 + , NOP + + , NEWBLOCK bid1 + , OR False g0 (RIImm (ImmInt 1)) dst + , BI ALWAYS False bid2 + , NOP + + , NEWBLOCK bid2] + + return (Any II32 code__2) + + +condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register +condFltReg cond x y = do + bid1 <- liftM (\a -> seq a a) getBlockIdNat + bid2 <- liftM (\a -> seq a a) getBlockIdNat + + CondCode _ cond cond_code <- condFltCode cond x y + let + code__2 dst + = cond_code + `appOL` toOL + [ NOP + , BF cond False bid1 + , NOP + + , OR False g0 (RIImm (ImmInt 0)) dst + , BI ALWAYS False bid2 + , NOP + + , NEWBLOCK bid1 + , OR False g0 (RIImm (ImmInt 1)) dst + , BI ALWAYS False bid2 + , NOP + + , NEWBLOCK bid2 ] + + return (Any II32 code__2) diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot new file mode 100644 index 0000000000..cf9553a63c --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot @@ -0,0 +1,16 @@ + +module GHC.CmmToAsm.SPARC.CodeGen.Gen32 ( + getSomeReg, + getRegister +) + +where + +import GHC.CmmToAsm.SPARC.CodeGen.Base +import GHC.CmmToAsm.Monad +import GHC.Platform.Reg + +import GHC.Cmm + +getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock) +getRegister :: CmmExpr -> NatM Register diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs new file mode 100644 index 0000000000..43807ec027 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs @@ -0,0 +1,216 @@ +-- | Evaluation of 64 bit values on 32 bit platforms. +module GHC.CmmToAsm.SPARC.CodeGen.Gen64 ( + assignMem_I64Code, + assignReg_I64Code, + iselExpr64 +) + +where + +import GhcPrelude + +import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32 +import GHC.CmmToAsm.SPARC.CodeGen.Base +import GHC.CmmToAsm.SPARC.CodeGen.Amode +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.Instr +-- GHC.CmmToAsm.SPARC.Ppr() +import GHC.CmmToAsm.Monad +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Format +import GHC.Platform.Reg + +import GHC.Cmm + +import GHC.Driver.Session +import OrdList +import Outputable + +-- | Code to assign a 64 bit value to memory. +assignMem_I64Code + :: CmmExpr -- ^ expr producing the destination address + -> CmmExpr -- ^ expr producing the source value. + -> NatM InstrBlock + +assignMem_I64Code addrTree valueTree + = do + ChildCode64 vcode rlo <- iselExpr64 valueTree + + (src, acode) <- getSomeReg addrTree + let + rhi = getHiVRegFromLo rlo + + -- Big-endian store + mov_hi = ST II32 rhi (AddrRegImm src (ImmInt 0)) + mov_lo = ST II32 rlo (AddrRegImm src (ImmInt 4)) + + code = vcode `appOL` acode `snocOL` mov_hi `snocOL` mov_lo + +{- pprTrace "assignMem_I64Code" + (vcat [ text "addrTree: " <+> ppr addrTree + , text "valueTree: " <+> ppr valueTree + , text "vcode:" + , vcat $ map ppr $ fromOL vcode + , text "" + , text "acode:" + , vcat $ map ppr $ fromOL acode ]) + $ -} + return code + + +-- | Code to assign a 64 bit value to a register. +assignReg_I64Code + :: CmmReg -- ^ the destination register + -> CmmExpr -- ^ expr producing the source value + -> NatM InstrBlock + +assignReg_I64Code (CmmLocal (LocalReg u_dst pk)) valueTree + = do + ChildCode64 vcode r_src_lo <- iselExpr64 valueTree + let + r_dst_lo = RegVirtual $ mkVirtualReg u_dst (cmmTypeFormat pk) + r_dst_hi = getHiVRegFromLo r_dst_lo + r_src_hi = getHiVRegFromLo r_src_lo + mov_lo = mkMOV r_src_lo r_dst_lo + mov_hi = mkMOV r_src_hi r_dst_hi + mkMOV sreg dreg = OR False g0 (RIReg sreg) dreg + + return (vcode `snocOL` mov_hi `snocOL` mov_lo) + +assignReg_I64Code _ _ + = panic "assignReg_I64Code(sparc): invalid lvalue" + + + + +-- | Get the value of an expression into a 64 bit register. + +iselExpr64 :: CmmExpr -> NatM ChildCode64 + +-- Load a 64 bit word +iselExpr64 (CmmLoad addrTree ty) + | isWord64 ty + = do Amode amode addr_code <- getAmode addrTree + let result + + | AddrRegReg r1 r2 <- amode + = do rlo <- getNewRegNat II32 + tmp <- getNewRegNat II32 + let rhi = getHiVRegFromLo rlo + + return $ ChildCode64 + ( addr_code + `appOL` toOL + [ ADD False False r1 (RIReg r2) tmp + , LD II32 (AddrRegImm tmp (ImmInt 0)) rhi + , LD II32 (AddrRegImm tmp (ImmInt 4)) rlo ]) + rlo + + | AddrRegImm r1 (ImmInt i) <- amode + = do rlo <- getNewRegNat II32 + let rhi = getHiVRegFromLo rlo + + return $ ChildCode64 + ( addr_code + `appOL` toOL + [ LD II32 (AddrRegImm r1 (ImmInt $ 0 + i)) rhi + , LD II32 (AddrRegImm r1 (ImmInt $ 4 + i)) rlo ]) + rlo + + | otherwise + = panic "SPARC.CodeGen.Gen64: no match" + + result + + +-- Add a literal to a 64 bit integer +iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) + = do ChildCode64 code1 r1_lo <- iselExpr64 e1 + let r1_hi = getHiVRegFromLo r1_lo + + r_dst_lo <- getNewRegNat II32 + let r_dst_hi = getHiVRegFromLo r_dst_lo + + let code = code1 + `appOL` toOL + [ ADD False True r1_lo (RIImm (ImmInteger i)) r_dst_lo + , ADD True False r1_hi (RIReg g0) r_dst_hi ] + + return $ ChildCode64 code r_dst_lo + + +-- Addition of II64 +iselExpr64 (CmmMachOp (MO_Add _) [e1, e2]) + = do ChildCode64 code1 r1_lo <- iselExpr64 e1 + let r1_hi = getHiVRegFromLo r1_lo + + ChildCode64 code2 r2_lo <- iselExpr64 e2 + let r2_hi = getHiVRegFromLo r2_lo + + r_dst_lo <- getNewRegNat II32 + let r_dst_hi = getHiVRegFromLo r_dst_lo + + let code = code1 + `appOL` code2 + `appOL` toOL + [ ADD False True r1_lo (RIReg r2_lo) r_dst_lo + , ADD True False r1_hi (RIReg r2_hi) r_dst_hi ] + + return $ ChildCode64 code r_dst_lo + + +iselExpr64 (CmmReg (CmmLocal (LocalReg uq ty))) + | isWord64 ty + = do + r_dst_lo <- getNewRegNat II32 + let r_dst_hi = getHiVRegFromLo r_dst_lo + r_src_lo = RegVirtual $ mkVirtualReg uq II32 + r_src_hi = getHiVRegFromLo r_src_lo + mov_lo = mkMOV r_src_lo r_dst_lo + mov_hi = mkMOV r_src_hi r_dst_hi + mkMOV sreg dreg = OR False g0 (RIReg sreg) dreg + return ( + ChildCode64 (toOL [mov_hi, mov_lo]) r_dst_lo + ) + +-- Convert something into II64 +iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) + = do + r_dst_lo <- getNewRegNat II32 + let r_dst_hi = getHiVRegFromLo r_dst_lo + + -- compute expr and load it into r_dst_lo + (a_reg, a_code) <- getSomeReg expr + + dflags <- getDynFlags + let platform = targetPlatform dflags + code = a_code + `appOL` toOL + [ mkRegRegMoveInstr platform g0 r_dst_hi -- clear high 32 bits + , mkRegRegMoveInstr platform a_reg r_dst_lo ] + + return $ ChildCode64 code r_dst_lo + +-- only W32 supported for now +iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) + = do + r_dst_lo <- getNewRegNat II32 + let r_dst_hi = getHiVRegFromLo r_dst_lo + + -- compute expr and load it into r_dst_lo + (a_reg, a_code) <- getSomeReg expr + + dflags <- getDynFlags + let platform = targetPlatform dflags + code = a_code + `appOL` toOL + [ SRA a_reg (RIImm (ImmInt 31)) r_dst_hi + , mkRegRegMoveInstr platform a_reg r_dst_lo ] + + return $ ChildCode64 code r_dst_lo + + +iselExpr64 expr + = pprPanic "iselExpr64(sparc)" (ppr expr) diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs new file mode 100644 index 0000000000..f8648c4549 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs @@ -0,0 +1,69 @@ +-- | One ounce of sanity checking is worth 10000000000000000 ounces +-- of staring blindly at assembly code trying to find the problem.. +module GHC.CmmToAsm.SPARC.CodeGen.Sanity ( + checkBlock +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Ppr () -- For Outputable instances +import GHC.CmmToAsm.Instr + +import GHC.Cmm + +import Outputable + + +-- | Enforce intra-block invariants. +-- +checkBlock :: CmmBlock + -> NatBasicBlock Instr + -> NatBasicBlock Instr + +checkBlock cmm block@(BasicBlock _ instrs) + | checkBlockInstrs instrs + = block + + | otherwise + = pprPanic + ("SPARC.CodeGen: bad block\n") + ( vcat [ text " -- cmm -----------------\n" + , ppr cmm + , text " -- native code ---------\n" + , ppr block ]) + + +checkBlockInstrs :: [Instr] -> Bool +checkBlockInstrs ii + + -- An unconditional jumps end the block. + -- There must be an unconditional jump in the block, otherwise + -- the register liveness determinator will get the liveness + -- information wrong. + -- + -- If the block ends with a cmm call that never returns + -- then there can be unreachable instructions after the jump, + -- but we don't mind here. + -- + | instr : NOP : _ <- ii + , isUnconditionalJump instr + = True + + -- All jumps must have a NOP in their branch delay slot. + -- The liveness determinator and register allocators aren't smart + -- enough to handle branch delay slots. + -- + | instr : NOP : is <- ii + , isJumpishInstr instr + = checkBlockInstrs is + + -- keep checking + | _:i2:is <- ii + = checkBlockInstrs (i2:is) + + -- this block is no good + | otherwise + = False diff --git a/compiler/GHC/CmmToAsm/SPARC/Cond.hs b/compiler/GHC/CmmToAsm/SPARC/Cond.hs new file mode 100644 index 0000000000..89b64b7c3a --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/Cond.hs @@ -0,0 +1,54 @@ +module GHC.CmmToAsm.SPARC.Cond ( + Cond(..), + condUnsigned, + condToSigned, + condToUnsigned +) + +where + +import GhcPrelude + +-- | Branch condition codes. +data Cond + = ALWAYS + | EQQ + | GE + | GEU + | GTT + | GU + | LE + | LEU + | LTT + | LU + | NE + | NEG + | NEVER + | POS + | VC + | VS + deriving Eq + + +condUnsigned :: Cond -> Bool +condUnsigned GU = True +condUnsigned LU = True +condUnsigned GEU = True +condUnsigned LEU = True +condUnsigned _ = False + + +condToSigned :: Cond -> Cond +condToSigned GU = GTT +condToSigned LU = LTT +condToSigned GEU = GE +condToSigned LEU = LE +condToSigned x = x + + +condToUnsigned :: Cond -> Cond +condToUnsigned GTT = GU +condToUnsigned LTT = LU +condToUnsigned GE = GEU +condToUnsigned LE = LEU +condToUnsigned x = x diff --git a/compiler/GHC/CmmToAsm/SPARC/Imm.hs b/compiler/GHC/CmmToAsm/SPARC/Imm.hs new file mode 100644 index 0000000000..71b0257ac5 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/Imm.hs @@ -0,0 +1,67 @@ +module GHC.CmmToAsm.SPARC.Imm ( + -- immediate values + Imm(..), + strImmLit, + litToImm +) + +where + +import GhcPrelude + +import GHC.Cmm +import GHC.Cmm.CLabel + +import Outputable + +-- | An immediate value. +-- Not all of these are directly representable by the machine. +-- Things like ImmLit are slurped out and put in a data segment instead. +-- +data Imm + = ImmInt Int + + -- Sigh. + | ImmInteger Integer + + -- AbstractC Label (with baggage) + | ImmCLbl CLabel + + -- Simple string + | ImmLit SDoc + | ImmIndex CLabel Int + | ImmFloat Rational + | ImmDouble Rational + + | ImmConstantSum Imm Imm + | ImmConstantDiff Imm Imm + + | LO Imm + | HI Imm + + +-- | Create a ImmLit containing this string. +strImmLit :: String -> Imm +strImmLit s = ImmLit (text s) + + +-- | Convert a CmmLit to an Imm. +-- Narrow to the width: a CmmInt might be out of +-- range, but we assume that ImmInteger only contains +-- in-range values. A signed value should be fine here. +-- +litToImm :: CmmLit -> Imm +litToImm lit + = case lit of + CmmInt i w -> ImmInteger (narrowS w i) + CmmFloat f W32 -> ImmFloat f + CmmFloat f W64 -> ImmDouble f + CmmLabel l -> ImmCLbl l + CmmLabelOff l off -> ImmIndex l off + + CmmLabelDiffOff l1 l2 off _ + -> ImmConstantSum + (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2)) + (ImmInt off) + + _ -> panic "SPARC.Regs.litToImm: no match" diff --git a/compiler/GHC/CmmToAsm/SPARC/Instr.hs b/compiler/GHC/CmmToAsm/SPARC/Instr.hs new file mode 100644 index 0000000000..ec74d3723b --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/Instr.hs @@ -0,0 +1,481 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Machine-dependent assembly language +-- +-- (c) The University of Glasgow 1993-2004 +-- +----------------------------------------------------------------------------- +#include "HsVersions.h" + +module GHC.CmmToAsm.SPARC.Instr ( + RI(..), + riZero, + + fpRelEA, + moveSp, + + isUnconditionalJump, + + Instr(..), + maxSpillSlots +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Stack +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Cond +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.SPARC.Base +import GHC.CmmToAsm.Reg.Target +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg.Class +import GHC.Platform.Reg +import GHC.CmmToAsm.Format + +import GHC.Cmm.CLabel +import GHC.Platform.Regs +import GHC.Cmm.BlockId +import GHC.Driver.Session +import GHC.Cmm +import FastString +import Outputable +import GHC.Platform + + +-- | Register or immediate +data RI + = RIReg Reg + | RIImm Imm + +-- | Check if a RI represents a zero value. +-- - a literal zero +-- - register %g0, which is always zero. +-- +riZero :: RI -> Bool +riZero (RIImm (ImmInt 0)) = True +riZero (RIImm (ImmInteger 0)) = True +riZero (RIReg (RegReal (RealRegSingle 0))) = True +riZero _ = False + + +-- | Calculate the effective address which would be used by the +-- corresponding fpRel sequence. +fpRelEA :: Int -> Reg -> Instr +fpRelEA n dst + = ADD False False fp (RIImm (ImmInt (n * wordLength))) dst + + +-- | Code to shift the stack pointer by n words. +moveSp :: Int -> Instr +moveSp n + = ADD False False sp (RIImm (ImmInt (n * wordLength))) sp + +-- | An instruction that will cause the one after it never to be exectuted +isUnconditionalJump :: Instr -> Bool +isUnconditionalJump ii + = case ii of + CALL{} -> True + JMP{} -> True + JMP_TBL{} -> True + BI ALWAYS _ _ -> True + BF ALWAYS _ _ -> True + _ -> False + + +-- | instance for sparc instruction set +instance Instruction Instr where + regUsageOfInstr = sparc_regUsageOfInstr + patchRegsOfInstr = sparc_patchRegsOfInstr + isJumpishInstr = sparc_isJumpishInstr + jumpDestsOfInstr = sparc_jumpDestsOfInstr + patchJumpInstr = sparc_patchJumpInstr + mkSpillInstr = sparc_mkSpillInstr + mkLoadInstr = sparc_mkLoadInstr + takeDeltaInstr = sparc_takeDeltaInstr + isMetaInstr = sparc_isMetaInstr + mkRegRegMoveInstr = sparc_mkRegRegMoveInstr + takeRegRegMoveInstr = sparc_takeRegRegMoveInstr + mkJumpInstr = sparc_mkJumpInstr + mkStackAllocInstr = panic "no sparc_mkStackAllocInstr" + mkStackDeallocInstr = panic "no sparc_mkStackDeallocInstr" + + +-- | SPARC instruction set. +-- Not complete. This is only the ones we need. +-- +data Instr + + -- meta ops -------------------------------------------------- + -- comment pseudo-op + = COMMENT FastString + + -- some static data spat out during code generation. + -- Will be extracted before pretty-printing. + | LDATA Section RawCmmStatics + + -- Start a new basic block. Useful during codegen, removed later. + -- Preceding instruction should be a jump, as per the invariants + -- for a BasicBlock (see Cmm). + | NEWBLOCK BlockId + + -- specify current stack offset for benefit of subsequent passes. + | DELTA Int + + -- real instrs ----------------------------------------------- + -- Loads and stores. + | LD Format AddrMode Reg -- format, src, dst + | ST Format Reg AddrMode -- format, src, dst + + -- Int Arithmetic. + -- x: add/sub with carry bit. + -- In SPARC V9 addx and friends were renamed addc. + -- + -- cc: modify condition codes + -- + | ADD Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst + | SUB Bool Bool Reg RI Reg -- x?, cc?, src1, src2, dst + + | UMUL Bool Reg RI Reg -- cc?, src1, src2, dst + | SMUL Bool Reg RI Reg -- cc?, src1, src2, dst + + + -- The SPARC divide instructions perform 64bit by 32bit division + -- The Y register is xored into the first operand. + + -- On _some implementations_ the Y register is overwritten by + -- the remainder, so we have to make sure it is 0 each time. + + -- dst <- ((Y `shiftL` 32) `or` src1) `div` src2 + | UDIV Bool Reg RI Reg -- cc?, src1, src2, dst + | SDIV Bool Reg RI Reg -- cc?, src1, src2, dst + + | RDY Reg -- move contents of Y register to reg + | WRY Reg Reg -- Y <- src1 `xor` src2 + + -- Logic operations. + | AND Bool Reg RI Reg -- cc?, src1, src2, dst + | ANDN Bool Reg RI Reg -- cc?, src1, src2, dst + | OR Bool Reg RI Reg -- cc?, src1, src2, dst + | ORN Bool Reg RI Reg -- cc?, src1, src2, dst + | XOR Bool Reg RI Reg -- cc?, src1, src2, dst + | XNOR Bool Reg RI Reg -- cc?, src1, src2, dst + | SLL Reg RI Reg -- src1, src2, dst + | SRL Reg RI Reg -- src1, src2, dst + | SRA Reg RI Reg -- src1, src2, dst + + -- Load immediates. + | SETHI Imm Reg -- src, dst + + -- Do nothing. + -- Implemented by the assembler as SETHI 0, %g0, but worth an alias + | NOP + + -- Float Arithmetic. + -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single + -- instructions right up until we spit them out. + -- + | FABS Format Reg Reg -- src dst + | FADD Format Reg Reg Reg -- src1, src2, dst + | FCMP Bool Format Reg Reg -- exception?, src1, src2, dst + | FDIV Format Reg Reg Reg -- src1, src2, dst + | FMOV Format Reg Reg -- src, dst + | FMUL Format Reg Reg Reg -- src1, src2, dst + | FNEG Format Reg Reg -- src, dst + | FSQRT Format Reg Reg -- src, dst + | FSUB Format Reg Reg Reg -- src1, src2, dst + | FxTOy Format Format Reg Reg -- src, dst + + -- Jumping around. + | BI Cond Bool BlockId -- cond, annul?, target + | BF Cond Bool BlockId -- cond, annul?, target + + | JMP AddrMode -- target + + -- With a tabled jump we know all the possible destinations. + -- We also need this info so we can work out what regs are live across the jump. + -- + | JMP_TBL AddrMode [Maybe BlockId] CLabel + + | CALL (Either Imm Reg) Int Bool -- target, args, terminal + + +-- | regUsage returns the sets of src and destination registers used +-- by a particular instruction. Machine registers that are +-- pre-allocated to stgRegs are filtered out, because they are +-- uninteresting from a register allocation standpoint. (We wouldn't +-- want them to end up on the free list!) As far as we are concerned, +-- the fixed registers simply don't exist (for allocation purposes, +-- anyway). + +-- regUsage doesn't need to do any trickery for jumps and such. Just +-- state precisely the regs read and written by that insn. The +-- consequences of control flow transfers, as far as register +-- allocation goes, are taken care of by the register allocator. +-- +sparc_regUsageOfInstr :: Platform -> Instr -> RegUsage +sparc_regUsageOfInstr platform instr + = case instr of + LD _ addr reg -> usage (regAddr addr, [reg]) + ST _ reg addr -> usage (reg : regAddr addr, []) + ADD _ _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + SUB _ _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + UMUL _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + SMUL _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + UDIV _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + SDIV _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + RDY rd -> usage ([], [rd]) + WRY r1 r2 -> usage ([r1, r2], []) + AND _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + ANDN _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + OR _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + ORN _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + XOR _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + XNOR _ r1 ar r2 -> usage (r1 : regRI ar, [r2]) + SLL r1 ar r2 -> usage (r1 : regRI ar, [r2]) + SRL r1 ar r2 -> usage (r1 : regRI ar, [r2]) + SRA r1 ar r2 -> usage (r1 : regRI ar, [r2]) + SETHI _ reg -> usage ([], [reg]) + FABS _ r1 r2 -> usage ([r1], [r2]) + FADD _ r1 r2 r3 -> usage ([r1, r2], [r3]) + FCMP _ _ r1 r2 -> usage ([r1, r2], []) + FDIV _ r1 r2 r3 -> usage ([r1, r2], [r3]) + FMOV _ r1 r2 -> usage ([r1], [r2]) + FMUL _ r1 r2 r3 -> usage ([r1, r2], [r3]) + FNEG _ r1 r2 -> usage ([r1], [r2]) + FSQRT _ r1 r2 -> usage ([r1], [r2]) + FSUB _ r1 r2 r3 -> usage ([r1, r2], [r3]) + FxTOy _ _ r1 r2 -> usage ([r1], [r2]) + + JMP addr -> usage (regAddr addr, []) + JMP_TBL addr _ _ -> usage (regAddr addr, []) + + CALL (Left _ ) _ True -> noUsage + CALL (Left _ ) n False -> usage (argRegs n, callClobberedRegs) + CALL (Right reg) _ True -> usage ([reg], []) + CALL (Right reg) n False -> usage (reg : (argRegs n), callClobberedRegs) + _ -> noUsage + + where + usage (src, dst) + = RU (filter (interesting platform) src) + (filter (interesting platform) dst) + + regAddr (AddrRegReg r1 r2) = [r1, r2] + regAddr (AddrRegImm r1 _) = [r1] + + regRI (RIReg r) = [r] + regRI _ = [] + + +-- | Interesting regs are virtuals, or ones that are allocatable +-- by the register allocator. +interesting :: Platform -> Reg -> Bool +interesting platform reg + = case reg of + RegVirtual _ -> True + RegReal (RealRegSingle r1) -> freeReg platform r1 + RegReal (RealRegPair r1 _) -> freeReg platform r1 + + + +-- | Apply a given mapping to tall the register references in this instruction. +sparc_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr +sparc_patchRegsOfInstr instr env = case instr of + LD fmt addr reg -> LD fmt (fixAddr addr) (env reg) + ST fmt reg addr -> ST fmt (env reg) (fixAddr addr) + + ADD x cc r1 ar r2 -> ADD x cc (env r1) (fixRI ar) (env r2) + SUB x cc r1 ar r2 -> SUB x cc (env r1) (fixRI ar) (env r2) + UMUL cc r1 ar r2 -> UMUL cc (env r1) (fixRI ar) (env r2) + SMUL cc r1 ar r2 -> SMUL cc (env r1) (fixRI ar) (env r2) + UDIV cc r1 ar r2 -> UDIV cc (env r1) (fixRI ar) (env r2) + SDIV cc r1 ar r2 -> SDIV cc (env r1) (fixRI ar) (env r2) + RDY rd -> RDY (env rd) + WRY r1 r2 -> WRY (env r1) (env r2) + AND b r1 ar r2 -> AND b (env r1) (fixRI ar) (env r2) + ANDN b r1 ar r2 -> ANDN b (env r1) (fixRI ar) (env r2) + OR b r1 ar r2 -> OR b (env r1) (fixRI ar) (env r2) + ORN b r1 ar r2 -> ORN b (env r1) (fixRI ar) (env r2) + XOR b r1 ar r2 -> XOR b (env r1) (fixRI ar) (env r2) + XNOR b r1 ar r2 -> XNOR b (env r1) (fixRI ar) (env r2) + SLL r1 ar r2 -> SLL (env r1) (fixRI ar) (env r2) + SRL r1 ar r2 -> SRL (env r1) (fixRI ar) (env r2) + SRA r1 ar r2 -> SRA (env r1) (fixRI ar) (env r2) + + SETHI imm reg -> SETHI imm (env reg) + + FABS s r1 r2 -> FABS s (env r1) (env r2) + FADD s r1 r2 r3 -> FADD s (env r1) (env r2) (env r3) + FCMP e s r1 r2 -> FCMP e s (env r1) (env r2) + FDIV s r1 r2 r3 -> FDIV s (env r1) (env r2) (env r3) + FMOV s r1 r2 -> FMOV s (env r1) (env r2) + FMUL s r1 r2 r3 -> FMUL s (env r1) (env r2) (env r3) + FNEG s r1 r2 -> FNEG s (env r1) (env r2) + FSQRT s r1 r2 -> FSQRT s (env r1) (env r2) + FSUB s r1 r2 r3 -> FSUB s (env r1) (env r2) (env r3) + FxTOy s1 s2 r1 r2 -> FxTOy s1 s2 (env r1) (env r2) + + JMP addr -> JMP (fixAddr addr) + JMP_TBL addr ids l -> JMP_TBL (fixAddr addr) ids l + + CALL (Left i) n t -> CALL (Left i) n t + CALL (Right r) n t -> CALL (Right (env r)) n t + _ -> instr + + where + fixAddr (AddrRegReg r1 r2) = AddrRegReg (env r1) (env r2) + fixAddr (AddrRegImm r1 i) = AddrRegImm (env r1) i + + fixRI (RIReg r) = RIReg (env r) + fixRI other = other + + +-------------------------------------------------------------------------------- +sparc_isJumpishInstr :: Instr -> Bool +sparc_isJumpishInstr instr + = case instr of + BI{} -> True + BF{} -> True + JMP{} -> True + JMP_TBL{} -> True + CALL{} -> True + _ -> False + +sparc_jumpDestsOfInstr :: Instr -> [BlockId] +sparc_jumpDestsOfInstr insn + = case insn of + BI _ _ id -> [id] + BF _ _ id -> [id] + JMP_TBL _ ids _ -> [id | Just id <- ids] + _ -> [] + + +sparc_patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr +sparc_patchJumpInstr insn patchF + = case insn of + BI cc annul id -> BI cc annul (patchF id) + BF cc annul id -> BF cc annul (patchF id) + JMP_TBL n ids l -> JMP_TBL n (map (fmap patchF) ids) l + _ -> insn + + +-------------------------------------------------------------------------------- +-- | Make a spill instruction. +-- On SPARC we spill below frame pointer leaving 2 words/spill +sparc_mkSpillInstr + :: DynFlags + -> Reg -- ^ register to spill + -> Int -- ^ current stack delta + -> Int -- ^ spill slot to use + -> Instr + +sparc_mkSpillInstr dflags reg _ slot + = let platform = targetPlatform dflags + off = spillSlotToOffset dflags slot + off_w = 1 + (off `div` 4) + fmt = case targetClassOfReg platform reg of + RcInteger -> II32 + RcFloat -> FF32 + RcDouble -> FF64 + + in ST fmt reg (fpRel (negate off_w)) + + +-- | Make a spill reload instruction. +sparc_mkLoadInstr + :: DynFlags + -> Reg -- ^ register to load into + -> Int -- ^ current stack delta + -> Int -- ^ spill slot to use + -> Instr + +sparc_mkLoadInstr dflags reg _ slot + = let platform = targetPlatform dflags + off = spillSlotToOffset dflags slot + off_w = 1 + (off `div` 4) + fmt = case targetClassOfReg platform reg of + RcInteger -> II32 + RcFloat -> FF32 + RcDouble -> FF64 + + in LD fmt (fpRel (- off_w)) reg + + +-------------------------------------------------------------------------------- +-- | See if this instruction is telling us the current C stack delta +sparc_takeDeltaInstr + :: Instr + -> Maybe Int + +sparc_takeDeltaInstr instr + = case instr of + DELTA i -> Just i + _ -> Nothing + + +sparc_isMetaInstr + :: Instr + -> Bool + +sparc_isMetaInstr instr + = case instr of + COMMENT{} -> True + LDATA{} -> True + NEWBLOCK{} -> True + DELTA{} -> True + _ -> False + + +-- | Make a reg-reg move instruction. +-- On SPARC v8 there are no instructions to move directly between +-- floating point and integer regs. If we need to do that then we +-- have to go via memory. +-- +sparc_mkRegRegMoveInstr + :: Platform + -> Reg + -> Reg + -> Instr + +sparc_mkRegRegMoveInstr platform src dst + | srcClass <- targetClassOfReg platform src + , dstClass <- targetClassOfReg platform dst + , srcClass == dstClass + = case srcClass of + RcInteger -> ADD False False src (RIReg g0) dst + RcDouble -> FMOV FF64 src dst + RcFloat -> FMOV FF32 src dst + + | otherwise + = panic "SPARC.Instr.mkRegRegMoveInstr: classes of src and dest not the same" + + +-- | Check whether an instruction represents a reg-reg move. +-- The register allocator attempts to eliminate reg->reg moves whenever it can, +-- by assigning the src and dest temporaries to the same real register. +-- +sparc_takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg) +sparc_takeRegRegMoveInstr instr + = case instr of + ADD False False src (RIReg src2) dst + | g0 == src2 -> Just (src, dst) + + FMOV FF64 src dst -> Just (src, dst) + FMOV FF32 src dst -> Just (src, dst) + _ -> Nothing + + +-- | Make an unconditional branch instruction. +sparc_mkJumpInstr + :: BlockId + -> [Instr] + +sparc_mkJumpInstr id + = [BI ALWAYS False id + , NOP] -- fill the branch delay slot. diff --git a/compiler/GHC/CmmToAsm/SPARC/Ppr.hs b/compiler/GHC/CmmToAsm/SPARC/Ppr.hs new file mode 100644 index 0000000000..2f3ea778f7 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/Ppr.hs @@ -0,0 +1,645 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Pretty-printing assembly language +-- +-- (c) The University of Glasgow 1993-2005 +-- +----------------------------------------------------------------------------- + +{-# OPTIONS_GHC -fno-warn-orphans #-} + +module GHC.CmmToAsm.SPARC.Ppr ( + pprNatCmmDecl, + pprBasicBlock, + pprData, + pprInstr, + pprFormat, + pprImm, + pprDataItem +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Cond +import GHC.CmmToAsm.SPARC.Imm +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Base +import GHC.CmmToAsm.Instr +import GHC.Platform.Reg +import GHC.CmmToAsm.Format +import GHC.CmmToAsm.Ppr + +import GHC.Cmm hiding (topInfoTable) +import GHC.Cmm.Ppr() -- For Outputable instances +import GHC.Cmm.BlockId +import GHC.Cmm.CLabel +import GHC.Cmm.Dataflow.Label +import GHC.Cmm.Dataflow.Collections + +import Unique ( pprUniqueAlways ) +import Outputable +import GHC.Platform +import FastString + +-- ----------------------------------------------------------------------------- +-- Printing this stuff out + +pprNatCmmDecl :: NatCmmDecl RawCmmStatics Instr -> SDoc +pprNatCmmDecl (CmmData section dats) = + pprSectionAlign section $$ pprDatas dats + +pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) = + case topInfoTable proc of + Nothing -> + -- special case for code without info table: + pprSectionAlign (Section Text lbl) $$ + pprLabel lbl $$ -- blocks guaranteed not null, so label needed + vcat (map (pprBasicBlock top_info) blocks) + + Just (RawCmmStatics info_lbl _) -> + sdocWithPlatform $ \platform -> + (if platformHasSubsectionsViaSymbols platform + then pprSectionAlign dspSection $$ + ppr (mkDeadStripPreventer info_lbl) <> char ':' + else empty) $$ + vcat (map (pprBasicBlock top_info) blocks) $$ + -- above: Even the first block gets a label, because with branch-chain + -- elimination, it might be the target of a goto. + (if platformHasSubsectionsViaSymbols platform + then + -- See Note [Subsections Via Symbols] in X86/Ppr.hs + text "\t.long " + <+> ppr info_lbl + <+> char '-' + <+> ppr (mkDeadStripPreventer info_lbl) + else empty) + +dspSection :: Section +dspSection = Section Text $ + panic "subsections-via-symbols doesn't combine with split-sections" + +pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc +pprBasicBlock info_env (BasicBlock blockid instrs) + = maybe_infotable $$ + pprLabel (blockLbl blockid) $$ + vcat (map pprInstr instrs) + where + maybe_infotable = case mapLookup blockid info_env of + Nothing -> empty + Just (RawCmmStatics info_lbl info) -> + pprAlignForSection Text $$ + vcat (map pprData info) $$ + pprLabel info_lbl + + +pprDatas :: RawCmmStatics -> SDoc +-- See note [emit-time elimination of static indirections] in CLabel. +pprDatas (RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _]) + | lbl == mkIndStaticInfoLabel + , let labelInd (CmmLabelOff l _) = Just l + labelInd (CmmLabel l) = Just l + labelInd _ = Nothing + , Just ind' <- labelInd ind + , alias `mayRedirectTo` ind' + = pprGloblDecl alias + $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind') +pprDatas (RawCmmStatics lbl dats) = vcat (pprLabel lbl : map pprData dats) + +pprData :: CmmStatic -> SDoc +pprData (CmmString str) = pprBytes str +pprData (CmmUninitialised bytes) = text ".skip " <> int bytes +pprData (CmmStaticLit lit) = pprDataItem lit + +pprGloblDecl :: CLabel -> SDoc +pprGloblDecl lbl + | not (externallyVisibleCLabel lbl) = empty + | otherwise = text ".global " <> ppr lbl + +pprTypeAndSizeDecl :: CLabel -> SDoc +pprTypeAndSizeDecl lbl + = sdocWithPlatform $ \platform -> + if platformOS platform == OSLinux && externallyVisibleCLabel lbl + then text ".type " <> ppr lbl <> ptext (sLit ", @object") + else empty + +pprLabel :: CLabel -> SDoc +pprLabel lbl = pprGloblDecl lbl + $$ pprTypeAndSizeDecl lbl + $$ (ppr lbl <> char ':') + +-- ----------------------------------------------------------------------------- +-- pprInstr: print an 'Instr' + +instance Outputable Instr where + ppr instr = pprInstr instr + + +-- | Pretty print a register. +pprReg :: Reg -> SDoc +pprReg reg + = case reg of + RegVirtual vr + -> case vr of + VirtualRegI u -> text "%vI_" <> pprUniqueAlways u + VirtualRegHi u -> text "%vHi_" <> pprUniqueAlways u + VirtualRegF u -> text "%vF_" <> pprUniqueAlways u + VirtualRegD u -> text "%vD_" <> pprUniqueAlways u + + + RegReal rr + -> case rr of + RealRegSingle r1 + -> pprReg_ofRegNo r1 + + RealRegPair r1 r2 + -> text "(" <> pprReg_ofRegNo r1 + <> vbar <> pprReg_ofRegNo r2 + <> text ")" + + + +-- | Pretty print a register name, based on this register number. +-- The definition has been unfolded so we get a jump-table in the +-- object code. This function is called quite a lot when emitting +-- the asm file.. +-- +pprReg_ofRegNo :: Int -> SDoc +pprReg_ofRegNo i + = ptext + (case i of { + 0 -> sLit "%g0"; 1 -> sLit "%g1"; + 2 -> sLit "%g2"; 3 -> sLit "%g3"; + 4 -> sLit "%g4"; 5 -> sLit "%g5"; + 6 -> sLit "%g6"; 7 -> sLit "%g7"; + 8 -> sLit "%o0"; 9 -> sLit "%o1"; + 10 -> sLit "%o2"; 11 -> sLit "%o3"; + 12 -> sLit "%o4"; 13 -> sLit "%o5"; + 14 -> sLit "%o6"; 15 -> sLit "%o7"; + 16 -> sLit "%l0"; 17 -> sLit "%l1"; + 18 -> sLit "%l2"; 19 -> sLit "%l3"; + 20 -> sLit "%l4"; 21 -> sLit "%l5"; + 22 -> sLit "%l6"; 23 -> sLit "%l7"; + 24 -> sLit "%i0"; 25 -> sLit "%i1"; + 26 -> sLit "%i2"; 27 -> sLit "%i3"; + 28 -> sLit "%i4"; 29 -> sLit "%i5"; + 30 -> sLit "%i6"; 31 -> sLit "%i7"; + 32 -> sLit "%f0"; 33 -> sLit "%f1"; + 34 -> sLit "%f2"; 35 -> sLit "%f3"; + 36 -> sLit "%f4"; 37 -> sLit "%f5"; + 38 -> sLit "%f6"; 39 -> sLit "%f7"; + 40 -> sLit "%f8"; 41 -> sLit "%f9"; + 42 -> sLit "%f10"; 43 -> sLit "%f11"; + 44 -> sLit "%f12"; 45 -> sLit "%f13"; + 46 -> sLit "%f14"; 47 -> sLit "%f15"; + 48 -> sLit "%f16"; 49 -> sLit "%f17"; + 50 -> sLit "%f18"; 51 -> sLit "%f19"; + 52 -> sLit "%f20"; 53 -> sLit "%f21"; + 54 -> sLit "%f22"; 55 -> sLit "%f23"; + 56 -> sLit "%f24"; 57 -> sLit "%f25"; + 58 -> sLit "%f26"; 59 -> sLit "%f27"; + 60 -> sLit "%f28"; 61 -> sLit "%f29"; + 62 -> sLit "%f30"; 63 -> sLit "%f31"; + _ -> sLit "very naughty sparc register" }) + + +-- | Pretty print a format for an instruction suffix. +pprFormat :: Format -> SDoc +pprFormat x + = ptext + (case x of + II8 -> sLit "ub" + II16 -> sLit "uh" + II32 -> sLit "" + II64 -> sLit "d" + FF32 -> sLit "" + FF64 -> sLit "d") + + +-- | Pretty print a format for an instruction suffix. +-- eg LD is 32bit on sparc, but LDD is 64 bit. +pprStFormat :: Format -> SDoc +pprStFormat x + = ptext + (case x of + II8 -> sLit "b" + II16 -> sLit "h" + II32 -> sLit "" + II64 -> sLit "x" + FF32 -> sLit "" + FF64 -> sLit "d") + + + +-- | Pretty print a condition code. +pprCond :: Cond -> SDoc +pprCond c + = ptext + (case c of + ALWAYS -> sLit "" + NEVER -> sLit "n" + GEU -> sLit "geu" + LU -> sLit "lu" + EQQ -> sLit "e" + GTT -> sLit "g" + GE -> sLit "ge" + GU -> sLit "gu" + LTT -> sLit "l" + LE -> sLit "le" + LEU -> sLit "leu" + NE -> sLit "ne" + NEG -> sLit "neg" + POS -> sLit "pos" + VC -> sLit "vc" + VS -> sLit "vs") + + +-- | Pretty print an address mode. +pprAddr :: AddrMode -> SDoc +pprAddr am + = case am of + AddrRegReg r1 (RegReal (RealRegSingle 0)) + -> pprReg r1 + + AddrRegReg r1 r2 + -> hcat [ pprReg r1, char '+', pprReg r2 ] + + AddrRegImm r1 (ImmInt i) + | i == 0 -> pprReg r1 + | not (fits13Bits i) -> largeOffsetError i + | otherwise -> hcat [ pprReg r1, pp_sign, int i ] + where + pp_sign = if i > 0 then char '+' else empty + + AddrRegImm r1 (ImmInteger i) + | i == 0 -> pprReg r1 + | not (fits13Bits i) -> largeOffsetError i + | otherwise -> hcat [ pprReg r1, pp_sign, integer i ] + where + pp_sign = if i > 0 then char '+' else empty + + AddrRegImm r1 imm + -> hcat [ pprReg r1, char '+', pprImm imm ] + + +-- | Pretty print an immediate value. +pprImm :: Imm -> SDoc +pprImm imm + = case imm of + ImmInt i -> int i + ImmInteger i -> integer i + ImmCLbl l -> ppr l + ImmIndex l i -> ppr l <> char '+' <> int i + ImmLit s -> s + + ImmConstantSum a b + -> pprImm a <> char '+' <> pprImm b + + ImmConstantDiff a b + -> pprImm a <> char '-' <> lparen <> pprImm b <> rparen + + LO i + -> hcat [ text "%lo(", pprImm i, rparen ] + + HI i + -> hcat [ text "%hi(", pprImm i, rparen ] + + -- these should have been converted to bytes and placed + -- in the data section. + ImmFloat _ -> text "naughty float immediate" + ImmDouble _ -> text "naughty double immediate" + + +-- | Pretty print a section \/ segment header. +-- On SPARC all the data sections must be at least 8 byte aligned +-- incase we store doubles in them. +-- +pprSectionAlign :: Section -> SDoc +pprSectionAlign sec@(Section seg _) = + sdocWithPlatform $ \platform -> + pprSectionHeader platform sec $$ + pprAlignForSection seg + +-- | Print appropriate alignment for the given section type. +pprAlignForSection :: SectionType -> SDoc +pprAlignForSection seg = + ptext (case seg of + Text -> sLit ".align 4" + Data -> sLit ".align 8" + ReadOnlyData -> sLit ".align 8" + RelocatableReadOnlyData + -> sLit ".align 8" + UninitialisedData -> sLit ".align 8" + ReadOnlyData16 -> sLit ".align 16" + -- TODO: This is copied from the ReadOnlyData case, but it can likely be + -- made more efficient. + CString -> sLit ".align 8" + OtherSection _ -> panic "PprMach.pprSectionHeader: unknown section") + +-- | Pretty print a data item. +pprDataItem :: CmmLit -> SDoc +pprDataItem lit + = sdocWithDynFlags $ \dflags -> + vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit) + where + imm = litToImm lit + + ppr_item II8 _ = [text "\t.byte\t" <> pprImm imm] + ppr_item II32 _ = [text "\t.long\t" <> pprImm imm] + + ppr_item FF32 (CmmFloat r _) + = let bs = floatToBytes (fromRational r) + in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs + + ppr_item FF64 (CmmFloat r _) + = let bs = doubleToBytes (fromRational r) + in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs + + ppr_item II16 _ = [text "\t.short\t" <> pprImm imm] + ppr_item II64 _ = [text "\t.quad\t" <> pprImm imm] + ppr_item _ _ = panic "SPARC.Ppr.pprDataItem: no match" + + +-- | Pretty print an instruction. +pprInstr :: Instr -> SDoc + +-- nuke comments. +pprInstr (COMMENT _) + = empty + +pprInstr (DELTA d) + = pprInstr (COMMENT (mkFastString ("\tdelta = " ++ show d))) + +-- Newblocks and LData should have been slurped out before producing the .s file. +pprInstr (NEWBLOCK _) + = panic "X86.Ppr.pprInstr: NEWBLOCK" + +pprInstr (LDATA _ _) + = panic "PprMach.pprInstr: LDATA" + +-- 64 bit FP loads are expanded into individual instructions in CodeGen.Expand +pprInstr (LD FF64 _ reg) + | RegReal (RealRegSingle{}) <- reg + = panic "SPARC.Ppr: not emitting potentially misaligned LD FF64 instr" + +pprInstr (LD format addr reg) + = hcat [ + text "\tld", + pprFormat format, + char '\t', + lbrack, + pprAddr addr, + pp_rbracket_comma, + pprReg reg + ] + +-- 64 bit FP stores are expanded into individual instructions in CodeGen.Expand +pprInstr (ST FF64 reg _) + | RegReal (RealRegSingle{}) <- reg + = panic "SPARC.Ppr: not emitting potentially misaligned ST FF64 instr" + +-- no distinction is made between signed and unsigned bytes on stores for the +-- Sparc opcodes (at least I cannot see any, and gas is nagging me --SOF), +-- so we call a special-purpose pprFormat for ST.. +pprInstr (ST format reg addr) + = hcat [ + text "\tst", + pprStFormat format, + char '\t', + pprReg reg, + pp_comma_lbracket, + pprAddr addr, + rbrack + ] + + +pprInstr (ADD x cc reg1 ri reg2) + | not x && not cc && riZero ri + = hcat [ text "\tmov\t", pprReg reg1, comma, pprReg reg2 ] + + | otherwise + = pprRegRIReg (if x then sLit "addx" else sLit "add") cc reg1 ri reg2 + + +pprInstr (SUB x cc reg1 ri reg2) + | not x && cc && reg2 == g0 + = hcat [ text "\tcmp\t", pprReg reg1, comma, pprRI ri ] + + | not x && not cc && riZero ri + = hcat [ text "\tmov\t", pprReg reg1, comma, pprReg reg2 ] + + | otherwise + = pprRegRIReg (if x then sLit "subx" else sLit "sub") cc reg1 ri reg2 + +pprInstr (AND b reg1 ri reg2) = pprRegRIReg (sLit "and") b reg1 ri reg2 + +pprInstr (ANDN b reg1 ri reg2) = pprRegRIReg (sLit "andn") b reg1 ri reg2 + +pprInstr (OR b reg1 ri reg2) + | not b && reg1 == g0 + = let doit = hcat [ text "\tmov\t", pprRI ri, comma, pprReg reg2 ] + in case ri of + RIReg rrr | rrr == reg2 -> empty + _ -> doit + + | otherwise + = pprRegRIReg (sLit "or") b reg1 ri reg2 + +pprInstr (ORN b reg1 ri reg2) = pprRegRIReg (sLit "orn") b reg1 ri reg2 + +pprInstr (XOR b reg1 ri reg2) = pprRegRIReg (sLit "xor") b reg1 ri reg2 +pprInstr (XNOR b reg1 ri reg2) = pprRegRIReg (sLit "xnor") b reg1 ri reg2 + +pprInstr (SLL reg1 ri reg2) = pprRegRIReg (sLit "sll") False reg1 ri reg2 +pprInstr (SRL reg1 ri reg2) = pprRegRIReg (sLit "srl") False reg1 ri reg2 +pprInstr (SRA reg1 ri reg2) = pprRegRIReg (sLit "sra") False reg1 ri reg2 + +pprInstr (RDY rd) = text "\trd\t%y," <> pprReg rd +pprInstr (WRY reg1 reg2) + = text "\twr\t" + <> pprReg reg1 + <> char ',' + <> pprReg reg2 + <> char ',' + <> text "%y" + +pprInstr (SMUL b reg1 ri reg2) = pprRegRIReg (sLit "smul") b reg1 ri reg2 +pprInstr (UMUL b reg1 ri reg2) = pprRegRIReg (sLit "umul") b reg1 ri reg2 +pprInstr (SDIV b reg1 ri reg2) = pprRegRIReg (sLit "sdiv") b reg1 ri reg2 +pprInstr (UDIV b reg1 ri reg2) = pprRegRIReg (sLit "udiv") b reg1 ri reg2 + +pprInstr (SETHI imm reg) + = hcat [ + text "\tsethi\t", + pprImm imm, + comma, + pprReg reg + ] + +pprInstr NOP + = text "\tnop" + +pprInstr (FABS format reg1 reg2) + = pprFormatRegReg (sLit "fabs") format reg1 reg2 + +pprInstr (FADD format reg1 reg2 reg3) + = pprFormatRegRegReg (sLit "fadd") format reg1 reg2 reg3 + +pprInstr (FCMP e format reg1 reg2) + = pprFormatRegReg (if e then sLit "fcmpe" else sLit "fcmp") + format reg1 reg2 + +pprInstr (FDIV format reg1 reg2 reg3) + = pprFormatRegRegReg (sLit "fdiv") format reg1 reg2 reg3 + +pprInstr (FMOV format reg1 reg2) + = pprFormatRegReg (sLit "fmov") format reg1 reg2 + +pprInstr (FMUL format reg1 reg2 reg3) + = pprFormatRegRegReg (sLit "fmul") format reg1 reg2 reg3 + +pprInstr (FNEG format reg1 reg2) + = pprFormatRegReg (sLit "fneg") format reg1 reg2 + +pprInstr (FSQRT format reg1 reg2) + = pprFormatRegReg (sLit "fsqrt") format reg1 reg2 + +pprInstr (FSUB format reg1 reg2 reg3) + = pprFormatRegRegReg (sLit "fsub") format reg1 reg2 reg3 + +pprInstr (FxTOy format1 format2 reg1 reg2) + = hcat [ + text "\tf", + ptext + (case format1 of + II32 -> sLit "ito" + FF32 -> sLit "sto" + FF64 -> sLit "dto" + _ -> panic "SPARC.Ppr.pprInstr.FxToY: no match"), + ptext + (case format2 of + II32 -> sLit "i\t" + II64 -> sLit "x\t" + FF32 -> sLit "s\t" + FF64 -> sLit "d\t" + _ -> panic "SPARC.Ppr.pprInstr.FxToY: no match"), + pprReg reg1, comma, pprReg reg2 + ] + + +pprInstr (BI cond b blockid) + = hcat [ + text "\tb", pprCond cond, + if b then pp_comma_a else empty, + char '\t', + ppr (blockLbl blockid) + ] + +pprInstr (BF cond b blockid) + = hcat [ + text "\tfb", pprCond cond, + if b then pp_comma_a else empty, + char '\t', + ppr (blockLbl blockid) + ] + +pprInstr (JMP addr) = text "\tjmp\t" <> pprAddr addr +pprInstr (JMP_TBL op _ _) = pprInstr (JMP op) + +pprInstr (CALL (Left imm) n _) + = hcat [ text "\tcall\t", pprImm imm, comma, int n ] + +pprInstr (CALL (Right reg) n _) + = hcat [ text "\tcall\t", pprReg reg, comma, int n ] + + +-- | Pretty print a RI +pprRI :: RI -> SDoc +pprRI (RIReg r) = pprReg r +pprRI (RIImm r) = pprImm r + + +-- | Pretty print a two reg instruction. +pprFormatRegReg :: PtrString -> Format -> Reg -> Reg -> SDoc +pprFormatRegReg name format reg1 reg2 + = hcat [ + char '\t', + ptext name, + (case format of + FF32 -> text "s\t" + FF64 -> text "d\t" + _ -> panic "SPARC.Ppr.pprFormatRegReg: no match"), + + pprReg reg1, + comma, + pprReg reg2 + ] + + +-- | Pretty print a three reg instruction. +pprFormatRegRegReg :: PtrString -> Format -> Reg -> Reg -> Reg -> SDoc +pprFormatRegRegReg name format reg1 reg2 reg3 + = hcat [ + char '\t', + ptext name, + (case format of + FF32 -> text "s\t" + FF64 -> text "d\t" + _ -> panic "SPARC.Ppr.pprFormatRegReg: no match"), + pprReg reg1, + comma, + pprReg reg2, + comma, + pprReg reg3 + ] + + +-- | Pretty print an instruction of two regs and a ri. +pprRegRIReg :: PtrString -> Bool -> Reg -> RI -> Reg -> SDoc +pprRegRIReg name b reg1 ri reg2 + = hcat [ + char '\t', + ptext name, + if b then text "cc\t" else char '\t', + pprReg reg1, + comma, + pprRI ri, + comma, + pprReg reg2 + ] + +{- +pprRIReg :: PtrString -> Bool -> RI -> Reg -> SDoc +pprRIReg name b ri reg1 + = hcat [ + char '\t', + ptext name, + if b then text "cc\t" else char '\t', + pprRI ri, + comma, + pprReg reg1 + ] +-} + +{- +pp_ld_lbracket :: SDoc +pp_ld_lbracket = text "\tld\t[" +-} + +pp_rbracket_comma :: SDoc +pp_rbracket_comma = text "]," + + +pp_comma_lbracket :: SDoc +pp_comma_lbracket = text ",[" + + +pp_comma_a :: SDoc +pp_comma_a = text ",a" diff --git a/compiler/GHC/CmmToAsm/SPARC/Regs.hs b/compiler/GHC/CmmToAsm/SPARC/Regs.hs new file mode 100644 index 0000000000..ba22470912 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/Regs.hs @@ -0,0 +1,259 @@ +-- ----------------------------------------------------------------------------- +-- +-- (c) The University of Glasgow 1994-2004 +-- +-- ----------------------------------------------------------------------------- + +module GHC.CmmToAsm.SPARC.Regs ( + -- registers + showReg, + virtualRegSqueeze, + realRegSqueeze, + classOfRealReg, + allRealRegs, + + -- machine specific info + gReg, iReg, lReg, oReg, fReg, + fp, sp, g0, g1, g2, o0, o1, f0, f1, f6, f8, f22, f26, f27, + + -- allocatable + allocatableRegs, + + -- args + argRegs, + allArgRegs, + callClobberedRegs, + + -- + mkVirtualReg, + regDotColor +) + +where + + +import GhcPrelude + +import GHC.Platform.SPARC +import GHC.Platform.Reg +import GHC.Platform.Reg.Class +import GHC.CmmToAsm.Format + +import Unique +import Outputable + +{- + The SPARC has 64 registers of interest; 32 integer registers and 32 + floating point registers. The mapping of STG registers to SPARC + machine registers is defined in StgRegs.h. We are, of course, + prepared for any eventuality. + + The whole fp-register pairing thing on sparcs is a huge nuisance. See + includes/stg/MachRegs.h for a description of what's going on + here. +-} + + +-- | Get the standard name for the register with this number. +showReg :: RegNo -> String +showReg n + | n >= 0 && n < 8 = "%g" ++ show n + | n >= 8 && n < 16 = "%o" ++ show (n-8) + | n >= 16 && n < 24 = "%l" ++ show (n-16) + | n >= 24 && n < 32 = "%i" ++ show (n-24) + | n >= 32 && n < 64 = "%f" ++ show (n-32) + | otherwise = panic "SPARC.Regs.showReg: unknown sparc register" + + +-- Get the register class of a certain real reg +classOfRealReg :: RealReg -> RegClass +classOfRealReg reg + = case reg of + RealRegSingle i + | i < 32 -> RcInteger + | otherwise -> RcFloat + + RealRegPair{} -> RcDouble + + +-- | regSqueeze_class reg +-- Calculate the maximum number of register colors that could be +-- denied to a node of this class due to having this reg +-- as a neighbour. +-- +{-# INLINE virtualRegSqueeze #-} +virtualRegSqueeze :: RegClass -> VirtualReg -> Int + +virtualRegSqueeze cls vr + = case cls of + RcInteger + -> case vr of + VirtualRegI{} -> 1 + VirtualRegHi{} -> 1 + _other -> 0 + + RcFloat + -> case vr of + VirtualRegF{} -> 1 + VirtualRegD{} -> 2 + _other -> 0 + + RcDouble + -> case vr of + VirtualRegF{} -> 1 + VirtualRegD{} -> 1 + _other -> 0 + + +{-# INLINE realRegSqueeze #-} +realRegSqueeze :: RegClass -> RealReg -> Int + +realRegSqueeze cls rr + = case cls of + RcInteger + -> case rr of + RealRegSingle regNo + | regNo < 32 -> 1 + | otherwise -> 0 + + RealRegPair{} -> 0 + + RcFloat + -> case rr of + RealRegSingle regNo + | regNo < 32 -> 0 + | otherwise -> 1 + + RealRegPair{} -> 2 + + RcDouble + -> case rr of + RealRegSingle regNo + | regNo < 32 -> 0 + | otherwise -> 1 + + RealRegPair{} -> 1 + + +-- | All the allocatable registers in the machine, +-- including register pairs. +allRealRegs :: [RealReg] +allRealRegs + = [ (RealRegSingle i) | i <- [0..63] ] + ++ [ (RealRegPair i (i+1)) | i <- [32, 34 .. 62 ] ] + + +-- | Get the regno for this sort of reg +gReg, lReg, iReg, oReg, fReg :: Int -> RegNo + +gReg x = x -- global regs +oReg x = (8 + x) -- output regs +lReg x = (16 + x) -- local regs +iReg x = (24 + x) -- input regs +fReg x = (32 + x) -- float regs + + +-- | Some specific regs used by the code generator. +g0, g1, g2, fp, sp, o0, o1, f0, f1, f6, f8, f22, f26, f27 :: Reg + +f6 = RegReal (RealRegSingle (fReg 6)) +f8 = RegReal (RealRegSingle (fReg 8)) +f22 = RegReal (RealRegSingle (fReg 22)) +f26 = RegReal (RealRegSingle (fReg 26)) +f27 = RegReal (RealRegSingle (fReg 27)) + +-- g0 is always zero, and writes to it vanish. +g0 = RegReal (RealRegSingle (gReg 0)) +g1 = RegReal (RealRegSingle (gReg 1)) +g2 = RegReal (RealRegSingle (gReg 2)) + +-- FP, SP, int and float return (from C) regs. +fp = RegReal (RealRegSingle (iReg 6)) +sp = RegReal (RealRegSingle (oReg 6)) +o0 = RegReal (RealRegSingle (oReg 0)) +o1 = RegReal (RealRegSingle (oReg 1)) +f0 = RegReal (RealRegSingle (fReg 0)) +f1 = RegReal (RealRegSingle (fReg 1)) + +-- | Produce the second-half-of-a-double register given the first half. +{- +fPair :: Reg -> Maybe Reg +fPair (RealReg n) + | n >= 32 && n `mod` 2 == 0 = Just (RealReg (n+1)) + +fPair (VirtualRegD u) + = Just (VirtualRegHi u) + +fPair reg + = trace ("MachInstrs.fPair: can't get high half of supposed double reg " ++ showPpr reg) + Nothing +-} + + +-- | All the regs that the register allocator can allocate to, +-- with the fixed use regs removed. +-- +allocatableRegs :: [RealReg] +allocatableRegs + = let isFree rr + = case rr of + RealRegSingle r -> freeReg r + RealRegPair r1 r2 -> freeReg r1 && freeReg r2 + in filter isFree allRealRegs + + +-- | The registers to place arguments for function calls, +-- for some number of arguments. +-- +argRegs :: RegNo -> [Reg] +argRegs r + = case r of + 0 -> [] + 1 -> map (RegReal . RealRegSingle . oReg) [0] + 2 -> map (RegReal . RealRegSingle . oReg) [0,1] + 3 -> map (RegReal . RealRegSingle . oReg) [0,1,2] + 4 -> map (RegReal . RealRegSingle . oReg) [0,1,2,3] + 5 -> map (RegReal . RealRegSingle . oReg) [0,1,2,3,4] + 6 -> map (RegReal . RealRegSingle . oReg) [0,1,2,3,4,5] + _ -> panic "MachRegs.argRegs(sparc): don't know about >6 arguments!" + + +-- | All all the regs that could possibly be returned by argRegs +-- +allArgRegs :: [Reg] +allArgRegs + = map (RegReal . RealRegSingle) [oReg i | i <- [0..5]] + + +-- These are the regs that we cannot assume stay alive over a C call. +-- TODO: Why can we assume that o6 isn't clobbered? -- BL 2009/02 +-- +callClobberedRegs :: [Reg] +callClobberedRegs + = map (RegReal . RealRegSingle) + ( oReg 7 : + [oReg i | i <- [0..5]] ++ + [gReg i | i <- [1..7]] ++ + [fReg i | i <- [0..31]] ) + + + +-- | Make a virtual reg with this format. +mkVirtualReg :: Unique -> Format -> VirtualReg +mkVirtualReg u format + | not (isFloatFormat format) + = VirtualRegI u + + | otherwise + = case format of + FF32 -> VirtualRegF u + FF64 -> VirtualRegD u + _ -> panic "mkVReg" + + +regDotColor :: RealReg -> SDoc +regDotColor reg + = case classOfRealReg reg of + RcInteger -> text "blue" + RcFloat -> text "red" + _other -> text "green" diff --git a/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs b/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs new file mode 100644 index 0000000000..2d1f77d737 --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs @@ -0,0 +1,74 @@ +module GHC.CmmToAsm.SPARC.ShortcutJump ( + JumpDest(..), getJumpDestBlockId, + canShortcut, + shortcutJump, + shortcutStatics, + shortBlockId +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.Instr +import GHC.CmmToAsm.SPARC.Imm + +import GHC.Cmm.CLabel +import GHC.Cmm.BlockId +import GHC.Cmm + +import Panic +import Outputable + +data JumpDest + = DestBlockId BlockId + | DestImm Imm + +-- Debug Instance +instance Outputable JumpDest where + ppr (DestBlockId bid) = text "blk:" <> ppr bid + ppr (DestImm _bid) = text "imm:?" + +getJumpDestBlockId :: JumpDest -> Maybe BlockId +getJumpDestBlockId (DestBlockId bid) = Just bid +getJumpDestBlockId _ = Nothing + + +canShortcut :: Instr -> Maybe JumpDest +canShortcut _ = Nothing + + +shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr +shortcutJump _ other = other + + + +shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics +shortcutStatics fn (RawCmmStatics lbl statics) + = RawCmmStatics lbl $ map (shortcutStatic fn) statics + -- we need to get the jump tables, so apply the mapping to the entries + -- of a CmmData too. + +shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel +shortcutLabel fn lab + | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn blkId + | otherwise = lab + +shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic +shortcutStatic fn (CmmStaticLit (CmmLabel lab)) + = CmmStaticLit (CmmLabel (shortcutLabel fn lab)) +shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w)) + = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w) +-- slightly dodgy, we're ignoring the second label, but this +-- works with the way we use CmmLabelDiffOff for jump tables now. +shortcutStatic _ other_static + = other_static + + +shortBlockId :: (BlockId -> Maybe JumpDest) -> BlockId -> CLabel +shortBlockId fn blockid = + case fn blockid of + Nothing -> blockLbl blockid + Just (DestBlockId blockid') -> shortBlockId fn blockid' + Just (DestImm (ImmCLbl lbl)) -> lbl + _other -> panic "shortBlockId" diff --git a/compiler/GHC/CmmToAsm/SPARC/Stack.hs b/compiler/GHC/CmmToAsm/SPARC/Stack.hs new file mode 100644 index 0000000000..d9a0ffd7cf --- /dev/null +++ b/compiler/GHC/CmmToAsm/SPARC/Stack.hs @@ -0,0 +1,59 @@ +module GHC.CmmToAsm.SPARC.Stack ( + spRel, + fpRel, + spillSlotToOffset, + maxSpillSlots +) + +where + +import GhcPrelude + +import GHC.CmmToAsm.SPARC.AddrMode +import GHC.CmmToAsm.SPARC.Regs +import GHC.CmmToAsm.SPARC.Base +import GHC.CmmToAsm.SPARC.Imm + +import GHC.Driver.Session +import Outputable + +-- | Get an AddrMode relative to the address in sp. +-- This gives us a stack relative addressing mode for volatile +-- temporaries and for excess call arguments. +-- +spRel :: Int -- ^ stack offset in words, positive or negative + -> AddrMode + +spRel n = AddrRegImm sp (ImmInt (n * wordLength)) + + +-- | Get an address relative to the frame pointer. +-- This doesn't work work for offsets greater than 13 bits; we just hope for the best +-- +fpRel :: Int -> AddrMode +fpRel n + = AddrRegImm fp (ImmInt (n * wordLength)) + + +-- | Convert a spill slot number to a *byte* offset, with no sign. +-- +spillSlotToOffset :: DynFlags -> Int -> Int +spillSlotToOffset dflags slot + | slot >= 0 && slot < maxSpillSlots dflags + = 64 + spillSlotSize * slot + + | otherwise + = pprPanic "spillSlotToOffset:" + ( text "invalid spill location: " <> int slot + $$ text "maxSpillSlots: " <> int (maxSpillSlots dflags)) + + +-- | The maximum number of spill slots available on the C stack. +-- If we use up all of the slots, then we're screwed. +-- +-- Why do we reserve 64 bytes, instead of using the whole thing?? +-- -- BL 2009/02/15 +-- +maxSpillSlots :: DynFlags -> Int +maxSpillSlots dflags + = ((spillAreaLength dflags - 64) `div` spillSlotSize) - 1 diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs new file mode 100644 index 0000000000..f3b20c19e1 --- /dev/null +++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs @@ -0,0 +1,3747 @@ +{-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE BangPatterns #-} + +#if __GLASGOW_HASKELL__ <= 808 +-- GHC 8.10 deprecates this flag, but GHC 8.8 needs it +-- The default iteration limit is a bit too low for the definitions +-- in this module. +{-# OPTIONS_GHC -fmax-pmcheck-iterations=10000000 #-} +#endif + +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +----------------------------------------------------------------------------- +-- +-- Generating machine code (instruction selection) +-- +-- (c) The University of Glasgow 1996-2004 +-- +----------------------------------------------------------------------------- + +-- This is a big module, but, if you pay attention to +-- (a) the sectioning, and (b) the type signatures, the +-- structure should not be too overwhelming. + +module GHC.CmmToAsm.X86.CodeGen ( + cmmTopCodeGen, + generateJumpTableForInstr, + extractUnwindPoints, + invertCondBranches, + InstrBlock +) + +where + +#include "HsVersions.h" + +-- NCG stuff: +import GhcPrelude + +import GHC.CmmToAsm.X86.Instr +import GHC.CmmToAsm.X86.Cond +import GHC.CmmToAsm.X86.Regs +import GHC.CmmToAsm.X86.Ppr ( ) +import GHC.CmmToAsm.X86.RegInfo + +import GHC.Platform.Regs +import GHC.CmmToAsm.CPrim +import GHC.Cmm.DebugBlock + ( DebugBlock(..), UnwindPoint(..), UnwindTable + , UnwindExpr(UwReg), toUnwindExpr + ) +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.PIC +import GHC.CmmToAsm.Monad + ( NatM, getNewRegNat, getNewLabelNat, setDeltaNat + , getDeltaNat, getBlockIdNat, getPicBaseNat, getNewRegPairNat + , getPicBaseMaybeNat, getDebugBlock, getFileId + , addImmediateSuccessorNat, updateCfgNat + ) +import GHC.CmmToAsm.CFG +import GHC.CmmToAsm.Format +import GHC.Platform.Reg +import GHC.Platform + +-- Our intermediate code: +import BasicTypes +import GHC.Cmm.BlockId +import Module ( primUnitId ) +import GHC.Cmm.Utils +import GHC.Cmm.Switch +import GHC.Cmm +import GHC.Cmm.Dataflow.Block +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Graph +import GHC.Cmm.Dataflow.Label +import GHC.Cmm.CLabel +import CoreSyn ( Tickish(..) ) +import SrcLoc ( srcSpanFile, srcSpanStartLine, srcSpanStartCol ) + +-- The rest: +import ForeignCall ( CCallConv(..) ) +import OrdList +import Outputable +import FastString +import GHC.Driver.Session +import Util +import UniqSupply ( getUniqueM ) + +import Control.Monad +import Data.Bits +import Data.Foldable (fold) +import Data.Int +import Data.Maybe +import Data.Word + +import qualified Data.Map as M + +is32BitPlatform :: NatM Bool +is32BitPlatform = do + dflags <- getDynFlags + return $ target32Bit (targetPlatform dflags) + +sse2Enabled :: NatM Bool +sse2Enabled = do + dflags <- getDynFlags + case platformArch (targetPlatform dflags) of + -- We Assume SSE1 and SSE2 operations are available on both + -- x86 and x86_64. Historically we didn't default to SSE2 and + -- SSE1 on x86, which results in defacto nondeterminism for how + -- rounding behaves in the associated x87 floating point instructions + -- because variations in the spill/fpu stack placement of arguments for + -- operations would change the precision and final result of what + -- would otherwise be the same expressions with respect to single or + -- double precision IEEE floating point computations. + ArchX86_64 -> return True + ArchX86 -> return True + _ -> panic "trying to generate x86/x86_64 on the wrong platform" + + +sse4_2Enabled :: NatM Bool +sse4_2Enabled = do + dflags <- getDynFlags + return (isSse4_2Enabled dflags) + + +cmmTopCodeGen + :: RawCmmDecl + -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr] + +cmmTopCodeGen (CmmProc info lab live graph) = do + let blocks = toBlockListEntryFirst graph + (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks + picBaseMb <- getPicBaseMaybeNat + dflags <- getDynFlags + let proc = CmmProc info lab live (ListGraph $ concat nat_blocks) + tops = proc : concat statics + os = platformOS $ targetPlatform dflags + + case picBaseMb of + Just picBase -> initializePicBase_x86 ArchX86 os picBase tops + Nothing -> return tops + +cmmTopCodeGen (CmmData sec dat) = do + return [CmmData sec (mkAlignment 1, dat)] -- no translation, we just use CmmStatic + +{- Note [Verifying basic blocks] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + We want to guarantee a few things about the results + of instruction selection. + + Namely that each basic blocks consists of: + * A (potentially empty) sequence of straight line instructions + followed by + * A (potentially empty) sequence of jump like instructions. + + We can verify this by going through the instructions and + making sure that any non-jumpish instruction can't appear + after a jumpish instruction. + + There are gotchas however: + * CALLs are strictly speaking control flow but here we care + not about them. Hence we treat them as regular instructions. + + It's safe for them to appear inside a basic block + as (ignoring side effects inside the call) they will result in + straight line code. + + * NEWBLOCK marks the start of a new basic block so can + be followed by any instructions. +-} + +-- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally. +verifyBasicBlock :: [Instr] -> () +verifyBasicBlock instrs + | debugIsOn = go False instrs + | otherwise = () + where + go _ [] = () + go atEnd (i:instr) + = case i of + -- Start a new basic block + NEWBLOCK {} -> go False instr + -- Calls are not viable block terminators + CALL {} | atEnd -> faultyBlockWith i + | not atEnd -> go atEnd instr + -- All instructions ok, check if we reached the end and continue. + _ | not atEnd -> go (isJumpishInstr i) instr + -- Only jumps allowed at the end of basic blocks. + | otherwise -> if isJumpishInstr i + then go True instr + else faultyBlockWith i + faultyBlockWith i + = pprPanic "Non control flow instructions after end of basic block." + (ppr i <+> text "in:" $$ vcat (map ppr instrs)) + +basicBlockCodeGen + :: CmmBlock + -> NatM ( [NatBasicBlock Instr] + , [NatCmmDecl (Alignment, RawCmmStatics) Instr]) + +basicBlockCodeGen block = do + let (_, nodes, tail) = blockSplit block + id = entryLabel block + stmts = blockToList nodes + -- Generate location directive + dbg <- getDebugBlock (entryLabel block) + loc_instrs <- case dblSourceTick =<< dbg of + Just (SourceNote span name) + -> do fileId <- getFileId (srcSpanFile span) + let line = srcSpanStartLine span; col = srcSpanStartCol span + return $ unitOL $ LOCATION fileId line col name + _ -> return nilOL + (mid_instrs,mid_bid) <- stmtsToInstrs id stmts + (!tail_instrs,_) <- stmtToInstrs mid_bid tail + let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs + return $! verifyBasicBlock (fromOL instrs) + instrs' <- fold <$> traverse addSpUnwindings instrs + -- code generation may introduce new basic block boundaries, which + -- are indicated by the NEWBLOCK instruction. We must split up the + -- instruction stream into basic blocks again. Also, we extract + -- LDATAs here too. + let + (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs' + + mkBlocks (NEWBLOCK id) (instrs,blocks,statics) + = ([], BasicBlock id instrs : blocks, statics) + mkBlocks (LDATA sec dat) (instrs,blocks,statics) + = (instrs, blocks, CmmData sec dat:statics) + mkBlocks instr (instrs,blocks,statics) + = (instr:instrs, blocks, statics) + return (BasicBlock id top : other_blocks, statics) + +-- | Convert 'DELTA' instructions into 'UNWIND' instructions to capture changes +-- in the @sp@ register. See Note [What is this unwinding business?] in Debug +-- for details. +addSpUnwindings :: Instr -> NatM (OrdList Instr) +addSpUnwindings instr@(DELTA d) = do + dflags <- getDynFlags + if debugLevel dflags >= 1 + then do lbl <- mkAsmTempLabel <$> getUniqueM + let unwind = M.singleton MachSp (Just $ UwReg MachSp $ negate d) + return $ toOL [ instr, UNWIND lbl unwind ] + else return (unitOL instr) +addSpUnwindings instr = return $ unitOL instr + +{- Note [Keeping track of the current block] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When generating instructions for Cmm we sometimes require +the current block for things like retry loops. + +We also sometimes change the current block, if a MachOP +results in branching control flow. + +Issues arise if we have two statements in the same block, +which both depend on the current block id *and* change the +basic block after them. This happens for atomic primops +in the X86 backend where we want to update the CFG data structure +when introducing new basic blocks. + +For example in #17334 we got this Cmm code: + + c3Bf: // global + (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18); + (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0); + _s3sT::I64 = _s3sV::I64; + goto c3B1; + +This resulted in two new basic blocks being inserted: + + c3Bf: + movl $18,%vI_n3Bo + movq 88(%vI_s3sQ),%rax + jmp _n3Bp + n3Bp: + ... + cmpxchgq %vI_n3Bq,88(%vI_s3sQ) + jne _n3Bp + ... + jmp _n3Bs + n3Bs: + ... + cmpxchgq %vI_n3Bt,88(%vI_s3sQ) + jne _n3Bs + ... + jmp _c3B1 + ... + +Based on the Cmm we called stmtToInstrs we translated both atomic operations under +the assumption they would be placed into their Cmm basic block `c3Bf`. +However for the retry loop we introduce new labels, so this is not the case +for the second statement. +This resulted in a desync between the explicit control flow graph +we construct as a separate data type and the actual control flow graph in the code. + +Instead we now return the new basic block if a statement causes a change +in the current block and use the block for all following statements. + +For this reason genCCall is also split into two parts. +One for calls which *won't* change the basic blocks in +which successive instructions will be placed. +A different one for calls which *are* known to change the +basic block. + +-} + +-- See Note [Keeping track of the current block] for why +-- we pass the BlockId. +stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in. + -> [CmmNode O O] -- ^ Cmm Statement + -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction +stmtsToInstrs bid stmts = + go bid stmts nilOL + where + go bid [] instrs = return (instrs,bid) + go bid (s:stmts) instrs = do + (instrs',bid') <- stmtToInstrs bid s + -- If the statement introduced a new block, we use that one + let !newBid = fromMaybe bid bid' + go newBid stmts (instrs `appOL` instrs') + +-- | `bid` refers to the current block and is used to update the CFG +-- if new blocks are inserted in the control flow. +-- See Note [Keeping track of the current block] for more details. +stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in. + -> CmmNode e x + -> NatM (InstrBlock, Maybe BlockId) + -- ^ Instructions, and bid of new block if successive + -- statements are placed in a different basic block. +stmtToInstrs bid stmt = do + dflags <- getDynFlags + is32Bit <- is32BitPlatform + case stmt of + CmmUnsafeForeignCall target result_regs args + -> genCCall dflags is32Bit target result_regs args bid + + _ -> (,Nothing) <$> case stmt of + CmmComment s -> return (unitOL (COMMENT s)) + CmmTick {} -> return nilOL + + CmmUnwind regs -> do + let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable + to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr) + case foldMap to_unwind_entry regs of + tbl | M.null tbl -> return nilOL + | otherwise -> do + lbl <- mkAsmTempLabel <$> getUniqueM + return $ unitOL $ UNWIND lbl tbl + + CmmAssign reg src + | isFloatType ty -> assignReg_FltCode format reg src + | is32Bit && isWord64 ty -> assignReg_I64Code reg src + | otherwise -> assignReg_IntCode format reg src + where ty = cmmRegType dflags reg + format = cmmTypeFormat ty + + CmmStore addr src + | isFloatType ty -> assignMem_FltCode format addr src + | is32Bit && isWord64 ty -> assignMem_I64Code addr src + | otherwise -> assignMem_IntCode format addr src + where ty = cmmExprType dflags src + format = cmmTypeFormat ty + + CmmBranch id -> return $ genBranch id + + --We try to arrange blocks such that the likely branch is the fallthrough + --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here. + CmmCondBranch arg true false _ -> genCondBranch bid true false arg + CmmSwitch arg ids -> do dflags <- getDynFlags + genSwitch dflags arg ids + CmmCall { cml_target = arg + , cml_args_regs = gregs } -> do + dflags <- getDynFlags + genJump arg (jumpRegs dflags gregs) + _ -> + panic "stmtToInstrs: statement should have been cps'd away" + + +jumpRegs :: DynFlags -> [GlobalReg] -> [Reg] +jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ] + where platform = targetPlatform dflags + +-------------------------------------------------------------------------------- +-- | 'InstrBlock's are the insn sequences generated by the insn selectors. +-- They are really trees of insns to facilitate fast appending, where a +-- left-to-right traversal yields the insns in the correct order. +-- +type InstrBlock + = OrdList Instr + + +-- | Condition codes passed up the tree. +-- +data CondCode + = CondCode Bool Cond InstrBlock + + +-- | a.k.a "Register64" +-- Reg is the lower 32-bit temporary which contains the result. +-- Use getHiVRegFromLo to find the other VRegUnique. +-- +-- Rules of this simplified insn selection game are therefore that +-- the returned Reg may be modified +-- +data ChildCode64 + = ChildCode64 + InstrBlock + Reg + + +-- | Register's passed up the tree. If the stix code forces the register +-- to live in a pre-decided machine register, it comes out as @Fixed@; +-- otherwise, it comes out as @Any@, and the parent can decide which +-- register to put it in. +-- +data Register + = Fixed Format Reg InstrBlock + | Any Format (Reg -> InstrBlock) + + +swizzleRegisterRep :: Register -> Format -> Register +swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code +swizzleRegisterRep (Any _ codefn) format = Any format codefn + + +-- | Grab the Reg for a CmmReg +getRegisterReg :: Platform -> CmmReg -> Reg + +getRegisterReg _ (CmmLocal (LocalReg u pk)) + = -- by Assuming SSE2, Int,Word,Float,Double all can be register allocated + let fmt = cmmTypeFormat pk in + RegVirtual (mkVirtualReg u fmt) + +getRegisterReg platform (CmmGlobal mid) + = case globalRegMaybe platform mid of + Just reg -> RegReal $ reg + Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid) + -- By this stage, the only MagicIds remaining should be the + -- ones which map to a real machine register on this + -- platform. Hence ... + + +-- | Memory addressing modes passed up the tree. +data Amode + = Amode AddrMode InstrBlock + +{- +Now, given a tree (the argument to a CmmLoad) that references memory, +produce a suitable addressing mode. + +A Rule of the Game (tm) for Amodes: use of the addr bit must +immediately follow use of the code part, since the code part puts +values in registers which the addr then refers to. So you can't put +anything in between, lest it overwrite some of those registers. If +you need to do some other computation between the code part and use of +the addr bit, first store the effective address from the amode in a +temporary, then do the other computation, and then use the temporary: + + code + LEA amode, tmp + ... other computation ... + ... (tmp) ... +-} + + +-- | Check whether an integer will fit in 32 bits. +-- A CmmInt is intended to be truncated to the appropriate +-- number of bits, so here we truncate it to Int64. This is +-- important because e.g. -1 as a CmmInt might be either +-- -1 or 18446744073709551615. +-- +is32BitInteger :: Integer -> Bool +is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000 + where i64 = fromIntegral i :: Int64 + + +-- | Convert a BlockId to some CmmStatic data +jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic +jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags)) +jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel) + where blockLabel = blockLbl blockid + + +-- ----------------------------------------------------------------------------- +-- General things for putting together code sequences + +-- Expand CmmRegOff. ToDo: should we do it this way around, or convert +-- CmmExprs into CmmRegOff? +mangleIndexTree :: DynFlags -> CmmReg -> Int -> CmmExpr +mangleIndexTree dflags reg off + = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)] + where width = typeWidth (cmmRegType dflags reg) + +-- | The dual to getAnyReg: compute an expression into a register, but +-- we don't mind which one it is. +getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock) +getSomeReg expr = do + r <- getRegister expr + case r of + Any rep code -> do + tmp <- getNewRegNat rep + return (tmp, code tmp) + Fixed _ reg code -> + return (reg, code) + + +assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock +assignMem_I64Code addrTree valueTree = do + Amode addr addr_code <- getAmode addrTree + ChildCode64 vcode rlo <- iselExpr64 valueTree + let + rhi = getHiVRegFromLo rlo + + -- Little-endian store + mov_lo = MOV II32 (OpReg rlo) (OpAddr addr) + mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4))) + return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi) + + +assignReg_I64Code :: CmmReg -> CmmExpr -> NatM InstrBlock +assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do + ChildCode64 vcode r_src_lo <- iselExpr64 valueTree + let + r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32 + r_dst_hi = getHiVRegFromLo r_dst_lo + r_src_hi = getHiVRegFromLo r_src_lo + mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo) + mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi) + return ( + vcode `snocOL` mov_lo `snocOL` mov_hi + ) + +assignReg_I64Code _ _ + = panic "assignReg_I64Code(i386): invalid lvalue" + + +iselExpr64 :: CmmExpr -> NatM ChildCode64 +iselExpr64 (CmmLit (CmmInt i _)) = do + (rlo,rhi) <- getNewRegPairNat II32 + let + r = fromIntegral (fromIntegral i :: Word32) + q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32) + code = toOL [ + MOV II32 (OpImm (ImmInteger r)) (OpReg rlo), + MOV II32 (OpImm (ImmInteger q)) (OpReg rhi) + ] + return (ChildCode64 code rlo) + +iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do + Amode addr addr_code <- getAmode addrTree + (rlo,rhi) <- getNewRegPairNat II32 + let + mov_lo = MOV II32 (OpAddr addr) (OpReg rlo) + mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi) + return ( + ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi) + rlo + ) + +iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty + = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32)) + +-- we handle addition, but rather badly +iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do + ChildCode64 code1 r1lo <- iselExpr64 e1 + (rlo,rhi) <- getNewRegPairNat II32 + let + r = fromIntegral (fromIntegral i :: Word32) + q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32) + r1hi = getHiVRegFromLo r1lo + code = code1 `appOL` + toOL [ MOV II32 (OpReg r1lo) (OpReg rlo), + ADD II32 (OpImm (ImmInteger r)) (OpReg rlo), + MOV II32 (OpReg r1hi) (OpReg rhi), + ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ] + return (ChildCode64 code rlo) + +iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do + ChildCode64 code1 r1lo <- iselExpr64 e1 + ChildCode64 code2 r2lo <- iselExpr64 e2 + (rlo,rhi) <- getNewRegPairNat II32 + let + r1hi = getHiVRegFromLo r1lo + r2hi = getHiVRegFromLo r2lo + code = code1 `appOL` + code2 `appOL` + toOL [ MOV II32 (OpReg r1lo) (OpReg rlo), + ADD II32 (OpReg r2lo) (OpReg rlo), + MOV II32 (OpReg r1hi) (OpReg rhi), + ADC II32 (OpReg r2hi) (OpReg rhi) ] + return (ChildCode64 code rlo) + +iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do + ChildCode64 code1 r1lo <- iselExpr64 e1 + ChildCode64 code2 r2lo <- iselExpr64 e2 + (rlo,rhi) <- getNewRegPairNat II32 + let + r1hi = getHiVRegFromLo r1lo + r2hi = getHiVRegFromLo r2lo + code = code1 `appOL` + code2 `appOL` + toOL [ MOV II32 (OpReg r1lo) (OpReg rlo), + SUB II32 (OpReg r2lo) (OpReg rlo), + MOV II32 (OpReg r1hi) (OpReg rhi), + SBB II32 (OpReg r2hi) (OpReg rhi) ] + return (ChildCode64 code rlo) + +iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do + fn <- getAnyReg expr + r_dst_lo <- getNewRegNat II32 + let r_dst_hi = getHiVRegFromLo r_dst_lo + code = fn r_dst_lo + return ( + ChildCode64 (code `snocOL` + MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi)) + r_dst_lo + ) + +iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do + fn <- getAnyReg expr + r_dst_lo <- getNewRegNat II32 + let r_dst_hi = getHiVRegFromLo r_dst_lo + code = fn r_dst_lo + return ( + ChildCode64 (code `snocOL` + MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL` + CLTD II32 `snocOL` + MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL` + MOV II32 (OpReg edx) (OpReg r_dst_hi)) + r_dst_lo + ) + +iselExpr64 expr + = pprPanic "iselExpr64(i386)" (ppr expr) + + +-------------------------------------------------------------------------------- +getRegister :: CmmExpr -> NatM Register +getRegister e = do dflags <- getDynFlags + is32Bit <- is32BitPlatform + getRegister' dflags is32Bit e + +getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register + +getRegister' dflags is32Bit (CmmReg reg) + = case reg of + CmmGlobal PicBaseReg + | is32Bit -> + -- on x86_64, we have %rip for PicBaseReg, but it's not + -- a full-featured register, it can only be used for + -- rip-relative addressing. + do reg' <- getPicBaseNat (archWordFormat is32Bit) + return (Fixed (archWordFormat is32Bit) reg' nilOL) + _ -> + do + let + fmt = cmmTypeFormat (cmmRegType dflags reg) + format = fmt + -- + let platform = targetPlatform dflags + return (Fixed format + (getRegisterReg platform reg) + nilOL) + + +getRegister' dflags is32Bit (CmmRegOff r n) + = getRegister' dflags is32Bit $ mangleIndexTree dflags r n + +getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e]) + = addAlignmentCheck align <$> getRegister' dflags is32Bit e + +-- for 32-bit architectures, support some 64 -> 32 bit conversions: +-- TO_W_(x), TO_W_(x >> 32) + +getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) + [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) + | is32Bit = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 (getHiVRegFromLo rlo) code + +getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) + [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) + | is32Bit = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 (getHiVRegFromLo rlo) code + +getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x]) + | is32Bit = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 rlo code + +getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x]) + | is32Bit = do + ChildCode64 code rlo <- iselExpr64 x + return $ Fixed II32 rlo code + +getRegister' _ _ (CmmLit lit@(CmmFloat f w)) = + float_const_sse2 where + float_const_sse2 + | f == 0.0 = do + let + format = floatFormat w + code dst = unitOL (XOR format (OpReg dst) (OpReg dst)) + -- I don't know why there are xorpd, xorps, and pxor instructions. + -- They all appear to do the same thing --SDM + return (Any format code) + + | otherwise = do + Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit + loadFloatAmode w addr code + +-- catch simple cases of zero- or sign-extended load +getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do + code <- intLoadCode (MOVZxL II8) addr + return (Any II32 code) + +getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do + code <- intLoadCode (MOVSxL II8) addr + return (Any II32 code) + +getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do + code <- intLoadCode (MOVZxL II16) addr + return (Any II32 code) + +getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do + code <- intLoadCode (MOVSxL II16) addr + return (Any II32 code) + +-- catch simple cases of zero- or sign-extended load +getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _]) + | not is32Bit = do + code <- intLoadCode (MOVZxL II8) addr + return (Any II64 code) + +getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _]) + | not is32Bit = do + code <- intLoadCode (MOVSxL II8) addr + return (Any II64 code) + +getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _]) + | not is32Bit = do + code <- intLoadCode (MOVZxL II16) addr + return (Any II64 code) + +getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _]) + | not is32Bit = do + code <- intLoadCode (MOVSxL II16) addr + return (Any II64 code) + +getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _]) + | not is32Bit = do + code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend + return (Any II64 code) + +getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _]) + | not is32Bit = do + code <- intLoadCode (MOVSxL II32) addr + return (Any II64 code) + +getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg), + CmmLit displacement]) + | not is32Bit = do + return $ Any II64 (\dst -> unitOL $ + LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst)) + +getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps + case mop of + MO_F_Neg w -> sse2NegCode w x + + + MO_S_Neg w -> triv_ucode NEGI (intFormat w) + MO_Not w -> triv_ucode NOT (intFormat w) + + -- Nop conversions + MO_UU_Conv W32 W8 -> toI8Reg W32 x + MO_SS_Conv W32 W8 -> toI8Reg W32 x + MO_XX_Conv W32 W8 -> toI8Reg W32 x + MO_UU_Conv W16 W8 -> toI8Reg W16 x + MO_SS_Conv W16 W8 -> toI8Reg W16 x + MO_XX_Conv W16 W8 -> toI8Reg W16 x + MO_UU_Conv W32 W16 -> toI16Reg W32 x + MO_SS_Conv W32 W16 -> toI16Reg W32 x + MO_XX_Conv W32 W16 -> toI16Reg W32 x + + MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x + MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x + MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x + MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x + MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x + MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x + MO_UU_Conv W64 W8 | not is32Bit -> toI8Reg W64 x + MO_SS_Conv W64 W8 | not is32Bit -> toI8Reg W64 x + MO_XX_Conv W64 W8 | not is32Bit -> toI8Reg W64 x + + MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x + MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x + MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x + + -- widenings + MO_UU_Conv W8 W32 -> integerExtend W8 W32 MOVZxL x + MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x + MO_UU_Conv W8 W16 -> integerExtend W8 W16 MOVZxL x + + MO_SS_Conv W8 W32 -> integerExtend W8 W32 MOVSxL x + MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x + MO_SS_Conv W8 W16 -> integerExtend W8 W16 MOVSxL x + + -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we + -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register + -- has 8-bit version). So for 32-bit code, we'll just zero-extend. + MO_XX_Conv W8 W32 + | is32Bit -> integerExtend W8 W32 MOVZxL x + | otherwise -> integerExtend W8 W32 MOV x + MO_XX_Conv W8 W16 + | is32Bit -> integerExtend W8 W16 MOVZxL x + | otherwise -> integerExtend W8 W16 MOV x + MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x + + MO_UU_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVZxL x + MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x + MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x + MO_SS_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVSxL x + MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x + MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x + -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl. + -- However, we don't want the register allocator to throw it + -- away as an unnecessary reg-to-reg move, so we keep it in + -- the form of a movzl and print it as a movl later. + -- This doesn't apply to MO_XX_Conv since in this case we don't care about + -- the upper bits. So we can just use MOV. + MO_XX_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOV x + MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x + MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x + + MO_FF_Conv W32 W64 -> coerceFP2FP W64 x + + + MO_FF_Conv W64 W32 -> coerceFP2FP W32 x + + MO_FS_Conv from to -> coerceFP2Int from to x + MO_SF_Conv from to -> coerceInt2FP from to x + + MO_V_Insert {} -> needLlvm + MO_V_Extract {} -> needLlvm + MO_V_Add {} -> needLlvm + MO_V_Sub {} -> needLlvm + MO_V_Mul {} -> needLlvm + MO_VS_Quot {} -> needLlvm + MO_VS_Rem {} -> needLlvm + MO_VS_Neg {} -> needLlvm + MO_VU_Quot {} -> needLlvm + MO_VU_Rem {} -> needLlvm + MO_VF_Insert {} -> needLlvm + MO_VF_Extract {} -> needLlvm + MO_VF_Add {} -> needLlvm + MO_VF_Sub {} -> needLlvm + MO_VF_Mul {} -> needLlvm + MO_VF_Quot {} -> needLlvm + MO_VF_Neg {} -> needLlvm + + _other -> pprPanic "getRegister" (pprMachOp mop) + where + triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register + triv_ucode instr format = trivialUCode format (instr format) x + + -- signed or unsigned extension. + integerExtend :: Width -> Width + -> (Format -> Operand -> Operand -> Instr) + -> CmmExpr -> NatM Register + integerExtend from to instr expr = do + (reg,e_code) <- if from == W8 then getByteReg expr + else getSomeReg expr + let + code dst = + e_code `snocOL` + instr (intFormat from) (OpReg reg) (OpReg dst) + return (Any (intFormat to) code) + + toI8Reg :: Width -> CmmExpr -> NatM Register + toI8Reg new_rep expr + = do codefn <- getAnyReg expr + return (Any (intFormat new_rep) codefn) + -- HACK: use getAnyReg to get a byte-addressable register. + -- If the source was a Fixed register, this will add the + -- mov instruction to put it into the desired destination. + -- We're assuming that the destination won't be a fixed + -- non-byte-addressable register; it won't be, because all + -- fixed registers are word-sized. + + toI16Reg = toI8Reg -- for now + + conversionNop :: Format -> CmmExpr -> NatM Register + conversionNop new_format expr + = do e_code <- getRegister' dflags is32Bit expr + return (swizzleRegisterRep e_code new_format) + + +getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps + case mop of + MO_F_Eq _ -> condFltReg is32Bit EQQ x y + MO_F_Ne _ -> condFltReg is32Bit NE x y + MO_F_Gt _ -> condFltReg is32Bit GTT x y + MO_F_Ge _ -> condFltReg is32Bit GE x y + -- Invert comparison condition and swap operands + -- See Note [SSE Parity Checks] + MO_F_Lt _ -> condFltReg is32Bit GTT y x + MO_F_Le _ -> condFltReg is32Bit GE y x + + MO_Eq _ -> condIntReg EQQ x y + MO_Ne _ -> condIntReg NE x y + + MO_S_Gt _ -> condIntReg GTT x y + MO_S_Ge _ -> condIntReg GE x y + MO_S_Lt _ -> condIntReg LTT x y + MO_S_Le _ -> condIntReg LE x y + + MO_U_Gt _ -> condIntReg GU x y + MO_U_Ge _ -> condIntReg GEU x y + MO_U_Lt _ -> condIntReg LU x y + MO_U_Le _ -> condIntReg LEU x y + + MO_F_Add w -> trivialFCode_sse2 w ADD x y + + MO_F_Sub w -> trivialFCode_sse2 w SUB x y + + MO_F_Quot w -> trivialFCode_sse2 w FDIV x y + + MO_F_Mul w -> trivialFCode_sse2 w MUL x y + + + MO_Add rep -> add_code rep x y + MO_Sub rep -> sub_code rep x y + + MO_S_Quot rep -> div_code rep True True x y + MO_S_Rem rep -> div_code rep True False x y + MO_U_Quot rep -> div_code rep False True x y + MO_U_Rem rep -> div_code rep False False x y + + MO_S_MulMayOflo rep -> imulMayOflo rep x y + + MO_Mul W8 -> imulW8 x y + MO_Mul rep -> triv_op rep IMUL + MO_And rep -> triv_op rep AND + MO_Or rep -> triv_op rep OR + MO_Xor rep -> triv_op rep XOR + + {- Shift ops on x86s have constraints on their source, it + either has to be Imm, CL or 1 + => trivialCode is not restrictive enough (sigh.) + -} + MO_Shl rep -> shift_code rep SHL x y {-False-} + MO_U_Shr rep -> shift_code rep SHR x y {-False-} + MO_S_Shr rep -> shift_code rep SAR x y {-False-} + + MO_V_Insert {} -> needLlvm + MO_V_Extract {} -> needLlvm + MO_V_Add {} -> needLlvm + MO_V_Sub {} -> needLlvm + MO_V_Mul {} -> needLlvm + MO_VS_Quot {} -> needLlvm + MO_VS_Rem {} -> needLlvm + MO_VS_Neg {} -> needLlvm + MO_VF_Insert {} -> needLlvm + MO_VF_Extract {} -> needLlvm + MO_VF_Add {} -> needLlvm + MO_VF_Sub {} -> needLlvm + MO_VF_Mul {} -> needLlvm + MO_VF_Quot {} -> needLlvm + MO_VF_Neg {} -> needLlvm + + _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop) + where + -------------------- + triv_op width instr = trivialCode width op (Just op) x y + where op = instr (intFormat width) + + -- Special case for IMUL for bytes, since the result of IMULB will be in + -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider + -- values. + imulW8 :: CmmExpr -> CmmExpr -> NatM Register + imulW8 arg_a arg_b = do + (a_reg, a_code) <- getNonClobberedReg arg_a + b_code <- getAnyReg arg_b + + let code = a_code `appOL` b_code eax `appOL` + toOL [ IMUL2 format (OpReg a_reg) ] + format = intFormat W8 + + return (Fixed format eax code) + + + imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register + imulMayOflo rep a b = do + (a_reg, a_code) <- getNonClobberedReg a + b_code <- getAnyReg b + let + shift_amt = case rep of + W32 -> 31 + W64 -> 63 + _ -> panic "shift_amt" + + format = intFormat rep + code = a_code `appOL` b_code eax `appOL` + toOL [ + IMUL2 format (OpReg a_reg), -- result in %edx:%eax + SAR format (OpImm (ImmInt shift_amt)) (OpReg eax), + -- sign extend lower part + SUB format (OpReg edx) (OpReg eax) + -- compare against upper + -- eax==0 if high part == sign extended low part + ] + return (Fixed format eax code) + + -------------------- + shift_code :: Width + -> (Format -> Operand -> Operand -> Instr) + -> CmmExpr + -> CmmExpr + -> NatM Register + + {- Case1: shift length as immediate -} + shift_code width instr x (CmmLit lit) = do + x_code <- getAnyReg x + let + format = intFormat width + code dst + = x_code dst `snocOL` + instr format (OpImm (litToImm lit)) (OpReg dst) + return (Any format code) + + {- Case2: shift length is complex (non-immediate) + * y must go in %ecx. + * we cannot do y first *and* put its result in %ecx, because + %ecx might be clobbered by x. + * if we do y second, then x cannot be + in a clobbered reg. Also, we cannot clobber x's reg + with the instruction itself. + * so we can either: + - do y first, put its result in a fresh tmp, then copy it to %ecx later + - do y second and put its result into %ecx. x gets placed in a fresh + tmp. This is likely to be better, because the reg alloc can + eliminate this reg->reg move here (it won't eliminate the other one, + because the move is into the fixed %ecx). + -} + shift_code width instr x y{-amount-} = do + x_code <- getAnyReg x + let format = intFormat width + tmp <- getNewRegNat format + y_code <- getAnyReg y + let + code = x_code tmp `appOL` + y_code ecx `snocOL` + instr format (OpReg ecx) (OpReg tmp) + return (Fixed format tmp code) + + -------------------- + add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register + add_code rep x (CmmLit (CmmInt y _)) + | is32BitInteger y = add_int rep x y + add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y + where format = intFormat rep + -- TODO: There are other interesting patterns we want to replace + -- with a LEA, e.g. `(x + offset) + (y << shift)`. + + -------------------- + sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register + sub_code rep x (CmmLit (CmmInt y _)) + | is32BitInteger (-y) = add_int rep x (-y) + sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y + + -- our three-operand add instruction: + add_int width x y = do + (x_reg, x_code) <- getSomeReg x + let + format = intFormat width + imm = ImmInt (fromInteger y) + code dst + = x_code `snocOL` + LEA format + (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm)) + (OpReg dst) + -- + return (Any format code) + + ---------------------- + + -- See Note [DIV/IDIV for bytes] + div_code W8 signed quotient x y = do + let widen | signed = MO_SS_Conv W8 W16 + | otherwise = MO_UU_Conv W8 W16 + div_code + W16 + signed + quotient + (CmmMachOp widen [x]) + (CmmMachOp widen [y]) + + div_code width signed quotient x y = do + (y_op, y_code) <- getRegOrMem y -- cannot be clobbered + x_code <- getAnyReg x + let + format = intFormat width + widen | signed = CLTD format + | otherwise = XOR format (OpReg edx) (OpReg edx) + + instr | signed = IDIV + | otherwise = DIV + + code = y_code `appOL` + x_code eax `appOL` + toOL [widen, instr format y_op] + + result | quotient = eax + | otherwise = edx + + return (Fixed format result code) + + +getRegister' _ _ (CmmLoad mem pk) + | isFloatType pk + = do + Amode addr mem_code <- getAmode mem + loadFloatAmode (typeWidth pk) addr mem_code + +getRegister' _ is32Bit (CmmLoad mem pk) + | is32Bit && not (isWord64 pk) + = do + code <- intLoadCode instr mem + return (Any format code) + where + width = typeWidth pk + format = intFormat width + instr = case width of + W8 -> MOVZxL II8 + _other -> MOV format + -- We always zero-extend 8-bit loads, if we + -- can't think of anything better. This is because + -- we can't guarantee access to an 8-bit variant of every register + -- (esi and edi don't have 8-bit variants), so to make things + -- simpler we do our 8-bit arithmetic with full 32-bit registers. + +-- Simpler memory load code on x86_64 +getRegister' _ is32Bit (CmmLoad mem pk) + | not is32Bit + = do + code <- intLoadCode (MOV format) mem + return (Any format code) + where format = intFormat $ typeWidth pk + +getRegister' _ is32Bit (CmmLit (CmmInt 0 width)) + = let + format = intFormat width + + -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits + format1 = if is32Bit then format + else case format of + II64 -> II32 + _ -> format + code dst + = unitOL (XOR format1 (OpReg dst) (OpReg dst)) + in + return (Any format code) + + -- optimisation for loading small literals on x86_64: take advantage + -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit + -- instruction forms are shorter. +getRegister' dflags is32Bit (CmmLit lit) + | not is32Bit, isWord64 (cmmLitType dflags lit), not (isBigLit lit) + = let + imm = litToImm lit + code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst)) + in + return (Any II64 code) + where + isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff + isBigLit _ = False + -- note1: not the same as (not.is32BitLit), because that checks for + -- signed literals that fit in 32 bits, but we want unsigned + -- literals here. + -- note2: all labels are small, because we're assuming the + -- small memory model (see gcc docs, -mcmodel=small). + +getRegister' dflags _ (CmmLit lit) + = do let format = cmmTypeFormat (cmmLitType dflags lit) + imm = litToImm lit + code dst = unitOL (MOV format (OpImm imm) (OpReg dst)) + return (Any format code) + +getRegister' _ _ other + | isVecExpr other = needLlvm + | otherwise = pprPanic "getRegister(x86)" (ppr other) + + +intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr + -> NatM (Reg -> InstrBlock) +intLoadCode instr mem = do + Amode src mem_code <- getAmode mem + return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst)) + +-- Compute an expression into *any* register, adding the appropriate +-- move instruction if necessary. +getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock) +getAnyReg expr = do + r <- getRegister expr + anyReg r + +anyReg :: Register -> NatM (Reg -> InstrBlock) +anyReg (Any _ code) = return code +anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst) + +-- A bit like getSomeReg, but we want a reg that can be byte-addressed. +-- Fixed registers might not be byte-addressable, so we make sure we've +-- got a temporary, inserting an extra reg copy if necessary. +getByteReg :: CmmExpr -> NatM (Reg, InstrBlock) +getByteReg expr = do + is32Bit <- is32BitPlatform + if is32Bit + then do r <- getRegister expr + case r of + Any rep code -> do + tmp <- getNewRegNat rep + return (tmp, code tmp) + Fixed rep reg code + | isVirtualReg reg -> return (reg,code) + | otherwise -> do + tmp <- getNewRegNat rep + return (tmp, code `snocOL` reg2reg rep reg tmp) + -- ToDo: could optimise slightly by checking for + -- byte-addressable real registers, but that will + -- happen very rarely if at all. + else getSomeReg expr -- all regs are byte-addressable on x86_64 + +-- Another variant: this time we want the result in a register that cannot +-- be modified by code to evaluate an arbitrary expression. +getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock) +getNonClobberedReg expr = do + dflags <- getDynFlags + r <- getRegister expr + case r of + Any rep code -> do + tmp <- getNewRegNat rep + return (tmp, code tmp) + Fixed rep reg code + -- only certain regs can be clobbered + | reg `elem` instrClobberedRegs (targetPlatform dflags) + -> do + tmp <- getNewRegNat rep + return (tmp, code `snocOL` reg2reg rep reg tmp) + | otherwise -> + return (reg, code) + +reg2reg :: Format -> Reg -> Reg -> Instr +reg2reg format src dst = MOV format (OpReg src) (OpReg dst) + + +-------------------------------------------------------------------------------- +getAmode :: CmmExpr -> NatM Amode +getAmode e = do is32Bit <- is32BitPlatform + getAmode' is32Bit e + +getAmode' :: Bool -> CmmExpr -> NatM Amode +getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags + getAmode $ mangleIndexTree dflags r n + +getAmode' is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg), + CmmLit displacement]) + | not is32Bit + = return $ Amode (ripRel (litToImm displacement)) nilOL + + +-- This is all just ridiculous, since it carefully undoes +-- what mangleIndexTree has just done. +getAmode' is32Bit (CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)]) + | is32BitLit is32Bit lit + -- ASSERT(rep == II32)??? + = do (x_reg, x_code) <- getSomeReg x + let off = ImmInt (-(fromInteger i)) + return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code) + +getAmode' is32Bit (CmmMachOp (MO_Add _rep) [x, CmmLit lit]) + | is32BitLit is32Bit lit + -- ASSERT(rep == II32)??? + = do (x_reg, x_code) <- getSomeReg x + let off = litToImm lit + return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code) + +-- Turn (lit1 << n + lit2) into (lit2 + lit1 << n) so it will be +-- recognised by the next rule. +getAmode' is32Bit (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _), + b@(CmmLit _)]) + = getAmode' is32Bit (CmmMachOp (MO_Add rep) [b,a]) + +-- Matches: (x + offset) + (y << shift) +getAmode' _ (CmmMachOp (MO_Add _) [CmmRegOff x offset, + CmmMachOp (MO_Shl _) + [y, CmmLit (CmmInt shift _)]]) + | shift == 0 || shift == 1 || shift == 2 || shift == 3 + = x86_complex_amode (CmmReg x) y shift (fromIntegral offset) + +getAmode' _ (CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _) + [y, CmmLit (CmmInt shift _)]]) + | shift == 0 || shift == 1 || shift == 2 || shift == 3 + = x86_complex_amode x y shift 0 + +getAmode' _ (CmmMachOp (MO_Add _) + [x, CmmMachOp (MO_Add _) + [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)], + CmmLit (CmmInt offset _)]]) + | shift == 0 || shift == 1 || shift == 2 || shift == 3 + && is32BitInteger offset + = x86_complex_amode x y shift offset + +getAmode' _ (CmmMachOp (MO_Add _) [x,y]) + = x86_complex_amode x y 0 0 + +getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit + = return (Amode (ImmAddr (litToImm lit) 0) nilOL) + +getAmode' _ expr = do + (reg,code) <- getSomeReg expr + return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code) + +-- | Like 'getAmode', but on 32-bit use simple register addressing +-- (i.e. no index register). This stops us from running out of +-- registers on x86 when using instructions such as cmpxchg, which can +-- use up to three virtual registers and one fixed register. +getSimpleAmode :: DynFlags -> Bool -> CmmExpr -> NatM Amode +getSimpleAmode dflags is32Bit addr + | is32Bit = do + addr_code <- getAnyReg addr + addr_r <- getNewRegNat (intFormat (wordWidth dflags)) + let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0) + return $! Amode amode (addr_code addr_r) + | otherwise = getAmode addr + +x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode +x86_complex_amode base index shift offset + = do (x_reg, x_code) <- getNonClobberedReg base + -- x must be in a temp, because it has to stay live over y_code + -- we could compare x_reg and y_reg and do something better here... + (y_reg, y_code) <- getSomeReg index + let + code = x_code `appOL` y_code + base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8; + n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")" + return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset))) + code) + + + + +-- ----------------------------------------------------------------------------- +-- getOperand: sometimes any operand will do. + +-- getNonClobberedOperand: the value of the operand will remain valid across +-- the computation of an arbitrary expression, unless the expression +-- is computed directly into a register which the operand refers to +-- (see trivialCode where this function is used for an example). + +getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock) +getNonClobberedOperand (CmmLit lit) = do + if isSuitableFloatingPointLit lit + then do + let CmmFloat _ w = lit + Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit + return (OpAddr addr, code) + else do + + is32Bit <- is32BitPlatform + dflags <- getDynFlags + if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit)) + then return (OpImm (litToImm lit), nilOL) + else getNonClobberedOperand_generic (CmmLit lit) + +getNonClobberedOperand (CmmLoad mem pk) = do + is32Bit <- is32BitPlatform + -- this logic could be simplified + -- TODO FIXME + if (if is32Bit then not (isWord64 pk) else True) + -- if 32bit and pk is at float/double/simd value + -- or if 64bit + -- this could use some eyeballs or i'll need to stare at it more later + then do + dflags <- getDynFlags + let platform = targetPlatform dflags + Amode src mem_code <- getAmode mem + (src',save_code) <- + if (amodeCouldBeClobbered platform src) + then do + tmp <- getNewRegNat (archWordFormat is32Bit) + return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0), + unitOL (LEA (archWordFormat is32Bit) + (OpAddr src) + (OpReg tmp))) + else + return (src, nilOL) + return (OpAddr src', mem_code `appOL` save_code) + else do + -- if its a word or gcptr on 32bit? + getNonClobberedOperand_generic (CmmLoad mem pk) + +getNonClobberedOperand e = getNonClobberedOperand_generic e + +getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock) +getNonClobberedOperand_generic e = do + (reg, code) <- getNonClobberedReg e + return (OpReg reg, code) + +amodeCouldBeClobbered :: Platform -> AddrMode -> Bool +amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode) + +regClobbered :: Platform -> Reg -> Bool +regClobbered platform (RegReal (RealRegSingle rr)) = freeReg platform rr +regClobbered _ _ = False + +-- getOperand: the operand is not required to remain valid across the +-- computation of an arbitrary expression. +getOperand :: CmmExpr -> NatM (Operand, InstrBlock) + +getOperand (CmmLit lit) = do + use_sse2 <- sse2Enabled + if (use_sse2 && isSuitableFloatingPointLit lit) + then do + let CmmFloat _ w = lit + Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit + return (OpAddr addr, code) + else do + + is32Bit <- is32BitPlatform + dflags <- getDynFlags + if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit)) + then return (OpImm (litToImm lit), nilOL) + else getOperand_generic (CmmLit lit) + +getOperand (CmmLoad mem pk) = do + is32Bit <- is32BitPlatform + use_sse2 <- sse2Enabled + if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True) + then do + Amode src mem_code <- getAmode mem + return (OpAddr src, mem_code) + else + getOperand_generic (CmmLoad mem pk) + +getOperand e = getOperand_generic e + +getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock) +getOperand_generic e = do + (reg, code) <- getSomeReg e + return (OpReg reg, code) + +isOperand :: Bool -> CmmExpr -> Bool +isOperand _ (CmmLoad _ _) = True +isOperand is32Bit (CmmLit lit) = is32BitLit is32Bit lit + || isSuitableFloatingPointLit lit +isOperand _ _ = False + +-- | Given a 'Register', produce a new 'Register' with an instruction block +-- which will check the value for alignment. Used for @-falignment-sanitisation@. +addAlignmentCheck :: Int -> Register -> Register +addAlignmentCheck align reg = + case reg of + Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg) + Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt reg) + where + check :: Format -> Reg -> InstrBlock + check fmt reg = + ASSERT(not $ isFloatFormat fmt) + toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg) + , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel + ] + +memConstant :: Alignment -> CmmLit -> NatM Amode +memConstant align lit = do + lbl <- getNewLabelNat + let rosection = Section ReadOnlyData lbl + dflags <- getDynFlags + (addr, addr_code) <- if target32Bit (targetPlatform dflags) + then do dynRef <- cmmMakeDynamicReference + dflags + DataReference + lbl + Amode addr addr_code <- getAmode dynRef + return (addr, addr_code) + else return (ripRel (ImmCLbl lbl), nilOL) + let code = + LDATA rosection (align, RawCmmStatics lbl [CmmStaticLit lit]) + `consOL` addr_code + return (Amode addr code) + + +loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register +loadFloatAmode w addr addr_code = do + let format = floatFormat w + code dst = addr_code `snocOL` + MOV format (OpAddr addr) (OpReg dst) + + return (Any format code) + + +-- if we want a floating-point literal as an operand, we can +-- use it directly from memory. However, if the literal is +-- zero, we're better off generating it into a register using +-- xor. +isSuitableFloatingPointLit :: CmmLit -> Bool +isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0 +isSuitableFloatingPointLit _ = False + +getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock) +getRegOrMem e@(CmmLoad mem pk) = do + is32Bit <- is32BitPlatform + use_sse2 <- sse2Enabled + if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True) + then do + Amode src mem_code <- getAmode mem + return (OpAddr src, mem_code) + else do + (reg, code) <- getNonClobberedReg e + return (OpReg reg, code) +getRegOrMem e = do + (reg, code) <- getNonClobberedReg e + return (OpReg reg, code) + +is32BitLit :: Bool -> CmmLit -> Bool +is32BitLit is32Bit (CmmInt i W64) + | not is32Bit + = -- assume that labels are in the range 0-2^31-1: this assumes the + -- small memory model (see gcc docs, -mcmodel=small). + is32BitInteger i +is32BitLit _ _ = True + + + + +-- Set up a condition code for a conditional branch. + +getCondCode :: CmmExpr -> NatM CondCode + +-- yes, they really do seem to want exactly the same! + +getCondCode (CmmMachOp mop [x, y]) + = + case mop of + MO_F_Eq W32 -> condFltCode EQQ x y + MO_F_Ne W32 -> condFltCode NE x y + MO_F_Gt W32 -> condFltCode GTT x y + MO_F_Ge W32 -> condFltCode GE x y + -- Invert comparison condition and swap operands + -- See Note [SSE Parity Checks] + MO_F_Lt W32 -> condFltCode GTT y x + MO_F_Le W32 -> condFltCode GE y x + + MO_F_Eq W64 -> condFltCode EQQ x y + MO_F_Ne W64 -> condFltCode NE x y + MO_F_Gt W64 -> condFltCode GTT x y + MO_F_Ge W64 -> condFltCode GE x y + MO_F_Lt W64 -> condFltCode GTT y x + MO_F_Le W64 -> condFltCode GE y x + + _ -> condIntCode (machOpToCond mop) x y + +getCondCode other = pprPanic "getCondCode(2)(x86,x86_64)" (ppr other) + +machOpToCond :: MachOp -> Cond +machOpToCond mo = case mo of + MO_Eq _ -> EQQ + MO_Ne _ -> NE + MO_S_Gt _ -> GTT + MO_S_Ge _ -> GE + MO_S_Lt _ -> LTT + MO_S_Le _ -> LE + MO_U_Gt _ -> GU + MO_U_Ge _ -> GEU + MO_U_Lt _ -> LU + MO_U_Le _ -> LEU + _other -> pprPanic "machOpToCond" (pprMachOp mo) + + +-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be +-- passed back up the tree. + +condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode +condIntCode cond x y = do is32Bit <- is32BitPlatform + condIntCode' is32Bit cond x y + +condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode + +-- memory vs immediate +condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit) + | is32BitLit is32Bit lit = do + Amode x_addr x_code <- getAmode x + let + imm = litToImm lit + code = x_code `snocOL` + CMP (cmmTypeFormat pk) (OpImm imm) (OpAddr x_addr) + -- + return (CondCode False cond code) + +-- anything vs zero, using a mask +-- TODO: Add some sanity checking!!!! +condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk)) + | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit + = do + (x_reg, x_code) <- getSomeReg x + let + code = x_code `snocOL` + TEST (intFormat pk) (OpImm (ImmInteger mask)) (OpReg x_reg) + -- + return (CondCode False cond code) + +-- anything vs zero +condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do + (x_reg, x_code) <- getSomeReg x + let + code = x_code `snocOL` + TEST (intFormat pk) (OpReg x_reg) (OpReg x_reg) + -- + return (CondCode False cond code) + +-- anything vs operand +condIntCode' is32Bit cond x y + | isOperand is32Bit y = do + dflags <- getDynFlags + (x_reg, x_code) <- getNonClobberedReg x + (y_op, y_code) <- getOperand y + let + code = x_code `appOL` y_code `snocOL` + CMP (cmmTypeFormat (cmmExprType dflags x)) y_op (OpReg x_reg) + return (CondCode False cond code) +-- operand vs. anything: invert the comparison so that we can use a +-- single comparison instruction. + | isOperand is32Bit x + , Just revcond <- maybeFlipCond cond = do + dflags <- getDynFlags + (y_reg, y_code) <- getNonClobberedReg y + (x_op, x_code) <- getOperand x + let + code = y_code `appOL` x_code `snocOL` + CMP (cmmTypeFormat (cmmExprType dflags x)) x_op (OpReg y_reg) + return (CondCode False revcond code) + +-- anything vs anything +condIntCode' _ cond x y = do + dflags <- getDynFlags + (y_reg, y_code) <- getNonClobberedReg y + (x_op, x_code) <- getRegOrMem x + let + code = y_code `appOL` + x_code `snocOL` + CMP (cmmTypeFormat (cmmExprType dflags x)) (OpReg y_reg) x_op + return (CondCode False cond code) + + + +-------------------------------------------------------------------------------- +condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode + +condFltCode cond x y + = condFltCode_sse2 + where + + + -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be + -- an operand, but the right must be a reg. We can probably do better + -- than this general case... + condFltCode_sse2 = do + dflags <- getDynFlags + (x_reg, x_code) <- getNonClobberedReg x + (y_op, y_code) <- getOperand y + let + code = x_code `appOL` + y_code `snocOL` + CMP (floatFormat $ cmmExprWidth dflags x) y_op (OpReg x_reg) + -- NB(1): we need to use the unsigned comparison operators on the + -- result of this comparison. + return (CondCode True (condToUnsigned cond) code) + +-- ----------------------------------------------------------------------------- +-- Generating assignments + +-- Assignments are really at the heart of the whole code generation +-- business. Almost all top-level nodes of any real importance are +-- assignments, which correspond to loads, stores, or register +-- transfers. If we're really lucky, some of the register transfers +-- will go away, because we can use the destination register to +-- complete the code generation for the right hand side. This only +-- fails when the right hand side is forced into a fixed register +-- (e.g. the result of a call). + +assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock +assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock + +assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock +assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock + + +-- integer assignment to memory + +-- specific case of adding/subtracting an integer to a particular address. +-- ToDo: catch other cases where we can use an operation directly on a memory +-- address. +assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _, + CmmLit (CmmInt i _)]) + | addr == addr2, pk /= II64 || is32BitInteger i, + Just instr <- check op + = do Amode amode code_addr <- getAmode addr + let code = code_addr `snocOL` + instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode) + return code + where + check (MO_Add _) = Just ADD + check (MO_Sub _) = Just SUB + check _ = Nothing + -- ToDo: more? + +-- general case +assignMem_IntCode pk addr src = do + is32Bit <- is32BitPlatform + Amode addr code_addr <- getAmode addr + (code_src, op_src) <- get_op_RI is32Bit src + let + code = code_src `appOL` + code_addr `snocOL` + MOV pk op_src (OpAddr addr) + -- NOTE: op_src is stable, so it will still be valid + -- after code_addr. This may involve the introduction + -- of an extra MOV to a temporary register, but we hope + -- the register allocator will get rid of it. + -- + return code + where + get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand) -- code, operator + get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit + = return (nilOL, OpImm (litToImm lit)) + get_op_RI _ op + = do (reg,code) <- getNonClobberedReg op + return (code, OpReg reg) + + +-- Assign; dst is a reg, rhs is mem +assignReg_IntCode pk reg (CmmLoad src _) = do + load_code <- intLoadCode (MOV pk) src + dflags <- getDynFlags + let platform = targetPlatform dflags + return (load_code (getRegisterReg platform reg)) + +-- dst is a reg, but src could be anything +assignReg_IntCode _ reg src = do + dflags <- getDynFlags + let platform = targetPlatform dflags + code <- getAnyReg src + return (code (getRegisterReg platform reg)) + + +-- Floating point assignment to memory +assignMem_FltCode pk addr src = do + (src_reg, src_code) <- getNonClobberedReg src + Amode addr addr_code <- getAmode addr + let + code = src_code `appOL` + addr_code `snocOL` + MOV pk (OpReg src_reg) (OpAddr addr) + + return code + +-- Floating point assignment to a register/temporary +assignReg_FltCode _ reg src = do + src_code <- getAnyReg src + dflags <- getDynFlags + let platform = targetPlatform dflags + return (src_code (getRegisterReg platform reg)) + + +genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock + +genJump (CmmLoad mem _) regs = do + Amode target code <- getAmode mem + return (code `snocOL` JMP (OpAddr target) regs) + +genJump (CmmLit lit) regs = do + return (unitOL (JMP (OpImm (litToImm lit)) regs)) + +genJump expr regs = do + (reg,code) <- getSomeReg expr + return (code `snocOL` JMP (OpReg reg) regs) + + +-- ----------------------------------------------------------------------------- +-- Unconditional branches + +genBranch :: BlockId -> InstrBlock +genBranch = toOL . mkJumpInstr + + + +-- ----------------------------------------------------------------------------- +-- Conditional jumps/branches + +{- +Conditional jumps are always to local labels, so we can use branch +instructions. We peek at the arguments to decide what kind of +comparison to do. + +I386: First, we have to ensure that the condition +codes are set according to the supplied comparison operation. +-} + + +genCondBranch + :: BlockId -- the source of the jump + -> BlockId -- the true branch target + -> BlockId -- the false branch target + -> CmmExpr -- the condition on which to branch + -> NatM InstrBlock -- Instructions + +genCondBranch bid id false expr = do + is32Bit <- is32BitPlatform + genCondBranch' is32Bit bid id false expr + +-- | We return the instructions generated. +genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr + -> NatM InstrBlock + +-- 64-bit integer comparisons on 32-bit +genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2]) + | is32Bit, Just W64 <- maybeIntComparison mop = do + ChildCode64 code1 r1_lo <- iselExpr64 e1 + ChildCode64 code2 r2_lo <- iselExpr64 e2 + let r1_hi = getHiVRegFromLo r1_lo + r2_hi = getHiVRegFromLo r2_lo + cond = machOpToCond mop + Just cond' = maybeFlipCond cond + --TODO: Update CFG for x86 + let code = code1 `appOL` code2 `appOL` toOL [ + CMP II32 (OpReg r2_hi) (OpReg r1_hi), + JXX cond true, + JXX cond' false, + CMP II32 (OpReg r2_lo) (OpReg r1_lo), + JXX cond true] `appOL` genBranch false + return code + +genCondBranch' _ bid id false bool = do + CondCode is_float cond cond_code <- getCondCode bool + use_sse2 <- sse2Enabled + if not is_float || not use_sse2 + then + return (cond_code `snocOL` JXX cond id `appOL` genBranch false) + else do + -- See Note [SSE Parity Checks] + let jmpFalse = genBranch false + code + = case cond of + NE -> or_unordered + GU -> plain_test + GEU -> plain_test + -- Use ASSERT so we don't break releases if + -- LTT/LE creep in somehow. + LTT -> + ASSERT2(False, ppr "Should have been turned into >") + and_ordered + LE -> + ASSERT2(False, ppr "Should have been turned into >=") + and_ordered + _ -> and_ordered + + plain_test = unitOL ( + JXX cond id + ) `appOL` jmpFalse + or_unordered = toOL [ + JXX cond id, + JXX PARITY id + ] `appOL` jmpFalse + and_ordered = toOL [ + JXX PARITY false, + JXX cond id, + JXX ALWAYS false + ] + updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false) + return (cond_code `appOL` code) + +{- Note [Introducing cfg edges inside basic blocks] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + During instruction selection a statement `s` + in a block B with control of the sort: B -> C + will sometimes result in control + flow of the sort: + + ┌ < ┐ + v ^ + B -> B1 ┴ -> C + + as is the case for some atomic operations. + + Now to keep the CFG in sync when introducing B1 we clearly + want to insert it between B and C. However there is + a catch when we have to deal with self loops. + + We might start with code and a CFG of these forms: + + loop: + stmt1 ┌ < ┐ + .... v ^ + stmtX loop ┘ + stmtY + .... + goto loop: + + Now we introduce B1: + ┌ ─ ─ ─ ─ ─┐ + loop: │ ┌ < ┐ │ + instrs v │ │ ^ + .... loop ┴ B1 ┴ ┘ + instrsFromX + stmtY + goto loop: + + This is simple, all outgoing edges from loop now simply + start from B1 instead and the code generator knows which + new edges it introduced for the self loop of B1. + + Disaster strikes if the statement Y follows the same pattern. + If we apply the same rule that all outgoing edges change then + we end up with: + + loop ─> B1 ─> B2 ┬─┐ + │ │ └─<┤ │ + │ └───<───┘ │ + └───────<────────┘ + + This is problematic. The edge B1->B1 is modified as expected. + However the modification is wrong! + + The assembly in this case looked like this: + + _loop: + <instrs> + _B1: + ... + cmpxchgq ... + jne _B1 + <instrs> + <end _B1> + _B2: + ... + cmpxchgq ... + jne _B2 + <instrs> + jmp loop + + There is no edge _B2 -> _B1 here. It's still a self loop onto _B1. + + The problem here is that really B1 should be two basic blocks. + Otherwise we have control flow in the *middle* of a basic block. + A contradiction! + + So to account for this we add yet another basic block marker: + + _B: + <instrs> + _B1: + ... + cmpxchgq ... + jne _B1 + jmp _B1' + _B1': + <instrs> + <end _B1> + _B2: + ... + + Now when inserting B2 we will only look at the outgoing edges of B1' and + everything will work out nicely. + + You might also wonder why we don't insert jumps at the end of _B1'. There is + no way another block ends up jumping to the labels _B1 or _B2 since they are + essentially invisible to other blocks. View them as control flow labels local + to the basic block if you'd like. + + Not doing this ultimately caused (part 2 of) #17334. +-} + + +-- ----------------------------------------------------------------------------- +-- Generating C calls + +-- Now the biggest nightmare---calls. Most of the nastiness is buried in +-- @get_arg@, which moves the arguments to the correct registers/stack +-- locations. Apart from that, the code is easy. +-- +-- (If applicable) Do not fill the delay slots here; you will confuse the +-- register allocator. +-- +-- See Note [Keeping track of the current block] for information why we need +-- to take/return a block id. + +genCCall + :: DynFlags + -> Bool -- 32 bit platform? + -> ForeignTarget -- function to call + -> [CmmFormal] -- where to put the result + -> [CmmActual] -- arguments (of mixed type) + -> BlockId -- The block we are in + -> NatM (InstrBlock, Maybe BlockId) + +-- First we deal with cases which might introduce new blocks in the stream. + +genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop)) + [dst] [addr, n] bid = do + Amode amode addr_code <- + if amop `elem` [AMO_Add, AMO_Sub] + then getAmode addr + else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg + arg <- getNewRegNat format + arg_code <- getAnyReg n + let platform = targetPlatform dflags + dst_r = getRegisterReg platform (CmmLocal dst) + (code, lbl) <- op_code dst_r arg amode + return (addr_code `appOL` arg_code arg `appOL` code, Just lbl) + where + -- Code for the operation + op_code :: Reg -- Destination reg + -> Reg -- Register containing argument + -> AddrMode -- Address of location to mutate + -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId + op_code dst_r arg amode = case amop of + -- In the common case where dst_r is a virtual register the + -- final move should go away, because it's the last use of arg + -- and the first use of dst_r. + AMO_Add -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode)) + , MOV format (OpReg arg) (OpReg dst_r) + ], bid) + AMO_Sub -> return $ (toOL [ NEGI format (OpReg arg) + , LOCK (XADD format (OpReg arg) (OpAddr amode)) + , MOV format (OpReg arg) (OpReg dst_r) + ], bid) + -- In these cases we need a new block id, and have to return it so + -- that later instruction selection can reference it. + AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst) + AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst + , NOT format dst + ]) + AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst) + AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst) + where + -- Simulate operation that lacks a dedicated instruction using + -- cmpxchg. + cmpxchg_code :: (Operand -> Operand -> OrdList Instr) + -> NatM (OrdList Instr, BlockId) + cmpxchg_code instrs = do + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + tmp <- getNewRegNat format + + --Record inserted blocks + -- We turn A -> B into A -> A' -> A'' -> B + -- with a self loop on A'. + addImmediateSuccessorNat bid lbl1 + addImmediateSuccessorNat lbl1 lbl2 + updateCfgNat (addWeightEdge lbl1 lbl1 0) + + return $ (toOL + [ MOV format (OpAddr amode) (OpReg eax) + , JXX ALWAYS lbl1 + , NEWBLOCK lbl1 + -- Keep old value so we can return it: + , MOV format (OpReg eax) (OpReg dst_r) + , MOV format (OpReg eax) (OpReg tmp) + ] + `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL + [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode)) + , JXX NE lbl1 + -- See Note [Introducing cfg edges inside basic blocks] + -- why this basic block is required. + , JXX ALWAYS lbl2 + , NEWBLOCK lbl2 + ], + lbl2) + format = intFormat width + +genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid + | is32Bit, width == W64 = do + ChildCode64 vcode rlo <- iselExpr64 src + let rhi = getHiVRegFromLo rlo + dst_r = getRegisterReg platform (CmmLocal dst) + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + let format = if width == W8 then II16 else intFormat width + tmp_r <- getNewRegNat format + + -- New CFG Edges: + -- bid -> lbl2 + -- bid -> lbl1 -> lbl2 + -- We also changes edges originating at bid to start at lbl2 instead. + updateCfgNat (addWeightEdge bid lbl1 110 . + addWeightEdge lbl1 lbl2 110 . + addImmediateSuccessor bid lbl2) + + -- The following instruction sequence corresponds to the pseudo-code + -- + -- if (src) { + -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32); + -- } else { + -- dst = 64; + -- } + let !instrs = vcode `appOL` toOL + ([ MOV II32 (OpReg rhi) (OpReg tmp_r) + , OR II32 (OpReg rlo) (OpReg tmp_r) + , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r) + , JXX EQQ lbl2 + , JXX ALWAYS lbl1 + + , NEWBLOCK lbl1 + , BSF II32 (OpReg rhi) dst_r + , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r) + , BSF II32 (OpReg rlo) tmp_r + , CMOV NE II32 (OpReg tmp_r) dst_r + , JXX ALWAYS lbl2 + + , NEWBLOCK lbl2 + ]) + return (instrs, Just lbl2) + + | otherwise = do + code_src <- getAnyReg src + let dst_r = getRegisterReg platform (CmmLocal dst) + + if isBmi2Enabled dflags + then do + src_r <- getNewRegNat (intFormat width) + let instrs = appOL (code_src src_r) $ case width of + W8 -> toOL + [ OR II32 (OpImm (ImmInteger 0xFFFFFF00)) (OpReg src_r) + , TZCNT II32 (OpReg src_r) dst_r + ] + W16 -> toOL + [ TZCNT II16 (OpReg src_r) dst_r + , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) + ] + _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r + return (instrs, Nothing) + else do + -- The following insn sequence makes sure 'ctz 0' has a defined value. + -- starting with Haswell, one could use the TZCNT insn instead. + let format = if width == W8 then II16 else intFormat width + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + let !instrs = code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSF format (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r) + , CMOV NE format (OpReg tmp_r) dst_r + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits + return (instrs, Nothing) + where + bw = widthInBits width + platform = targetPlatform dflags + +genCCall dflags bits mop dst args bid = do + instr <- genCCall' dflags bits mop dst args bid + return (instr, Nothing) + +-- genCCall' handles cases not introducing new code blocks. +genCCall' + :: DynFlags + -> Bool -- 32 bit platform? + -> ForeignTarget -- function to call + -> [CmmFormal] -- where to put the result + -> [CmmActual] -- arguments (of mixed type) + -> BlockId -- The block we are in + -> NatM InstrBlock + +-- Unroll memcpy calls if the number of bytes to copy isn't too +-- large. Otherwise, call C's memcpy. +genCCall' dflags _ (PrimTarget (MO_Memcpy align)) _ + [dst, src, CmmLit (CmmInt n _)] _ + | fromInteger insns <= maxInlineMemcpyInsns dflags = do + code_dst <- getAnyReg dst + dst_r <- getNewRegNat format + code_src <- getAnyReg src + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + return $ code_dst dst_r `appOL` code_src src_r `appOL` + go dst_r src_r tmp_r (fromInteger n) + where + -- The number of instructions we will generate (approx). We need 2 + -- instructions per move. + insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes) + + maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported + effectiveAlignment = min (alignmentOf align) maxAlignment + format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment + + -- The size of each move, in bytes. + sizeBytes :: Integer + sizeBytes = fromIntegral (formatInBytes format) + + go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr + go dst src tmp i + | i >= sizeBytes = + unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL` + unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL` + go dst src tmp (i - sizeBytes) + -- Deal with remaining bytes. + | i >= 4 = -- Will never happen on 32-bit + unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL` + unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL` + go dst src tmp (i - 4) + | i >= 2 = + unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL` + unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL` + go dst src tmp (i - 2) + | i >= 1 = + unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL` + unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL` + go dst src tmp (i - 1) + | otherwise = nilOL + where + src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone + (ImmInteger (n - i)) + dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone + (ImmInteger (n - i)) + +genCCall' dflags _ (PrimTarget (MO_Memset align)) _ + [dst, + CmmLit (CmmInt c _), + CmmLit (CmmInt n _)] + _ + | fromInteger insns <= maxInlineMemsetInsns dflags = do + code_dst <- getAnyReg dst + dst_r <- getNewRegNat format + if format == II64 && n >= 8 then do + code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64)) + imm8byte_r <- getNewRegNat II64 + return $ code_dst dst_r `appOL` + code_imm8byte imm8byte_r `appOL` + go8 dst_r imm8byte_r (fromInteger n) + else + return $ code_dst dst_r `appOL` + go4 dst_r (fromInteger n) + where + maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported + effectiveAlignment = min (alignmentOf align) maxAlignment + format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment + c2 = c `shiftL` 8 .|. c + c4 = c2 `shiftL` 16 .|. c2 + c8 = c4 `shiftL` 32 .|. c4 + + -- The number of instructions we will generate (approx). We need 1 + -- instructions per move. + insns = (n + sizeBytes - 1) `div` sizeBytes + + -- The size of each move, in bytes. + sizeBytes :: Integer + sizeBytes = fromIntegral (formatInBytes format) + + -- Depending on size returns the widest MOV instruction and its + -- width. + gen4 :: AddrMode -> Integer -> (InstrBlock, Integer) + gen4 addr size + | size >= 4 = + (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4) + | size >= 2 = + (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2) + | size >= 1 = + (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1) + | otherwise = (nilOL, 0) + + -- Generates a 64-bit wide MOV instruction from REG to MEM. + gen8 :: AddrMode -> Reg -> InstrBlock + gen8 addr reg8byte = + unitOL (MOV format (OpReg reg8byte) (OpAddr addr)) + + -- Unrolls memset when the widest MOV is <= 4 bytes. + go4 :: Reg -> Integer -> InstrBlock + go4 dst left = + if left <= 0 then nilOL + else curMov `appOL` go4 dst (left - curWidth) + where + possibleWidth = minimum [left, sizeBytes] + dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) + (curMov, curWidth) = gen4 dst_addr possibleWidth + + -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg + -- argument). Falls back to go4 when all 8 byte moves are + -- exhausted. + go8 :: Reg -> Reg -> Integer -> InstrBlock + go8 dst reg8byte left = + if possibleWidth >= 8 then + let curMov = gen8 dst_addr reg8byte + in curMov `appOL` go8 dst reg8byte (left - 8) + else go4 dst left + where + possibleWidth = minimum [left, sizeBytes] + dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) + +genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL +genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL + -- barriers compile to no code on x86/x86-64; + -- we keep it this long in order to prevent earlier optimisations. + +genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL + +genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ = + case n of + 0 -> genPrefetch src $ PREFETCH NTA format + 1 -> genPrefetch src $ PREFETCH Lvl2 format + 2 -> genPrefetch src $ PREFETCH Lvl1 format + 3 -> genPrefetch src $ PREFETCH Lvl0 format + l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l) + -- the c / llvm prefetch convention is 0, 1, 2, and 3 + -- the x86 corresponding names are : NTA, 2 , 1, and 0 + where + format = archWordFormat is32bit + -- need to know what register width for pointers! + genPrefetch inRegSrc prefetchCTor = + do + code_src <- getAnyReg inRegSrc + src_r <- getNewRegNat format + return $ code_src src_r `appOL` + (unitOL (prefetchCTor (OpAddr + ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) )) + -- prefetch always takes an address + +genCCall' dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do + let platform = targetPlatform dflags + let dst_r = getRegisterReg platform (CmmLocal dst) + case width of + W64 | is32Bit -> do + ChildCode64 vcode rlo <- iselExpr64 src + let dst_rhi = getHiVRegFromLo dst_r + rhi = getHiVRegFromLo rlo + return $ vcode `appOL` + toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi), + MOV II32 (OpReg rhi) (OpReg dst_r), + BSWAP II32 dst_rhi, + BSWAP II32 dst_r ] + W16 -> do code_src <- getAnyReg src + return $ code_src dst_r `appOL` + unitOL (BSWAP II32 dst_r) `appOL` + unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r)) + _ -> do code_src <- getAnyReg src + return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r) + where + format = intFormat width + +genCCall' dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] + args@[src] bid = do + sse4_2 <- sse4_2Enabled + let platform = targetPlatform dflags + if sse4_2 + then do code_src <- getAnyReg src + src_r <- getNewRegNat format + let dst_r = getRegisterReg platform (CmmLocal dst) + return $ code_src src_r `appOL` + (if width == W8 then + -- The POPCNT instruction doesn't take a r/m8 + unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL` + unitOL (POPCNT II16 (OpReg src_r) dst_r) + else + unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL` + (if width == W8 || width == W16 then + -- We used a 16-bit destination register above, + -- so zero-extend + unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) + else nilOL) + else do + targetExpr <- cmmMakeDynamicReference dflags + CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall' dflags is32Bit target dest_regs args bid + where + format = intFormat width + lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width)) + +genCCall' dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst] + args@[src, mask] bid = do + let platform = targetPlatform dflags + if isBmi2Enabled dflags + then do code_src <- getAnyReg src + code_mask <- getAnyReg mask + src_r <- getNewRegNat format + mask_r <- getNewRegNat format + let dst_r = getRegisterReg platform (CmmLocal dst) + return $ code_src src_r `appOL` code_mask mask_r `appOL` + (if width == W8 then + -- The PDEP instruction doesn't take a r/m8 + unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL` + unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL` + unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r) + else + unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL` + (if width == W8 || width == W16 then + -- We used a 16-bit destination register above, + -- so zero-extend + unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) + else nilOL) + else do + targetExpr <- cmmMakeDynamicReference dflags + CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall' dflags is32Bit target dest_regs args bid + where + format = intFormat width + lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width)) + +genCCall' dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst] + args@[src, mask] bid = do + let platform = targetPlatform dflags + if isBmi2Enabled dflags + then do code_src <- getAnyReg src + code_mask <- getAnyReg mask + src_r <- getNewRegNat format + mask_r <- getNewRegNat format + let dst_r = getRegisterReg platform (CmmLocal dst) + return $ code_src src_r `appOL` code_mask mask_r `appOL` + (if width == W8 then + -- The PEXT instruction doesn't take a r/m8 + unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL` + unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL` + unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r) + else + unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL` + (if width == W8 || width == W16 then + -- We used a 16-bit destination register above, + -- so zero-extend + unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) + else nilOL) + else do + targetExpr <- cmmMakeDynamicReference dflags + CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall' dflags is32Bit target dest_regs args bid + where + format = intFormat width + lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width)) + +genCCall' dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid + | is32Bit && width == W64 = do + -- Fallback to `hs_clz64` on i386 + targetExpr <- cmmMakeDynamicReference dflags CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall' dflags is32Bit target dest_regs args bid + + | otherwise = do + code_src <- getAnyReg src + let dst_r = getRegisterReg platform (CmmLocal dst) + if isBmi2Enabled dflags + then do + src_r <- getNewRegNat (intFormat width) + return $ appOL (code_src src_r) $ case width of + W8 -> toOL + [ MOVZxL II8 (OpReg src_r) (OpReg src_r) -- zero-extend to 32 bit + , LZCNT II32 (OpReg src_r) dst_r -- lzcnt with extra 24 zeros + , SUB II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros + ] + W16 -> toOL + [ LZCNT II16 (OpReg src_r) dst_r + , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit + ] + _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r) + else do + let format = if width == W8 then II16 else intFormat width + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + return $ code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSR format (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r) + , CMOV NE format (OpReg tmp_r) dst_r + , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r) + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits + where + bw = widthInBits width + platform = targetPlatform dflags + lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width)) + +genCCall' dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do + targetExpr <- cmmMakeDynamicReference dflags + CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall' dflags is32Bit target dest_regs args bid + where + lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width)) + +genCCall' dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do + load_code <- intLoadCode (MOV (intFormat width)) addr + let platform = targetPlatform dflags + + return (load_code (getRegisterReg platform (CmmLocal dst))) + +genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do + code <- assignMem_IntCode (intFormat width) addr val + return $ code `snocOL` MFENCE + +genCCall' dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do + -- On x86 we don't have enough registers to use cmpxchg with a + -- complicated addressing mode, so on that architecture we + -- pre-compute the address first. + Amode amode addr_code <- getSimpleAmode dflags is32Bit addr + newval <- getNewRegNat format + newval_code <- getAnyReg new + oldval <- getNewRegNat format + oldval_code <- getAnyReg old + let platform = targetPlatform dflags + dst_r = getRegisterReg platform (CmmLocal dst) + code = toOL + [ MOV format (OpReg oldval) (OpReg eax) + , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode)) + , MOV format (OpReg eax) (OpReg dst_r) + ] + return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval + `appOL` code + where + format = intFormat width + +genCCall' _ is32Bit target dest_regs args bid = do + dflags <- getDynFlags + let platform = targetPlatform dflags + case (target, dest_regs) of + -- void return type prim op + (PrimTarget op, []) -> + outOfLineCmmOp bid op Nothing args + -- we only cope with a single result for foreign calls + (PrimTarget op, [r]) -> case op of + MO_F32_Fabs -> case args of + [x] -> sse2FabsCode W32 x + _ -> panic "genCCall: Wrong number of arguments for fabs" + MO_F64_Fabs -> case args of + [x] -> sse2FabsCode W64 x + _ -> panic "genCCall: Wrong number of arguments for fabs" + + MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args + MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args + _other_op -> outOfLineCmmOp bid op (Just r) args + + where + actuallyInlineSSE2Op = actuallyInlineFloatOp' + + actuallyInlineFloatOp' instr format [x] + = do res <- trivialUFCode format (instr format) x + any <- anyReg res + return (any (getRegisterReg platform (CmmLocal r))) + + actuallyInlineFloatOp' _ _ args + = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! (" + ++ show (length args) ++ ")" + + sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock + sse2FabsCode w x = do + let fmt = floatFormat w + x_code <- getAnyReg x + let + const | FF32 <- fmt = CmmInt 0x7fffffff W32 + | otherwise = CmmInt 0x7fffffffffffffff W64 + Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const + tmp <- getNewRegNat fmt + let + code dst = x_code dst `appOL` amode_code `appOL` toOL [ + MOV fmt (OpAddr amode) (OpReg tmp), + AND fmt (OpReg tmp) (OpReg dst) + ] + + return $ code (getRegisterReg platform (CmmLocal r)) + + (PrimTarget (MO_S_QuotRem width), _) -> divOp1 platform True width dest_regs args + (PrimTarget (MO_U_QuotRem width), _) -> divOp1 platform False width dest_regs args + (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args + (PrimTarget (MO_Add2 width), [res_h, res_l]) -> + case args of + [arg_x, arg_y] -> + do hCode <- getAnyReg (CmmLit (CmmInt 0 width)) + let format = intFormat width + lCode <- anyReg =<< trivialCode width (ADD_CC format) + (Just (ADD_CC format)) arg_x arg_y + let reg_l = getRegisterReg platform (CmmLocal res_l) + reg_h = getRegisterReg platform (CmmLocal res_h) + code = hCode reg_h `appOL` + lCode reg_l `snocOL` + ADC format (OpImm (ImmInteger 0)) (OpReg reg_h) + return code + _ -> panic "genCCall: Wrong number of arguments/results for add2" + (PrimTarget (MO_AddWordC width), [res_r, res_c]) -> + addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args + (PrimTarget (MO_SubWordC width), [res_r, res_c]) -> + addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args + (PrimTarget (MO_AddIntC width), [res_r, res_c]) -> + addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args + (PrimTarget (MO_SubIntC width), [res_r, res_c]) -> + addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args + (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) -> + case args of + [arg_x, arg_y] -> + do (y_reg, y_code) <- getRegOrMem arg_y + x_code <- getAnyReg arg_x + let format = intFormat width + reg_h = getRegisterReg platform (CmmLocal res_h) + reg_l = getRegisterReg platform (CmmLocal res_l) + code = y_code `appOL` + x_code rax `appOL` + toOL [MUL2 format y_reg, + MOV format (OpReg rdx) (OpReg reg_h), + MOV format (OpReg rax) (OpReg reg_l)] + return code + _ -> panic "genCCall: Wrong number of arguments/results for mul2" + (PrimTarget (MO_S_Mul2 width), [res_c, res_h, res_l]) -> + case args of + [arg_x, arg_y] -> + do (y_reg, y_code) <- getRegOrMem arg_y + x_code <- getAnyReg arg_x + reg_tmp <- getNewRegNat II8 + let format = intFormat width + reg_h = getRegisterReg platform (CmmLocal res_h) + reg_l = getRegisterReg platform (CmmLocal res_l) + reg_c = getRegisterReg platform (CmmLocal res_c) + code = y_code `appOL` + x_code rax `appOL` + toOL [ IMUL2 format y_reg + , MOV format (OpReg rdx) (OpReg reg_h) + , MOV format (OpReg rax) (OpReg reg_l) + , SETCC CARRY (OpReg reg_tmp) + , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c) + ] + return code + _ -> panic "genCCall: Wrong number of arguments/results for imul2" + + _ -> if is32Bit + then genCCall32' dflags target dest_regs args + else genCCall64' dflags target dest_regs args + + where divOp1 platform signed width results [arg_x, arg_y] + = divOp platform signed width results Nothing arg_x arg_y + divOp1 _ _ _ _ _ + = panic "genCCall: Wrong number of arguments for divOp1" + divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y] + = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y + divOp2 _ _ _ _ _ + = panic "genCCall: Wrong number of arguments for divOp2" + + -- See Note [DIV/IDIV for bytes] + divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y = + let widen | signed = MO_SS_Conv W8 W16 + | otherwise = MO_UU_Conv W8 W16 + arg_x_low_16 = CmmMachOp widen [arg_x_low] + arg_y_16 = CmmMachOp widen [arg_y] + m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high + in divOp + platform signed W16 [res_q, res_r] + m_arg_x_high_16 arg_x_low_16 arg_y_16 + + divOp platform signed width [res_q, res_r] + m_arg_x_high arg_x_low arg_y + = do let format = intFormat width + reg_q = getRegisterReg platform (CmmLocal res_q) + reg_r = getRegisterReg platform (CmmLocal res_r) + widen | signed = CLTD format + | otherwise = XOR format (OpReg rdx) (OpReg rdx) + instr | signed = IDIV + | otherwise = DIV + (y_reg, y_code) <- getRegOrMem arg_y + x_low_code <- getAnyReg arg_x_low + x_high_code <- case m_arg_x_high of + Just arg_x_high -> + getAnyReg arg_x_high + Nothing -> + return $ const $ unitOL widen + return $ y_code `appOL` + x_low_code rax `appOL` + x_high_code rdx `appOL` + toOL [instr format y_reg, + MOV format (OpReg rax) (OpReg reg_q), + MOV format (OpReg rdx) (OpReg reg_r)] + divOp _ _ _ _ _ _ _ + = panic "genCCall: Wrong number of results for divOp" + + addSubIntC platform instr mrevinstr cond width + res_r res_c [arg_x, arg_y] + = do let format = intFormat width + rCode <- anyReg =<< trivialCode width (instr format) + (mrevinstr format) arg_x arg_y + reg_tmp <- getNewRegNat II8 + let reg_c = getRegisterReg platform (CmmLocal res_c) + reg_r = getRegisterReg platform (CmmLocal res_r) + code = rCode reg_r `snocOL` + SETCC cond (OpReg reg_tmp) `snocOL` + MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c) + + return code + addSubIntC _ _ _ _ _ _ _ _ + = panic "genCCall: Wrong number of arguments/results for addSubIntC" + +-- Note [DIV/IDIV for bytes] +-- +-- IDIV reminder: +-- Size Dividend Divisor Quotient Remainder +-- byte %ax r/m8 %al %ah +-- word %dx:%ax r/m16 %ax %dx +-- dword %edx:%eax r/m32 %eax %edx +-- qword %rdx:%rax r/m64 %rax %rdx +-- +-- We do a special case for the byte division because the current +-- codegen doesn't deal well with accessing %ah register (also, +-- accessing %ah in 64-bit mode is complicated because it cannot be an +-- operand of many instructions). So we just widen operands to 16 bits +-- and get the results from %al, %dl. This is not optimal, but a few +-- register moves are probably not a huge deal when doing division. + +genCCall32' :: DynFlags + -> ForeignTarget -- function to call + -> [CmmFormal] -- where to put the result + -> [CmmActual] -- arguments (of mixed type) + -> NatM InstrBlock +genCCall32' dflags target dest_regs args = do + let + prom_args = map (maybePromoteCArg dflags W32) args + + -- Align stack to 16n for calls, assuming a starting stack + -- alignment of 16n - word_size on procedure entry. Which we + -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86] + sizes = map (arg_size_bytes . cmmExprType dflags) (reverse args) + raw_arg_size = sum sizes + wORD_SIZE dflags + arg_pad_size = (roundTo 16 $ raw_arg_size) - raw_arg_size + tot_arg_size = raw_arg_size + arg_pad_size - wORD_SIZE dflags + delta0 <- getDeltaNat + setDeltaNat (delta0 - arg_pad_size) + + push_codes <- mapM push_arg (reverse prom_args) + delta <- getDeltaNat + MASSERT(delta == delta0 - tot_arg_size) + + -- deal with static vs dynamic call targets + (callinsns,cconv) <- + case target of + ForeignTarget (CmmLit (CmmLabel lbl)) conv + -> -- ToDo: stdcall arg sizes + return (unitOL (CALL (Left fn_imm) []), conv) + where fn_imm = ImmCLbl lbl + ForeignTarget expr conv + -> do { (dyn_r, dyn_c) <- getSomeReg expr + ; ASSERT( isWord32 (cmmExprType dflags expr) ) + return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) } + PrimTarget _ + -> panic $ "genCCall: Can't handle PrimTarget call type here, error " + ++ "probably because too many return values." + + let push_code + | arg_pad_size /= 0 + = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp), + DELTA (delta0 - arg_pad_size)] + `appOL` concatOL push_codes + | otherwise + = concatOL push_codes + + -- Deallocate parameters after call for ccall; + -- but not for stdcall (callee does it) + -- + -- We have to pop any stack padding we added + -- even if we are doing stdcall, though (#5052) + pop_size + | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size + | otherwise = tot_arg_size + + call = callinsns `appOL` + toOL ( + (if pop_size==0 then [] else + [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)]) + ++ + [DELTA delta0] + ) + setDeltaNat delta0 + + dflags <- getDynFlags + let platform = targetPlatform dflags + + let + -- assign the results, if necessary + assign_code [] = nilOL + assign_code [dest] + | isFloatType ty = + -- we assume SSE2 + let tmp_amode = AddrBaseIndex (EABaseReg esp) + EAIndexNone + (ImmInt 0) + fmt = floatFormat w + in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp), + DELTA (delta0 - b), + X87Store fmt tmp_amode, + -- X87Store only supported for the CDECL ABI + -- NB: This code will need to be + -- revisted once GHC does more work around + -- SIGFPE f + MOV fmt (OpAddr tmp_amode) (OpReg r_dest), + ADD II32 (OpImm (ImmInt b)) (OpReg esp), + DELTA delta0] + | isWord64 ty = toOL [MOV II32 (OpReg eax) (OpReg r_dest), + MOV II32 (OpReg edx) (OpReg r_dest_hi)] + | otherwise = unitOL (MOV (intFormat w) + (OpReg eax) + (OpReg r_dest)) + where + ty = localRegType dest + w = typeWidth ty + b = widthInBytes w + r_dest_hi = getHiVRegFromLo r_dest + r_dest = getRegisterReg platform (CmmLocal dest) + assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many) + + return (push_code `appOL` + call `appOL` + assign_code dest_regs) + + where + -- If the size is smaller than the word, we widen things (see maybePromoteCArg) + arg_size_bytes :: CmmType -> Int + arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth dflags)) + + roundTo a x | x `mod` a == 0 = x + | otherwise = x + a - (x `mod` a) + + push_arg :: CmmActual {-current argument-} + -> NatM InstrBlock -- code + + push_arg arg -- we don't need the hints on x86 + | isWord64 arg_ty = do + ChildCode64 code r_lo <- iselExpr64 arg + delta <- getDeltaNat + setDeltaNat (delta - 8) + let r_hi = getHiVRegFromLo r_lo + return ( code `appOL` + toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4), + PUSH II32 (OpReg r_lo), DELTA (delta - 8), + DELTA (delta-8)] + ) + + | isFloatType arg_ty = do + (reg, code) <- getSomeReg arg + delta <- getDeltaNat + setDeltaNat (delta-size) + return (code `appOL` + toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp), + DELTA (delta-size), + let addr = AddrBaseIndex (EABaseReg esp) + EAIndexNone + (ImmInt 0) + format = floatFormat (typeWidth arg_ty) + in + + -- assume SSE2 + MOV format (OpReg reg) (OpAddr addr) + + ] + ) + + | otherwise = do + -- Arguments can be smaller than 32-bit, but we still use @PUSH + -- II32@ - the usual calling conventions expect integers to be + -- 4-byte aligned. + ASSERT((typeWidth arg_ty) <= W32) return () + (operand, code) <- getOperand arg + delta <- getDeltaNat + setDeltaNat (delta-size) + return (code `snocOL` + PUSH II32 operand `snocOL` + DELTA (delta-size)) + + where + arg_ty = cmmExprType dflags arg + size = arg_size_bytes arg_ty -- Byte size + +genCCall64' :: DynFlags + -> ForeignTarget -- function to call + -> [CmmFormal] -- where to put the result + -> [CmmActual] -- arguments (of mixed type) + -> NatM InstrBlock +genCCall64' dflags target dest_regs args = do + -- load up the register arguments + let prom_args = map (maybePromoteCArg dflags W32) args + + (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code) + <- + if platformOS platform == OSMinGW32 + then load_args_win prom_args [] [] (allArgRegs platform) nilOL + else do + (stack_args, aregs, fregs, load_args_code, assign_args_code) + <- load_args prom_args (allIntArgRegs platform) + (allFPArgRegs platform) + nilOL nilOL + let used_regs rs as = reverse (drop (length rs) (reverse as)) + fregs_used = used_regs fregs (allFPArgRegs platform) + aregs_used = used_regs aregs (allIntArgRegs platform) + return (stack_args, aregs_used, fregs_used, load_args_code + , assign_args_code) + + let + arg_regs_used = int_regs_used ++ fp_regs_used + arg_regs = [eax] ++ arg_regs_used + -- for annotating the call instruction with + sse_regs = length fp_regs_used + arg_stack_slots = if platformOS platform == OSMinGW32 + then length stack_args + length (allArgRegs platform) + else length stack_args + tot_arg_size = arg_size * arg_stack_slots + + + -- Align stack to 16n for calls, assuming a starting stack + -- alignment of 16n - word_size on procedure entry. Which we + -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86] + (real_size, adjust_rsp) <- + if (tot_arg_size + wORD_SIZE dflags) `rem` 16 == 0 + then return (tot_arg_size, nilOL) + else do -- we need to adjust... + delta <- getDeltaNat + setDeltaNat (delta - wORD_SIZE dflags) + return (tot_arg_size + wORD_SIZE dflags, toOL [ + SUB II64 (OpImm (ImmInt (wORD_SIZE dflags))) (OpReg rsp), + DELTA (delta - wORD_SIZE dflags) ]) + + -- push the stack args, right to left + push_code <- push_args (reverse stack_args) nilOL + -- On Win64, we also have to leave stack space for the arguments + -- that we are passing in registers + lss_code <- if platformOS platform == OSMinGW32 + then leaveStackSpace (length (allArgRegs platform)) + else return nilOL + delta <- getDeltaNat + + -- deal with static vs dynamic call targets + (callinsns,_cconv) <- + case target of + ForeignTarget (CmmLit (CmmLabel lbl)) conv + -> -- ToDo: stdcall arg sizes + return (unitOL (CALL (Left fn_imm) arg_regs), conv) + where fn_imm = ImmCLbl lbl + ForeignTarget expr conv + -> do (dyn_r, dyn_c) <- getSomeReg expr + return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv) + PrimTarget _ + -> panic $ "genCCall: Can't handle PrimTarget call type here, error " + ++ "probably because too many return values." + + let + -- The x86_64 ABI requires us to set %al to the number of SSE2 + -- registers that contain arguments, if the called routine + -- is a varargs function. We don't know whether it's a + -- varargs function or not, so we have to assume it is. + -- + -- It's not safe to omit this assignment, even if the number + -- of SSE2 regs in use is zero. If %al is larger than 8 + -- on entry to a varargs function, seg faults ensue. + assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax)) + + let call = callinsns `appOL` + toOL ( + -- Deallocate parameters after call for ccall; + -- stdcall has callee do it, but is not supported on + -- x86_64 target (see #3336) + (if real_size==0 then [] else + [ADD (intFormat (wordWidth dflags)) (OpImm (ImmInt real_size)) (OpReg esp)]) + ++ + [DELTA (delta + real_size)] + ) + setDeltaNat (delta + real_size) + + let + -- assign the results, if necessary + assign_code [] = nilOL + assign_code [dest] = + case typeWidth rep of + W32 | isFloatType rep -> unitOL (MOV (floatFormat W32) + (OpReg xmm0) + (OpReg r_dest)) + W64 | isFloatType rep -> unitOL (MOV (floatFormat W64) + (OpReg xmm0) + (OpReg r_dest)) + _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest)) + where + rep = localRegType dest + r_dest = getRegisterReg platform (CmmLocal dest) + assign_code _many = panic "genCCall.assign_code many" + + return (adjust_rsp `appOL` + push_code `appOL` + load_args_code `appOL` + assign_args_code `appOL` + lss_code `appOL` + assign_eax sse_regs `appOL` + call `appOL` + assign_code dest_regs) + + where platform = targetPlatform dflags + arg_size = 8 -- always, at the mo + + + load_args :: [CmmExpr] + -> [Reg] -- int regs avail for args + -> [Reg] -- FP regs avail for args + -> InstrBlock -- code computing args + -> InstrBlock -- code assigning args to ABI regs + -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock) + -- no more regs to use + load_args args [] [] code acode = + return (args, [], [], code, acode) + + -- no more args to push + load_args [] aregs fregs code acode = + return ([], aregs, fregs, code, acode) + + load_args (arg : rest) aregs fregs code acode + | isFloatType arg_rep = case fregs of + [] -> push_this_arg + (r:rs) -> do + (code',acode') <- reg_this_arg r + load_args rest aregs rs code' acode' + | otherwise = case aregs of + [] -> push_this_arg + (r:rs) -> do + (code',acode') <- reg_this_arg r + load_args rest rs fregs code' acode' + where + + -- put arg into the list of stack pushed args + push_this_arg = do + (args',ars,frs,code',acode') + <- load_args rest aregs fregs code acode + return (arg:args', ars, frs, code', acode') + + -- pass the arg into the given register + reg_this_arg r + -- "operand" args can be directly assigned into r + | isOperand False arg = do + arg_code <- getAnyReg arg + return (code, (acode `appOL` arg_code r)) + -- The last non-operand arg can be directly assigned after its + -- computation without going into a temporary register + | all (isOperand False) rest = do + arg_code <- getAnyReg arg + return (code `appOL` arg_code r,acode) + + -- other args need to be computed beforehand to avoid clobbering + -- previously assigned registers used to pass parameters (see + -- #11792, #12614). They are assigned into temporary registers + -- and get assigned to proper call ABI registers after they all + -- have been computed. + | otherwise = do + arg_code <- getAnyReg arg + tmp <- getNewRegNat arg_fmt + let + code' = code `appOL` arg_code tmp + acode' = acode `snocOL` reg2reg arg_fmt tmp r + return (code',acode') + + arg_rep = cmmExprType dflags arg + arg_fmt = cmmTypeFormat arg_rep + + load_args_win :: [CmmExpr] + -> [Reg] -- used int regs + -> [Reg] -- used FP regs + -> [(Reg, Reg)] -- (int, FP) regs avail for args + -> InstrBlock + -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock) + load_args_win args usedInt usedFP [] code + = return (args, usedInt, usedFP, code, nilOL) + -- no more regs to use + load_args_win [] usedInt usedFP _ code + = return ([], usedInt, usedFP, code, nilOL) + -- no more args to push + load_args_win (arg : rest) usedInt usedFP + ((ireg, freg) : regs) code + | isFloatType arg_rep = do + arg_code <- getAnyReg arg + load_args_win rest (ireg : usedInt) (freg : usedFP) regs + (code `appOL` + arg_code freg `snocOL` + -- If we are calling a varargs function + -- then we need to define ireg as well + -- as freg + MOV II64 (OpReg freg) (OpReg ireg)) + | otherwise = do + arg_code <- getAnyReg arg + load_args_win rest (ireg : usedInt) usedFP regs + (code `appOL` arg_code ireg) + where + arg_rep = cmmExprType dflags arg + + push_args [] code = return code + push_args (arg:rest) code + | isFloatType arg_rep = do + (arg_reg, arg_code) <- getSomeReg arg + delta <- getDeltaNat + setDeltaNat (delta-arg_size) + let code' = code `appOL` arg_code `appOL` toOL [ + SUB (intFormat (wordWidth dflags)) (OpImm (ImmInt arg_size)) (OpReg rsp), + DELTA (delta-arg_size), + MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel dflags 0))] + push_args rest code' + + | otherwise = do + -- Arguments can be smaller than 64-bit, but we still use @PUSH + -- II64@ - the usual calling conventions expect integers to be + -- 8-byte aligned. + ASSERT(width <= W64) return () + (arg_op, arg_code) <- getOperand arg + delta <- getDeltaNat + setDeltaNat (delta-arg_size) + let code' = code `appOL` arg_code `appOL` toOL [ + PUSH II64 arg_op, + DELTA (delta-arg_size)] + push_args rest code' + where + arg_rep = cmmExprType dflags arg + width = typeWidth arg_rep + + leaveStackSpace n = do + delta <- getDeltaNat + setDeltaNat (delta - n * arg_size) + return $ toOL [ + SUB II64 (OpImm (ImmInt (n * wORD_SIZE dflags))) (OpReg rsp), + DELTA (delta - n * arg_size)] + +maybePromoteCArg :: DynFlags -> Width -> CmmExpr -> CmmExpr +maybePromoteCArg dflags wto arg + | wfrom < wto = CmmMachOp (MO_UU_Conv wfrom wto) [arg] + | otherwise = arg + where + wfrom = cmmExprWidth dflags arg + +outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual] + -> NatM InstrBlock +outOfLineCmmOp bid mop res args + = do + dflags <- getDynFlags + targetExpr <- cmmMakeDynamicReference dflags CallReference lbl + let target = ForeignTarget targetExpr + (ForeignConvention CCallConv [] [] CmmMayReturn) + + -- We know foreign calls results in no new basic blocks, so we can ignore + -- the returned block id. + (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args) + return instrs + where + -- Assume we can call these functions directly, and that they're not in a dynamic library. + -- TODO: Why is this ok? Under linux this code will be in libm.so + -- Is it because they're really implemented as a primitive instruction by the assembler?? -- BL 2009/12/31 + lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction + + fn = case mop of + MO_F32_Sqrt -> fsLit "sqrtf" + MO_F32_Fabs -> fsLit "fabsf" + MO_F32_Sin -> fsLit "sinf" + MO_F32_Cos -> fsLit "cosf" + MO_F32_Tan -> fsLit "tanf" + MO_F32_Exp -> fsLit "expf" + MO_F32_ExpM1 -> fsLit "expm1f" + MO_F32_Log -> fsLit "logf" + MO_F32_Log1P -> fsLit "log1pf" + + MO_F32_Asin -> fsLit "asinf" + MO_F32_Acos -> fsLit "acosf" + MO_F32_Atan -> fsLit "atanf" + + MO_F32_Sinh -> fsLit "sinhf" + MO_F32_Cosh -> fsLit "coshf" + MO_F32_Tanh -> fsLit "tanhf" + MO_F32_Pwr -> fsLit "powf" + + MO_F32_Asinh -> fsLit "asinhf" + MO_F32_Acosh -> fsLit "acoshf" + MO_F32_Atanh -> fsLit "atanhf" + + MO_F64_Sqrt -> fsLit "sqrt" + MO_F64_Fabs -> fsLit "fabs" + MO_F64_Sin -> fsLit "sin" + MO_F64_Cos -> fsLit "cos" + MO_F64_Tan -> fsLit "tan" + MO_F64_Exp -> fsLit "exp" + MO_F64_ExpM1 -> fsLit "expm1" + MO_F64_Log -> fsLit "log" + MO_F64_Log1P -> fsLit "log1p" + + MO_F64_Asin -> fsLit "asin" + MO_F64_Acos -> fsLit "acos" + MO_F64_Atan -> fsLit "atan" + + MO_F64_Sinh -> fsLit "sinh" + MO_F64_Cosh -> fsLit "cosh" + MO_F64_Tanh -> fsLit "tanh" + MO_F64_Pwr -> fsLit "pow" + + MO_F64_Asinh -> fsLit "asinh" + MO_F64_Acosh -> fsLit "acosh" + MO_F64_Atanh -> fsLit "atanh" + + MO_Memcpy _ -> fsLit "memcpy" + MO_Memset _ -> fsLit "memset" + MO_Memmove _ -> fsLit "memmove" + MO_Memcmp _ -> fsLit "memcmp" + + MO_PopCnt _ -> fsLit "popcnt" + MO_BSwap _ -> fsLit "bswap" + {- Here the C implementation is used as there is no x86 + instruction to reverse a word's bit order. + -} + MO_BRev w -> fsLit $ bRevLabel w + MO_Clz w -> fsLit $ clzLabel w + MO_Ctz _ -> unsupported + + MO_Pdep w -> fsLit $ pdepLabel w + MO_Pext w -> fsLit $ pextLabel w + + MO_AtomicRMW _ _ -> fsLit "atomicrmw" + MO_AtomicRead _ -> fsLit "atomicread" + MO_AtomicWrite _ -> fsLit "atomicwrite" + MO_Cmpxchg _ -> fsLit "cmpxchg" + + MO_UF_Conv _ -> unsupported + + MO_S_Mul2 {} -> unsupported + MO_S_QuotRem {} -> unsupported + MO_U_QuotRem {} -> unsupported + MO_U_QuotRem2 {} -> unsupported + MO_Add2 {} -> unsupported + MO_AddIntC {} -> unsupported + MO_SubIntC {} -> unsupported + MO_AddWordC {} -> unsupported + MO_SubWordC {} -> unsupported + MO_U_Mul2 {} -> unsupported + MO_ReadBarrier -> unsupported + MO_WriteBarrier -> unsupported + MO_Touch -> unsupported + (MO_Prefetch_Data _ ) -> unsupported + unsupported = panic ("outOfLineCmmOp: " ++ show mop + ++ " not supported here") + +-- ----------------------------------------------------------------------------- +-- Generating a table-branch + +genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock + +genSwitch dflags expr targets + | positionIndependent dflags + = do + (reg,e_code) <- getNonClobberedReg (cmmOffset dflags expr offset) + -- getNonClobberedReg because it needs to survive across t_code + lbl <- getNewLabelNat + dflags <- getDynFlags + let is32bit = target32Bit (targetPlatform dflags) + os = platformOS (targetPlatform dflags) + -- Might want to use .rodata.<function we're in> instead, but as + -- long as it's something unique it'll work out since the + -- references to the jump table are in the appropriate section. + rosection = case os of + -- on Mac OS X/x86_64, put the jump table in the text section to + -- work around a limitation of the linker. + -- ld64 is unable to handle the relocations for + -- .quad L1 - L0 + -- if L0 is not preceded by a non-anonymous label in its section. + OSDarwin | not is32bit -> Section Text lbl + _ -> Section ReadOnlyData lbl + dynRef <- cmmMakeDynamicReference dflags DataReference lbl + (tableReg,t_code) <- getSomeReg $ dynRef + let op = OpAddr (AddrBaseIndex (EABaseReg tableReg) + (EAIndex reg (wORD_SIZE dflags)) (ImmInt 0)) + + offsetReg <- getNewRegNat (intFormat (wordWidth dflags)) + return $ if is32bit || os == OSDarwin + then e_code `appOL` t_code `appOL` toOL [ + ADD (intFormat (wordWidth dflags)) op (OpReg tableReg), + JMP_TBL (OpReg tableReg) ids rosection lbl + ] + else -- HACK: On x86_64 binutils<2.17 is only able to generate + -- PC32 relocations, hence we only get 32-bit offsets in + -- the jump table. As these offsets are always negative + -- we need to properly sign extend them to 64-bit. This + -- hack should be removed in conjunction with the hack in + -- PprMach.hs/pprDataItem once binutils 2.17 is standard. + e_code `appOL` t_code `appOL` toOL [ + MOVSxL II32 op (OpReg offsetReg), + ADD (intFormat (wordWidth dflags)) + (OpReg offsetReg) + (OpReg tableReg), + JMP_TBL (OpReg tableReg) ids rosection lbl + ] + | otherwise + = do + (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset) + lbl <- getNewLabelNat + let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (wORD_SIZE dflags)) (ImmCLbl lbl)) + code = e_code `appOL` toOL [ + JMP_TBL op ids (Section ReadOnlyData lbl) lbl + ] + return code + where + (offset, blockIds) = switchTargetsToTable targets + ids = map (fmap DestBlockId) blockIds + +generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, RawCmmStatics) Instr) +generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl) + = let getBlockId (DestBlockId id) = id + getBlockId _ = panic "Non-Label target in Jump Table" + blockIds = map (fmap getBlockId) ids + in Just (createJumpTable dflags blockIds section lbl) +generateJumpTableForInstr _ _ = Nothing + +createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel + -> GenCmmDecl (Alignment, RawCmmStatics) h g +createJumpTable dflags ids section lbl + = let jumpTable + | positionIndependent dflags = + let ww = wordWidth dflags + jumpTableEntryRel Nothing + = CmmStaticLit (CmmInt 0 ww) + jumpTableEntryRel (Just blockid) + = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww) + where blockLabel = blockLbl blockid + in map jumpTableEntryRel ids + | otherwise = map (jumpTableEntry dflags) ids + in CmmData section (mkAlignment 1, RawCmmStatics lbl jumpTable) + +extractUnwindPoints :: [Instr] -> [UnwindPoint] +extractUnwindPoints instrs = + [ UnwindPoint lbl unwinds | UNWIND lbl unwinds <- instrs] + +-- ----------------------------------------------------------------------------- +-- 'condIntReg' and 'condFltReg': condition codes into registers + +-- Turn those condition codes into integers now (when they appear on +-- the right hand side of an assignment). +-- +-- (If applicable) Do not fill the delay slots here; you will confuse the +-- register allocator. + +condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register + +condIntReg cond x y = do + CondCode _ cond cond_code <- condIntCode cond x y + tmp <- getNewRegNat II8 + let + code dst = cond_code `appOL` toOL [ + SETCC cond (OpReg tmp), + MOVZxL II8 (OpReg tmp) (OpReg dst) + ] + return (Any II32 code) + + +----------------------------------------------------------- +--- Note [SSE Parity Checks] --- +----------------------------------------------------------- + +-- We have to worry about unordered operands (eg. comparisons +-- against NaN). If the operands are unordered, the comparison +-- sets the parity flag, carry flag and zero flag. +-- All comparisons are supposed to return false for unordered +-- operands except for !=, which returns true. +-- +-- Optimisation: we don't have to test the parity flag if we +-- know the test has already excluded the unordered case: eg > +-- and >= test for a zero carry flag, which can only occur for +-- ordered operands. +-- +-- By reversing comparisons we can avoid testing the parity +-- for < and <= as well. If any of the arguments is an NaN we +-- return false either way. If both arguments are valid then +-- x <= y <-> y >= x holds. So it's safe to swap these. +-- +-- We invert the condition inside getRegister'and getCondCode +-- which should cover all invertable cases. +-- All other functions translating FP comparisons to assembly +-- use these to two generate the comparison code. +-- +-- As an example consider a simple check: +-- +-- func :: Float -> Float -> Int +-- func x y = if x < y then 1 else 0 +-- +-- Which in Cmm gives the floating point comparison. +-- +-- if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf; +-- +-- We used to compile this to an assembly code block like this: +-- _c2gh: +-- ucomiss %xmm2,%xmm1 +-- jp _c2gf +-- jb _c2gg +-- jmp _c2gf +-- +-- Where we have to introduce an explicit +-- check for unordered results (using jmp parity): +-- +-- We can avoid this by exchanging the arguments and inverting the direction +-- of the comparison. This results in the sequence of: +-- +-- ucomiss %xmm1,%xmm2 +-- ja _c2g2 +-- jmp _c2g1 +-- +-- Removing the jump reduces the pressure on the branch predidiction system +-- and plays better with the uOP cache. + +condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register +condFltReg is32Bit cond x y = condFltReg_sse2 + where + + + condFltReg_sse2 = do + CondCode _ cond cond_code <- condFltCode cond x y + tmp1 <- getNewRegNat (archWordFormat is32Bit) + tmp2 <- getNewRegNat (archWordFormat is32Bit) + let -- See Note [SSE Parity Checks] + code dst = + cond_code `appOL` + (case cond of + NE -> or_unordered dst + GU -> plain_test dst + GEU -> plain_test dst + -- Use ASSERT so we don't break releases if these creep in. + LTT -> ASSERT2(False, ppr "Should have been turned into >") + and_ordered dst + LE -> ASSERT2(False, ppr "Should have been turned into >=") + and_ordered dst + _ -> and_ordered dst) + + plain_test dst = toOL [ + SETCC cond (OpReg tmp1), + MOVZxL II8 (OpReg tmp1) (OpReg dst) + ] + or_unordered dst = toOL [ + SETCC cond (OpReg tmp1), + SETCC PARITY (OpReg tmp2), + OR II8 (OpReg tmp1) (OpReg tmp2), + MOVZxL II8 (OpReg tmp2) (OpReg dst) + ] + and_ordered dst = toOL [ + SETCC cond (OpReg tmp1), + SETCC NOTPARITY (OpReg tmp2), + AND II8 (OpReg tmp1) (OpReg tmp2), + MOVZxL II8 (OpReg tmp2) (OpReg dst) + ] + return (Any II32 code) + + +-- ----------------------------------------------------------------------------- +-- 'trivial*Code': deal with trivial instructions + +-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode', +-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions. +-- Only look for constants on the right hand side, because that's +-- where the generic optimizer will have put them. + +-- Similarly, for unary instructions, we don't have to worry about +-- matching an StInt as the argument, because genericOpt will already +-- have handled the constant-folding. + + +{- +The Rules of the Game are: + +* You cannot assume anything about the destination register dst; + it may be anything, including a fixed reg. + +* You may compute an operand into a fixed reg, but you may not + subsequently change the contents of that fixed reg. If you + want to do so, first copy the value either to a temporary + or into dst. You are free to modify dst even if it happens + to be a fixed reg -- that's not your problem. + +* You cannot assume that a fixed reg will stay live over an + arbitrary computation. The same applies to the dst reg. + +* Temporary regs obtained from getNewRegNat are distinct from + each other and from all other regs, and stay live over + arbitrary computations. + +-------------------- + +SDM's version of The Rules: + +* If getRegister returns Any, that means it can generate correct + code which places the result in any register, period. Even if that + register happens to be read during the computation. + + Corollary #1: this means that if you are generating code for an + operation with two arbitrary operands, you cannot assign the result + of the first operand into the destination register before computing + the second operand. The second operand might require the old value + of the destination register. + + Corollary #2: A function might be able to generate more efficient + code if it knows the destination register is a new temporary (and + therefore not read by any of the sub-computations). + +* If getRegister returns Any, then the code it generates may modify only: + (a) fresh temporaries + (b) the destination register + (c) known registers (eg. %ecx is used by shifts) + In particular, it may *not* modify global registers, unless the global + register happens to be the destination register. +-} + +trivialCode :: Width -> (Operand -> Operand -> Instr) + -> Maybe (Operand -> Operand -> Instr) + -> CmmExpr -> CmmExpr -> NatM Register +trivialCode width instr m a b + = do is32Bit <- is32BitPlatform + trivialCode' is32Bit width instr m a b + +trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr) + -> Maybe (Operand -> Operand -> Instr) + -> CmmExpr -> CmmExpr -> NatM Register +trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b + | is32BitLit is32Bit lit_a = do + b_code <- getAnyReg b + let + code dst + = b_code dst `snocOL` + revinstr (OpImm (litToImm lit_a)) (OpReg dst) + return (Any (intFormat width) code) + +trivialCode' _ width instr _ a b + = genTrivialCode (intFormat width) instr a b + +-- This is re-used for floating pt instructions too. +genTrivialCode :: Format -> (Operand -> Operand -> Instr) + -> CmmExpr -> CmmExpr -> NatM Register +genTrivialCode rep instr a b = do + (b_op, b_code) <- getNonClobberedOperand b + a_code <- getAnyReg a + tmp <- getNewRegNat rep + let + -- We want the value of b to stay alive across the computation of a. + -- But, we want to calculate a straight into the destination register, + -- because the instruction only has two operands (dst := dst `op` src). + -- The troublesome case is when the result of b is in the same register + -- as the destination reg. In this case, we have to save b in a + -- new temporary across the computation of a. + code dst + | dst `regClashesWithOp` b_op = + b_code `appOL` + unitOL (MOV rep b_op (OpReg tmp)) `appOL` + a_code dst `snocOL` + instr (OpReg tmp) (OpReg dst) + | otherwise = + b_code `appOL` + a_code dst `snocOL` + instr b_op (OpReg dst) + return (Any rep code) + +regClashesWithOp :: Reg -> Operand -> Bool +reg `regClashesWithOp` OpReg reg2 = reg == reg2 +reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode) +_ `regClashesWithOp` _ = False + +----------- + +trivialUCode :: Format -> (Operand -> Instr) + -> CmmExpr -> NatM Register +trivialUCode rep instr x = do + x_code <- getAnyReg x + let + code dst = + x_code dst `snocOL` + instr (OpReg dst) + return (Any rep code) + +----------- + + +trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr) + -> CmmExpr -> CmmExpr -> NatM Register +trivialFCode_sse2 pk instr x y + = genTrivialCode format (instr format) x y + where format = floatFormat pk + + +trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register +trivialUFCode format instr x = do + (x_reg, x_code) <- getSomeReg x + let + code dst = + x_code `snocOL` + instr x_reg dst + return (Any format code) + + +-------------------------------------------------------------------------------- +coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register +coerceInt2FP from to x = coerce_sse2 + where + + coerce_sse2 = do + (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand + let + opc = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD + n -> panic $ "coerceInt2FP.sse: unhandled width (" + ++ show n ++ ")" + code dst = x_code `snocOL` opc (intFormat from) x_op dst + return (Any (floatFormat to) code) + -- works even if the destination rep is <II32 + +-------------------------------------------------------------------------------- +coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register +coerceFP2Int from to x = coerceFP2Int_sse2 + where + coerceFP2Int_sse2 = do + (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand + let + opc = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ; + n -> panic $ "coerceFP2Init.sse: unhandled width (" + ++ show n ++ ")" + code dst = x_code `snocOL` opc (intFormat to) x_op dst + return (Any (intFormat to) code) + -- works even if the destination rep is <II32 + + +-------------------------------------------------------------------------------- +coerceFP2FP :: Width -> CmmExpr -> NatM Register +coerceFP2FP to x = do + (x_reg, x_code) <- getSomeReg x + let + opc = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD; + n -> panic $ "coerceFP2FP: unhandled width (" + ++ show n ++ ")" + code dst = x_code `snocOL` opc x_reg dst + return (Any ( floatFormat to) code) + +-------------------------------------------------------------------------------- + +sse2NegCode :: Width -> CmmExpr -> NatM Register +sse2NegCode w x = do + let fmt = floatFormat w + x_code <- getAnyReg x + -- This is how gcc does it, so it can't be that bad: + let + const = case fmt of + FF32 -> CmmInt 0x80000000 W32 + FF64 -> CmmInt 0x8000000000000000 W64 + x@II8 -> wrongFmt x + x@II16 -> wrongFmt x + x@II32 -> wrongFmt x + x@II64 -> wrongFmt x + + where + wrongFmt x = panic $ "sse2NegCode: " ++ show x + Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const + tmp <- getNewRegNat fmt + let + code dst = x_code dst `appOL` amode_code `appOL` toOL [ + MOV fmt (OpAddr amode) (OpReg tmp), + XOR fmt (OpReg tmp) (OpReg dst) + ] + -- + return (Any fmt code) + +isVecExpr :: CmmExpr -> Bool +isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True +isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True +isVecExpr (CmmMachOp (MO_V_Add {}) _) = True +isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True +isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True +isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True +isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True +isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True +isVecExpr (CmmMachOp _ [e]) = isVecExpr e +isVecExpr _ = False + +needLlvm :: NatM a +needLlvm = + sorry $ unlines ["The native code generator does not support vector" + ,"instructions. Please use -fllvm."] + +-- | This works on the invariant that all jumps in the given blocks are required. +-- Starting from there we try to make a few more jumps redundant by reordering +-- them. +-- We depend on the information in the CFG to do so so without a given CFG +-- we do nothing. +invertCondBranches :: Maybe CFG -- ^ CFG if present + -> LabelMap a -- ^ Blocks with info tables + -> [NatBasicBlock Instr] -- ^ List of basic blocks + -> [NatBasicBlock Instr] +invertCondBranches Nothing _ bs = bs +invertCondBranches (Just cfg) keep bs = + invert bs + where + invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr] + invert ((BasicBlock lbl1 ins@(_:_:_xs)):b2@(BasicBlock lbl2 _):bs) + | --pprTrace "Block" (ppr lbl1) True, + (jmp1,jmp2) <- last2 ins + , JXX cond1 target1 <- jmp1 + , target1 == lbl2 + --, pprTrace "CutChance" (ppr b1) True + , JXX ALWAYS target2 <- jmp2 + -- We have enough information to check if we can perform the inversion + -- TODO: We could also check for the last asm instruction which sets + -- status flags instead. Which I suspect is worse in terms of compiler + -- performance, but might be applicable to more cases + , Just edgeInfo1 <- getEdgeInfo lbl1 target1 cfg + , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg + -- Both jumps come from the same cmm statement + , transitionSource edgeInfo1 == transitionSource edgeInfo2 + , CmmSource {trans_cmmNode = cmmCondBranch} <- transitionSource edgeInfo1 + + --Int comparisons are invertable + , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch + , Just _ <- maybeIntComparison op + , Just invCond <- maybeInvertCond cond1 + + --Swap the last two jumps, invert the conditional jumps condition. + = let jumps = + case () of + -- We are free the eliminate the jmp. So we do so. + _ | not (mapMember target1 keep) + -> [JXX invCond target2] + -- If the conditional target is unlikely we put the other + -- target at the front. + | edgeWeight edgeInfo2 > edgeWeight edgeInfo1 + -> [JXX invCond target2, JXX ALWAYS target1] + -- Keep things as-is otherwise + | otherwise + -> [jmp1, jmp2] + in --pprTrace "Cutable" (ppr [jmp1,jmp2] <+> text "=>" <+> ppr jumps) $ + (BasicBlock lbl1 + (dropTail 2 ins ++ jumps)) + : invert (b2:bs) + invert (b:bs) = b : invert bs + invert [] = [] diff --git a/compiler/GHC/CmmToAsm/X86/Cond.hs b/compiler/GHC/CmmToAsm/X86/Cond.hs new file mode 100644 index 0000000000..bb8f61438b --- /dev/null +++ b/compiler/GHC/CmmToAsm/X86/Cond.hs @@ -0,0 +1,109 @@ +module GHC.CmmToAsm.X86.Cond ( + Cond(..), + condUnsigned, + condToSigned, + condToUnsigned, + maybeFlipCond, + maybeInvertCond +) + +where + +import GhcPrelude + +data Cond + = ALWAYS -- What's really used? ToDo + | EQQ + | GE + | GEU + | GTT + | GU + | LE + | LEU + | LTT + | LU + | NE + | NEG + | POS + | CARRY + | OFLO + | PARITY + | NOTPARITY + deriving Eq + +condUnsigned :: Cond -> Bool +condUnsigned GU = True +condUnsigned LU = True +condUnsigned GEU = True +condUnsigned LEU = True +condUnsigned _ = False + + +condToSigned :: Cond -> Cond +condToSigned GU = GTT +condToSigned LU = LTT +condToSigned GEU = GE +condToSigned LEU = LE +condToSigned x = x + + +condToUnsigned :: Cond -> Cond +condToUnsigned GTT = GU +condToUnsigned LTT = LU +condToUnsigned GE = GEU +condToUnsigned LE = LEU +condToUnsigned x = x + +-- | @maybeFlipCond c@ returns @Just c'@ if it is possible to flip the +-- arguments to the conditional @c@, and the new condition should be @c'@. +maybeFlipCond :: Cond -> Maybe Cond +maybeFlipCond cond = case cond of + EQQ -> Just EQQ + NE -> Just NE + LU -> Just GU + GU -> Just LU + LEU -> Just GEU + GEU -> Just LEU + LTT -> Just GTT + GTT -> Just LTT + LE -> Just GE + GE -> Just LE + _other -> Nothing + +-- | If we apply @maybeInvertCond@ to the condition of a jump we turn +-- jumps taken into jumps not taken and vice versa. +-- +-- Careful! If the used comparison and the conditional jump +-- don't match the above behaviour will NOT hold. +-- When used for FP comparisons this does not consider unordered +-- numbers. +-- Also inverting twice might return a synonym for the original condition. +maybeInvertCond :: Cond -> Maybe Cond +maybeInvertCond cond = case cond of + ALWAYS -> Nothing + EQQ -> Just NE + NE -> Just EQQ + + NEG -> Just POS + POS -> Just NEG + + GEU -> Just LU + LU -> Just GEU + + GE -> Just LTT + LTT -> Just GE + + GTT -> Just LE + LE -> Just GTT + + GU -> Just LEU + LEU -> Just GU + + --GEU "==" NOTCARRY, they are synonyms + --at the assembly level + CARRY -> Just GEU + + OFLO -> Nothing + + PARITY -> Just NOTPARITY + NOTPARITY -> Just PARITY diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs new file mode 100644 index 0000000000..4171806695 --- /dev/null +++ b/compiler/GHC/CmmToAsm/X86/Instr.hs @@ -0,0 +1,1056 @@ +{-# LANGUAGE CPP, TypeFamilies #-} + +----------------------------------------------------------------------------- +-- +-- Machine-dependent assembly language +-- +-- (c) The University of Glasgow 1993-2004 +-- +----------------------------------------------------------------------------- + +module GHC.CmmToAsm.X86.Instr + ( Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..) + , getJumpDestBlockId, canShortcut, shortcutStatics + , shortcutJump, allocMoreStack + , maxSpillSlots, archWordFormat + ) +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.CmmToAsm.X86.Cond +import GHC.CmmToAsm.X86.Regs +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Format +import GHC.Platform.Reg.Class +import GHC.Platform.Reg +import GHC.CmmToAsm.Reg.Target + +import GHC.Cmm.BlockId +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label +import GHC.Platform.Regs +import GHC.Cmm +import FastString +import Outputable +import GHC.Platform + +import BasicTypes (Alignment) +import GHC.Cmm.CLabel +import GHC.Driver.Session +import UniqSet +import Unique +import UniqSupply +import GHC.Cmm.DebugBlock (UnwindTable) + +import Control.Monad +import Data.Maybe (fromMaybe) + +-- Format of an x86/x86_64 memory address, in bytes. +-- +archWordFormat :: Bool -> Format +archWordFormat is32Bit + | is32Bit = II32 + | otherwise = II64 + +-- | Instruction instance for x86 instruction set. +instance Instruction Instr where + regUsageOfInstr = x86_regUsageOfInstr + patchRegsOfInstr = x86_patchRegsOfInstr + isJumpishInstr = x86_isJumpishInstr + jumpDestsOfInstr = x86_jumpDestsOfInstr + patchJumpInstr = x86_patchJumpInstr + mkSpillInstr = x86_mkSpillInstr + mkLoadInstr = x86_mkLoadInstr + takeDeltaInstr = x86_takeDeltaInstr + isMetaInstr = x86_isMetaInstr + mkRegRegMoveInstr = x86_mkRegRegMoveInstr + takeRegRegMoveInstr = x86_takeRegRegMoveInstr + mkJumpInstr = x86_mkJumpInstr + mkStackAllocInstr = x86_mkStackAllocInstr + mkStackDeallocInstr = x86_mkStackDeallocInstr + + +-- ----------------------------------------------------------------------------- +-- Intel x86 instructions + +{- +Intel, in their infinite wisdom, selected a stack model for floating +point registers on x86. That might have made sense back in 1979 -- +nowadays we can see it for the nonsense it really is. A stack model +fits poorly with the existing nativeGen infrastructure, which assumes +flat integer and FP register sets. Prior to this commit, nativeGen +could not generate correct x86 FP code -- to do so would have meant +somehow working the register-stack paradigm into the register +allocator and spiller, which sounds very difficult. + +We have decided to cheat, and go for a simple fix which requires no +infrastructure modifications, at the expense of generating ropey but +correct FP code. All notions of the x86 FP stack and its insns have +been removed. Instead, we pretend (to the instruction selector and +register allocator) that x86 has six floating point registers, %fake0 +.. %fake5, which can be used in the usual flat manner. We further +claim that x86 has floating point instructions very similar to SPARC +and Alpha, that is, a simple 3-operand register-register arrangement. +Code generation and register allocation proceed on this basis. + +When we come to print out the final assembly, our convenient fiction +is converted to dismal reality. Each fake instruction is +independently converted to a series of real x86 instructions. +%fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg +arithmetic operations, the two operands are pushed onto the top of the +FP stack, the operation done, and the result copied back into the +relevant register. There are only six %fake registers because 2 are +needed for the translation, and x86 has 8 in total. + +The translation is inefficient but is simple and it works. A cleverer +translation would handle a sequence of insns, simulating the FP stack +contents, would not impose a fixed mapping from %fake to %st regs, and +hopefully could avoid most of the redundant reg-reg moves of the +current translation. + +We might as well make use of whatever unique FP facilities Intel have +chosen to bless us with (let's not be churlish, after all). +Hence GLDZ and GLD1. Bwahahahahahahaha! +-} + +{- +Note [x86 Floating point precision] + +Intel's internal floating point registers are by default 80 bit +extended precision. This means that all operations done on values in +registers are done at 80 bits, and unless the intermediate values are +truncated to the appropriate size (32 or 64 bits) by storing in +memory, calculations in registers will give different results from +calculations which pass intermediate values in memory (eg. via +function calls). + +One solution is to set the FPU into 64 bit precision mode. Some OSs +do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is +that this will only affect 64-bit precision arithmetic; 32-bit +calculations will still be done at 64-bit precision in registers. So +it doesn't solve the whole problem. + +There's also the issue of what the C library is expecting in terms of +precision. It seems to be the case that glibc on Linux expects the +FPU to be set to 80 bit precision, so setting it to 64 bit could have +unexpected effects. Changing the default could have undesirable +effects on other 3rd-party library code too, so the right thing would +be to save/restore the FPU control word across Haskell code if we were +to do this. + +gcc's -ffloat-store gives consistent results by always storing the +results of floating-point calculations in memory, which works for both +32 and 64-bit precision. However, it only affects the values of +user-declared floating point variables in C, not intermediate results. +GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision +flag). + +Another problem is how to spill floating point registers in the +register allocator. Should we spill the whole 80 bits, or just 64? +On an OS which is set to 64 bit precision, spilling 64 is fine. On +Linux, spilling 64 bits will round the results of some operations. +This is what gcc does. Spilling at 80 bits requires taking up a full +128 bit slot (so we get alignment). We spill at 80-bits and ignore +the alignment problems. + +In the future [edit: now available in GHC 7.0.1, with the -msse2 +flag], we'll use the SSE registers for floating point. This requires +a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision +float ops), which means P4 or Xeon and above. Using SSE will solve +all these problems, because the SSE registers use fixed 32 bit or 64 +bit precision. + +--SDM 1/2003 +-} + +data Instr + -- comment pseudo-op + = COMMENT FastString + + -- location pseudo-op (file, line, col, name) + | LOCATION Int Int Int String + + -- some static data spat out during code + -- generation. Will be extracted before + -- pretty-printing. + | LDATA Section (Alignment, RawCmmStatics) + + -- start a new basic block. Useful during + -- codegen, removed later. Preceding + -- instruction should be a jump, as per the + -- invariants for a BasicBlock (see Cmm). + | NEWBLOCK BlockId + + -- unwinding information + -- See Note [Unwinding information in the NCG]. + | UNWIND CLabel UnwindTable + + -- specify current stack offset for benefit of subsequent passes. + -- This carries a BlockId so it can be used in unwinding information. + | DELTA Int + + -- Moves. + | MOV Format Operand Operand + | CMOV Cond Format Operand Reg + | MOVZxL Format Operand Operand -- format is the size of operand 1 + | MOVSxL Format Operand Operand -- format is the size of operand 1 + -- x86_64 note: plain mov into a 32-bit register always zero-extends + -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which + -- don't affect the high bits of the register. + + -- Load effective address (also a very useful three-operand add instruction :-) + | LEA Format Operand Operand + + -- Int Arithmetic. + | ADD Format Operand Operand + | ADC Format Operand Operand + | SUB Format Operand Operand + | SBB Format Operand Operand + + | MUL Format Operand Operand + | MUL2 Format Operand -- %edx:%eax = operand * %rax + | IMUL Format Operand Operand -- signed int mul + | IMUL2 Format Operand -- %edx:%eax = operand * %eax + + | DIV Format Operand -- eax := eax:edx/op, edx := eax:edx%op + | IDIV Format Operand -- ditto, but signed + + -- Int Arithmetic, where the effects on the condition register + -- are important. Used in specialized sequences such as MO_Add2. + -- Do not rewrite these instructions to "equivalent" ones that + -- have different effect on the condition register! (See #9013.) + | ADD_CC Format Operand Operand + | SUB_CC Format Operand Operand + + -- Simple bit-twiddling. + | AND Format Operand Operand + | OR Format Operand Operand + | XOR Format Operand Operand + | NOT Format Operand + | NEGI Format Operand -- NEG instruction (name clash with Cond) + | BSWAP Format Reg + + -- Shifts (amount may be immediate or %cl only) + | SHL Format Operand{-amount-} Operand + | SAR Format Operand{-amount-} Operand + | SHR Format Operand{-amount-} Operand + + | BT Format Imm Operand + | NOP + + + -- We need to support the FSTP (x87 store and pop) instruction + -- so that we can correctly read off the return value of an + -- x86 CDECL C function call when its floating point. + -- so we dont include a register argument, and just use st(0) + -- this instruction is used ONLY for return values of C ffi calls + -- in x86_32 abi + | X87Store Format AddrMode -- st(0), dst + + + -- SSE2 floating point: we use a restricted set of the available SSE2 + -- instructions for floating-point. + -- use MOV for moving (either movss or movsd (movlpd better?)) + | CVTSS2SD Reg Reg -- F32 to F64 + | CVTSD2SS Reg Reg -- F64 to F32 + | CVTTSS2SIQ Format Operand Reg -- F32 to I32/I64 (with truncation) + | CVTTSD2SIQ Format Operand Reg -- F64 to I32/I64 (with truncation) + | CVTSI2SS Format Operand Reg -- I32/I64 to F32 + | CVTSI2SD Format Operand Reg -- I32/I64 to F64 + + -- use ADD, SUB, and SQRT for arithmetic. In both cases, operands + -- are Operand Reg. + + -- SSE2 floating-point division: + | FDIV Format Operand Operand -- divisor, dividend(dst) + + -- use CMP for comparisons. ucomiss and ucomisd instructions + -- compare single/double prec floating point respectively. + + | SQRT Format Operand Reg -- src, dst + + + -- Comparison + | TEST Format Operand Operand + | CMP Format Operand Operand + | SETCC Cond Operand + + -- Stack Operations. + | PUSH Format Operand + | POP Format Operand + -- both unused (SDM): + -- | PUSHA + -- | POPA + + -- Jumping around. + | JMP Operand [Reg] -- including live Regs at the call + | JXX Cond BlockId -- includes unconditional branches + | JXX_GBL Cond Imm -- non-local version of JXX + -- Table jump + | JMP_TBL Operand -- Address to jump to + [Maybe JumpDest] -- Targets of the jump table + Section -- Data section jump table should be put in + CLabel -- Label of jump table + -- | X86 call instruction + | CALL (Either Imm Reg) -- ^ Jump target + [Reg] -- ^ Arguments (required for register allocation) + + -- Other things. + | CLTD Format -- sign extend %eax into %edx:%eax + + | FETCHGOT Reg -- pseudo-insn for ELF position-independent code + -- pretty-prints as + -- call 1f + -- 1: popl %reg + -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg + | FETCHPC Reg -- pseudo-insn for Darwin position-independent code + -- pretty-prints as + -- call 1f + -- 1: popl %reg + + -- bit counting instructions + | POPCNT Format Operand Reg -- [SSE4.2] count number of bits set to 1 + | LZCNT Format Operand Reg -- [BMI2] count number of leading zeros + | TZCNT Format Operand Reg -- [BMI2] count number of trailing zeros + | BSF Format Operand Reg -- bit scan forward + | BSR Format Operand Reg -- bit scan reverse + + -- bit manipulation instructions + | PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask + | PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask + + -- prefetch + | PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch + -- variant can be NTA, Lvl0, Lvl1, or Lvl2 + + | LOCK Instr -- lock prefix + | XADD Format Operand Operand -- src (r), dst (r/m) + | CMPXCHG Format Operand Operand -- src (r), dst (r/m), eax implicit + | MFENCE + +data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2 + + +data Operand + = OpReg Reg -- register + | OpImm Imm -- immediate value + | OpAddr AddrMode -- memory reference + + + +-- | Returns which registers are read and written as a (read, written) +-- pair. +x86_regUsageOfInstr :: Platform -> Instr -> RegUsage +x86_regUsageOfInstr platform instr + = case instr of + MOV _ src dst -> usageRW src dst + CMOV _ _ src dst -> mkRU (use_R src [dst]) [dst] + MOVZxL _ src dst -> usageRW src dst + MOVSxL _ src dst -> usageRW src dst + LEA _ src dst -> usageRW src dst + ADD _ src dst -> usageRM src dst + ADC _ src dst -> usageRM src dst + SUB _ src dst -> usageRM src dst + SBB _ src dst -> usageRM src dst + IMUL _ src dst -> usageRM src dst + + -- Result of IMULB will be in just in %ax + IMUL2 II8 src -> mkRU (eax:use_R src []) [eax] + -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and + -- %ax/%eax/%rax. + IMUL2 _ src -> mkRU (eax:use_R src []) [eax,edx] + + MUL _ src dst -> usageRM src dst + MUL2 _ src -> mkRU (eax:use_R src []) [eax,edx] + DIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx] + IDIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx] + ADD_CC _ src dst -> usageRM src dst + SUB_CC _ src dst -> usageRM src dst + AND _ src dst -> usageRM src dst + OR _ src dst -> usageRM src dst + + XOR _ (OpReg src) (OpReg dst) + | src == dst -> mkRU [] [dst] + + XOR _ src dst -> usageRM src dst + NOT _ op -> usageM op + BSWAP _ reg -> mkRU [reg] [reg] + NEGI _ op -> usageM op + SHL _ imm dst -> usageRM imm dst + SAR _ imm dst -> usageRM imm dst + SHR _ imm dst -> usageRM imm dst + BT _ _ src -> mkRUR (use_R src []) + + PUSH _ op -> mkRUR (use_R op []) + POP _ op -> mkRU [] (def_W op) + TEST _ src dst -> mkRUR (use_R src $! use_R dst []) + CMP _ src dst -> mkRUR (use_R src $! use_R dst []) + SETCC _ op -> mkRU [] (def_W op) + JXX _ _ -> mkRU [] [] + JXX_GBL _ _ -> mkRU [] [] + JMP op regs -> mkRUR (use_R op regs) + JMP_TBL op _ _ _ -> mkRUR (use_R op []) + CALL (Left _) params -> mkRU params (callClobberedRegs platform) + CALL (Right reg) params -> mkRU (reg:params) (callClobberedRegs platform) + CLTD _ -> mkRU [eax] [edx] + NOP -> mkRU [] [] + + X87Store _ dst -> mkRUR ( use_EA dst []) + + CVTSS2SD src dst -> mkRU [src] [dst] + CVTSD2SS src dst -> mkRU [src] [dst] + CVTTSS2SIQ _ src dst -> mkRU (use_R src []) [dst] + CVTTSD2SIQ _ src dst -> mkRU (use_R src []) [dst] + CVTSI2SS _ src dst -> mkRU (use_R src []) [dst] + CVTSI2SD _ src dst -> mkRU (use_R src []) [dst] + FDIV _ src dst -> usageRM src dst + SQRT _ src dst -> mkRU (use_R src []) [dst] + + FETCHGOT reg -> mkRU [] [reg] + FETCHPC reg -> mkRU [] [reg] + + COMMENT _ -> noUsage + LOCATION{} -> noUsage + UNWIND{} -> noUsage + DELTA _ -> noUsage + + POPCNT _ src dst -> mkRU (use_R src []) [dst] + LZCNT _ src dst -> mkRU (use_R src []) [dst] + TZCNT _ src dst -> mkRU (use_R src []) [dst] + BSF _ src dst -> mkRU (use_R src []) [dst] + BSR _ src dst -> mkRU (use_R src []) [dst] + + PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst] + PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst] + + -- note: might be a better way to do this + PREFETCH _ _ src -> mkRU (use_R src []) [] + LOCK i -> x86_regUsageOfInstr platform i + XADD _ src dst -> usageMM src dst + CMPXCHG _ src dst -> usageRMM src dst (OpReg eax) + MFENCE -> noUsage + + _other -> panic "regUsage: unrecognised instr" + where + -- # Definitions + -- + -- Written: If the operand is a register, it's written. If it's an + -- address, registers mentioned in the address are read. + -- + -- Modified: If the operand is a register, it's both read and + -- written. If it's an address, registers mentioned in the address + -- are read. + + -- 2 operand form; first operand Read; second Written + usageRW :: Operand -> Operand -> RegUsage + usageRW op (OpReg reg) = mkRU (use_R op []) [reg] + usageRW op (OpAddr ea) = mkRUR (use_R op $! use_EA ea []) + usageRW _ _ = panic "X86.RegInfo.usageRW: no match" + + -- 2 operand form; first operand Read; second Modified + usageRM :: Operand -> Operand -> RegUsage + usageRM op (OpReg reg) = mkRU (use_R op [reg]) [reg] + usageRM op (OpAddr ea) = mkRUR (use_R op $! use_EA ea []) + usageRM _ _ = panic "X86.RegInfo.usageRM: no match" + + -- 2 operand form; first operand Modified; second Modified + usageMM :: Operand -> Operand -> RegUsage + usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst] + usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src] + usageMM _ _ = panic "X86.RegInfo.usageMM: no match" + + -- 3 operand form; first operand Read; second Modified; third Modified + usageRMM :: Operand -> Operand -> Operand -> RegUsage + usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg] + usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg] + usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match" + + -- 1 operand form; operand Modified + usageM :: Operand -> RegUsage + usageM (OpReg reg) = mkRU [reg] [reg] + usageM (OpAddr ea) = mkRUR (use_EA ea []) + usageM _ = panic "X86.RegInfo.usageM: no match" + + -- Registers defd when an operand is written. + def_W (OpReg reg) = [reg] + def_W (OpAddr _ ) = [] + def_W _ = panic "X86.RegInfo.def_W: no match" + + -- Registers used when an operand is read. + use_R (OpReg reg) tl = reg : tl + use_R (OpImm _) tl = tl + use_R (OpAddr ea) tl = use_EA ea tl + + -- Registers used to compute an effective address. + use_EA (ImmAddr _ _) tl = tl + use_EA (AddrBaseIndex base index _) tl = + use_base base $! use_index index tl + where use_base (EABaseReg r) tl = r : tl + use_base _ tl = tl + use_index EAIndexNone tl = tl + use_index (EAIndex i _) tl = i : tl + + mkRUR src = src' `seq` RU src' [] + where src' = filter (interesting platform) src + + mkRU src dst = src' `seq` dst' `seq` RU src' dst' + where src' = filter (interesting platform) src + dst' = filter (interesting platform) dst + +-- | Is this register interesting for the register allocator? +interesting :: Platform -> Reg -> Bool +interesting _ (RegVirtual _) = True +interesting platform (RegReal (RealRegSingle i)) = freeReg platform i +interesting _ (RegReal (RealRegPair{})) = panic "X86.interesting: no reg pairs on this arch" + + + +-- | Applies the supplied function to all registers in instructions. +-- Typically used to change virtual registers to real registers. +x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr +x86_patchRegsOfInstr instr env + = case instr of + MOV fmt src dst -> patch2 (MOV fmt) src dst + CMOV cc fmt src dst -> CMOV cc fmt (patchOp src) (env dst) + MOVZxL fmt src dst -> patch2 (MOVZxL fmt) src dst + MOVSxL fmt src dst -> patch2 (MOVSxL fmt) src dst + LEA fmt src dst -> patch2 (LEA fmt) src dst + ADD fmt src dst -> patch2 (ADD fmt) src dst + ADC fmt src dst -> patch2 (ADC fmt) src dst + SUB fmt src dst -> patch2 (SUB fmt) src dst + SBB fmt src dst -> patch2 (SBB fmt) src dst + IMUL fmt src dst -> patch2 (IMUL fmt) src dst + IMUL2 fmt src -> patch1 (IMUL2 fmt) src + MUL fmt src dst -> patch2 (MUL fmt) src dst + MUL2 fmt src -> patch1 (MUL2 fmt) src + IDIV fmt op -> patch1 (IDIV fmt) op + DIV fmt op -> patch1 (DIV fmt) op + ADD_CC fmt src dst -> patch2 (ADD_CC fmt) src dst + SUB_CC fmt src dst -> patch2 (SUB_CC fmt) src dst + AND fmt src dst -> patch2 (AND fmt) src dst + OR fmt src dst -> patch2 (OR fmt) src dst + XOR fmt src dst -> patch2 (XOR fmt) src dst + NOT fmt op -> patch1 (NOT fmt) op + BSWAP fmt reg -> BSWAP fmt (env reg) + NEGI fmt op -> patch1 (NEGI fmt) op + SHL fmt imm dst -> patch1 (SHL fmt imm) dst + SAR fmt imm dst -> patch1 (SAR fmt imm) dst + SHR fmt imm dst -> patch1 (SHR fmt imm) dst + BT fmt imm src -> patch1 (BT fmt imm) src + TEST fmt src dst -> patch2 (TEST fmt) src dst + CMP fmt src dst -> patch2 (CMP fmt) src dst + PUSH fmt op -> patch1 (PUSH fmt) op + POP fmt op -> patch1 (POP fmt) op + SETCC cond op -> patch1 (SETCC cond) op + JMP op regs -> JMP (patchOp op) regs + JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl + + -- literally only support storing the top x87 stack value st(0) + X87Store fmt dst -> X87Store fmt (lookupAddr dst) + + CVTSS2SD src dst -> CVTSS2SD (env src) (env dst) + CVTSD2SS src dst -> CVTSD2SS (env src) (env dst) + CVTTSS2SIQ fmt src dst -> CVTTSS2SIQ fmt (patchOp src) (env dst) + CVTTSD2SIQ fmt src dst -> CVTTSD2SIQ fmt (patchOp src) (env dst) + CVTSI2SS fmt src dst -> CVTSI2SS fmt (patchOp src) (env dst) + CVTSI2SD fmt src dst -> CVTSI2SD fmt (patchOp src) (env dst) + FDIV fmt src dst -> FDIV fmt (patchOp src) (patchOp dst) + SQRT fmt src dst -> SQRT fmt (patchOp src) (env dst) + + CALL (Left _) _ -> instr + CALL (Right reg) p -> CALL (Right (env reg)) p + + FETCHGOT reg -> FETCHGOT (env reg) + FETCHPC reg -> FETCHPC (env reg) + + NOP -> instr + COMMENT _ -> instr + LOCATION {} -> instr + UNWIND {} -> instr + DELTA _ -> instr + + JXX _ _ -> instr + JXX_GBL _ _ -> instr + CLTD _ -> instr + + POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst) + LZCNT fmt src dst -> LZCNT fmt (patchOp src) (env dst) + TZCNT fmt src dst -> TZCNT fmt (patchOp src) (env dst) + PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst) + PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst) + BSF fmt src dst -> BSF fmt (patchOp src) (env dst) + BSR fmt src dst -> BSR fmt (patchOp src) (env dst) + + PREFETCH lvl format src -> PREFETCH lvl format (patchOp src) + + LOCK i -> LOCK (x86_patchRegsOfInstr i env) + XADD fmt src dst -> patch2 (XADD fmt) src dst + CMPXCHG fmt src dst -> patch2 (CMPXCHG fmt) src dst + MFENCE -> instr + + _other -> panic "patchRegs: unrecognised instr" + + where + patch1 :: (Operand -> a) -> Operand -> a + patch1 insn op = insn $! patchOp op + patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a + patch2 insn src dst = (insn $! patchOp src) $! patchOp dst + + patchOp (OpReg reg) = OpReg $! env reg + patchOp (OpImm imm) = OpImm imm + patchOp (OpAddr ea) = OpAddr $! lookupAddr ea + + lookupAddr (ImmAddr imm off) = ImmAddr imm off + lookupAddr (AddrBaseIndex base index disp) + = ((AddrBaseIndex $! lookupBase base) $! lookupIndex index) disp + where + lookupBase EABaseNone = EABaseNone + lookupBase EABaseRip = EABaseRip + lookupBase (EABaseReg r) = EABaseReg $! env r + + lookupIndex EAIndexNone = EAIndexNone + lookupIndex (EAIndex r i) = (EAIndex $! env r) i + + +-------------------------------------------------------------------------------- +x86_isJumpishInstr + :: Instr -> Bool + +x86_isJumpishInstr instr + = case instr of + JMP{} -> True + JXX{} -> True + JXX_GBL{} -> True + JMP_TBL{} -> True + CALL{} -> True + _ -> False + + +x86_jumpDestsOfInstr + :: Instr + -> [BlockId] + +x86_jumpDestsOfInstr insn + = case insn of + JXX _ id -> [id] + JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids] + _ -> [] + + +x86_patchJumpInstr + :: Instr -> (BlockId -> BlockId) -> Instr + +x86_patchJumpInstr insn patchF + = case insn of + JXX cc id -> JXX cc (patchF id) + JMP_TBL op ids section lbl + -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl + _ -> insn + where + patchJumpDest f (DestBlockId id) = DestBlockId (f id) + patchJumpDest _ dest = dest + + + + + +-- ----------------------------------------------------------------------------- +-- | Make a spill instruction. +x86_mkSpillInstr + :: DynFlags + -> Reg -- register to spill + -> Int -- current stack delta + -> Int -- spill slot to use + -> Instr + +x86_mkSpillInstr dflags reg delta slot + = let off = spillSlotToOffset platform slot - delta + in + case targetClassOfReg platform reg of + RcInteger -> MOV (archWordFormat is32Bit) + (OpReg reg) (OpAddr (spRel dflags off)) + RcDouble -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off)) + _ -> panic "X86.mkSpillInstr: no match" + where platform = targetPlatform dflags + is32Bit = target32Bit platform + +-- | Make a spill reload instruction. +x86_mkLoadInstr + :: DynFlags + -> Reg -- register to load + -> Int -- current stack delta + -> Int -- spill slot to use + -> Instr + +x86_mkLoadInstr dflags reg delta slot + = let off = spillSlotToOffset platform slot - delta + in + case targetClassOfReg platform reg of + RcInteger -> MOV (archWordFormat is32Bit) + (OpAddr (spRel dflags off)) (OpReg reg) + RcDouble -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg) + _ -> panic "X86.x86_mkLoadInstr" + where platform = targetPlatform dflags + is32Bit = target32Bit platform + +spillSlotSize :: Platform -> Int +spillSlotSize dflags = if is32Bit then 12 else 8 + where is32Bit = target32Bit dflags + +maxSpillSlots :: DynFlags -> Int +maxSpillSlots dflags + = ((rESERVED_C_STACK_BYTES dflags - 64) `div` spillSlotSize (targetPlatform dflags)) - 1 +-- = 0 -- useful for testing allocMoreStack + +-- number of bytes that the stack pointer should be aligned to +stackAlign :: Int +stackAlign = 16 + +-- convert a spill slot number to a *byte* offset, with no sign: +-- decide on a per arch basis whether you are spilling above or below +-- the C stack pointer. +spillSlotToOffset :: Platform -> Int -> Int +spillSlotToOffset platform slot + = 64 + spillSlotSize platform * slot + +-------------------------------------------------------------------------------- + +-- | See if this instruction is telling us the current C stack delta +x86_takeDeltaInstr + :: Instr + -> Maybe Int + +x86_takeDeltaInstr instr + = case instr of + DELTA i -> Just i + _ -> Nothing + + +x86_isMetaInstr + :: Instr + -> Bool + +x86_isMetaInstr instr + = case instr of + COMMENT{} -> True + LOCATION{} -> True + LDATA{} -> True + NEWBLOCK{} -> True + UNWIND{} -> True + DELTA{} -> True + _ -> False + + + +--- TODO: why is there +-- | Make a reg-reg move instruction. +-- On SPARC v8 there are no instructions to move directly between +-- floating point and integer regs. If we need to do that then we +-- have to go via memory. +-- +x86_mkRegRegMoveInstr + :: Platform + -> Reg + -> Reg + -> Instr + +x86_mkRegRegMoveInstr platform src dst + = case targetClassOfReg platform src of + RcInteger -> case platformArch platform of + ArchX86 -> MOV II32 (OpReg src) (OpReg dst) + ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst) + _ -> panic "x86_mkRegRegMoveInstr: Bad arch" + RcDouble -> MOV FF64 (OpReg src) (OpReg dst) + -- this code is the lie we tell ourselves because both float and double + -- use the same register class.on x86_64 and x86 32bit with SSE2, + -- more plainly, both use the XMM registers + _ -> panic "X86.RegInfo.mkRegRegMoveInstr: no match" + +-- | Check whether an instruction represents a reg-reg move. +-- The register allocator attempts to eliminate reg->reg moves whenever it can, +-- by assigning the src and dest temporaries to the same real register. +-- +x86_takeRegRegMoveInstr + :: Instr + -> Maybe (Reg,Reg) + +x86_takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2)) + = Just (r1,r2) + +x86_takeRegRegMoveInstr _ = Nothing + + +-- | Make an unconditional branch instruction. +x86_mkJumpInstr + :: BlockId + -> [Instr] + +x86_mkJumpInstr id + = [JXX ALWAYS id] + +-- Note [Windows stack layout] +-- | On most OSes the kernel will place a guard page after the current stack +-- page. If you allocate larger than a page worth you may jump over this +-- guard page. Not only is this a security issue, but on certain OSes such +-- as Windows a new page won't be allocated if you don't hit the guard. This +-- will cause a segfault or access fault. +-- +-- This function defines if the current allocation amount requires a probe. +-- On Windows (for now) we emit a call to _chkstk for this. For other OSes +-- this is not yet implemented. +-- See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk +-- The Windows stack looks like this: +-- +-- +-------------------+ +-- | SP | +-- +-------------------+ +-- | | +-- | GUARD PAGE | +-- | | +-- +-------------------+ +-- | | +-- | | +-- | UNMAPPED | +-- | | +-- | | +-- +-------------------+ +-- +-- In essence each allocation larger than a page size needs to be chunked and +-- a probe emitted after each page allocation. You have to hit the guard +-- page so the kernel can map in the next page, otherwise you'll segfault. +-- +needs_probe_call :: Platform -> Int -> Bool +needs_probe_call platform amount + = case platformOS platform of + OSMinGW32 -> case platformArch platform of + ArchX86 -> amount > (4 * 1024) + ArchX86_64 -> amount > (8 * 1024) + _ -> False + _ -> False + +x86_mkStackAllocInstr + :: Platform + -> Int + -> [Instr] +x86_mkStackAllocInstr platform amount + = case platformOS platform of + OSMinGW32 -> + -- These will clobber AX but this should be ok because + -- + -- 1. It is the first thing we do when entering the closure and AX is + -- a caller saved registers on Windows both on x86_64 and x86. + -- + -- 2. The closures are only entered via a call or longjmp in which case + -- there are no expectations for volatile registers. + -- + -- 3. When the target is a local branch point it is re-targeted + -- after the dealloc, preserving #2. See note [extra spill slots]. + -- + -- We emit a call because the stack probes are quite involved and + -- would bloat code size a lot. GHC doesn't really have an -Os. + -- __chkstk is guaranteed to leave all nonvolatile registers and AX + -- untouched. It's part of the standard prologue code for any Windows + -- function dropping the stack more than a page. + -- See Note [Windows stack layout] + case platformArch platform of + ArchX86 | needs_probe_call platform amount -> + [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax) + , CALL (Left $ strImmLit "___chkstk_ms") [eax] + , SUB II32 (OpReg eax) (OpReg esp) + ] + | otherwise -> + [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) + , TEST II32 (OpReg esp) (OpReg esp) + ] + ArchX86_64 | needs_probe_call platform amount -> + [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax) + , CALL (Left $ strImmLit "___chkstk_ms") [rax] + , SUB II64 (OpReg rax) (OpReg rsp) + ] + | otherwise -> + [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) + , TEST II64 (OpReg rsp) (OpReg rsp) + ] + _ -> panic "x86_mkStackAllocInstr" + _ -> + case platformArch platform of + ArchX86 -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ] + ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ] + _ -> panic "x86_mkStackAllocInstr" + +x86_mkStackDeallocInstr + :: Platform + -> Int + -> [Instr] +x86_mkStackDeallocInstr platform amount + = case platformArch platform of + ArchX86 -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)] + ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)] + _ -> panic "x86_mkStackDeallocInstr" + + +-- +-- Note [extra spill slots] +-- +-- If the register allocator used more spill slots than we have +-- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more +-- C stack space on entry and exit from this proc. Therefore we +-- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp" +-- before every non-local jump. +-- +-- This became necessary when the new codegen started bundling entire +-- functions together into one proc, because the register allocator +-- assigns a different stack slot to each virtual reg within a proc. +-- To avoid using so many slots we could also: +-- +-- - split up the proc into connected components before code generator +-- +-- - rename the virtual regs, so that we re-use vreg names and hence +-- stack slots for non-overlapping vregs. +-- +-- Note that when a block is both a non-local entry point (with an +-- info table) and a local branch target, we have to split it into +-- two, like so: +-- +-- <info table> +-- L: +-- <code> +-- +-- becomes +-- +-- <info table> +-- L: +-- subl $rsp, N +-- jmp Lnew +-- Lnew: +-- <code> +-- +-- and all branches pointing to L are retargetted to point to Lnew. +-- Otherwise, we would repeat the $rsp adjustment for each branch to +-- L. +-- +-- Returns a list of (L,Lnew) pairs. +-- +allocMoreStack + :: Platform + -> Int + -> NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr + -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr, [(BlockId,BlockId)]) + +allocMoreStack _ _ top@(CmmData _ _) = return (top,[]) +allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do + let entries = entryBlocks proc + + uniqs <- replicateM (length entries) getUniqueM + + let + delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up + where x = slots * spillSlotSize platform -- sp delta + + alloc = mkStackAllocInstr platform delta + dealloc = mkStackDeallocInstr platform delta + + retargetList = (zip entries (map mkBlockId uniqs)) + + new_blockmap :: LabelMap BlockId + new_blockmap = mapFromList retargetList + + insert_stack_insns (BasicBlock id insns) + | Just new_blockid <- mapLookup id new_blockmap + = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid] + , BasicBlock new_blockid block' ] + | otherwise + = [ BasicBlock id block' ] + where + block' = foldr insert_dealloc [] insns + + insert_dealloc insn r = case insn of + JMP _ _ -> dealloc ++ (insn : r) + JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL" + _other -> x86_patchJumpInstr insn retarget : r + where retarget b = fromMaybe b (mapLookup b new_blockmap) + + new_code = concatMap insert_stack_insns code + -- in + return (CmmProc info lbl live (ListGraph new_code), retargetList) + +data JumpDest = DestBlockId BlockId | DestImm Imm + +-- Debug Instance +instance Outputable JumpDest where + ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid + ppr (DestImm _imm) = text "jd<imm>:noShow" + + +getJumpDestBlockId :: JumpDest -> Maybe BlockId +getJumpDestBlockId (DestBlockId bid) = Just bid +getJumpDestBlockId _ = Nothing + +canShortcut :: Instr -> Maybe JumpDest +canShortcut (JXX ALWAYS id) = Just (DestBlockId id) +canShortcut (JMP (OpImm imm) _) = Just (DestImm imm) +canShortcut _ = Nothing + + +-- This helper shortcuts a sequence of branches. +-- The blockset helps avoid following cycles. +shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr +shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn + where + shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr + shortcutJump' fn seen insn@(JXX cc id) = + if setMember id seen then insn + else case fn id of + Nothing -> insn + Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id') + Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm) + where seen' = setInsert id seen + shortcutJump' fn _ (JMP_TBL addr blocks section tblId) = + let updateBlock (Just (DestBlockId bid)) = + case fn bid of + Nothing -> Just (DestBlockId bid ) + Just dest -> Just dest + updateBlock dest = dest + blocks' = map updateBlock blocks + in JMP_TBL addr blocks' section tblId + shortcutJump' _ _ other = other + +-- Here because it knows about JumpDest +shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, RawCmmStatics) -> (Alignment, RawCmmStatics) +shortcutStatics fn (align, RawCmmStatics lbl statics) + = (align, RawCmmStatics lbl $ map (shortcutStatic fn) statics) + -- we need to get the jump tables, so apply the mapping to the entries + -- of a CmmData too. + +shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel +shortcutLabel fn lab + | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId + | otherwise = lab + +shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic +shortcutStatic fn (CmmStaticLit (CmmLabel lab)) + = CmmStaticLit (CmmLabel (shortcutLabel fn lab)) +shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w)) + = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w) + -- slightly dodgy, we're ignoring the second label, but this + -- works with the way we use CmmLabelDiffOff for jump tables now. +shortcutStatic _ other_static + = other_static + +shortBlockId + :: (BlockId -> Maybe JumpDest) + -> UniqSet Unique + -> BlockId + -> CLabel + +shortBlockId fn seen blockid = + case (elementOfUniqSet uq seen, fn blockid) of + (True, _) -> blockLbl blockid + (_, Nothing) -> blockLbl blockid + (_, Just (DestBlockId blockid')) -> shortBlockId fn (addOneToUniqSet seen uq) blockid' + (_, Just (DestImm (ImmCLbl lbl))) -> lbl + (_, _other) -> panic "shortBlockId" + where uq = getUnique blockid diff --git a/compiler/GHC/CmmToAsm/X86/Ppr.hs b/compiler/GHC/CmmToAsm/X86/Ppr.hs new file mode 100644 index 0000000000..a5b9041974 --- /dev/null +++ b/compiler/GHC/CmmToAsm/X86/Ppr.hs @@ -0,0 +1,1014 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Pretty-printing assembly language +-- +-- (c) The University of Glasgow 1993-2005 +-- +----------------------------------------------------------------------------- + +{-# OPTIONS_GHC -fno-warn-orphans #-} +module GHC.CmmToAsm.X86.Ppr ( + pprNatCmmDecl, + pprData, + pprInstr, + pprFormat, + pprImm, + pprDataItem, +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.CmmToAsm.X86.Regs +import GHC.CmmToAsm.X86.Instr +import GHC.CmmToAsm.X86.Cond +import GHC.CmmToAsm.Instr +import GHC.CmmToAsm.Format +import GHC.Platform.Reg +import GHC.CmmToAsm.Ppr + + +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow.Label +import BasicTypes (Alignment, mkAlignment, alignmentBytes) +import GHC.Driver.Session +import GHC.Cmm hiding (topInfoTable) +import GHC.Cmm.BlockId +import GHC.Cmm.CLabel +import Unique ( pprUniqueAlways ) +import GHC.Platform +import FastString +import Outputable + +import Data.Word +import Data.Bits + +-- ----------------------------------------------------------------------------- +-- Printing this stuff out +-- +-- +-- Note [Subsections Via Symbols] +-- +-- If we are using the .subsections_via_symbols directive +-- (available on recent versions of Darwin), +-- we have to make sure that there is some kind of reference +-- from the entry code to a label on the _top_ of of the info table, +-- so that the linker will not think it is unreferenced and dead-strip +-- it. That's why the label is called a DeadStripPreventer (_dsp). +-- +-- The LLVM code gen already creates `iTableSuf` symbols, where +-- the X86 would generate the DeadStripPreventer (_dsp) symbol. +-- Therefore all that is left for llvm code gen, is to ensure +-- that all the `iTableSuf` symbols are marked as used. +-- As of this writing the documentation regarding the +-- .subsections_via_symbols and -dead_strip can be found at +-- <https://developer.apple.com/library/mac/documentation/DeveloperTools/Reference/Assembler/040-Assembler_Directives/asm_directives.html#//apple_ref/doc/uid/TP30000823-TPXREF101> + +pprProcAlignment :: SDoc +pprProcAlignment = sdocWithDynFlags $ \dflags -> + (maybe empty (pprAlign . mkAlignment) (cmmProcAlignment dflags)) + +pprNatCmmDecl :: NatCmmDecl (Alignment, RawCmmStatics) Instr -> SDoc +pprNatCmmDecl (CmmData section dats) = + pprSectionAlign section $$ pprDatas dats + +pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) = + sdocWithDynFlags $ \dflags -> + pprProcAlignment $$ + case topInfoTable proc of + Nothing -> + -- special case for code without info table: + pprSectionAlign (Section Text lbl) $$ + pprProcAlignment $$ + pprLabel lbl $$ -- blocks guaranteed not null, so label needed + vcat (map (pprBasicBlock top_info) blocks) $$ + (if debugLevel dflags > 0 + then ppr (mkAsmTempEndLabel lbl) <> char ':' else empty) $$ + pprSizeDecl lbl + + Just (RawCmmStatics info_lbl _) -> + sdocWithPlatform $ \platform -> + pprSectionAlign (Section Text info_lbl) $$ + pprProcAlignment $$ + (if platformHasSubsectionsViaSymbols platform + then ppr (mkDeadStripPreventer info_lbl) <> char ':' + else empty) $$ + vcat (map (pprBasicBlock top_info) blocks) $$ + -- above: Even the first block gets a label, because with branch-chain + -- elimination, it might be the target of a goto. + (if platformHasSubsectionsViaSymbols platform + then -- See Note [Subsections Via Symbols] + text "\t.long " + <+> ppr info_lbl + <+> char '-' + <+> ppr (mkDeadStripPreventer info_lbl) + else empty) $$ + pprSizeDecl info_lbl + +-- | Output the ELF .size directive. +pprSizeDecl :: CLabel -> SDoc +pprSizeDecl lbl + = sdocWithPlatform $ \platform -> + if osElfTarget (platformOS platform) + then text "\t.size" <+> ppr lbl <> ptext (sLit ", .-") <> ppr lbl + else empty + +pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc +pprBasicBlock info_env (BasicBlock blockid instrs) + = maybe_infotable $ + pprLabel asmLbl $$ + vcat (map pprInstr instrs) $$ + (sdocOption sdocDebugLevel $ \level -> + if level > 0 + then ppr (mkAsmTempEndLabel asmLbl) <> char ':' + else empty + ) + where + asmLbl = blockLbl blockid + maybe_infotable c = case mapLookup blockid info_env of + Nothing -> c + Just (RawCmmStatics infoLbl info) -> + pprAlignForSection Text $$ + infoTableLoc $$ + vcat (map pprData info) $$ + pprLabel infoLbl $$ + c $$ + (sdocOption sdocDebugLevel $ \level -> + if level > 0 + then ppr (mkAsmTempEndLabel infoLbl) <> char ':' + else empty + ) + -- Make sure the info table has the right .loc for the block + -- coming right after it. See [Note: Info Offset] + infoTableLoc = case instrs of + (l@LOCATION{} : _) -> pprInstr l + _other -> empty + + +pprDatas :: (Alignment, RawCmmStatics) -> SDoc +-- See note [emit-time elimination of static indirections] in CLabel. +pprDatas (_, RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _]) + | lbl == mkIndStaticInfoLabel + , let labelInd (CmmLabelOff l _) = Just l + labelInd (CmmLabel l) = Just l + labelInd _ = Nothing + , Just ind' <- labelInd ind + , alias `mayRedirectTo` ind' + = pprGloblDecl alias + $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind') + +pprDatas (align, (RawCmmStatics lbl dats)) + = vcat (pprAlign align : pprLabel lbl : map pprData dats) + +pprData :: CmmStatic -> SDoc +pprData (CmmString str) = pprBytes str + +pprData (CmmUninitialised bytes) + = sdocWithPlatform $ \platform -> + if platformOS platform == OSDarwin then text ".space " <> int bytes + else text ".skip " <> int bytes + +pprData (CmmStaticLit lit) = pprDataItem lit + +pprGloblDecl :: CLabel -> SDoc +pprGloblDecl lbl + | not (externallyVisibleCLabel lbl) = empty + | otherwise = text ".globl " <> ppr lbl + +pprLabelType' :: DynFlags -> CLabel -> SDoc +pprLabelType' dflags lbl = + if isCFunctionLabel lbl || functionOkInfoTable then + text "@function" + else + text "@object" + where + {- + NOTE: This is a bit hacky. + + With the `tablesNextToCode` info tables look like this: + ``` + <info table data> + label_info: + <info table code> + ``` + So actually info table label points exactly to the code and we can mark + the label as @function. (This is required to make perf and potentially other + tools to work on Haskell binaries). + This usually works well but it can cause issues with a linker. + A linker uses different algorithms for the relocation depending on + the symbol type.For some reason, a linker will generate JUMP_SLOT relocation + when constructor info table is referenced from a data section. + This only happens with static constructor call so + we mark _con_info symbols as `@object` to avoid the issue with relocations. + + @SimonMarlow hack explanation: + "The reasoning goes like this: + + * The danger when we mark a symbol as `@function` is that the linker will + redirect it to point to the PLT and use a `JUMP_SLOT` relocation when + the symbol refers to something outside the current shared object. + A PLT / JUMP_SLOT reference only works for symbols that we jump to, not + for symbols representing data,, nor for info table symbol references which + we expect to point directly to the info table. + * GHC generates code that might refer to any info table symbol from the text + segment, but that's OK, because those will be explicit GOT references + generated by the code generator. + * When we refer to info tables from the data segment, it's either + * a FUN_STATIC/THUNK_STATIC local to this module + * a `con_info` that could be from anywhere + + So, the only info table symbols that we might refer to from the data segment + of another shared object are `con_info` symbols, so those are the ones we + need to exclude from getting the @function treatment. + " + + A good place to check for more + https://gitlab.haskell.org/ghc/ghc/wikis/commentary/position-independent-code + + Another possible hack is to create an extra local function symbol for + every code-like thing to give the needed information for to the tools + but mess up with the relocation. https://phabricator.haskell.org/D4730 + -} + functionOkInfoTable = tablesNextToCode dflags && + isInfoTableLabel lbl && not (isConInfoTableLabel lbl) + + +pprTypeDecl :: CLabel -> SDoc +pprTypeDecl lbl + = sdocWithPlatform $ \platform -> + if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl + then + sdocWithDynFlags $ \df -> + text ".type " <> ppr lbl <> ptext (sLit ", ") <> pprLabelType' df lbl + else empty + +pprLabel :: CLabel -> SDoc +pprLabel lbl = pprGloblDecl lbl + $$ pprTypeDecl lbl + $$ (ppr lbl <> char ':') + +pprAlign :: Alignment -> SDoc +pprAlign alignment + = sdocWithPlatform $ \platform -> + text ".align " <> int (alignmentOn platform) + where + bytes = alignmentBytes alignment + alignmentOn platform = if platformOS platform == OSDarwin + then log2 bytes + else bytes + + log2 :: Int -> Int -- cache the common ones + log2 1 = 0 + log2 2 = 1 + log2 4 = 2 + log2 8 = 3 + log2 n = 1 + log2 (n `quot` 2) + +-- ----------------------------------------------------------------------------- +-- pprInstr: print an 'Instr' + +instance Outputable Instr where + ppr instr = pprInstr instr + + +pprReg :: Format -> Reg -> SDoc +pprReg f r + = case r of + RegReal (RealRegSingle i) -> + sdocWithPlatform $ \platform -> + if target32Bit platform then ppr32_reg_no f i + else ppr64_reg_no f i + RegReal (RealRegPair _ _) -> panic "X86.Ppr: no reg pairs on this arch" + RegVirtual (VirtualRegI u) -> text "%vI_" <> pprUniqueAlways u + RegVirtual (VirtualRegHi u) -> text "%vHi_" <> pprUniqueAlways u + RegVirtual (VirtualRegF u) -> text "%vF_" <> pprUniqueAlways u + RegVirtual (VirtualRegD u) -> text "%vD_" <> pprUniqueAlways u + + where + ppr32_reg_no :: Format -> Int -> SDoc + ppr32_reg_no II8 = ppr32_reg_byte + ppr32_reg_no II16 = ppr32_reg_word + ppr32_reg_no _ = ppr32_reg_long + + ppr32_reg_byte i = ptext + (case i of { + 0 -> sLit "%al"; 1 -> sLit "%bl"; + 2 -> sLit "%cl"; 3 -> sLit "%dl"; + _ -> sLit $ "very naughty I386 byte register: " ++ show i + }) + + ppr32_reg_word i = ptext + (case i of { + 0 -> sLit "%ax"; 1 -> sLit "%bx"; + 2 -> sLit "%cx"; 3 -> sLit "%dx"; + 4 -> sLit "%si"; 5 -> sLit "%di"; + 6 -> sLit "%bp"; 7 -> sLit "%sp"; + _ -> sLit "very naughty I386 word register" + }) + + ppr32_reg_long i = ptext + (case i of { + 0 -> sLit "%eax"; 1 -> sLit "%ebx"; + 2 -> sLit "%ecx"; 3 -> sLit "%edx"; + 4 -> sLit "%esi"; 5 -> sLit "%edi"; + 6 -> sLit "%ebp"; 7 -> sLit "%esp"; + _ -> ppr_reg_float i + }) + + ppr64_reg_no :: Format -> Int -> SDoc + ppr64_reg_no II8 = ppr64_reg_byte + ppr64_reg_no II16 = ppr64_reg_word + ppr64_reg_no II32 = ppr64_reg_long + ppr64_reg_no _ = ppr64_reg_quad + + ppr64_reg_byte i = ptext + (case i of { + 0 -> sLit "%al"; 1 -> sLit "%bl"; + 2 -> sLit "%cl"; 3 -> sLit "%dl"; + 4 -> sLit "%sil"; 5 -> sLit "%dil"; -- new 8-bit regs! + 6 -> sLit "%bpl"; 7 -> sLit "%spl"; + 8 -> sLit "%r8b"; 9 -> sLit "%r9b"; + 10 -> sLit "%r10b"; 11 -> sLit "%r11b"; + 12 -> sLit "%r12b"; 13 -> sLit "%r13b"; + 14 -> sLit "%r14b"; 15 -> sLit "%r15b"; + _ -> sLit $ "very naughty x86_64 byte register: " ++ show i + }) + + ppr64_reg_word i = ptext + (case i of { + 0 -> sLit "%ax"; 1 -> sLit "%bx"; + 2 -> sLit "%cx"; 3 -> sLit "%dx"; + 4 -> sLit "%si"; 5 -> sLit "%di"; + 6 -> sLit "%bp"; 7 -> sLit "%sp"; + 8 -> sLit "%r8w"; 9 -> sLit "%r9w"; + 10 -> sLit "%r10w"; 11 -> sLit "%r11w"; + 12 -> sLit "%r12w"; 13 -> sLit "%r13w"; + 14 -> sLit "%r14w"; 15 -> sLit "%r15w"; + _ -> sLit "very naughty x86_64 word register" + }) + + ppr64_reg_long i = ptext + (case i of { + 0 -> sLit "%eax"; 1 -> sLit "%ebx"; + 2 -> sLit "%ecx"; 3 -> sLit "%edx"; + 4 -> sLit "%esi"; 5 -> sLit "%edi"; + 6 -> sLit "%ebp"; 7 -> sLit "%esp"; + 8 -> sLit "%r8d"; 9 -> sLit "%r9d"; + 10 -> sLit "%r10d"; 11 -> sLit "%r11d"; + 12 -> sLit "%r12d"; 13 -> sLit "%r13d"; + 14 -> sLit "%r14d"; 15 -> sLit "%r15d"; + _ -> sLit "very naughty x86_64 register" + }) + + ppr64_reg_quad i = ptext + (case i of { + 0 -> sLit "%rax"; 1 -> sLit "%rbx"; + 2 -> sLit "%rcx"; 3 -> sLit "%rdx"; + 4 -> sLit "%rsi"; 5 -> sLit "%rdi"; + 6 -> sLit "%rbp"; 7 -> sLit "%rsp"; + 8 -> sLit "%r8"; 9 -> sLit "%r9"; + 10 -> sLit "%r10"; 11 -> sLit "%r11"; + 12 -> sLit "%r12"; 13 -> sLit "%r13"; + 14 -> sLit "%r14"; 15 -> sLit "%r15"; + _ -> ppr_reg_float i + }) + +ppr_reg_float :: Int -> PtrString +ppr_reg_float i = case i of + 16 -> sLit "%xmm0" ; 17 -> sLit "%xmm1" + 18 -> sLit "%xmm2" ; 19 -> sLit "%xmm3" + 20 -> sLit "%xmm4" ; 21 -> sLit "%xmm5" + 22 -> sLit "%xmm6" ; 23 -> sLit "%xmm7" + 24 -> sLit "%xmm8" ; 25 -> sLit "%xmm9" + 26 -> sLit "%xmm10"; 27 -> sLit "%xmm11" + 28 -> sLit "%xmm12"; 29 -> sLit "%xmm13" + 30 -> sLit "%xmm14"; 31 -> sLit "%xmm15" + _ -> sLit "very naughty x86 register" + +pprFormat :: Format -> SDoc +pprFormat x + = ptext (case x of + II8 -> sLit "b" + II16 -> sLit "w" + II32 -> sLit "l" + II64 -> sLit "q" + FF32 -> sLit "ss" -- "scalar single-precision float" (SSE2) + FF64 -> sLit "sd" -- "scalar double-precision float" (SSE2) + ) + +pprFormat_x87 :: Format -> SDoc +pprFormat_x87 x + = ptext $ case x of + FF32 -> sLit "s" + FF64 -> sLit "l" + _ -> panic "X86.Ppr.pprFormat_x87" + + +pprCond :: Cond -> SDoc +pprCond c + = ptext (case c of { + GEU -> sLit "ae"; LU -> sLit "b"; + EQQ -> sLit "e"; GTT -> sLit "g"; + GE -> sLit "ge"; GU -> sLit "a"; + LTT -> sLit "l"; LE -> sLit "le"; + LEU -> sLit "be"; NE -> sLit "ne"; + NEG -> sLit "s"; POS -> sLit "ns"; + CARRY -> sLit "c"; OFLO -> sLit "o"; + PARITY -> sLit "p"; NOTPARITY -> sLit "np"; + ALWAYS -> sLit "mp"}) + + +pprImm :: Imm -> SDoc +pprImm (ImmInt i) = int i +pprImm (ImmInteger i) = integer i +pprImm (ImmCLbl l) = ppr l +pprImm (ImmIndex l i) = ppr l <> char '+' <> int i +pprImm (ImmLit s) = s + +pprImm (ImmFloat _) = text "naughty float immediate" +pprImm (ImmDouble _) = text "naughty double immediate" + +pprImm (ImmConstantSum a b) = pprImm a <> char '+' <> pprImm b +pprImm (ImmConstantDiff a b) = pprImm a <> char '-' + <> lparen <> pprImm b <> rparen + + + +pprAddr :: AddrMode -> SDoc +pprAddr (ImmAddr imm off) + = let pp_imm = pprImm imm + in + if (off == 0) then + pp_imm + else if (off < 0) then + pp_imm <> int off + else + pp_imm <> char '+' <> int off + +pprAddr (AddrBaseIndex base index displacement) + = sdocWithPlatform $ \platform -> + let + pp_disp = ppr_disp displacement + pp_off p = pp_disp <> char '(' <> p <> char ')' + pp_reg r = pprReg (archWordFormat (target32Bit platform)) r + in + case (base, index) of + (EABaseNone, EAIndexNone) -> pp_disp + (EABaseReg b, EAIndexNone) -> pp_off (pp_reg b) + (EABaseRip, EAIndexNone) -> pp_off (text "%rip") + (EABaseNone, EAIndex r i) -> pp_off (comma <> pp_reg r <> comma <> int i) + (EABaseReg b, EAIndex r i) -> pp_off (pp_reg b <> comma <> pp_reg r + <> comma <> int i) + _ -> panic "X86.Ppr.pprAddr: no match" + + where + ppr_disp (ImmInt 0) = empty + ppr_disp imm = pprImm imm + +-- | Print section header and appropriate alignment for that section. +pprSectionAlign :: Section -> SDoc +pprSectionAlign (Section (OtherSection _) _) = + panic "X86.Ppr.pprSectionAlign: unknown section" +pprSectionAlign sec@(Section seg _) = + sdocWithPlatform $ \platform -> + pprSectionHeader platform sec $$ + pprAlignForSection seg + +-- | Print appropriate alignment for the given section type. +pprAlignForSection :: SectionType -> SDoc +pprAlignForSection seg = + sdocWithPlatform $ \platform -> + text ".align " <> + case platformOS platform of + -- Darwin: alignments are given as shifts. + OSDarwin + | target32Bit platform -> + case seg of + ReadOnlyData16 -> int 4 + CString -> int 1 + _ -> int 2 + | otherwise -> + case seg of + ReadOnlyData16 -> int 4 + CString -> int 1 + _ -> int 3 + -- Other: alignments are given as bytes. + _ + | target32Bit platform -> + case seg of + Text -> text "4,0x90" + ReadOnlyData16 -> int 16 + CString -> int 1 + _ -> int 4 + | otherwise -> + case seg of + ReadOnlyData16 -> int 16 + CString -> int 1 + _ -> int 8 + +pprDataItem :: CmmLit -> SDoc +pprDataItem lit = sdocWithDynFlags $ \dflags -> pprDataItem' dflags lit + +pprDataItem' :: DynFlags -> CmmLit -> SDoc +pprDataItem' dflags lit + = vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit) + where + platform = targetPlatform dflags + imm = litToImm lit + + -- These seem to be common: + ppr_item II8 _ = [text "\t.byte\t" <> pprImm imm] + ppr_item II16 _ = [text "\t.word\t" <> pprImm imm] + ppr_item II32 _ = [text "\t.long\t" <> pprImm imm] + + ppr_item FF32 (CmmFloat r _) + = let bs = floatToBytes (fromRational r) + in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs + + ppr_item FF64 (CmmFloat r _) + = let bs = doubleToBytes (fromRational r) + in map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs + + ppr_item II64 _ + = case platformOS platform of + OSDarwin + | target32Bit platform -> + case lit of + CmmInt x _ -> + [text "\t.long\t" + <> int (fromIntegral (fromIntegral x :: Word32)), + text "\t.long\t" + <> int (fromIntegral + (fromIntegral (x `shiftR` 32) :: Word32))] + _ -> panic "X86.Ppr.ppr_item: no match for II64" + | otherwise -> + [text "\t.quad\t" <> pprImm imm] + _ + | target32Bit platform -> + [text "\t.quad\t" <> pprImm imm] + | otherwise -> + -- x86_64: binutils can't handle the R_X86_64_PC64 + -- relocation type, which means we can't do + -- pc-relative 64-bit addresses. Fortunately we're + -- assuming the small memory model, in which all such + -- offsets will fit into 32 bits, so we have to stick + -- to 32-bit offset fields and modify the RTS + -- appropriately + -- + -- See Note [x86-64-relative] in includes/rts/storage/InfoTables.h + -- + case lit of + -- A relative relocation: + CmmLabelDiffOff _ _ _ _ -> + [text "\t.long\t" <> pprImm imm, + text "\t.long\t0"] + _ -> + [text "\t.quad\t" <> pprImm imm] + + ppr_item _ _ + = panic "X86.Ppr.ppr_item: no match" + + +asmComment :: SDoc -> SDoc +asmComment c = whenPprDebug $ text "# " <> c + +pprInstr :: Instr -> SDoc + +pprInstr (COMMENT s) + = asmComment (ftext s) + +pprInstr (LOCATION file line col _name) + = text "\t.loc " <> ppr file <+> ppr line <+> ppr col + +pprInstr (DELTA d) + = asmComment $ text ("\tdelta = " ++ show d) + +pprInstr (NEWBLOCK _) + = panic "PprMach.pprInstr: NEWBLOCK" + +pprInstr (UNWIND lbl d) + = asmComment (text "\tunwind = " <> ppr d) + $$ ppr lbl <> colon + +pprInstr (LDATA _ _) + = panic "PprMach.pprInstr: LDATA" + +{- +pprInstr (SPILL reg slot) + = hcat [ + text "\tSPILL", + char ' ', + pprUserReg reg, + comma, + text "SLOT" <> parens (int slot)] + +pprInstr (RELOAD slot reg) + = hcat [ + text "\tRELOAD", + char ' ', + text "SLOT" <> parens (int slot), + comma, + pprUserReg reg] +-} + +-- Replace 'mov $0x0,%reg' by 'xor %reg,%reg', which is smaller and cheaper. +-- The code generator catches most of these already, but not all. +pprInstr (MOV format (OpImm (ImmInt 0)) dst@(OpReg _)) + = pprInstr (XOR format' dst dst) + where format' = case format of + II64 -> II32 -- 32-bit version is equivalent, and smaller + _ -> format +pprInstr (MOV format src dst) + = pprFormatOpOp (sLit "mov") format src dst + +pprInstr (CMOV cc format src dst) + = pprCondOpReg (sLit "cmov") format cc src dst + +pprInstr (MOVZxL II32 src dst) = pprFormatOpOp (sLit "mov") II32 src dst + -- 32-to-64 bit zero extension on x86_64 is accomplished by a simple + -- movl. But we represent it as a MOVZxL instruction, because + -- the reg alloc would tend to throw away a plain reg-to-reg + -- move, and we still want it to do that. + +pprInstr (MOVZxL formats src dst) + = pprFormatOpOpCoerce (sLit "movz") formats II32 src dst + -- zero-extension only needs to extend to 32 bits: on x86_64, + -- the remaining zero-extension to 64 bits is automatic, and the 32-bit + -- instruction is shorter. + +pprInstr (MOVSxL formats src dst) + = sdocWithPlatform $ \platform -> + pprFormatOpOpCoerce (sLit "movs") formats (archWordFormat (target32Bit platform)) src dst + +-- here we do some patching, since the physical registers are only set late +-- in the code generation. +pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) (EAIndex reg2 1) (ImmInt 0))) dst@(OpReg reg3)) + | reg1 == reg3 + = pprFormatOpOp (sLit "add") format (OpReg reg2) dst + +pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) (EAIndex reg2 1) (ImmInt 0))) dst@(OpReg reg3)) + | reg2 == reg3 + = pprFormatOpOp (sLit "add") format (OpReg reg1) dst + +pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) EAIndexNone displ)) dst@(OpReg reg3)) + | reg1 == reg3 + = pprInstr (ADD format (OpImm displ) dst) + +pprInstr (LEA format src dst) = pprFormatOpOp (sLit "lea") format src dst + +pprInstr (ADD format (OpImm (ImmInt (-1))) dst) + = pprFormatOp (sLit "dec") format dst +pprInstr (ADD format (OpImm (ImmInt 1)) dst) + = pprFormatOp (sLit "inc") format dst +pprInstr (ADD format src dst) = pprFormatOpOp (sLit "add") format src dst +pprInstr (ADC format src dst) = pprFormatOpOp (sLit "adc") format src dst +pprInstr (SUB format src dst) = pprFormatOpOp (sLit "sub") format src dst +pprInstr (SBB format src dst) = pprFormatOpOp (sLit "sbb") format src dst +pprInstr (IMUL format op1 op2) = pprFormatOpOp (sLit "imul") format op1 op2 + +pprInstr (ADD_CC format src dst) + = pprFormatOpOp (sLit "add") format src dst +pprInstr (SUB_CC format src dst) + = pprFormatOpOp (sLit "sub") format src dst + +{- A hack. The Intel documentation says that "The two and three + operand forms [of IMUL] may also be used with unsigned operands + because the lower half of the product is the same regardless if + (sic) the operands are signed or unsigned. The CF and OF flags, + however, cannot be used to determine if the upper half of the + result is non-zero." So there. +-} + +-- Use a 32-bit instruction when possible as it saves a byte. +-- Notably, extracting the tag bits of a pointer has this form. +-- TODO: we could save a byte in a subsequent CMP instruction too, +-- but need something like a peephole pass for this +pprInstr (AND II64 src@(OpImm (ImmInteger mask)) dst) + | 0 <= mask && mask < 0xffffffff + = pprInstr (AND II32 src dst) +pprInstr (AND FF32 src dst) = pprOpOp (sLit "andps") FF32 src dst +pprInstr (AND FF64 src dst) = pprOpOp (sLit "andpd") FF64 src dst +pprInstr (AND format src dst) = pprFormatOpOp (sLit "and") format src dst +pprInstr (OR format src dst) = pprFormatOpOp (sLit "or") format src dst + +pprInstr (XOR FF32 src dst) = pprOpOp (sLit "xorps") FF32 src dst +pprInstr (XOR FF64 src dst) = pprOpOp (sLit "xorpd") FF64 src dst +pprInstr (XOR format src dst) = pprFormatOpOp (sLit "xor") format src dst + +pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst) +pprInstr (LZCNT format src dst) = pprOpOp (sLit "lzcnt") format src (OpReg dst) +pprInstr (TZCNT format src dst) = pprOpOp (sLit "tzcnt") format src (OpReg dst) +pprInstr (BSF format src dst) = pprOpOp (sLit "bsf") format src (OpReg dst) +pprInstr (BSR format src dst) = pprOpOp (sLit "bsr") format src (OpReg dst) + +pprInstr (PDEP format src mask dst) = pprFormatOpOpReg (sLit "pdep") format src mask dst +pprInstr (PEXT format src mask dst) = pprFormatOpOpReg (sLit "pext") format src mask dst + +pprInstr (PREFETCH NTA format src ) = pprFormatOp_ (sLit "prefetchnta") format src +pprInstr (PREFETCH Lvl0 format src) = pprFormatOp_ (sLit "prefetcht0") format src +pprInstr (PREFETCH Lvl1 format src) = pprFormatOp_ (sLit "prefetcht1") format src +pprInstr (PREFETCH Lvl2 format src) = pprFormatOp_ (sLit "prefetcht2") format src + +pprInstr (NOT format op) = pprFormatOp (sLit "not") format op +pprInstr (BSWAP format op) = pprFormatOp (sLit "bswap") format (OpReg op) +pprInstr (NEGI format op) = pprFormatOp (sLit "neg") format op + +pprInstr (SHL format src dst) = pprShift (sLit "shl") format src dst +pprInstr (SAR format src dst) = pprShift (sLit "sar") format src dst +pprInstr (SHR format src dst) = pprShift (sLit "shr") format src dst + +pprInstr (BT format imm src) = pprFormatImmOp (sLit "bt") format imm src + +pprInstr (CMP format src dst) + | isFloatFormat format = pprFormatOpOp (sLit "ucomi") format src dst -- SSE2 + | otherwise = pprFormatOpOp (sLit "cmp") format src dst + +pprInstr (TEST format src dst) = sdocWithPlatform $ \platform -> + let format' = case (src,dst) of + -- Match instructions like 'test $0x3,%esi' or 'test $0x7,%rbx'. + -- We can replace them by equivalent, but smaller instructions + -- by reducing the size of the immediate operand as far as possible. + -- (We could handle masks larger than a single byte too, + -- but it would complicate the code considerably + -- and tag checks are by far the most common case.) + -- The mask must have the high bit clear for this smaller encoding + -- to be completely equivalent to the original; in particular so + -- that the signed comparison condition bits are the same as they + -- would be if doing a full word comparison. See #13425. + (OpImm (ImmInteger mask), OpReg dstReg) + | 0 <= mask && mask < 128 -> minSizeOfReg platform dstReg + _ -> format + in pprFormatOpOp (sLit "test") format' src dst + where + minSizeOfReg platform (RegReal (RealRegSingle i)) + | target32Bit platform && i <= 3 = II8 -- al, bl, cl, dl + | target32Bit platform && i <= 7 = II16 -- si, di, bp, sp + | not (target32Bit platform) && i <= 15 = II8 -- al .. r15b + minSizeOfReg _ _ = format -- other + +pprInstr (PUSH format op) = pprFormatOp (sLit "push") format op +pprInstr (POP format op) = pprFormatOp (sLit "pop") format op + +-- both unused (SDM): +-- pprInstr PUSHA = text "\tpushal" +-- pprInstr POPA = text "\tpopal" + +pprInstr NOP = text "\tnop" +pprInstr (CLTD II8) = text "\tcbtw" +pprInstr (CLTD II16) = text "\tcwtd" +pprInstr (CLTD II32) = text "\tcltd" +pprInstr (CLTD II64) = text "\tcqto" +pprInstr (CLTD x) = panic $ "pprInstr: " ++ show x + +pprInstr (SETCC cond op) = pprCondInstr (sLit "set") cond (pprOperand II8 op) + +pprInstr (JXX cond blockid) + = pprCondInstr (sLit "j") cond (ppr lab) + where lab = blockLbl blockid + +pprInstr (JXX_GBL cond imm) = pprCondInstr (sLit "j") cond (pprImm imm) + +pprInstr (JMP (OpImm imm) _) = text "\tjmp " <> pprImm imm +pprInstr (JMP op _) = sdocWithPlatform $ \platform -> + text "\tjmp *" + <> pprOperand (archWordFormat (target32Bit platform)) op +pprInstr (JMP_TBL op _ _ _) = pprInstr (JMP op []) +pprInstr (CALL (Left imm) _) = text "\tcall " <> pprImm imm +pprInstr (CALL (Right reg) _) = sdocWithPlatform $ \platform -> + text "\tcall *" + <> pprReg (archWordFormat (target32Bit platform)) reg + +pprInstr (IDIV fmt op) = pprFormatOp (sLit "idiv") fmt op +pprInstr (DIV fmt op) = pprFormatOp (sLit "div") fmt op +pprInstr (IMUL2 fmt op) = pprFormatOp (sLit "imul") fmt op + +-- x86_64 only +pprInstr (MUL format op1 op2) = pprFormatOpOp (sLit "mul") format op1 op2 +pprInstr (MUL2 format op) = pprFormatOp (sLit "mul") format op + +pprInstr (FDIV format op1 op2) = pprFormatOpOp (sLit "div") format op1 op2 +pprInstr (SQRT format op1 op2) = pprFormatOpReg (sLit "sqrt") format op1 op2 + +pprInstr (CVTSS2SD from to) = pprRegReg (sLit "cvtss2sd") from to +pprInstr (CVTSD2SS from to) = pprRegReg (sLit "cvtsd2ss") from to +pprInstr (CVTTSS2SIQ fmt from to) = pprFormatFormatOpReg (sLit "cvttss2si") FF32 fmt from to +pprInstr (CVTTSD2SIQ fmt from to) = pprFormatFormatOpReg (sLit "cvttsd2si") FF64 fmt from to +pprInstr (CVTSI2SS fmt from to) = pprFormatOpReg (sLit "cvtsi2ss") fmt from to +pprInstr (CVTSI2SD fmt from to) = pprFormatOpReg (sLit "cvtsi2sd") fmt from to + + -- FETCHGOT for PIC on ELF platforms +pprInstr (FETCHGOT reg) + = vcat [ text "\tcall 1f", + hcat [ text "1:\tpopl\t", pprReg II32 reg ], + hcat [ text "\taddl\t$_GLOBAL_OFFSET_TABLE_+(.-1b), ", + pprReg II32 reg ] + ] + + -- FETCHPC for PIC on Darwin/x86 + -- get the instruction pointer into a register + -- (Terminology note: the IP is called Program Counter on PPC, + -- and it's a good thing to use the same name on both platforms) +pprInstr (FETCHPC reg) + = vcat [ text "\tcall 1f", + hcat [ text "1:\tpopl\t", pprReg II32 reg ] + ] + + +-- the +-- GST fmt src addr ==> FLD dst ; FSTPsz addr +pprInstr g@(X87Store fmt addr) + = pprX87 g (hcat [gtab, + text "fstp", pprFormat_x87 fmt, gsp, pprAddr addr]) + + +-- Atomics + +pprInstr (LOCK i) = text "\tlock" $$ pprInstr i + +pprInstr MFENCE = text "\tmfence" + +pprInstr (XADD format src dst) = pprFormatOpOp (sLit "xadd") format src dst + +pprInstr (CMPXCHG format src dst) + = pprFormatOpOp (sLit "cmpxchg") format src dst + + + +-------------------------- +-- some left over + + + +gtab :: SDoc +gtab = char '\t' + +gsp :: SDoc +gsp = char ' ' + + + +pprX87 :: Instr -> SDoc -> SDoc +pprX87 fake actual + = (char '#' <> pprX87Instr fake) $$ actual + +pprX87Instr :: Instr -> SDoc +pprX87Instr (X87Store fmt dst) = pprFormatAddr (sLit "gst") fmt dst +pprX87Instr _ = panic "X86.Ppr.pprX87Instr: no match" + +pprDollImm :: Imm -> SDoc +pprDollImm i = text "$" <> pprImm i + + +pprOperand :: Format -> Operand -> SDoc +pprOperand f (OpReg r) = pprReg f r +pprOperand _ (OpImm i) = pprDollImm i +pprOperand _ (OpAddr ea) = pprAddr ea + + +pprMnemonic_ :: PtrString -> SDoc +pprMnemonic_ name = + char '\t' <> ptext name <> space + + +pprMnemonic :: PtrString -> Format -> SDoc +pprMnemonic name format = + char '\t' <> ptext name <> pprFormat format <> space + + +pprFormatImmOp :: PtrString -> Format -> Imm -> Operand -> SDoc +pprFormatImmOp name format imm op1 + = hcat [ + pprMnemonic name format, + char '$', + pprImm imm, + comma, + pprOperand format op1 + ] + + +pprFormatOp_ :: PtrString -> Format -> Operand -> SDoc +pprFormatOp_ name format op1 + = hcat [ + pprMnemonic_ name , + pprOperand format op1 + ] + +pprFormatOp :: PtrString -> Format -> Operand -> SDoc +pprFormatOp name format op1 + = hcat [ + pprMnemonic name format, + pprOperand format op1 + ] + + +pprFormatOpOp :: PtrString -> Format -> Operand -> Operand -> SDoc +pprFormatOpOp name format op1 op2 + = hcat [ + pprMnemonic name format, + pprOperand format op1, + comma, + pprOperand format op2 + ] + + +pprOpOp :: PtrString -> Format -> Operand -> Operand -> SDoc +pprOpOp name format op1 op2 + = hcat [ + pprMnemonic_ name, + pprOperand format op1, + comma, + pprOperand format op2 + ] + + + +pprRegReg :: PtrString -> Reg -> Reg -> SDoc +pprRegReg name reg1 reg2 + = sdocWithPlatform $ \platform -> + hcat [ + pprMnemonic_ name, + pprReg (archWordFormat (target32Bit platform)) reg1, + comma, + pprReg (archWordFormat (target32Bit platform)) reg2 + ] + + +pprFormatOpReg :: PtrString -> Format -> Operand -> Reg -> SDoc +pprFormatOpReg name format op1 reg2 + = sdocWithPlatform $ \platform -> + hcat [ + pprMnemonic name format, + pprOperand format op1, + comma, + pprReg (archWordFormat (target32Bit platform)) reg2 + ] + +pprCondOpReg :: PtrString -> Format -> Cond -> Operand -> Reg -> SDoc +pprCondOpReg name format cond op1 reg2 + = hcat [ + char '\t', + ptext name, + pprCond cond, + space, + pprOperand format op1, + comma, + pprReg format reg2 + ] + +pprFormatFormatOpReg :: PtrString -> Format -> Format -> Operand -> Reg -> SDoc +pprFormatFormatOpReg name format1 format2 op1 reg2 + = hcat [ + pprMnemonic name format2, + pprOperand format1 op1, + comma, + pprReg format2 reg2 + ] + +pprFormatOpOpReg :: PtrString -> Format -> Operand -> Operand -> Reg -> SDoc +pprFormatOpOpReg name format op1 op2 reg3 + = hcat [ + pprMnemonic name format, + pprOperand format op1, + comma, + pprOperand format op2, + comma, + pprReg format reg3 + ] + + + +pprFormatAddr :: PtrString -> Format -> AddrMode -> SDoc +pprFormatAddr name format op + = hcat [ + pprMnemonic name format, + comma, + pprAddr op + ] + +pprShift :: PtrString -> Format -> Operand -> Operand -> SDoc +pprShift name format src dest + = hcat [ + pprMnemonic name format, + pprOperand II8 src, -- src is 8-bit sized + comma, + pprOperand format dest + ] + + +pprFormatOpOpCoerce :: PtrString -> Format -> Format -> Operand -> Operand -> SDoc +pprFormatOpOpCoerce name format1 format2 op1 op2 + = hcat [ char '\t', ptext name, pprFormat format1, pprFormat format2, space, + pprOperand format1 op1, + comma, + pprOperand format2 op2 + ] + + +pprCondInstr :: PtrString -> Cond -> SDoc -> SDoc +pprCondInstr name cond arg + = hcat [ char '\t', ptext name, pprCond cond, space, arg] diff --git a/compiler/GHC/CmmToAsm/X86/RegInfo.hs b/compiler/GHC/CmmToAsm/X86/RegInfo.hs new file mode 100644 index 0000000000..597efe1c3e --- /dev/null +++ b/compiler/GHC/CmmToAsm/X86/RegInfo.hs @@ -0,0 +1,73 @@ +{-# LANGUAGE CPP #-} +module GHC.CmmToAsm.X86.RegInfo ( + mkVirtualReg, + regDotColor +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.CmmToAsm.Format +import GHC.Platform.Reg + +import Outputable +import GHC.Platform +import Unique + +import UniqFM +import GHC.CmmToAsm.X86.Regs + + +mkVirtualReg :: Unique -> Format -> VirtualReg +mkVirtualReg u format + = case format of + FF32 -> VirtualRegD u + -- for scalar F32, we use the same xmm as F64! + -- this is a hack that needs some improvement. + -- For now we map both to being allocated as "Double" Registers + -- on X86/X86_64 + FF64 -> VirtualRegD u + _other -> VirtualRegI u + +regDotColor :: Platform -> RealReg -> SDoc +regDotColor platform reg + = case (lookupUFM (regColors platform) reg) of + Just str -> text str + _ -> panic "Register not assigned a color" + +regColors :: Platform -> UniqFM [Char] +regColors platform = listToUFM (normalRegColors platform) + +normalRegColors :: Platform -> [(Reg,String)] +normalRegColors platform = + zip (map regSingle [0..lastint platform]) colors + ++ zip (map regSingle [firstxmm..lastxmm platform]) greys + where + -- 16 colors - enough for amd64 gp regs + colors = ["#800000","#ff0000","#808000","#ffff00","#008000" + ,"#00ff00","#008080","#00ffff","#000080","#0000ff" + ,"#800080","#ff00ff","#87005f","#875f00","#87af00" + ,"#ff00af"] + + -- 16 shades of grey, enough for the currently supported + -- SSE extensions. + greys = ["#0e0e0e","#1c1c1c","#2a2a2a","#383838","#464646" + ,"#545454","#626262","#707070","#7e7e7e","#8c8c8c" + ,"#9a9a9a","#a8a8a8","#b6b6b6","#c4c4c4","#d2d2d2" + ,"#e0e0e0"] + + + +-- 32 shades of grey - use for avx 512 if we ever need it +-- greys = ["#070707","#0e0e0e","#151515","#1c1c1c" +-- ,"#232323","#2a2a2a","#313131","#383838","#3f3f3f" +-- ,"#464646","#4d4d4d","#545454","#5b5b5b","#626262" +-- ,"#696969","#707070","#777777","#7e7e7e","#858585" +-- ,"#8c8c8c","#939393","#9a9a9a","#a1a1a1","#a8a8a8" +-- ,"#afafaf","#b6b6b6","#bdbdbd","#c4c4c4","#cbcbcb" +-- ,"#d2d2d2","#d9d9d9","#e0e0e0"] + + diff --git a/compiler/GHC/CmmToAsm/X86/Regs.hs b/compiler/GHC/CmmToAsm/X86/Regs.hs new file mode 100644 index 0000000000..87e31a1428 --- /dev/null +++ b/compiler/GHC/CmmToAsm/X86/Regs.hs @@ -0,0 +1,442 @@ +{-# LANGUAGE CPP #-} + +module GHC.CmmToAsm.X86.Regs ( + -- squeese functions for the graph allocator + virtualRegSqueeze, + realRegSqueeze, + + -- immediates + Imm(..), + strImmLit, + litToImm, + + -- addressing modes + AddrMode(..), + addrOffset, + + -- registers + spRel, + argRegs, + allArgRegs, + allIntArgRegs, + callClobberedRegs, + instrClobberedRegs, + allMachRegNos, + classOfRealReg, + showReg, + + -- machine specific + EABase(..), EAIndex(..), addrModeRegs, + + eax, ebx, ecx, edx, esi, edi, ebp, esp, + + + rax, rbx, rcx, rdx, rsi, rdi, rbp, rsp, + r8, r9, r10, r11, r12, r13, r14, r15, + lastint, + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, + xmm, + firstxmm, lastxmm, + + ripRel, + allFPArgRegs, + + allocatableRegs +) + +where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.Platform.Regs +import GHC.Platform.Reg +import GHC.Platform.Reg.Class + +import GHC.Cmm +import GHC.Cmm.CLabel ( CLabel ) +import GHC.Driver.Session +import Outputable +import GHC.Platform + +import qualified Data.Array as A + +-- | regSqueeze_class reg +-- Calculate the maximum number of register colors that could be +-- denied to a node of this class due to having this reg +-- as a neighbour. +-- +{-# INLINE virtualRegSqueeze #-} +virtualRegSqueeze :: RegClass -> VirtualReg -> Int + +virtualRegSqueeze cls vr + = case cls of + RcInteger + -> case vr of + VirtualRegI{} -> 1 + VirtualRegHi{} -> 1 + _other -> 0 + + RcDouble + -> case vr of + VirtualRegD{} -> 1 + VirtualRegF{} -> 0 + _other -> 0 + + + _other -> 0 + +{-# INLINE realRegSqueeze #-} +realRegSqueeze :: RegClass -> RealReg -> Int +realRegSqueeze cls rr + = case cls of + RcInteger + -> case rr of + RealRegSingle regNo + | regNo < firstxmm -> 1 + | otherwise -> 0 + + RealRegPair{} -> 0 + + RcDouble + -> case rr of + RealRegSingle regNo + | regNo >= firstxmm -> 1 + | otherwise -> 0 + + RealRegPair{} -> 0 + + + _other -> 0 + +-- ----------------------------------------------------------------------------- +-- Immediates + +data Imm + = ImmInt Int + | ImmInteger Integer -- Sigh. + | ImmCLbl CLabel -- AbstractC Label (with baggage) + | ImmLit SDoc -- Simple string + | ImmIndex CLabel Int + | ImmFloat Rational + | ImmDouble Rational + | ImmConstantSum Imm Imm + | ImmConstantDiff Imm Imm + +strImmLit :: String -> Imm +strImmLit s = ImmLit (text s) + + +litToImm :: CmmLit -> Imm +litToImm (CmmInt i w) = ImmInteger (narrowS w i) + -- narrow to the width: a CmmInt might be out of + -- range, but we assume that ImmInteger only contains + -- in-range values. A signed value should be fine here. +litToImm (CmmFloat f W32) = ImmFloat f +litToImm (CmmFloat f W64) = ImmDouble f +litToImm (CmmLabel l) = ImmCLbl l +litToImm (CmmLabelOff l off) = ImmIndex l off +litToImm (CmmLabelDiffOff l1 l2 off _) + = ImmConstantSum + (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2)) + (ImmInt off) +litToImm _ = panic "X86.Regs.litToImm: no match" + +-- addressing modes ------------------------------------------------------------ + +data AddrMode + = AddrBaseIndex EABase EAIndex Displacement + | ImmAddr Imm Int + +data EABase = EABaseNone | EABaseReg Reg | EABaseRip +data EAIndex = EAIndexNone | EAIndex Reg Int +type Displacement = Imm + + +addrOffset :: AddrMode -> Int -> Maybe AddrMode +addrOffset addr off + = case addr of + ImmAddr i off0 -> Just (ImmAddr i (off0 + off)) + + AddrBaseIndex r i (ImmInt n) -> Just (AddrBaseIndex r i (ImmInt (n + off))) + AddrBaseIndex r i (ImmInteger n) + -> Just (AddrBaseIndex r i (ImmInt (fromInteger (n + toInteger off)))) + + AddrBaseIndex r i (ImmCLbl lbl) + -> Just (AddrBaseIndex r i (ImmIndex lbl off)) + + AddrBaseIndex r i (ImmIndex lbl ix) + -> Just (AddrBaseIndex r i (ImmIndex lbl (ix+off))) + + _ -> Nothing -- in theory, shouldn't happen + + +addrModeRegs :: AddrMode -> [Reg] +addrModeRegs (AddrBaseIndex b i _) = b_regs ++ i_regs + where + b_regs = case b of { EABaseReg r -> [r]; _ -> [] } + i_regs = case i of { EAIndex r _ -> [r]; _ -> [] } +addrModeRegs _ = [] + + +-- registers ------------------------------------------------------------------- + +-- @spRel@ gives us a stack relative addressing mode for volatile +-- temporaries and for excess call arguments. @fpRel@, where +-- applicable, is the same but for the frame pointer. + + +spRel :: DynFlags + -> Int -- ^ desired stack offset in bytes, positive or negative + -> AddrMode +spRel dflags n + | target32Bit (targetPlatform dflags) + = AddrBaseIndex (EABaseReg esp) EAIndexNone (ImmInt n) + | otherwise + = AddrBaseIndex (EABaseReg rsp) EAIndexNone (ImmInt n) + +-- The register numbers must fit into 32 bits on x86, so that we can +-- use a Word32 to represent the set of free registers in the register +-- allocator. + + + +firstxmm :: RegNo +firstxmm = 16 + +-- on 32bit platformOSs, only the first 8 XMM/YMM/ZMM registers are available +lastxmm :: Platform -> RegNo +lastxmm platform + | target32Bit platform = firstxmm + 7 -- xmm0 - xmmm7 + | otherwise = firstxmm + 15 -- xmm0 -xmm15 + +lastint :: Platform -> RegNo +lastint platform + | target32Bit platform = 7 -- not %r8..%r15 + | otherwise = 15 + +intregnos :: Platform -> [RegNo] +intregnos platform = [0 .. lastint platform] + + + +xmmregnos :: Platform -> [RegNo] +xmmregnos platform = [firstxmm .. lastxmm platform] + +floatregnos :: Platform -> [RegNo] +floatregnos platform = xmmregnos platform + +-- argRegs is the set of regs which are read for an n-argument call to C. +-- For archs which pass all args on the stack (x86), is empty. +-- Sparc passes up to the first 6 args in regs. +argRegs :: RegNo -> [Reg] +argRegs _ = panic "MachRegs.argRegs(x86): should not be used!" + +-- | The complete set of machine registers. +allMachRegNos :: Platform -> [RegNo] +allMachRegNos platform = intregnos platform ++ floatregnos platform + +-- | Take the class of a register. +{-# INLINE classOfRealReg #-} +classOfRealReg :: Platform -> RealReg -> RegClass +-- On x86, we might want to have an 8-bit RegClass, which would +-- contain just regs 1-4 (the others don't have 8-bit versions). +-- However, we can get away without this at the moment because the +-- only allocatable integer regs are also 8-bit compatible (1, 3, 4). +classOfRealReg platform reg + = case reg of + RealRegSingle i + | i <= lastint platform -> RcInteger + | i <= lastxmm platform -> RcDouble + | otherwise -> panic "X86.Reg.classOfRealReg registerSingle too high" + _ -> panic "X86.Regs.classOfRealReg: RegPairs on this arch" + +-- | Get the name of the register with this number. +-- NOTE: fixme, we dont track which "way" the XMM registers are used +showReg :: Platform -> RegNo -> String +showReg platform n + | n >= firstxmm && n <= lastxmm platform = "%xmm" ++ show (n-firstxmm) + | n >= 8 && n < firstxmm = "%r" ++ show n + | otherwise = regNames platform A.! n + +regNames :: Platform -> A.Array Int String +regNames platform + = if target32Bit platform + then A.listArray (0,8) ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"] + else A.listArray (0,8) ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp"] + + + +-- machine specific ------------------------------------------------------------ + + +{- +Intel x86 architecture: +- All registers except 7 (esp) are available for use. +- Only ebx, esi, edi and esp are available across a C call (they are callee-saves). +- Registers 0-7 have 16-bit counterparts (ax, bx etc.) +- Registers 0-3 have 8 bit counterparts (ah, bh etc.) + +The fp registers are all Double registers; we don't have any RcFloat class +regs. @regClass@ barfs if you give it a VirtualRegF, and mkVReg above should +never generate them. + +TODO: cleanup modelling float vs double registers and how they are the same class. +-} + + +eax, ebx, ecx, edx, esp, ebp, esi, edi :: Reg + +eax = regSingle 0 +ebx = regSingle 1 +ecx = regSingle 2 +edx = regSingle 3 +esi = regSingle 4 +edi = regSingle 5 +ebp = regSingle 6 +esp = regSingle 7 + + + + +{- +AMD x86_64 architecture: +- All 16 integer registers are addressable as 8, 16, 32 and 64-bit values: + + 8 16 32 64 + --------------------- + al ax eax rax + bl bx ebx rbx + cl cx ecx rcx + dl dx edx rdx + sil si esi rsi + dil si edi rdi + bpl bp ebp rbp + spl sp esp rsp + r10b r10w r10d r10 + r11b r11w r11d r11 + r12b r12w r12d r12 + r13b r13w r13d r13 + r14b r14w r14d r14 + r15b r15w r15d r15 +-} + +rax, rbx, rcx, rdx, rsp, rbp, rsi, rdi, + r8, r9, r10, r11, r12, r13, r14, r15, + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 :: Reg + +rax = regSingle 0 +rbx = regSingle 1 +rcx = regSingle 2 +rdx = regSingle 3 +rsi = regSingle 4 +rdi = regSingle 5 +rbp = regSingle 6 +rsp = regSingle 7 +r8 = regSingle 8 +r9 = regSingle 9 +r10 = regSingle 10 +r11 = regSingle 11 +r12 = regSingle 12 +r13 = regSingle 13 +r14 = regSingle 14 +r15 = regSingle 15 +xmm0 = regSingle 16 +xmm1 = regSingle 17 +xmm2 = regSingle 18 +xmm3 = regSingle 19 +xmm4 = regSingle 20 +xmm5 = regSingle 21 +xmm6 = regSingle 22 +xmm7 = regSingle 23 +xmm8 = regSingle 24 +xmm9 = regSingle 25 +xmm10 = regSingle 26 +xmm11 = regSingle 27 +xmm12 = regSingle 28 +xmm13 = regSingle 29 +xmm14 = regSingle 30 +xmm15 = regSingle 31 + +ripRel :: Displacement -> AddrMode +ripRel imm = AddrBaseIndex EABaseRip EAIndexNone imm + + + -- so we can re-use some x86 code: +{- +eax = rax +ebx = rbx +ecx = rcx +edx = rdx +esi = rsi +edi = rdi +ebp = rbp +esp = rsp +-} + +xmm :: RegNo -> Reg +xmm n = regSingle (firstxmm+n) + + + + +-- | these are the regs which we cannot assume stay alive over a C call. +callClobberedRegs :: Platform -> [Reg] +-- caller-saves registers +callClobberedRegs platform + | target32Bit platform = [eax,ecx,edx] ++ map regSingle (floatregnos platform) + | platformOS platform == OSMinGW32 + = [rax,rcx,rdx,r8,r9,r10,r11] + -- Only xmm0-5 are caller-saves registers on 64bit windows. + -- ( https://docs.microsoft.com/en-us/cpp/build/register-usage ) + -- For details check the Win64 ABI. + ++ map xmm [0 .. 5] + | otherwise + -- all xmm regs are caller-saves + -- caller-saves registers + = [rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11] + ++ map regSingle (floatregnos platform) + +allArgRegs :: Platform -> [(Reg, Reg)] +allArgRegs platform + | platformOS platform == OSMinGW32 = zip [rcx,rdx,r8,r9] + (map regSingle [firstxmm ..]) + | otherwise = panic "X86.Regs.allArgRegs: not defined for this arch" + +allIntArgRegs :: Platform -> [Reg] +allIntArgRegs platform + | (platformOS platform == OSMinGW32) || target32Bit platform + = panic "X86.Regs.allIntArgRegs: not defined for this platform" + | otherwise = [rdi,rsi,rdx,rcx,r8,r9] + + +-- | on 64bit platforms we pass the first 8 float/double arguments +-- in the xmm registers. +allFPArgRegs :: Platform -> [Reg] +allFPArgRegs platform + | platformOS platform == OSMinGW32 + = panic "X86.Regs.allFPArgRegs: not defined for this platform" + | otherwise = map regSingle [firstxmm .. firstxmm + 7 ] + + +-- Machine registers which might be clobbered by instructions that +-- generate results into fixed registers, or need arguments in a fixed +-- register. +instrClobberedRegs :: Platform -> [Reg] +instrClobberedRegs platform + | target32Bit platform = [ eax, ecx, edx ] + | otherwise = [ rax, rcx, rdx ] + +-- + +-- allocatableRegs is allMachRegNos with the fixed-use regs removed. +-- i.e., these are the regs for which we are prepared to allow the +-- register allocator to attempt to map VRegs to. +allocatableRegs :: Platform -> [RealReg] +allocatableRegs platform + = let isFree i = freeReg platform i + in map RealRegSingle $ filter isFree (allMachRegNos platform) + diff --git a/compiler/GHC/CmmToC.hs b/compiler/GHC/CmmToC.hs index f7245f5c30..6e6f58ba7d 100644 --- a/compiler/GHC/CmmToC.hs +++ b/compiler/GHC/CmmToC.hs @@ -39,7 +39,7 @@ import GHC.Cmm.Utils import GHC.Cmm.Switch -- Utils -import CPrim +import GHC.CmmToAsm.CPrim import GHC.Driver.Session import FastString import Outputable diff --git a/compiler/GHC/Driver/CodeOutput.hs b/compiler/GHC/Driver/CodeOutput.hs index e52d3216d5..507311c039 100644 --- a/compiler/GHC/Driver/CodeOutput.hs +++ b/compiler/GHC/Driver/CodeOutput.hs @@ -12,7 +12,7 @@ module GHC.Driver.CodeOutput ( codeOutput, outputForeignStubs ) where import GhcPrelude -import AsmCodeGen ( nativeCodeGen ) +import GHC.CmmToAsm ( nativeCodeGen ) import GHC.CmmToLlvm ( llvmCodeGen ) import UniqSupply ( mkSplitUniqSupply ) diff --git a/compiler/GHC/Llvm/Types.hs b/compiler/GHC/Llvm/Types.hs index f8ca0c826c..3e320634d0 100644 --- a/compiler/GHC/Llvm/Types.hs +++ b/compiler/GHC/Llvm/Types.hs @@ -20,7 +20,7 @@ import Outputable import Unique -- from NCG -import PprBase +import GHC.CmmToAsm.Ppr import GHC.Float diff --git a/compiler/GHC/Platform/Reg.hs b/compiler/GHC/Platform/Reg.hs new file mode 100644 index 0000000000..b856d7c3af --- /dev/null +++ b/compiler/GHC/Platform/Reg.hs @@ -0,0 +1,241 @@ +-- | An architecture independent description of a register. +-- This needs to stay architecture independent because it is used +-- by NCGMonad and the register allocators, which are shared +-- by all architectures. +-- +module GHC.Platform.Reg ( + RegNo, + Reg(..), + regPair, + regSingle, + isRealReg, takeRealReg, + isVirtualReg, takeVirtualReg, + + VirtualReg(..), + renameVirtualReg, + classOfVirtualReg, + getHiVirtualRegFromLo, + getHiVRegFromLo, + + RealReg(..), + regNosOfRealReg, + realRegsAlias, + + liftPatchFnToRegReg +) + +where + +import GhcPrelude + +import Outputable +import Unique +import GHC.Platform.Reg.Class +import Data.List (intersect) + +-- | An identifier for a primitive real machine register. +type RegNo + = Int + +-- VirtualRegs are virtual registers. The register allocator will +-- eventually have to map them into RealRegs, or into spill slots. +-- +-- VirtualRegs are allocated on the fly, usually to represent a single +-- value in the abstract assembly code (i.e. dynamic registers are +-- usually single assignment). +-- +-- The single assignment restriction isn't necessary to get correct code, +-- although a better register allocation will result if single +-- assignment is used -- because the allocator maps a VirtualReg into +-- a single RealReg, even if the VirtualReg has multiple live ranges. +-- +-- Virtual regs can be of either class, so that info is attached. +-- +data VirtualReg + = VirtualRegI {-# UNPACK #-} !Unique + | VirtualRegHi {-# UNPACK #-} !Unique -- High part of 2-word register + | VirtualRegF {-# UNPACK #-} !Unique + | VirtualRegD {-# UNPACK #-} !Unique + + deriving (Eq, Show) + +-- This is laborious, but necessary. We can't derive Ord because +-- Unique doesn't have an Ord instance. Note nonDetCmpUnique in the +-- implementation. See Note [No Ord for Unique] +-- This is non-deterministic but we do not currently support deterministic +-- code-generation. See Note [Unique Determinism and code generation] +instance Ord VirtualReg where + compare (VirtualRegI a) (VirtualRegI b) = nonDetCmpUnique a b + compare (VirtualRegHi a) (VirtualRegHi b) = nonDetCmpUnique a b + compare (VirtualRegF a) (VirtualRegF b) = nonDetCmpUnique a b + compare (VirtualRegD a) (VirtualRegD b) = nonDetCmpUnique a b + + compare VirtualRegI{} _ = LT + compare _ VirtualRegI{} = GT + compare VirtualRegHi{} _ = LT + compare _ VirtualRegHi{} = GT + compare VirtualRegF{} _ = LT + compare _ VirtualRegF{} = GT + + + +instance Uniquable VirtualReg where + getUnique reg + = case reg of + VirtualRegI u -> u + VirtualRegHi u -> u + VirtualRegF u -> u + VirtualRegD u -> u + +instance Outputable VirtualReg where + ppr reg + = case reg of + VirtualRegI u -> text "%vI_" <> pprUniqueAlways u + VirtualRegHi u -> text "%vHi_" <> pprUniqueAlways u + -- this code is kinda wrong on x86 + -- because float and double occupy the same register set + -- namely SSE2 register xmm0 .. xmm15 + VirtualRegF u -> text "%vFloat_" <> pprUniqueAlways u + VirtualRegD u -> text "%vDouble_" <> pprUniqueAlways u + + + +renameVirtualReg :: Unique -> VirtualReg -> VirtualReg +renameVirtualReg u r + = case r of + VirtualRegI _ -> VirtualRegI u + VirtualRegHi _ -> VirtualRegHi u + VirtualRegF _ -> VirtualRegF u + VirtualRegD _ -> VirtualRegD u + + +classOfVirtualReg :: VirtualReg -> RegClass +classOfVirtualReg vr + = case vr of + VirtualRegI{} -> RcInteger + VirtualRegHi{} -> RcInteger + VirtualRegF{} -> RcFloat + VirtualRegD{} -> RcDouble + + + +-- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform +-- when supplied with the vreg for the lower-half of the quantity. +-- (NB. Not reversible). +getHiVirtualRegFromLo :: VirtualReg -> VirtualReg +getHiVirtualRegFromLo reg + = case reg of + -- makes a pseudo-unique with tag 'H' + VirtualRegI u -> VirtualRegHi (newTagUnique u 'H') + _ -> panic "Reg.getHiVirtualRegFromLo" + +getHiVRegFromLo :: Reg -> Reg +getHiVRegFromLo reg + = case reg of + RegVirtual vr -> RegVirtual (getHiVirtualRegFromLo vr) + RegReal _ -> panic "Reg.getHiVRegFromLo" + + +------------------------------------------------------------------------------------ +-- | RealRegs are machine regs which are available for allocation, in +-- the usual way. We know what class they are, because that's part of +-- the processor's architecture. +-- +-- RealRegPairs are pairs of real registers that are allocated together +-- to hold a larger value, such as with Double regs on SPARC. +-- +data RealReg + = RealRegSingle {-# UNPACK #-} !RegNo + | RealRegPair {-# UNPACK #-} !RegNo {-# UNPACK #-} !RegNo + deriving (Eq, Show, Ord) + +instance Uniquable RealReg where + getUnique reg + = case reg of + RealRegSingle i -> mkRegSingleUnique i + RealRegPair r1 r2 -> mkRegPairUnique (r1 * 65536 + r2) + +instance Outputable RealReg where + ppr reg + = case reg of + RealRegSingle i -> text "%r" <> int i + RealRegPair r1 r2 -> text "%r(" <> int r1 + <> vbar <> int r2 <> text ")" + +regNosOfRealReg :: RealReg -> [RegNo] +regNosOfRealReg rr + = case rr of + RealRegSingle r1 -> [r1] + RealRegPair r1 r2 -> [r1, r2] + + +realRegsAlias :: RealReg -> RealReg -> Bool +realRegsAlias rr1 rr2 + = not $ null $ intersect (regNosOfRealReg rr1) (regNosOfRealReg rr2) + +-------------------------------------------------------------------------------- +-- | A register, either virtual or real +data Reg + = RegVirtual !VirtualReg + | RegReal !RealReg + deriving (Eq, Ord) + +regSingle :: RegNo -> Reg +regSingle regNo = RegReal $ RealRegSingle regNo + +regPair :: RegNo -> RegNo -> Reg +regPair regNo1 regNo2 = RegReal $ RealRegPair regNo1 regNo2 + + +-- We like to have Uniques for Reg so that we can make UniqFM and UniqSets +-- in the register allocator. +instance Uniquable Reg where + getUnique reg + = case reg of + RegVirtual vr -> getUnique vr + RegReal rr -> getUnique rr + +-- | Print a reg in a generic manner +-- If you want the architecture specific names, then use the pprReg +-- function from the appropriate Ppr module. +instance Outputable Reg where + ppr reg + = case reg of + RegVirtual vr -> ppr vr + RegReal rr -> ppr rr + + +isRealReg :: Reg -> Bool +isRealReg reg + = case reg of + RegReal _ -> True + RegVirtual _ -> False + +takeRealReg :: Reg -> Maybe RealReg +takeRealReg reg + = case reg of + RegReal rr -> Just rr + _ -> Nothing + + +isVirtualReg :: Reg -> Bool +isVirtualReg reg + = case reg of + RegReal _ -> False + RegVirtual _ -> True + +takeVirtualReg :: Reg -> Maybe VirtualReg +takeVirtualReg reg + = case reg of + RegReal _ -> Nothing + RegVirtual vr -> Just vr + + +-- | The patch function supplied by the allocator maps VirtualReg to RealReg +-- regs, but sometimes we want to apply it to plain old Reg. +-- +liftPatchFnToRegReg :: (VirtualReg -> RealReg) -> (Reg -> Reg) +liftPatchFnToRegReg patchF reg + = case reg of + RegVirtual vr -> RegReal (patchF vr) + RegReal _ -> reg diff --git a/compiler/GHC/Platform/Reg/Class.hs b/compiler/GHC/Platform/Reg/Class.hs new file mode 100644 index 0000000000..225ad05be5 --- /dev/null +++ b/compiler/GHC/Platform/Reg/Class.hs @@ -0,0 +1,32 @@ +-- | An architecture independent description of a register's class. +module GHC.Platform.Reg.Class + ( RegClass (..) ) + +where + +import GhcPrelude + +import Outputable +import Unique + + +-- | The class of a register. +-- Used in the register allocator. +-- We treat all registers in a class as being interchangeable. +-- +data RegClass + = RcInteger + | RcFloat + | RcDouble + deriving Eq + + +instance Uniquable RegClass where + getUnique RcInteger = mkRegClassUnique 0 + getUnique RcFloat = mkRegClassUnique 1 + getUnique RcDouble = mkRegClassUnique 2 + +instance Outputable RegClass where + ppr RcInteger = Outputable.text "I" + ppr RcFloat = Outputable.text "F" + ppr RcDouble = Outputable.text "D" diff --git a/compiler/GHC/Platform/Regs.hs b/compiler/GHC/Platform/Regs.hs index 51f7658db2..d214b0d89f 100644 --- a/compiler/GHC/Platform/Regs.hs +++ b/compiler/GHC/Platform/Regs.hs @@ -7,7 +7,7 @@ import GhcPrelude import GHC.Cmm.Expr import GHC.Platform -import Reg +import GHC.Platform.Reg import qualified GHC.Platform.ARM as ARM import qualified GHC.Platform.ARM64 as ARM64 |