diff options
Diffstat (limited to 'compiler/GHC/Cmm/Pipeline.hs')
-rw-r--r-- | compiler/GHC/Cmm/Pipeline.hs | 367 |
1 files changed, 367 insertions, 0 deletions
diff --git a/compiler/GHC/Cmm/Pipeline.hs b/compiler/GHC/Cmm/Pipeline.hs new file mode 100644 index 0000000000..6db9e23ee1 --- /dev/null +++ b/compiler/GHC/Cmm/Pipeline.hs @@ -0,0 +1,367 @@ +{-# LANGUAGE BangPatterns #-} + +module GHC.Cmm.Pipeline ( + -- | Converts C-- with an implicit stack and native C-- calls into + -- optimized, CPS converted and native-call-less C--. The latter + -- C-- can be used to generate assembly. + cmmPipeline +) where + +import GhcPrelude + +import GHC.Cmm +import GHC.Cmm.Lint +import GHC.Cmm.Info.Build +import GHC.Cmm.CommonBlockElim +import GHC.Cmm.Switch.Implement +import GHC.Cmm.ProcPoint +import GHC.Cmm.ContFlowOpt +import GHC.Cmm.LayoutStack +import GHC.Cmm.Sink +import GHC.Cmm.Dataflow.Collections + +import UniqSupply +import DynFlags +import ErrUtils +import HscTypes +import Control.Monad +import Outputable +import GHC.Platform + +----------------------------------------------------------------------------- +-- | Top level driver for C-- pipeline +----------------------------------------------------------------------------- + +cmmPipeline + :: HscEnv -- Compilation env including + -- dynamic flags: -dcmm-lint -ddump-cmm-cps + -> ModuleSRTInfo -- Info about SRTs generated so far + -> CmmGroup -- Input C-- with Procedures + -> IO (ModuleSRTInfo, CmmGroup) -- Output CPS transformed C-- + +cmmPipeline hsc_env srtInfo prog = withTimingSilent dflags (text "Cmm pipeline") forceRes $ + do let dflags = hsc_dflags hsc_env + + tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog + + (srtInfo, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags srtInfo tops + dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" FormatCMM (ppr cmms) + + return (srtInfo, cmms) + + where forceRes (info, group) = + info `seq` foldr (\decl r -> decl `seq` r) () group + + dflags = hsc_dflags hsc_env + +cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl]) +cpsTop _ p@(CmmData {}) = return (mapEmpty, [p]) +cpsTop hsc_env proc = + do + ----------- Control-flow optimisations ---------------------------------- + + -- The first round of control-flow optimisation speeds up the + -- later passes by removing lots of empty blocks, so we do it + -- even when optimisation isn't turned on. + -- + CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-} + return $ cmmCfgOptsProc splitting_proc_points proc + dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g + + let !TopInfo {stack_info=StackInfo { arg_space = entry_off + , do_layout = do_layout }} = h + + ----------- Eliminate common blocks ------------------------------------- + g <- {-# SCC "elimCommonBlocks" #-} + condPass Opt_CmmElimCommonBlocks elimCommonBlocks g + Opt_D_dump_cmm_cbe "Post common block elimination" + + -- Any work storing block Labels must be performed _after_ + -- elimCommonBlocks + + ----------- Implement switches ------------------------------------------ + g <- {-# SCC "createSwitchPlans" #-} + runUniqSM $ cmmImplementSwitchPlans dflags g + dump Opt_D_dump_cmm_switch "Post switch plan" g + + ----------- Proc points ------------------------------------------------- + let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g + proc_points <- + if splitting_proc_points + then do + pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $ + minimalProcPointSet (targetPlatform dflags) call_pps g + dumpWith dflags Opt_D_dump_cmm_proc "Proc points" + FormatCMM (ppr l $$ ppr pp $$ ppr g) + return pp + else + return call_pps + + ----------- Layout the stack and manifest Sp ---------------------------- + (g, stackmaps) <- + {-# SCC "layoutStack" #-} + if do_layout + then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g + else return (g, mapEmpty) + dump Opt_D_dump_cmm_sp "Layout Stack" g + + ----------- Sink and inline assignments -------------------------------- + g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout] + condPass Opt_CmmSink (cmmSink dflags) g + Opt_D_dump_cmm_sink "Sink assignments" + + ------------- CAF analysis ---------------------------------------------- + let cafEnv = {-# SCC "cafAnal" #-} cafAnal call_pps l g + dumpWith dflags Opt_D_dump_cmm_caf "CAFEnv" FormatText (ppr cafEnv) + + g <- if splitting_proc_points + then do + ------------- Split into separate procedures ----------------------- + let pp_map = {-# SCC "procPointAnalysis" #-} + procPointAnalysis proc_points g + dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map" + FormatCMM (ppr pp_map) + g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $ + splitAtProcPoints dflags l call_pps proc_points pp_map + (CmmProc h l v g) + dumps Opt_D_dump_cmm_split "Post splitting" g + return g + else do + -- attach info tables to return points + return $ [attachContInfoTables call_pps (CmmProc h l v g)] + + ------------- Populate info tables with stack info ----------------- + g <- {-# SCC "setInfoTableStackMap" #-} + return $ map (setInfoTableStackMap dflags stackmaps) g + dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g + + ----------- Control-flow optimisations ----------------------------- + g <- {-# SCC "cmmCfgOpts(2)" #-} + return $ if optLevel dflags >= 1 + then map (cmmCfgOptsProc splitting_proc_points) g + else g + g <- return (map removeUnreachableBlocksProc g) + -- See Note [unreachable blocks] + dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g + + return (cafEnv, g) + + where dflags = hsc_dflags hsc_env + platform = targetPlatform dflags + dump = dumpGraph dflags + + dumps flag name + = mapM_ (dumpWith dflags flag name FormatCMM . ppr) + + condPass flag pass g dumpflag dumpname = + if gopt flag dflags + then do + g <- return $ pass g + dump dumpflag dumpname g + return g + else return g + + -- we don't need to split proc points for the NCG, unless + -- tablesNextToCode is off. The latter is because we have no + -- label to put on info tables for basic blocks that are not + -- the entry point. + splitting_proc_points = hscTarget dflags /= HscAsm + || not (tablesNextToCode dflags) + || -- Note [inconsistent-pic-reg] + usingInconsistentPicReg + usingInconsistentPicReg + = case (platformArch platform, platformOS platform, positionIndependent dflags) + of (ArchX86, OSDarwin, pic) -> pic + _ -> False + +-- Note [Sinking after stack layout] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- In the past we considered running sinking pass also before stack +-- layout, but after making some measurements we realized that: +-- +-- a) running sinking only before stack layout produces slower +-- code than running sinking only before stack layout +-- +-- b) running sinking both before and after stack layout produces +-- code that has the same performance as when running sinking +-- only after stack layout. +-- +-- In other words sinking before stack layout doesn't buy as anything. +-- +-- An interesting question is "why is it better to run sinking after +-- stack layout"? It seems that the major reason are stores and loads +-- generated by stack layout. Consider this code before stack layout: +-- +-- c1E: +-- _c1C::P64 = R3; +-- _c1B::P64 = R2; +-- _c1A::P64 = R1; +-- I64[(young<c1D> + 8)] = c1D; +-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8; +-- c1D: +-- R3 = _c1C::P64; +-- R2 = _c1B::P64; +-- R1 = _c1A::P64; +-- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8; +-- +-- Stack layout pass will save all local variables live across a call +-- (_c1C, _c1B and _c1A in this example) on the stack just before +-- making a call and reload them from the stack after returning from a +-- call: +-- +-- c1E: +-- _c1C::P64 = R3; +-- _c1B::P64 = R2; +-- _c1A::P64 = R1; +-- I64[Sp - 32] = c1D; +-- P64[Sp - 24] = _c1A::P64; +-- P64[Sp - 16] = _c1B::P64; +-- P64[Sp - 8] = _c1C::P64; +-- Sp = Sp - 32; +-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8; +-- c1D: +-- _c1A::P64 = P64[Sp + 8]; +-- _c1B::P64 = P64[Sp + 16]; +-- _c1C::P64 = P64[Sp + 24]; +-- R3 = _c1C::P64; +-- R2 = _c1B::P64; +-- R1 = _c1A::P64; +-- Sp = Sp + 32; +-- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8; +-- +-- If we don't run sinking pass after stack layout we are basically +-- left with such code. However, running sinking on this code can lead +-- to significant improvements: +-- +-- c1E: +-- I64[Sp - 32] = c1D; +-- P64[Sp - 24] = R1; +-- P64[Sp - 16] = R2; +-- P64[Sp - 8] = R3; +-- Sp = Sp - 32; +-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8; +-- c1D: +-- R3 = P64[Sp + 24]; +-- R2 = P64[Sp + 16]; +-- R1 = P64[Sp + 8]; +-- Sp = Sp + 32; +-- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8; +-- +-- Now we only have 9 assignments instead of 15. +-- +-- There is one case when running sinking before stack layout could +-- be beneficial. Consider this: +-- +-- L1: +-- x = y +-- call f() returns L2 +-- L2: ...x...y... +-- +-- Since both x and y are live across a call to f, they will be stored +-- on the stack during stack layout and restored after the call: +-- +-- L1: +-- x = y +-- P64[Sp - 24] = L2 +-- P64[Sp - 16] = x +-- P64[Sp - 8] = y +-- Sp = Sp - 24 +-- call f() returns L2 +-- L2: +-- y = P64[Sp + 16] +-- x = P64[Sp + 8] +-- Sp = Sp + 24 +-- ...x...y... +-- +-- However, if we run sinking before stack layout we would propagate x +-- to its usage place (both x and y must be local register for this to +-- be possible - global registers cannot be floated past a call): +-- +-- L1: +-- x = y +-- call f() returns L2 +-- L2: ...y...y... +-- +-- Thus making x dead at the call to f(). If we ran stack layout now +-- we would generate less stores and loads: +-- +-- L1: +-- x = y +-- P64[Sp - 16] = L2 +-- P64[Sp - 8] = y +-- Sp = Sp - 16 +-- call f() returns L2 +-- L2: +-- y = P64[Sp + 8] +-- Sp = Sp + 16 +-- ...y...y... +-- +-- But since we don't see any benefits from running sinking before stack +-- layout, this situation probably doesn't arise too often in practice. +-- + +{- Note [inconsistent-pic-reg] + +On x86/Darwin, PIC is implemented by inserting a sequence like + + call 1f + 1: popl %reg + +at the proc entry point, and then referring to labels as offsets from +%reg. If we don't split proc points, then we could have many entry +points in a proc that would need this sequence, and each entry point +would then get a different value for %reg. If there are any join +points, then at the join point we don't have a consistent value for +%reg, so we don't know how to refer to labels. + +Hence, on x86/Darwin, we have to split proc points, and then each proc +point will get its own PIC initialisation sequence. + +This isn't an issue on x86/ELF, where the sequence is + + call 1f + 1: popl %reg + addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg + +so %reg always has a consistent value: the address of +_GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via. + +-} + +{- Note [unreachable blocks] + +The control-flow optimiser sometimes leaves unreachable blocks behind +containing junk code. These aren't necessarily a problem, but +removing them is good because it might save time in the native code +generator later. + +-} + +runUniqSM :: UniqSM a -> IO a +runUniqSM m = do + us <- mkSplitUniqSupply 'u' + return (initUs_ us m) + + +dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO () +dumpGraph dflags flag name g = do + when (gopt Opt_DoCmmLinting dflags) $ do_lint g + dumpWith dflags flag name FormatCMM (ppr g) + where + do_lint g = case cmmLintGraph dflags g of + Just err -> do { fatalErrorMsg dflags err + ; ghcExit dflags 1 + } + Nothing -> return () + +dumpWith :: DynFlags -> DumpFlag -> String -> DumpFormat -> SDoc -> IO () +dumpWith dflags flag txt fmt sdoc = do + dumpIfSet_dyn dflags flag txt fmt sdoc + when (not (dopt flag dflags)) $ + -- If `-ddump-cmm-verbose -ddump-to-file` is specified, + -- dump each Cmm pipeline stage output to a separate file. #16930 + when (dopt Opt_D_dump_cmm_verbose dflags) + $ dumpAction dflags (mkDumpStyle dflags alwaysQualify) + (dumpOptionsFromFlag flag) txt fmt sdoc + dumpIfSet_dyn dflags Opt_D_dump_cmm_verbose_by_proc txt fmt sdoc |