summaryrefslogtreecommitdiff
path: root/compiler/GHC/Cmm/Pipeline.hs
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/GHC/Cmm/Pipeline.hs')
-rw-r--r--compiler/GHC/Cmm/Pipeline.hs367
1 files changed, 367 insertions, 0 deletions
diff --git a/compiler/GHC/Cmm/Pipeline.hs b/compiler/GHC/Cmm/Pipeline.hs
new file mode 100644
index 0000000000..6db9e23ee1
--- /dev/null
+++ b/compiler/GHC/Cmm/Pipeline.hs
@@ -0,0 +1,367 @@
+{-# LANGUAGE BangPatterns #-}
+
+module GHC.Cmm.Pipeline (
+ -- | Converts C-- with an implicit stack and native C-- calls into
+ -- optimized, CPS converted and native-call-less C--. The latter
+ -- C-- can be used to generate assembly.
+ cmmPipeline
+) where
+
+import GhcPrelude
+
+import GHC.Cmm
+import GHC.Cmm.Lint
+import GHC.Cmm.Info.Build
+import GHC.Cmm.CommonBlockElim
+import GHC.Cmm.Switch.Implement
+import GHC.Cmm.ProcPoint
+import GHC.Cmm.ContFlowOpt
+import GHC.Cmm.LayoutStack
+import GHC.Cmm.Sink
+import GHC.Cmm.Dataflow.Collections
+
+import UniqSupply
+import DynFlags
+import ErrUtils
+import HscTypes
+import Control.Monad
+import Outputable
+import GHC.Platform
+
+-----------------------------------------------------------------------------
+-- | Top level driver for C-- pipeline
+-----------------------------------------------------------------------------
+
+cmmPipeline
+ :: HscEnv -- Compilation env including
+ -- dynamic flags: -dcmm-lint -ddump-cmm-cps
+ -> ModuleSRTInfo -- Info about SRTs generated so far
+ -> CmmGroup -- Input C-- with Procedures
+ -> IO (ModuleSRTInfo, CmmGroup) -- Output CPS transformed C--
+
+cmmPipeline hsc_env srtInfo prog = withTimingSilent dflags (text "Cmm pipeline") forceRes $
+ do let dflags = hsc_dflags hsc_env
+
+ tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
+
+ (srtInfo, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags srtInfo tops
+ dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" FormatCMM (ppr cmms)
+
+ return (srtInfo, cmms)
+
+ where forceRes (info, group) =
+ info `seq` foldr (\decl r -> decl `seq` r) () group
+
+ dflags = hsc_dflags hsc_env
+
+cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
+cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
+cpsTop hsc_env proc =
+ do
+ ----------- Control-flow optimisations ----------------------------------
+
+ -- The first round of control-flow optimisation speeds up the
+ -- later passes by removing lots of empty blocks, so we do it
+ -- even when optimisation isn't turned on.
+ --
+ CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
+ return $ cmmCfgOptsProc splitting_proc_points proc
+ dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
+
+ let !TopInfo {stack_info=StackInfo { arg_space = entry_off
+ , do_layout = do_layout }} = h
+
+ ----------- Eliminate common blocks -------------------------------------
+ g <- {-# SCC "elimCommonBlocks" #-}
+ condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
+ Opt_D_dump_cmm_cbe "Post common block elimination"
+
+ -- Any work storing block Labels must be performed _after_
+ -- elimCommonBlocks
+
+ ----------- Implement switches ------------------------------------------
+ g <- {-# SCC "createSwitchPlans" #-}
+ runUniqSM $ cmmImplementSwitchPlans dflags g
+ dump Opt_D_dump_cmm_switch "Post switch plan" g
+
+ ----------- Proc points -------------------------------------------------
+ let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
+ proc_points <-
+ if splitting_proc_points
+ then do
+ pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
+ minimalProcPointSet (targetPlatform dflags) call_pps g
+ dumpWith dflags Opt_D_dump_cmm_proc "Proc points"
+ FormatCMM (ppr l $$ ppr pp $$ ppr g)
+ return pp
+ else
+ return call_pps
+
+ ----------- Layout the stack and manifest Sp ----------------------------
+ (g, stackmaps) <-
+ {-# SCC "layoutStack" #-}
+ if do_layout
+ then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
+ else return (g, mapEmpty)
+ dump Opt_D_dump_cmm_sp "Layout Stack" g
+
+ ----------- Sink and inline assignments --------------------------------
+ g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
+ condPass Opt_CmmSink (cmmSink dflags) g
+ Opt_D_dump_cmm_sink "Sink assignments"
+
+ ------------- CAF analysis ----------------------------------------------
+ let cafEnv = {-# SCC "cafAnal" #-} cafAnal call_pps l g
+ dumpWith dflags Opt_D_dump_cmm_caf "CAFEnv" FormatText (ppr cafEnv)
+
+ g <- if splitting_proc_points
+ then do
+ ------------- Split into separate procedures -----------------------
+ let pp_map = {-# SCC "procPointAnalysis" #-}
+ procPointAnalysis proc_points g
+ dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map"
+ FormatCMM (ppr pp_map)
+ g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
+ splitAtProcPoints dflags l call_pps proc_points pp_map
+ (CmmProc h l v g)
+ dumps Opt_D_dump_cmm_split "Post splitting" g
+ return g
+ else do
+ -- attach info tables to return points
+ return $ [attachContInfoTables call_pps (CmmProc h l v g)]
+
+ ------------- Populate info tables with stack info -----------------
+ g <- {-# SCC "setInfoTableStackMap" #-}
+ return $ map (setInfoTableStackMap dflags stackmaps) g
+ dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g
+
+ ----------- Control-flow optimisations -----------------------------
+ g <- {-# SCC "cmmCfgOpts(2)" #-}
+ return $ if optLevel dflags >= 1
+ then map (cmmCfgOptsProc splitting_proc_points) g
+ else g
+ g <- return (map removeUnreachableBlocksProc g)
+ -- See Note [unreachable blocks]
+ dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
+
+ return (cafEnv, g)
+
+ where dflags = hsc_dflags hsc_env
+ platform = targetPlatform dflags
+ dump = dumpGraph dflags
+
+ dumps flag name
+ = mapM_ (dumpWith dflags flag name FormatCMM . ppr)
+
+ condPass flag pass g dumpflag dumpname =
+ if gopt flag dflags
+ then do
+ g <- return $ pass g
+ dump dumpflag dumpname g
+ return g
+ else return g
+
+ -- we don't need to split proc points for the NCG, unless
+ -- tablesNextToCode is off. The latter is because we have no
+ -- label to put on info tables for basic blocks that are not
+ -- the entry point.
+ splitting_proc_points = hscTarget dflags /= HscAsm
+ || not (tablesNextToCode dflags)
+ || -- Note [inconsistent-pic-reg]
+ usingInconsistentPicReg
+ usingInconsistentPicReg
+ = case (platformArch platform, platformOS platform, positionIndependent dflags)
+ of (ArchX86, OSDarwin, pic) -> pic
+ _ -> False
+
+-- Note [Sinking after stack layout]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- In the past we considered running sinking pass also before stack
+-- layout, but after making some measurements we realized that:
+--
+-- a) running sinking only before stack layout produces slower
+-- code than running sinking only before stack layout
+--
+-- b) running sinking both before and after stack layout produces
+-- code that has the same performance as when running sinking
+-- only after stack layout.
+--
+-- In other words sinking before stack layout doesn't buy as anything.
+--
+-- An interesting question is "why is it better to run sinking after
+-- stack layout"? It seems that the major reason are stores and loads
+-- generated by stack layout. Consider this code before stack layout:
+--
+-- c1E:
+-- _c1C::P64 = R3;
+-- _c1B::P64 = R2;
+-- _c1A::P64 = R1;
+-- I64[(young<c1D> + 8)] = c1D;
+-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+-- c1D:
+-- R3 = _c1C::P64;
+-- R2 = _c1B::P64;
+-- R1 = _c1A::P64;
+-- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- Stack layout pass will save all local variables live across a call
+-- (_c1C, _c1B and _c1A in this example) on the stack just before
+-- making a call and reload them from the stack after returning from a
+-- call:
+--
+-- c1E:
+-- _c1C::P64 = R3;
+-- _c1B::P64 = R2;
+-- _c1A::P64 = R1;
+-- I64[Sp - 32] = c1D;
+-- P64[Sp - 24] = _c1A::P64;
+-- P64[Sp - 16] = _c1B::P64;
+-- P64[Sp - 8] = _c1C::P64;
+-- Sp = Sp - 32;
+-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+-- c1D:
+-- _c1A::P64 = P64[Sp + 8];
+-- _c1B::P64 = P64[Sp + 16];
+-- _c1C::P64 = P64[Sp + 24];
+-- R3 = _c1C::P64;
+-- R2 = _c1B::P64;
+-- R1 = _c1A::P64;
+-- Sp = Sp + 32;
+-- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- If we don't run sinking pass after stack layout we are basically
+-- left with such code. However, running sinking on this code can lead
+-- to significant improvements:
+--
+-- c1E:
+-- I64[Sp - 32] = c1D;
+-- P64[Sp - 24] = R1;
+-- P64[Sp - 16] = R2;
+-- P64[Sp - 8] = R3;
+-- Sp = Sp - 32;
+-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+-- c1D:
+-- R3 = P64[Sp + 24];
+-- R2 = P64[Sp + 16];
+-- R1 = P64[Sp + 8];
+-- Sp = Sp + 32;
+-- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- Now we only have 9 assignments instead of 15.
+--
+-- There is one case when running sinking before stack layout could
+-- be beneficial. Consider this:
+--
+-- L1:
+-- x = y
+-- call f() returns L2
+-- L2: ...x...y...
+--
+-- Since both x and y are live across a call to f, they will be stored
+-- on the stack during stack layout and restored after the call:
+--
+-- L1:
+-- x = y
+-- P64[Sp - 24] = L2
+-- P64[Sp - 16] = x
+-- P64[Sp - 8] = y
+-- Sp = Sp - 24
+-- call f() returns L2
+-- L2:
+-- y = P64[Sp + 16]
+-- x = P64[Sp + 8]
+-- Sp = Sp + 24
+-- ...x...y...
+--
+-- However, if we run sinking before stack layout we would propagate x
+-- to its usage place (both x and y must be local register for this to
+-- be possible - global registers cannot be floated past a call):
+--
+-- L1:
+-- x = y
+-- call f() returns L2
+-- L2: ...y...y...
+--
+-- Thus making x dead at the call to f(). If we ran stack layout now
+-- we would generate less stores and loads:
+--
+-- L1:
+-- x = y
+-- P64[Sp - 16] = L2
+-- P64[Sp - 8] = y
+-- Sp = Sp - 16
+-- call f() returns L2
+-- L2:
+-- y = P64[Sp + 8]
+-- Sp = Sp + 16
+-- ...y...y...
+--
+-- But since we don't see any benefits from running sinking before stack
+-- layout, this situation probably doesn't arise too often in practice.
+--
+
+{- Note [inconsistent-pic-reg]
+
+On x86/Darwin, PIC is implemented by inserting a sequence like
+
+ call 1f
+ 1: popl %reg
+
+at the proc entry point, and then referring to labels as offsets from
+%reg. If we don't split proc points, then we could have many entry
+points in a proc that would need this sequence, and each entry point
+would then get a different value for %reg. If there are any join
+points, then at the join point we don't have a consistent value for
+%reg, so we don't know how to refer to labels.
+
+Hence, on x86/Darwin, we have to split proc points, and then each proc
+point will get its own PIC initialisation sequence.
+
+This isn't an issue on x86/ELF, where the sequence is
+
+ call 1f
+ 1: popl %reg
+ addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
+
+so %reg always has a consistent value: the address of
+_GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
+
+-}
+
+{- Note [unreachable blocks]
+
+The control-flow optimiser sometimes leaves unreachable blocks behind
+containing junk code. These aren't necessarily a problem, but
+removing them is good because it might save time in the native code
+generator later.
+
+-}
+
+runUniqSM :: UniqSM a -> IO a
+runUniqSM m = do
+ us <- mkSplitUniqSupply 'u'
+ return (initUs_ us m)
+
+
+dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
+dumpGraph dflags flag name g = do
+ when (gopt Opt_DoCmmLinting dflags) $ do_lint g
+ dumpWith dflags flag name FormatCMM (ppr g)
+ where
+ do_lint g = case cmmLintGraph dflags g of
+ Just err -> do { fatalErrorMsg dflags err
+ ; ghcExit dflags 1
+ }
+ Nothing -> return ()
+
+dumpWith :: DynFlags -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
+dumpWith dflags flag txt fmt sdoc = do
+ dumpIfSet_dyn dflags flag txt fmt sdoc
+ when (not (dopt flag dflags)) $
+ -- If `-ddump-cmm-verbose -ddump-to-file` is specified,
+ -- dump each Cmm pipeline stage output to a separate file. #16930
+ when (dopt Opt_D_dump_cmm_verbose dflags)
+ $ dumpAction dflags (mkDumpStyle dflags alwaysQualify)
+ (dumpOptionsFromFlag flag) txt fmt sdoc
+ dumpIfSet_dyn dflags Opt_D_dump_cmm_verbose_by_proc txt fmt sdoc