diff options
author | Andreas Klebinger <klebinger.andreas@gmx.at> | 2022-04-07 17:21:47 +0200 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2022-08-06 06:13:17 -0400 |
commit | fab0ee93abda33bf5c7eb5ca0372e12bd140a252 (patch) | |
tree | dfb79e20a525328a52bd5ea9168583b836f9ab54 /compiler | |
parent | 1f6c56ae9aa4ab4977ba376ac901d5256bf0aba0 (diff) | |
download | haskell-fab0ee93abda33bf5c7eb5ca0372e12bd140a252.tar.gz |
Change `-fprof-late` to insert cost centres after unfolding creation.
The former behaviour of adding cost centres after optimization but
before unfoldings are created is not available via the flag
`prof-late-inline` instead.
I also reduced the overhead of -fprof-late* by pushing the cost centres
into lambdas. This means the cost centres will only account for
execution of functions and not their partial application.
Further I made LATE_CC cost centres it's own CC flavour so they now
won't clash with user defined ones if a user uses the same string for
a custom scc.
LateCC: Don't put cost centres inside constructor workers.
With -fprof-late they are rarely useful as the worker is usually
inlined. Even if the worker is not inlined or we use -fprof-late-linline
they are generally not helpful but bloat compile and run time
significantly. So we just don't add sccs inside constructor workers.
-------------------------
Metric Decrease:
T13701
-------------------------
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/GHC/Cmm/Info/Build.hs | 4 | ||||
-rw-r--r-- | compiler/GHC/Core/LateCC.hs | 142 | ||||
-rw-r--r-- | compiler/GHC/Core/Opt/Pipeline.hs | 7 | ||||
-rw-r--r-- | compiler/GHC/Driver/CodeOutput.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/Driver/Flags.hs | 2 | ||||
-rw-r--r-- | compiler/GHC/Driver/Main.hs | 22 | ||||
-rw-r--r-- | compiler/GHC/Driver/Session.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/Iface/Tidy.hs | 4 | ||||
-rw-r--r-- | compiler/GHC/Stg/Pipeline.hs | 2 | ||||
-rw-r--r-- | compiler/GHC/Types/CostCentre.hs | 13 |
10 files changed, 164 insertions, 38 deletions
diff --git a/compiler/GHC/Cmm/Info/Build.hs b/compiler/GHC/Cmm/Info/Build.hs index 5a3b22f8b8..0edf1ce3ab 100644 --- a/compiler/GHC/Cmm/Info/Build.hs +++ b/compiler/GHC/Cmm/Info/Build.hs @@ -56,8 +56,8 @@ import GHC.Types.Name.Set {- Note [SRTs] ~~~~~~~~~~~ Static Reference Tables (SRTs) are the mechanism by which the garbage collector -can determine the live CAFs in the program. An SRT is a static tables associated -with a CAFfy static closure which record which CAFfy objects are reachable from +can determine the live CAFs in the program. An SRT is a static table associated +with a CAFfy closure which record which CAFfy objects are reachable from the closure's code. Representation diff --git a/compiler/GHC/Core/LateCC.hs b/compiler/GHC/Core/LateCC.hs index 2b4f810441..7a677e9964 100644 --- a/compiler/GHC/Core/LateCC.hs +++ b/compiler/GHC/Core/LateCC.hs @@ -3,12 +3,15 @@ -- | Adds cost-centers after the core piple has run. module GHC.Core.LateCC - ( addLateCostCentres + ( addLateCostCentresMG + , addLateCostCentresPgm + , addLateCostCentres -- Might be useful for API users + , Env(..) ) where import Control.Applicative -import GHC.Utils.Monad.State.Strict import Control.Monad +import qualified Data.Set as S import GHC.Prelude import GHC.Types.CostCentre @@ -20,21 +23,83 @@ import GHC.Types.Var import GHC.Unit.Types import GHC.Data.FastString import GHC.Core -import GHC.Types.Id +import GHC.Core.Opt.Monad import GHC.Core.Utils (mkTick) +import GHC.Types.Id +import GHC.Driver.Session -addLateCostCentres :: Bool -> ModGuts -> ModGuts -addLateCostCentres prof_count_entries guts = let - env = Env - { thisModule = mg_module guts - , ccState = newCostCentreState - , countEntries = prof_count_entries - } - in guts { mg_binds = doCoreProgram env (mg_binds guts) } +import GHC.Utils.Logger +import GHC.Utils.Outputable +import GHC.Utils.Misc +import GHC.Utils.Error (withTiming) +import GHC.Utils.Monad.State.Strict + + +{- Note [Collecting late cost centres] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Usually cost centres defined by a module are collected +during tidy by collectCostCentres. However with `-fprof-late` +we insert cost centres after inlining. So we keep a list of +all the cost centres we inserted and combine that with the list +of cost centres found during tidy. + +To avoid overhead when using -fprof-inline there is a flag to stop +us from collecting them here when we run this pass before tidy. + +Note [Adding late cost centres] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The basic idea is very simple. For every top level binder +`f = rhs` we compile it as if the user had written +`f = {-# SCC f #-} rhs`. + +If we do this after unfoldings for `f` have been created this +doesn't impact core-level optimizations at all. If we do it +before the cost centre will be included in the unfolding and +might inhibit optimizations at the call site. For this reason +we provide flags for both approaches as they have different +tradeoffs. + +We also don't add a cost centre for any binder that is a constructor +worker or wrapper. These will never meaningfully enrich the resulting +profile so we improve efficiency by omitting those. + +-} + +addLateCostCentresMG :: ModGuts -> CoreM ModGuts +addLateCostCentresMG guts = do + dflags <- getDynFlags + let env :: Env + env = Env + { thisModule = mg_module guts + , ccState = newCostCentreState + , countEntries = gopt Opt_ProfCountEntries dflags + , collectCCs = False -- See Note [Collecting late cost centres] + } + let guts' = guts { mg_binds = fst (addLateCostCentres env (mg_binds guts)) + } + return guts' + +addLateCostCentresPgm :: DynFlags -> Logger -> Module -> CoreProgram -> IO (CoreProgram, S.Set CostCentre) +addLateCostCentresPgm dflags logger mod binds = + withTiming logger + (text "LateCC"<+>brackets (ppr mod)) + (\(a,b) -> a `seqList` (b `seq` ())) $ do + let env = Env + { thisModule = mod + , ccState = newCostCentreState + , countEntries = gopt Opt_ProfCountEntries dflags + , collectCCs = True -- See Note [Collecting late cost centres] + } + (binds', ccs) = addLateCostCentres env binds + when (dopt Opt_D_dump_late_cc dflags || dopt Opt_D_verbose_core2core dflags) $ + putDumpFileMaybe logger Opt_D_dump_late_cc "LateCC" FormatCore (vcat (map ppr binds')) + return (binds', ccs) + +addLateCostCentres :: Env -> CoreProgram -> (CoreProgram,S.Set CostCentre) +addLateCostCentres env binds = + let (binds', state) = runState (mapM (doBind env) binds) initLateCCState + in (binds',lcs_ccs state) -doCoreProgram :: Env -> CoreProgram -> CoreProgram -doCoreProgram env binds = flip evalState newCostCentreState $ do - mapM (doBind env) binds doBind :: Env -> CoreBind -> M CoreBind doBind env (NonRec b rhs) = NonRec b <$> doBndr env b rhs @@ -44,28 +109,59 @@ doBind env (Rec bs) = Rec <$> mapM doPair bs doPair (b,rhs) = (b,) <$> doBndr env b rhs doBndr :: Env -> Id -> CoreExpr -> M CoreExpr -doBndr env bndr rhs = do +doBndr env bndr rhs + -- Cost centres on constructor workers are pretty much useless + -- so we don't emit them if we are looking at the rhs of a constructor + -- binding. + | Just _ <- isDataConId_maybe bndr = pure rhs + | otherwise = doBndr' env bndr rhs + + +-- We want to put the cost centra below the lambda as we only care about executions of the RHS. +doBndr' :: Env -> Id -> CoreExpr -> State LateCCState CoreExpr +doBndr' env bndr (Lam b rhs) = Lam b <$> doBndr' env bndr rhs +doBndr' env bndr rhs = do let name = idName bndr name_loc = nameSrcSpan name cc_name = getOccFS name count = countEntries env - cc_flavour <- getCCExprFlavour cc_name + cc_flavour <- getCCFlavour cc_name let cc_mod = thisModule env bndrCC = NormalCC cc_flavour cc_name cc_mod name_loc note = ProfNote bndrCC count True + addCC env bndrCC return $ mkTick note rhs -type M = State CostCentreState +data LateCCState = LateCCState + { lcs_state :: !CostCentreState + , lcs_ccs :: S.Set CostCentre + } +type M = State LateCCState -getCCExprFlavour :: FastString -> M CCFlavour -getCCExprFlavour name = ExprCC <$> getCCIndex' name +initLateCCState :: LateCCState +initLateCCState = LateCCState newCostCentreState mempty + +getCCFlavour :: FastString -> M CCFlavour +getCCFlavour name = LateCC <$> getCCIndex' name getCCIndex' :: FastString -> M CostCentreIndex -getCCIndex' name = state (getCCIndex name) +getCCIndex' name = do + state <- get + let (index,cc_state') = getCCIndex name (lcs_state state) + put (state { lcs_state = cc_state'}) + return index + +addCC :: Env -> CostCentre -> M () +addCC !env cc = do + state <- get + when (collectCCs env) $ do + let ccs' = S.insert cc (lcs_ccs state) + put (state { lcs_ccs = ccs'}) data Env = Env - { thisModule :: Module - , countEntries :: Bool - , ccState :: CostCentreState + { thisModule :: !Module + , countEntries:: !Bool + , ccState :: !CostCentreState + , collectCCs :: !Bool } diff --git a/compiler/GHC/Core/Opt/Pipeline.hs b/compiler/GHC/Core/Opt/Pipeline.hs index d1ca6a2165..bbf0dc2164 100644 --- a/compiler/GHC/Core/Opt/Pipeline.hs +++ b/compiler/GHC/Core/Opt/Pipeline.hs @@ -43,7 +43,7 @@ import GHC.Core.Opt.CallArity ( callArityAnalProgram ) import GHC.Core.Opt.Exitify ( exitifyProgram ) import GHC.Core.Opt.WorkWrap ( wwTopBinds ) import GHC.Core.Opt.CallerCC ( addCallerCostCentres ) -import GHC.Core.LateCC (addLateCostCentres) +import GHC.Core.LateCC (addLateCostCentresMG) import GHC.Core.Seq (seqBinds) import GHC.Core.FamInstEnv @@ -198,7 +198,7 @@ getCoreToDo dflags rule_base extra_vars runWhen (profiling && not (null $ callerCcFilters dflags)) CoreAddCallerCcs add_late_ccs = - runWhen (profiling && gopt Opt_ProfLateCcs dflags) $ CoreAddLateCcs + runWhen (profiling && gopt Opt_ProfLateInlineCcs dflags) $ CoreAddLateCcs core_todo = [ @@ -463,7 +463,6 @@ doCorePass pass guts = do p_fam_env <- getPackageFamInstEnv let platform = targetPlatform dflags let fam_envs = (p_fam_env, mg_fam_inst_env guts) - let prof_count_entries = gopt Opt_ProfCountEntries dflags let updateBinds f = return $ guts { mg_binds = f (mg_binds guts) } let updateBindsM f = f (mg_binds guts) >>= \b' -> return $ guts { mg_binds = b' } @@ -513,7 +512,7 @@ doCorePass pass guts = do addCallerCostCentres guts CoreAddLateCcs -> {-# SCC "AddLateCcs" #-} - return (addLateCostCentres prof_count_entries guts) + addLateCostCentresMG guts CoreDoPrintCore -> {-# SCC "PrintCore" #-} liftIO $ printCore logger (mg_binds guts) >> return guts diff --git a/compiler/GHC/Driver/CodeOutput.hs b/compiler/GHC/Driver/CodeOutput.hs index 5b3f614d8e..ce5a7e156d 100644 --- a/compiler/GHC/Driver/CodeOutput.hs +++ b/compiler/GHC/Driver/CodeOutput.hs @@ -327,7 +327,8 @@ outputForeignStubs_help fname doc_str header footer -- | Generate code to initialise cost centres profilingInitCode :: Platform -> Module -> CollectedCCs -> CStub profilingInitCode platform this_mod (local_CCs, singleton_CCSs) - = initializerCStub platform fn_name decls body + = {-# SCC profilingInitCode #-} + initializerCStub platform fn_name decls body where fn_name = mkInitializerStubLabel this_mod "prof_init" decls = vcat diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs index a4e5827bc6..fd23d2e81e 100644 --- a/compiler/GHC/Driver/Flags.hs +++ b/compiler/GHC/Driver/Flags.hs @@ -96,6 +96,7 @@ data DumpFlag | Opt_D_dump_simpl_iterations | Opt_D_dump_spec | Opt_D_dump_prep + | Opt_D_dump_late_cc | Opt_D_dump_stg_from_core -- ^ Initial STG (CoreToStg output) | Opt_D_dump_stg_unarised -- ^ STG after unarise | Opt_D_dump_stg_cg -- ^ STG (after stg2stg) @@ -303,6 +304,7 @@ data GeneralFlag -- profiling opts | Opt_AutoSccsOnIndividualCafs | Opt_ProfCountEntries + | Opt_ProfLateInlineCcs | Opt_ProfLateCcs | Opt_ProfManualCcs -- ^ Ignore manual SCC annotations diff --git a/compiler/GHC/Driver/Main.hs b/compiler/GHC/Driver/Main.hs index 2f6a3262d0..e03883702b 100644 --- a/compiler/GHC/Driver/Main.hs +++ b/compiler/GHC/Driver/Main.hs @@ -171,6 +171,8 @@ import GHC.Core.InstEnv import GHC.Core.FamInstEnv import GHC.Core.Rules import GHC.Core.Stats +import GHC.Core.LateCC (addLateCostCentresPgm) + import GHC.CoreToStg.Prep import GHC.CoreToStg ( coreToStg ) @@ -268,7 +270,6 @@ import Data.Bifunctor (first) import Data.List.NonEmpty (NonEmpty ((:|))) - {- ********************************************************************** %* * Initialisation @@ -1692,6 +1693,21 @@ hscGenHardCode hsc_env cgguts location output_filename = do -- but we don't generate any code for newtypes ------------------- + -- Insert late cost centres if enabled. + -- If `-fprof-late-inline` is enabled we can skip this, as it will have added + -- a superset of cost centres we would add here already. + + (late_cc_binds, late_local_ccs) <- + if gopt Opt_ProfLateCcs dflags && not (gopt Opt_ProfLateInlineCcs dflags) + then {-# SCC lateCC #-} do + (binds,late_ccs) <- addLateCostCentresPgm dflags logger this_mod core_binds + return ( binds, (S.toList late_ccs `mappend` local_ccs )) + else + return (core_binds, local_ccs) + + + + ------------------- -- PREPARE FOR CODE GENERATION -- Do saturation and convert to A-normal form (prepd_binds) <- {-# SCC "CorePrep" #-} do @@ -1700,7 +1716,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do (hsc_logger hsc_env) cp_cfg (initCorePrepPgmConfig (hsc_dflags hsc_env) (interactiveInScope $ hsc_IC hsc_env)) - this_mod location core_binds data_tycons + this_mod location late_cc_binds data_tycons ----------------- Convert to STG ------------------ (stg_binds, denv, (caf_ccs, caf_cc_stacks)) @@ -1711,7 +1727,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do (myCoreToStg logger dflags (hsc_IC hsc_env) False this_mod location prepd_binds) let cost_centre_info = - (local_ccs ++ caf_ccs, caf_cc_stacks) + (late_local_ccs ++ caf_ccs, caf_cc_stacks) platform = targetPlatform dflags prof_init | sccProfilingEnabled dflags = profilingInitCode platform this_mod cost_centre_info diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs index ee032a2652..0407952c33 100644 --- a/compiler/GHC/Driver/Session.hs +++ b/compiler/GHC/Driver/Session.hs @@ -2486,6 +2486,8 @@ dynamic_flags_deps = [ (setDumpFlag Opt_D_dump_spec) , make_ord_flag defGhcFlag "ddump-prep" (setDumpFlag Opt_D_dump_prep) + , make_ord_flag defGhcFlag "ddump-late-cc" + (setDumpFlag Opt_D_dump_late_cc) , make_ord_flag defGhcFlag "ddump-stg-from-core" (setDumpFlag Opt_D_dump_stg_from_core) , make_ord_flag defGhcFlag "ddump-stg-unarised" @@ -3446,6 +3448,7 @@ fFlagsDeps = [ flagSpec "prof-count-entries" Opt_ProfCountEntries, flagSpec "prof-late" Opt_ProfLateCcs, flagSpec "prof-manual" Opt_ProfManualCcs, + flagSpec "prof-late-inline" Opt_ProfLateInlineCcs, flagSpec "regs-graph" Opt_RegsGraph, flagSpec "regs-iterative" Opt_RegsIterative, depFlagSpec' "rewrite-rules" Opt_EnableRewriteRules diff --git a/compiler/GHC/Iface/Tidy.hs b/compiler/GHC/Iface/Tidy.hs index 1edd3cb9bf..7fec591196 100644 --- a/compiler/GHC/Iface/Tidy.hs +++ b/compiler/GHC/Iface/Tidy.hs @@ -347,7 +347,7 @@ data UnfoldingExposure data TidyOpts = TidyOpts { opt_name_cache :: !NameCache - , opt_collect_ccs :: !Bool + , opt_collect_ccs :: !Bool -- ^ Always true if we compile with -prof , opt_unfolding_opts :: !UnfoldingOpts , opt_expose_unfoldings :: !UnfoldingExposure -- ^ Which unfoldings to expose @@ -468,7 +468,7 @@ tidyProgram opts (ModGuts { mg_module = mod -- unfoldings. collectCostCentres :: Module -> CoreProgram -> [CoreRule] -> S.Set CostCentre collectCostCentres mod_name binds rules - = foldl' go_bind (go_rules S.empty) binds + = {-# SCC collectCostCentres #-} foldl' go_bind (go_rules S.empty) binds where go cs e = case e of Var{} -> cs diff --git a/compiler/GHC/Stg/Pipeline.hs b/compiler/GHC/Stg/Pipeline.hs index c5f7bc2da3..ca7b1aefa2 100644 --- a/compiler/GHC/Stg/Pipeline.hs +++ b/compiler/GHC/Stg/Pipeline.hs @@ -126,7 +126,7 @@ stg2stg logger extra_vars opts this_mod binds StgUnarise -> do us <- getUniqueSupplyM liftIO (stg_linter False "Pre-unarise" binds) - let binds' = unarise us binds + let binds' = {-# SCC "StgUnarise" #-} unarise us binds liftIO (dump_when Opt_D_dump_stg_unarised "Unarised STG:" binds') liftIO (stg_linter True "Unarise" binds') return binds' diff --git a/compiler/GHC/Types/CostCentre.hs b/compiler/GHC/Types/CostCentre.hs index 61f6b87c88..092b727d8d 100644 --- a/compiler/GHC/Types/CostCentre.hs +++ b/compiler/GHC/Types/CostCentre.hs @@ -31,6 +31,7 @@ import GHC.Utils.Outputable import GHC.Types.SrcLoc import GHC.Data.FastString import GHC.Types.CostCentre.State +import GHC.Utils.Panic.Plain import Data.Data @@ -71,6 +72,7 @@ data CCFlavour = CafCC -- ^ Auto-generated top-level thunk | ExprCC !CostCentreIndex -- ^ Explicitly annotated expression | DeclCC !CostCentreIndex -- ^ Explicitly annotated declaration | HpcCC !CostCentreIndex -- ^ Generated by HPC for coverage + | LateCC !CostCentreIndex -- ^ Annotated by the one of the prof-last* passes. deriving (Eq, Ord, Data) -- | Extract the index from a flavour @@ -79,6 +81,7 @@ flavourIndex CafCC = 0 flavourIndex (ExprCC x) = unCostCentreIndex x flavourIndex (DeclCC x) = unCostCentreIndex x flavourIndex (HpcCC x) = unCostCentreIndex x +flavourIndex (LateCC x) = unCostCentreIndex x instance Eq CostCentre where c1 == c2 = case c1 `cmpCostCentre` c2 of { EQ -> True; _ -> False } @@ -292,7 +295,8 @@ ppFlavourLblComponent :: CCFlavour -> SDoc ppFlavourLblComponent CafCC = text "CAF" ppFlavourLblComponent (ExprCC i) = text "EXPR" <> ppIdxLblComponent i ppFlavourLblComponent (DeclCC i) = text "DECL" <> ppIdxLblComponent i -ppFlavourLblComponent (HpcCC i) = text "HPC" <> ppIdxLblComponent i +ppFlavourLblComponent (HpcCC i) = text "HPC" <> ppIdxLblComponent i +ppFlavourLblComponent (LateCC i) = text "LATECC" <> ppIdxLblComponent i -- ^ Print the flavour index component of a C label ppIdxLblComponent :: CostCentreIndex -> SDoc @@ -328,13 +332,18 @@ instance Binary CCFlavour where put_ bh (HpcCC i) = do putByte bh 3 put_ bh i + put_ bh (LateCC i) = do + putByte bh 4 + put_ bh i get bh = do h <- getByte bh case h of 0 -> return CafCC 1 -> ExprCC <$> get bh 2 -> DeclCC <$> get bh - _ -> HpcCC <$> get bh + 3 -> HpcCC <$> get bh + 4 -> LateCC <$> get bh + _ -> panic "Invalid CCFlavour" instance Binary CostCentre where put_ bh (NormalCC aa ab ac _ad) = do |