diff options
author | Andreas Klebinger <klebinger.andreas@gmx.at> | 2022-04-07 17:21:47 +0200 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2022-08-06 06:13:17 -0400 |
commit | fab0ee93abda33bf5c7eb5ca0372e12bd140a252 (patch) | |
tree | dfb79e20a525328a52bd5ea9168583b836f9ab54 | |
parent | 1f6c56ae9aa4ab4977ba376ac901d5256bf0aba0 (diff) | |
download | haskell-fab0ee93abda33bf5c7eb5ca0372e12bd140a252.tar.gz |
Change `-fprof-late` to insert cost centres after unfolding creation.
The former behaviour of adding cost centres after optimization but
before unfoldings are created is not available via the flag
`prof-late-inline` instead.
I also reduced the overhead of -fprof-late* by pushing the cost centres
into lambdas. This means the cost centres will only account for
execution of functions and not their partial application.
Further I made LATE_CC cost centres it's own CC flavour so they now
won't clash with user defined ones if a user uses the same string for
a custom scc.
LateCC: Don't put cost centres inside constructor workers.
With -fprof-late they are rarely useful as the worker is usually
inlined. Even if the worker is not inlined or we use -fprof-late-linline
they are generally not helpful but bloat compile and run time
significantly. So we just don't add sccs inside constructor workers.
-------------------------
Metric Decrease:
T13701
-------------------------
-rw-r--r-- | compiler/GHC/Cmm/Info/Build.hs | 4 | ||||
-rw-r--r-- | compiler/GHC/Core/LateCC.hs | 142 | ||||
-rw-r--r-- | compiler/GHC/Core/Opt/Pipeline.hs | 7 | ||||
-rw-r--r-- | compiler/GHC/Driver/CodeOutput.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/Driver/Flags.hs | 2 | ||||
-rw-r--r-- | compiler/GHC/Driver/Main.hs | 22 | ||||
-rw-r--r-- | compiler/GHC/Driver/Session.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/Iface/Tidy.hs | 4 | ||||
-rw-r--r-- | compiler/GHC/Stg/Pipeline.hs | 2 | ||||
-rw-r--r-- | compiler/GHC/Types/CostCentre.hs | 13 | ||||
-rw-r--r-- | docs/users_guide/debugging.rst | 6 | ||||
-rw-r--r-- | docs/users_guide/profiling.rst | 38 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_compile/all.T | 2 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_compile/prof-late-cc3.hs | 8 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_compile/prof-late-cc3.stderr | 50 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_compile/prof-late-cc4.hs | 5 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_compile/prof-late-cc4.stderr | 18 |
17 files changed, 286 insertions, 43 deletions
diff --git a/compiler/GHC/Cmm/Info/Build.hs b/compiler/GHC/Cmm/Info/Build.hs index 5a3b22f8b8..0edf1ce3ab 100644 --- a/compiler/GHC/Cmm/Info/Build.hs +++ b/compiler/GHC/Cmm/Info/Build.hs @@ -56,8 +56,8 @@ import GHC.Types.Name.Set {- Note [SRTs] ~~~~~~~~~~~ Static Reference Tables (SRTs) are the mechanism by which the garbage collector -can determine the live CAFs in the program. An SRT is a static tables associated -with a CAFfy static closure which record which CAFfy objects are reachable from +can determine the live CAFs in the program. An SRT is a static table associated +with a CAFfy closure which record which CAFfy objects are reachable from the closure's code. Representation diff --git a/compiler/GHC/Core/LateCC.hs b/compiler/GHC/Core/LateCC.hs index 2b4f810441..7a677e9964 100644 --- a/compiler/GHC/Core/LateCC.hs +++ b/compiler/GHC/Core/LateCC.hs @@ -3,12 +3,15 @@ -- | Adds cost-centers after the core piple has run. module GHC.Core.LateCC - ( addLateCostCentres + ( addLateCostCentresMG + , addLateCostCentresPgm + , addLateCostCentres -- Might be useful for API users + , Env(..) ) where import Control.Applicative -import GHC.Utils.Monad.State.Strict import Control.Monad +import qualified Data.Set as S import GHC.Prelude import GHC.Types.CostCentre @@ -20,21 +23,83 @@ import GHC.Types.Var import GHC.Unit.Types import GHC.Data.FastString import GHC.Core -import GHC.Types.Id +import GHC.Core.Opt.Monad import GHC.Core.Utils (mkTick) +import GHC.Types.Id +import GHC.Driver.Session -addLateCostCentres :: Bool -> ModGuts -> ModGuts -addLateCostCentres prof_count_entries guts = let - env = Env - { thisModule = mg_module guts - , ccState = newCostCentreState - , countEntries = prof_count_entries - } - in guts { mg_binds = doCoreProgram env (mg_binds guts) } +import GHC.Utils.Logger +import GHC.Utils.Outputable +import GHC.Utils.Misc +import GHC.Utils.Error (withTiming) +import GHC.Utils.Monad.State.Strict + + +{- Note [Collecting late cost centres] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Usually cost centres defined by a module are collected +during tidy by collectCostCentres. However with `-fprof-late` +we insert cost centres after inlining. So we keep a list of +all the cost centres we inserted and combine that with the list +of cost centres found during tidy. + +To avoid overhead when using -fprof-inline there is a flag to stop +us from collecting them here when we run this pass before tidy. + +Note [Adding late cost centres] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The basic idea is very simple. For every top level binder +`f = rhs` we compile it as if the user had written +`f = {-# SCC f #-} rhs`. + +If we do this after unfoldings for `f` have been created this +doesn't impact core-level optimizations at all. If we do it +before the cost centre will be included in the unfolding and +might inhibit optimizations at the call site. For this reason +we provide flags for both approaches as they have different +tradeoffs. + +We also don't add a cost centre for any binder that is a constructor +worker or wrapper. These will never meaningfully enrich the resulting +profile so we improve efficiency by omitting those. + +-} + +addLateCostCentresMG :: ModGuts -> CoreM ModGuts +addLateCostCentresMG guts = do + dflags <- getDynFlags + let env :: Env + env = Env + { thisModule = mg_module guts + , ccState = newCostCentreState + , countEntries = gopt Opt_ProfCountEntries dflags + , collectCCs = False -- See Note [Collecting late cost centres] + } + let guts' = guts { mg_binds = fst (addLateCostCentres env (mg_binds guts)) + } + return guts' + +addLateCostCentresPgm :: DynFlags -> Logger -> Module -> CoreProgram -> IO (CoreProgram, S.Set CostCentre) +addLateCostCentresPgm dflags logger mod binds = + withTiming logger + (text "LateCC"<+>brackets (ppr mod)) + (\(a,b) -> a `seqList` (b `seq` ())) $ do + let env = Env + { thisModule = mod + , ccState = newCostCentreState + , countEntries = gopt Opt_ProfCountEntries dflags + , collectCCs = True -- See Note [Collecting late cost centres] + } + (binds', ccs) = addLateCostCentres env binds + when (dopt Opt_D_dump_late_cc dflags || dopt Opt_D_verbose_core2core dflags) $ + putDumpFileMaybe logger Opt_D_dump_late_cc "LateCC" FormatCore (vcat (map ppr binds')) + return (binds', ccs) + +addLateCostCentres :: Env -> CoreProgram -> (CoreProgram,S.Set CostCentre) +addLateCostCentres env binds = + let (binds', state) = runState (mapM (doBind env) binds) initLateCCState + in (binds',lcs_ccs state) -doCoreProgram :: Env -> CoreProgram -> CoreProgram -doCoreProgram env binds = flip evalState newCostCentreState $ do - mapM (doBind env) binds doBind :: Env -> CoreBind -> M CoreBind doBind env (NonRec b rhs) = NonRec b <$> doBndr env b rhs @@ -44,28 +109,59 @@ doBind env (Rec bs) = Rec <$> mapM doPair bs doPair (b,rhs) = (b,) <$> doBndr env b rhs doBndr :: Env -> Id -> CoreExpr -> M CoreExpr -doBndr env bndr rhs = do +doBndr env bndr rhs + -- Cost centres on constructor workers are pretty much useless + -- so we don't emit them if we are looking at the rhs of a constructor + -- binding. + | Just _ <- isDataConId_maybe bndr = pure rhs + | otherwise = doBndr' env bndr rhs + + +-- We want to put the cost centra below the lambda as we only care about executions of the RHS. +doBndr' :: Env -> Id -> CoreExpr -> State LateCCState CoreExpr +doBndr' env bndr (Lam b rhs) = Lam b <$> doBndr' env bndr rhs +doBndr' env bndr rhs = do let name = idName bndr name_loc = nameSrcSpan name cc_name = getOccFS name count = countEntries env - cc_flavour <- getCCExprFlavour cc_name + cc_flavour <- getCCFlavour cc_name let cc_mod = thisModule env bndrCC = NormalCC cc_flavour cc_name cc_mod name_loc note = ProfNote bndrCC count True + addCC env bndrCC return $ mkTick note rhs -type M = State CostCentreState +data LateCCState = LateCCState + { lcs_state :: !CostCentreState + , lcs_ccs :: S.Set CostCentre + } +type M = State LateCCState -getCCExprFlavour :: FastString -> M CCFlavour -getCCExprFlavour name = ExprCC <$> getCCIndex' name +initLateCCState :: LateCCState +initLateCCState = LateCCState newCostCentreState mempty + +getCCFlavour :: FastString -> M CCFlavour +getCCFlavour name = LateCC <$> getCCIndex' name getCCIndex' :: FastString -> M CostCentreIndex -getCCIndex' name = state (getCCIndex name) +getCCIndex' name = do + state <- get + let (index,cc_state') = getCCIndex name (lcs_state state) + put (state { lcs_state = cc_state'}) + return index + +addCC :: Env -> CostCentre -> M () +addCC !env cc = do + state <- get + when (collectCCs env) $ do + let ccs' = S.insert cc (lcs_ccs state) + put (state { lcs_ccs = ccs'}) data Env = Env - { thisModule :: Module - , countEntries :: Bool - , ccState :: CostCentreState + { thisModule :: !Module + , countEntries:: !Bool + , ccState :: !CostCentreState + , collectCCs :: !Bool } diff --git a/compiler/GHC/Core/Opt/Pipeline.hs b/compiler/GHC/Core/Opt/Pipeline.hs index d1ca6a2165..bbf0dc2164 100644 --- a/compiler/GHC/Core/Opt/Pipeline.hs +++ b/compiler/GHC/Core/Opt/Pipeline.hs @@ -43,7 +43,7 @@ import GHC.Core.Opt.CallArity ( callArityAnalProgram ) import GHC.Core.Opt.Exitify ( exitifyProgram ) import GHC.Core.Opt.WorkWrap ( wwTopBinds ) import GHC.Core.Opt.CallerCC ( addCallerCostCentres ) -import GHC.Core.LateCC (addLateCostCentres) +import GHC.Core.LateCC (addLateCostCentresMG) import GHC.Core.Seq (seqBinds) import GHC.Core.FamInstEnv @@ -198,7 +198,7 @@ getCoreToDo dflags rule_base extra_vars runWhen (profiling && not (null $ callerCcFilters dflags)) CoreAddCallerCcs add_late_ccs = - runWhen (profiling && gopt Opt_ProfLateCcs dflags) $ CoreAddLateCcs + runWhen (profiling && gopt Opt_ProfLateInlineCcs dflags) $ CoreAddLateCcs core_todo = [ @@ -463,7 +463,6 @@ doCorePass pass guts = do p_fam_env <- getPackageFamInstEnv let platform = targetPlatform dflags let fam_envs = (p_fam_env, mg_fam_inst_env guts) - let prof_count_entries = gopt Opt_ProfCountEntries dflags let updateBinds f = return $ guts { mg_binds = f (mg_binds guts) } let updateBindsM f = f (mg_binds guts) >>= \b' -> return $ guts { mg_binds = b' } @@ -513,7 +512,7 @@ doCorePass pass guts = do addCallerCostCentres guts CoreAddLateCcs -> {-# SCC "AddLateCcs" #-} - return (addLateCostCentres prof_count_entries guts) + addLateCostCentresMG guts CoreDoPrintCore -> {-# SCC "PrintCore" #-} liftIO $ printCore logger (mg_binds guts) >> return guts diff --git a/compiler/GHC/Driver/CodeOutput.hs b/compiler/GHC/Driver/CodeOutput.hs index 5b3f614d8e..ce5a7e156d 100644 --- a/compiler/GHC/Driver/CodeOutput.hs +++ b/compiler/GHC/Driver/CodeOutput.hs @@ -327,7 +327,8 @@ outputForeignStubs_help fname doc_str header footer -- | Generate code to initialise cost centres profilingInitCode :: Platform -> Module -> CollectedCCs -> CStub profilingInitCode platform this_mod (local_CCs, singleton_CCSs) - = initializerCStub platform fn_name decls body + = {-# SCC profilingInitCode #-} + initializerCStub platform fn_name decls body where fn_name = mkInitializerStubLabel this_mod "prof_init" decls = vcat diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs index a4e5827bc6..fd23d2e81e 100644 --- a/compiler/GHC/Driver/Flags.hs +++ b/compiler/GHC/Driver/Flags.hs @@ -96,6 +96,7 @@ data DumpFlag | Opt_D_dump_simpl_iterations | Opt_D_dump_spec | Opt_D_dump_prep + | Opt_D_dump_late_cc | Opt_D_dump_stg_from_core -- ^ Initial STG (CoreToStg output) | Opt_D_dump_stg_unarised -- ^ STG after unarise | Opt_D_dump_stg_cg -- ^ STG (after stg2stg) @@ -303,6 +304,7 @@ data GeneralFlag -- profiling opts | Opt_AutoSccsOnIndividualCafs | Opt_ProfCountEntries + | Opt_ProfLateInlineCcs | Opt_ProfLateCcs | Opt_ProfManualCcs -- ^ Ignore manual SCC annotations diff --git a/compiler/GHC/Driver/Main.hs b/compiler/GHC/Driver/Main.hs index 2f6a3262d0..e03883702b 100644 --- a/compiler/GHC/Driver/Main.hs +++ b/compiler/GHC/Driver/Main.hs @@ -171,6 +171,8 @@ import GHC.Core.InstEnv import GHC.Core.FamInstEnv import GHC.Core.Rules import GHC.Core.Stats +import GHC.Core.LateCC (addLateCostCentresPgm) + import GHC.CoreToStg.Prep import GHC.CoreToStg ( coreToStg ) @@ -268,7 +270,6 @@ import Data.Bifunctor (first) import Data.List.NonEmpty (NonEmpty ((:|))) - {- ********************************************************************** %* * Initialisation @@ -1692,6 +1693,21 @@ hscGenHardCode hsc_env cgguts location output_filename = do -- but we don't generate any code for newtypes ------------------- + -- Insert late cost centres if enabled. + -- If `-fprof-late-inline` is enabled we can skip this, as it will have added + -- a superset of cost centres we would add here already. + + (late_cc_binds, late_local_ccs) <- + if gopt Opt_ProfLateCcs dflags && not (gopt Opt_ProfLateInlineCcs dflags) + then {-# SCC lateCC #-} do + (binds,late_ccs) <- addLateCostCentresPgm dflags logger this_mod core_binds + return ( binds, (S.toList late_ccs `mappend` local_ccs )) + else + return (core_binds, local_ccs) + + + + ------------------- -- PREPARE FOR CODE GENERATION -- Do saturation and convert to A-normal form (prepd_binds) <- {-# SCC "CorePrep" #-} do @@ -1700,7 +1716,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do (hsc_logger hsc_env) cp_cfg (initCorePrepPgmConfig (hsc_dflags hsc_env) (interactiveInScope $ hsc_IC hsc_env)) - this_mod location core_binds data_tycons + this_mod location late_cc_binds data_tycons ----------------- Convert to STG ------------------ (stg_binds, denv, (caf_ccs, caf_cc_stacks)) @@ -1711,7 +1727,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do (myCoreToStg logger dflags (hsc_IC hsc_env) False this_mod location prepd_binds) let cost_centre_info = - (local_ccs ++ caf_ccs, caf_cc_stacks) + (late_local_ccs ++ caf_ccs, caf_cc_stacks) platform = targetPlatform dflags prof_init | sccProfilingEnabled dflags = profilingInitCode platform this_mod cost_centre_info diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs index ee032a2652..0407952c33 100644 --- a/compiler/GHC/Driver/Session.hs +++ b/compiler/GHC/Driver/Session.hs @@ -2486,6 +2486,8 @@ dynamic_flags_deps = [ (setDumpFlag Opt_D_dump_spec) , make_ord_flag defGhcFlag "ddump-prep" (setDumpFlag Opt_D_dump_prep) + , make_ord_flag defGhcFlag "ddump-late-cc" + (setDumpFlag Opt_D_dump_late_cc) , make_ord_flag defGhcFlag "ddump-stg-from-core" (setDumpFlag Opt_D_dump_stg_from_core) , make_ord_flag defGhcFlag "ddump-stg-unarised" @@ -3446,6 +3448,7 @@ fFlagsDeps = [ flagSpec "prof-count-entries" Opt_ProfCountEntries, flagSpec "prof-late" Opt_ProfLateCcs, flagSpec "prof-manual" Opt_ProfManualCcs, + flagSpec "prof-late-inline" Opt_ProfLateInlineCcs, flagSpec "regs-graph" Opt_RegsGraph, flagSpec "regs-iterative" Opt_RegsIterative, depFlagSpec' "rewrite-rules" Opt_EnableRewriteRules diff --git a/compiler/GHC/Iface/Tidy.hs b/compiler/GHC/Iface/Tidy.hs index 1edd3cb9bf..7fec591196 100644 --- a/compiler/GHC/Iface/Tidy.hs +++ b/compiler/GHC/Iface/Tidy.hs @@ -347,7 +347,7 @@ data UnfoldingExposure data TidyOpts = TidyOpts { opt_name_cache :: !NameCache - , opt_collect_ccs :: !Bool + , opt_collect_ccs :: !Bool -- ^ Always true if we compile with -prof , opt_unfolding_opts :: !UnfoldingOpts , opt_expose_unfoldings :: !UnfoldingExposure -- ^ Which unfoldings to expose @@ -468,7 +468,7 @@ tidyProgram opts (ModGuts { mg_module = mod -- unfoldings. collectCostCentres :: Module -> CoreProgram -> [CoreRule] -> S.Set CostCentre collectCostCentres mod_name binds rules - = foldl' go_bind (go_rules S.empty) binds + = {-# SCC collectCostCentres #-} foldl' go_bind (go_rules S.empty) binds where go cs e = case e of Var{} -> cs diff --git a/compiler/GHC/Stg/Pipeline.hs b/compiler/GHC/Stg/Pipeline.hs index c5f7bc2da3..ca7b1aefa2 100644 --- a/compiler/GHC/Stg/Pipeline.hs +++ b/compiler/GHC/Stg/Pipeline.hs @@ -126,7 +126,7 @@ stg2stg logger extra_vars opts this_mod binds StgUnarise -> do us <- getUniqueSupplyM liftIO (stg_linter False "Pre-unarise" binds) - let binds' = unarise us binds + let binds' = {-# SCC "StgUnarise" #-} unarise us binds liftIO (dump_when Opt_D_dump_stg_unarised "Unarised STG:" binds') liftIO (stg_linter True "Unarise" binds') return binds' diff --git a/compiler/GHC/Types/CostCentre.hs b/compiler/GHC/Types/CostCentre.hs index 61f6b87c88..092b727d8d 100644 --- a/compiler/GHC/Types/CostCentre.hs +++ b/compiler/GHC/Types/CostCentre.hs @@ -31,6 +31,7 @@ import GHC.Utils.Outputable import GHC.Types.SrcLoc import GHC.Data.FastString import GHC.Types.CostCentre.State +import GHC.Utils.Panic.Plain import Data.Data @@ -71,6 +72,7 @@ data CCFlavour = CafCC -- ^ Auto-generated top-level thunk | ExprCC !CostCentreIndex -- ^ Explicitly annotated expression | DeclCC !CostCentreIndex -- ^ Explicitly annotated declaration | HpcCC !CostCentreIndex -- ^ Generated by HPC for coverage + | LateCC !CostCentreIndex -- ^ Annotated by the one of the prof-last* passes. deriving (Eq, Ord, Data) -- | Extract the index from a flavour @@ -79,6 +81,7 @@ flavourIndex CafCC = 0 flavourIndex (ExprCC x) = unCostCentreIndex x flavourIndex (DeclCC x) = unCostCentreIndex x flavourIndex (HpcCC x) = unCostCentreIndex x +flavourIndex (LateCC x) = unCostCentreIndex x instance Eq CostCentre where c1 == c2 = case c1 `cmpCostCentre` c2 of { EQ -> True; _ -> False } @@ -292,7 +295,8 @@ ppFlavourLblComponent :: CCFlavour -> SDoc ppFlavourLblComponent CafCC = text "CAF" ppFlavourLblComponent (ExprCC i) = text "EXPR" <> ppIdxLblComponent i ppFlavourLblComponent (DeclCC i) = text "DECL" <> ppIdxLblComponent i -ppFlavourLblComponent (HpcCC i) = text "HPC" <> ppIdxLblComponent i +ppFlavourLblComponent (HpcCC i) = text "HPC" <> ppIdxLblComponent i +ppFlavourLblComponent (LateCC i) = text "LATECC" <> ppIdxLblComponent i -- ^ Print the flavour index component of a C label ppIdxLblComponent :: CostCentreIndex -> SDoc @@ -328,13 +332,18 @@ instance Binary CCFlavour where put_ bh (HpcCC i) = do putByte bh 3 put_ bh i + put_ bh (LateCC i) = do + putByte bh 4 + put_ bh i get bh = do h <- getByte bh case h of 0 -> return CafCC 1 -> ExprCC <$> get bh 2 -> DeclCC <$> get bh - _ -> HpcCC <$> get bh + 3 -> HpcCC <$> get bh + 4 -> LateCC <$> get bh + _ -> panic "Invalid CCFlavour" instance Binary CostCentre where put_ bh (NormalCC aa ab ac _ad) = do diff --git a/docs/users_guide/debugging.rst b/docs/users_guide/debugging.rst index 0c09c4c3ec..87a689c268 100644 --- a/docs/users_guide/debugging.rst +++ b/docs/users_guide/debugging.rst @@ -446,6 +446,12 @@ subexpression elimination pass. Dump output of Core preparation pass +.. ghc-flag:: -ddump-late-cc + :shortdesc: Dump core with late cost centres added + :type: dynamic + + Dump output of LateCC pass after cost centres have been added. + .. ghc-flag:: -ddump-view-pattern-commoning :shortdesc: Dump commoned view patterns :type: dynamic diff --git a/docs/users_guide/profiling.rst b/docs/users_guide/profiling.rst index 418c9b0bb0..1c2f458f10 100644 --- a/docs/users_guide/profiling.rst +++ b/docs/users_guide/profiling.rst @@ -439,19 +439,47 @@ compiled program. details. .. ghc-flag:: -fprof-late - :shortdesc: Auto-add ``SCC``\\ s to all top level bindings *after* the optimizer has run. + :shortdesc: Auto-add ``SCC``\\ s to all top level bindings *after* the core pipeline has run. :type: dynamic :reverse: -fno-prof-late :category: :since: 9.4.1 - Adds an automatic ``SCC`` annotation to all top level bindings late in the core pipeline after - the optimizer has run. This means these cost centres will not interfere with core-level optimizations + Adds an automatic ``SCC`` annotation to all top level bindings late in the compilation pipeline after + the optimizer has run and unfoldings have been created. This means these cost centres will not interfere with core-level optimizations and the resulting profile will be closer to the performance profile of an optimized non-profiled executable. - While the results of this are generally very informative some of the compiler internal names - will leak into the profile. + While the results of this are generally informative, some of the compiler internal names + will leak into the profile. Further if a function is inlined into a use site it's costs will be counted against the + caller's cost center. + + For example if we have this code: + + .. code-block:: haskell + + {-# INLINE mysum #-} + mysum = sum + main = print $ mysum [1..9999999] + + Then ``mysum`` will not show up in the profile since it will be inlined into main and therefore + it's associated costs will be attributed to mains implicit cost centre. + +.. ghc-flag:: -fprof-late-inline + :shortdesc: Auto-add ``SCC``\\ s to all top level bindings *after* the optimizer has run and retain them when inlining. + :type: dynamic + :reverse: -fno-prof-late-inline + :category: + + :since: 9.4.1 + + Adds an automatic ``SCC`` annotation to all top level bindings late in the core pipeline after + the optimizer has run. This is the same as :ghc-flag:`-fprof-late` except that cost centers are included in some unfoldings. + + The result of which is that cost centers *can* inhibit core optimizations to some degree at use sites + after inlining. Further there can be significant overhead from cost centres added to small functions if they are inlined often. + + You can try this mode if :ghc-flag:`-fprof-late` results in a profile that's too hard to interpret. .. ghc-flag:: -fprof-cafs :shortdesc: Auto-add ``SCC``\\ s to all CAFs diff --git a/testsuite/tests/profiling/should_compile/all.T b/testsuite/tests/profiling/should_compile/all.T index 9e460e153b..1f4ae1430d 100644 --- a/testsuite/tests/profiling/should_compile/all.T +++ b/testsuite/tests/profiling/should_compile/all.T @@ -4,6 +4,8 @@ test('prof001', [only_ways(['normal']), req_profiling], compile_and_run, ['-prof test('prof002', [only_ways(['normal']), req_profiling], compile_and_run, ['-prof -fprof-cafs']) test('prof-late-cc', [only_ways(['normal']), req_profiling], compile, ['-prof -fprof-late']) test('prof-late-cc2', [only_ways(['normal']), req_profiling], compile_and_run, ['-prof -fprof-late']) +test('prof-late-cc3', [only_ways(['normal']), req_profiling, grep_errmsg('scc')], compile, ['-prof -fprof-late -ddump-prep -ddump-simpl -dno-typeable-binds']) +test('prof-late-cc4', [only_ways(['normal']), req_profiling, grep_errmsg('scc')], compile, ['-prof -fprof-late-inline -ddump-simpl -dno-typeable-binds -O']) test('T2410', [only_ways(['normal']), req_profiling], compile, ['-O2 -prof -fprof-cafs']) test('T5889', [only_ways(['normal']), req_profiling, extra_files(['T5889/A.hs', 'T5889/B.hs'])], multimod_compile, ['A B', '-O -prof -fno-prof-count-entries -v0']) diff --git a/testsuite/tests/profiling/should_compile/prof-late-cc3.hs b/testsuite/tests/profiling/should_compile/prof-late-cc3.hs new file mode 100644 index 0000000000..b7badbc216 --- /dev/null +++ b/testsuite/tests/profiling/should_compile/prof-late-cc3.hs @@ -0,0 +1,8 @@ +module M where + +-- There should be a cost center in core prep output but not in +-- -ddump-simpl output with -fprof-late +{-# INLINE doStuff #-} +doStuff x = do + print x + return x diff --git a/testsuite/tests/profiling/should_compile/prof-late-cc3.stderr b/testsuite/tests/profiling/should_compile/prof-late-cc3.stderr new file mode 100644 index 0000000000..92d9b1cd8f --- /dev/null +++ b/testsuite/tests/profiling/should_compile/prof-late-cc3.stderr @@ -0,0 +1,50 @@ + +==================== Tidy Core ==================== +Result size of Tidy Core + = {terms: 12, types: 16, coercions: 0, joins: 0/0} + +-- RHS size: {terms: 11, types: 10, coercions: 0, joins: 0/0} +doStuff [InlPrag=INLINE (sat-args=1)] + :: forall {b}. Show b => b -> IO b +[GblId, Arity=2, Unf=OtherCon []] +doStuff + = \ (@b_axP) ($dShow_aET :: Show b_axP) (eta_B0 :: b_axP) -> + >> + @IO + GHC.Base.$fMonadIO + @() + @b_axP + (print @b_axP $dShow_aET eta_B0) + (return @IO GHC.Base.$fMonadIO @b_axP eta_B0) + + + + +==================== CorePrep ==================== +Result size of CorePrep + = {terms: 16, types: 20, coercions: 0, joins: 0/2} + +-- RHS size: {terms: 15, types: 14, coercions: 0, joins: 0/2} +M.doStuff [InlPrag=INLINE (sat-args=1)] + :: forall {b}. GHC.Show.Show b => b -> GHC.Types.IO b +[GblId, Arity=2, Unf=OtherCon []] +M.doStuff + = \ (@b_axP) + ($dShow_sJs [Occ=OnceL1] :: GHC.Show.Show b_axP) + (eta_sJt :: b_axP) -> + scctick<doStuff> + let { + sat_sJv [Occ=Once1] :: GHC.Types.IO b_axP + [LclId] + sat_sJv + = GHC.Base.return + @GHC.Types.IO GHC.Base.$fMonadIO @b_axP eta_sJt } in + let { + sat_sJu [Occ=Once1] :: GHC.Types.IO () + [LclId] + sat_sJu = System.IO.print @b_axP $dShow_sJs eta_sJt } in + GHC.Base.>> + @GHC.Types.IO GHC.Base.$fMonadIO @() @b_axP sat_sJu sat_sJv + + + diff --git a/testsuite/tests/profiling/should_compile/prof-late-cc4.hs b/testsuite/tests/profiling/should_compile/prof-late-cc4.hs new file mode 100644 index 0000000000..45d628cd0f --- /dev/null +++ b/testsuite/tests/profiling/should_compile/prof-late-cc4.hs @@ -0,0 +1,5 @@ +module M where + +-- There should be a cost center in the -ddump-simpl output +addStuff x y = do + x + y :: Int diff --git a/testsuite/tests/profiling/should_compile/prof-late-cc4.stderr b/testsuite/tests/profiling/should_compile/prof-late-cc4.stderr new file mode 100644 index 0000000000..2de7dad615 --- /dev/null +++ b/testsuite/tests/profiling/should_compile/prof-late-cc4.stderr @@ -0,0 +1,18 @@ + +==================== Tidy Core ==================== +Result size of Tidy Core + = {terms: 2, types: 3, coercions: 0, joins: 0/0} + +-- RHS size: {terms: 1, types: 0, coercions: 0, joins: 0/0} +addStuff [InlPrag=INLINE (sat-args=2)] :: Int -> Int -> Int +[GblId, + Str=<1!P(L)><1!P(L)>, + Cpr=1, + Unf=Unf{Src=InlineStable, TopLvl=True, Value=True, ConLike=True, + WorkFree=True, Expandable=True, + Guidance=ALWAYS_IF(arity=2,unsat_ok=False,boring_ok=True) + Tmpl= GHC.Num.$fNumInt_$c+}] +addStuff = scctick<addStuff> GHC.Num.$fNumInt_$c+ + + + |