summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/GHC/Cmm/Info/Build.hs4
-rw-r--r--compiler/GHC/Core/LateCC.hs142
-rw-r--r--compiler/GHC/Core/Opt/Pipeline.hs7
-rw-r--r--compiler/GHC/Driver/CodeOutput.hs3
-rw-r--r--compiler/GHC/Driver/Flags.hs2
-rw-r--r--compiler/GHC/Driver/Main.hs22
-rw-r--r--compiler/GHC/Driver/Session.hs3
-rw-r--r--compiler/GHC/Iface/Tidy.hs4
-rw-r--r--compiler/GHC/Stg/Pipeline.hs2
-rw-r--r--compiler/GHC/Types/CostCentre.hs13
-rw-r--r--docs/users_guide/debugging.rst6
-rw-r--r--docs/users_guide/profiling.rst38
-rw-r--r--testsuite/tests/profiling/should_compile/all.T2
-rw-r--r--testsuite/tests/profiling/should_compile/prof-late-cc3.hs8
-rw-r--r--testsuite/tests/profiling/should_compile/prof-late-cc3.stderr50
-rw-r--r--testsuite/tests/profiling/should_compile/prof-late-cc4.hs5
-rw-r--r--testsuite/tests/profiling/should_compile/prof-late-cc4.stderr18
17 files changed, 286 insertions, 43 deletions
diff --git a/compiler/GHC/Cmm/Info/Build.hs b/compiler/GHC/Cmm/Info/Build.hs
index 5a3b22f8b8..0edf1ce3ab 100644
--- a/compiler/GHC/Cmm/Info/Build.hs
+++ b/compiler/GHC/Cmm/Info/Build.hs
@@ -56,8 +56,8 @@ import GHC.Types.Name.Set
{- Note [SRTs]
~~~~~~~~~~~
Static Reference Tables (SRTs) are the mechanism by which the garbage collector
-can determine the live CAFs in the program. An SRT is a static tables associated
-with a CAFfy static closure which record which CAFfy objects are reachable from
+can determine the live CAFs in the program. An SRT is a static table associated
+with a CAFfy closure which record which CAFfy objects are reachable from
the closure's code.
Representation
diff --git a/compiler/GHC/Core/LateCC.hs b/compiler/GHC/Core/LateCC.hs
index 2b4f810441..7a677e9964 100644
--- a/compiler/GHC/Core/LateCC.hs
+++ b/compiler/GHC/Core/LateCC.hs
@@ -3,12 +3,15 @@
-- | Adds cost-centers after the core piple has run.
module GHC.Core.LateCC
- ( addLateCostCentres
+ ( addLateCostCentresMG
+ , addLateCostCentresPgm
+ , addLateCostCentres -- Might be useful for API users
+ , Env(..)
) where
import Control.Applicative
-import GHC.Utils.Monad.State.Strict
import Control.Monad
+import qualified Data.Set as S
import GHC.Prelude
import GHC.Types.CostCentre
@@ -20,21 +23,83 @@ import GHC.Types.Var
import GHC.Unit.Types
import GHC.Data.FastString
import GHC.Core
-import GHC.Types.Id
+import GHC.Core.Opt.Monad
import GHC.Core.Utils (mkTick)
+import GHC.Types.Id
+import GHC.Driver.Session
-addLateCostCentres :: Bool -> ModGuts -> ModGuts
-addLateCostCentres prof_count_entries guts = let
- env = Env
- { thisModule = mg_module guts
- , ccState = newCostCentreState
- , countEntries = prof_count_entries
- }
- in guts { mg_binds = doCoreProgram env (mg_binds guts) }
+import GHC.Utils.Logger
+import GHC.Utils.Outputable
+import GHC.Utils.Misc
+import GHC.Utils.Error (withTiming)
+import GHC.Utils.Monad.State.Strict
+
+
+{- Note [Collecting late cost centres]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Usually cost centres defined by a module are collected
+during tidy by collectCostCentres. However with `-fprof-late`
+we insert cost centres after inlining. So we keep a list of
+all the cost centres we inserted and combine that with the list
+of cost centres found during tidy.
+
+To avoid overhead when using -fprof-inline there is a flag to stop
+us from collecting them here when we run this pass before tidy.
+
+Note [Adding late cost centres]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The basic idea is very simple. For every top level binder
+`f = rhs` we compile it as if the user had written
+`f = {-# SCC f #-} rhs`.
+
+If we do this after unfoldings for `f` have been created this
+doesn't impact core-level optimizations at all. If we do it
+before the cost centre will be included in the unfolding and
+might inhibit optimizations at the call site. For this reason
+we provide flags for both approaches as they have different
+tradeoffs.
+
+We also don't add a cost centre for any binder that is a constructor
+worker or wrapper. These will never meaningfully enrich the resulting
+profile so we improve efficiency by omitting those.
+
+-}
+
+addLateCostCentresMG :: ModGuts -> CoreM ModGuts
+addLateCostCentresMG guts = do
+ dflags <- getDynFlags
+ let env :: Env
+ env = Env
+ { thisModule = mg_module guts
+ , ccState = newCostCentreState
+ , countEntries = gopt Opt_ProfCountEntries dflags
+ , collectCCs = False -- See Note [Collecting late cost centres]
+ }
+ let guts' = guts { mg_binds = fst (addLateCostCentres env (mg_binds guts))
+ }
+ return guts'
+
+addLateCostCentresPgm :: DynFlags -> Logger -> Module -> CoreProgram -> IO (CoreProgram, S.Set CostCentre)
+addLateCostCentresPgm dflags logger mod binds =
+ withTiming logger
+ (text "LateCC"<+>brackets (ppr mod))
+ (\(a,b) -> a `seqList` (b `seq` ())) $ do
+ let env = Env
+ { thisModule = mod
+ , ccState = newCostCentreState
+ , countEntries = gopt Opt_ProfCountEntries dflags
+ , collectCCs = True -- See Note [Collecting late cost centres]
+ }
+ (binds', ccs) = addLateCostCentres env binds
+ when (dopt Opt_D_dump_late_cc dflags || dopt Opt_D_verbose_core2core dflags) $
+ putDumpFileMaybe logger Opt_D_dump_late_cc "LateCC" FormatCore (vcat (map ppr binds'))
+ return (binds', ccs)
+
+addLateCostCentres :: Env -> CoreProgram -> (CoreProgram,S.Set CostCentre)
+addLateCostCentres env binds =
+ let (binds', state) = runState (mapM (doBind env) binds) initLateCCState
+ in (binds',lcs_ccs state)
-doCoreProgram :: Env -> CoreProgram -> CoreProgram
-doCoreProgram env binds = flip evalState newCostCentreState $ do
- mapM (doBind env) binds
doBind :: Env -> CoreBind -> M CoreBind
doBind env (NonRec b rhs) = NonRec b <$> doBndr env b rhs
@@ -44,28 +109,59 @@ doBind env (Rec bs) = Rec <$> mapM doPair bs
doPair (b,rhs) = (b,) <$> doBndr env b rhs
doBndr :: Env -> Id -> CoreExpr -> M CoreExpr
-doBndr env bndr rhs = do
+doBndr env bndr rhs
+ -- Cost centres on constructor workers are pretty much useless
+ -- so we don't emit them if we are looking at the rhs of a constructor
+ -- binding.
+ | Just _ <- isDataConId_maybe bndr = pure rhs
+ | otherwise = doBndr' env bndr rhs
+
+
+-- We want to put the cost centra below the lambda as we only care about executions of the RHS.
+doBndr' :: Env -> Id -> CoreExpr -> State LateCCState CoreExpr
+doBndr' env bndr (Lam b rhs) = Lam b <$> doBndr' env bndr rhs
+doBndr' env bndr rhs = do
let name = idName bndr
name_loc = nameSrcSpan name
cc_name = getOccFS name
count = countEntries env
- cc_flavour <- getCCExprFlavour cc_name
+ cc_flavour <- getCCFlavour cc_name
let cc_mod = thisModule env
bndrCC = NormalCC cc_flavour cc_name cc_mod name_loc
note = ProfNote bndrCC count True
+ addCC env bndrCC
return $ mkTick note rhs
-type M = State CostCentreState
+data LateCCState = LateCCState
+ { lcs_state :: !CostCentreState
+ , lcs_ccs :: S.Set CostCentre
+ }
+type M = State LateCCState
-getCCExprFlavour :: FastString -> M CCFlavour
-getCCExprFlavour name = ExprCC <$> getCCIndex' name
+initLateCCState :: LateCCState
+initLateCCState = LateCCState newCostCentreState mempty
+
+getCCFlavour :: FastString -> M CCFlavour
+getCCFlavour name = LateCC <$> getCCIndex' name
getCCIndex' :: FastString -> M CostCentreIndex
-getCCIndex' name = state (getCCIndex name)
+getCCIndex' name = do
+ state <- get
+ let (index,cc_state') = getCCIndex name (lcs_state state)
+ put (state { lcs_state = cc_state'})
+ return index
+
+addCC :: Env -> CostCentre -> M ()
+addCC !env cc = do
+ state <- get
+ when (collectCCs env) $ do
+ let ccs' = S.insert cc (lcs_ccs state)
+ put (state { lcs_ccs = ccs'})
data Env = Env
- { thisModule :: Module
- , countEntries :: Bool
- , ccState :: CostCentreState
+ { thisModule :: !Module
+ , countEntries:: !Bool
+ , ccState :: !CostCentreState
+ , collectCCs :: !Bool
}
diff --git a/compiler/GHC/Core/Opt/Pipeline.hs b/compiler/GHC/Core/Opt/Pipeline.hs
index d1ca6a2165..bbf0dc2164 100644
--- a/compiler/GHC/Core/Opt/Pipeline.hs
+++ b/compiler/GHC/Core/Opt/Pipeline.hs
@@ -43,7 +43,7 @@ import GHC.Core.Opt.CallArity ( callArityAnalProgram )
import GHC.Core.Opt.Exitify ( exitifyProgram )
import GHC.Core.Opt.WorkWrap ( wwTopBinds )
import GHC.Core.Opt.CallerCC ( addCallerCostCentres )
-import GHC.Core.LateCC (addLateCostCentres)
+import GHC.Core.LateCC (addLateCostCentresMG)
import GHC.Core.Seq (seqBinds)
import GHC.Core.FamInstEnv
@@ -198,7 +198,7 @@ getCoreToDo dflags rule_base extra_vars
runWhen (profiling && not (null $ callerCcFilters dflags)) CoreAddCallerCcs
add_late_ccs =
- runWhen (profiling && gopt Opt_ProfLateCcs dflags) $ CoreAddLateCcs
+ runWhen (profiling && gopt Opt_ProfLateInlineCcs dflags) $ CoreAddLateCcs
core_todo =
[
@@ -463,7 +463,6 @@ doCorePass pass guts = do
p_fam_env <- getPackageFamInstEnv
let platform = targetPlatform dflags
let fam_envs = (p_fam_env, mg_fam_inst_env guts)
- let prof_count_entries = gopt Opt_ProfCountEntries dflags
let updateBinds f = return $ guts { mg_binds = f (mg_binds guts) }
let updateBindsM f = f (mg_binds guts) >>= \b' -> return $ guts { mg_binds = b' }
@@ -513,7 +512,7 @@ doCorePass pass guts = do
addCallerCostCentres guts
CoreAddLateCcs -> {-# SCC "AddLateCcs" #-}
- return (addLateCostCentres prof_count_entries guts)
+ addLateCostCentresMG guts
CoreDoPrintCore -> {-# SCC "PrintCore" #-}
liftIO $ printCore logger (mg_binds guts) >> return guts
diff --git a/compiler/GHC/Driver/CodeOutput.hs b/compiler/GHC/Driver/CodeOutput.hs
index 5b3f614d8e..ce5a7e156d 100644
--- a/compiler/GHC/Driver/CodeOutput.hs
+++ b/compiler/GHC/Driver/CodeOutput.hs
@@ -327,7 +327,8 @@ outputForeignStubs_help fname doc_str header footer
-- | Generate code to initialise cost centres
profilingInitCode :: Platform -> Module -> CollectedCCs -> CStub
profilingInitCode platform this_mod (local_CCs, singleton_CCSs)
- = initializerCStub platform fn_name decls body
+ = {-# SCC profilingInitCode #-}
+ initializerCStub platform fn_name decls body
where
fn_name = mkInitializerStubLabel this_mod "prof_init"
decls = vcat
diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs
index a4e5827bc6..fd23d2e81e 100644
--- a/compiler/GHC/Driver/Flags.hs
+++ b/compiler/GHC/Driver/Flags.hs
@@ -96,6 +96,7 @@ data DumpFlag
| Opt_D_dump_simpl_iterations
| Opt_D_dump_spec
| Opt_D_dump_prep
+ | Opt_D_dump_late_cc
| Opt_D_dump_stg_from_core -- ^ Initial STG (CoreToStg output)
| Opt_D_dump_stg_unarised -- ^ STG after unarise
| Opt_D_dump_stg_cg -- ^ STG (after stg2stg)
@@ -303,6 +304,7 @@ data GeneralFlag
-- profiling opts
| Opt_AutoSccsOnIndividualCafs
| Opt_ProfCountEntries
+ | Opt_ProfLateInlineCcs
| Opt_ProfLateCcs
| Opt_ProfManualCcs -- ^ Ignore manual SCC annotations
diff --git a/compiler/GHC/Driver/Main.hs b/compiler/GHC/Driver/Main.hs
index 2f6a3262d0..e03883702b 100644
--- a/compiler/GHC/Driver/Main.hs
+++ b/compiler/GHC/Driver/Main.hs
@@ -171,6 +171,8 @@ import GHC.Core.InstEnv
import GHC.Core.FamInstEnv
import GHC.Core.Rules
import GHC.Core.Stats
+import GHC.Core.LateCC (addLateCostCentresPgm)
+
import GHC.CoreToStg.Prep
import GHC.CoreToStg ( coreToStg )
@@ -268,7 +270,6 @@ import Data.Bifunctor (first)
import Data.List.NonEmpty (NonEmpty ((:|)))
-
{- **********************************************************************
%* *
Initialisation
@@ -1692,6 +1693,21 @@ hscGenHardCode hsc_env cgguts location output_filename = do
-- but we don't generate any code for newtypes
-------------------
+ -- Insert late cost centres if enabled.
+ -- If `-fprof-late-inline` is enabled we can skip this, as it will have added
+ -- a superset of cost centres we would add here already.
+
+ (late_cc_binds, late_local_ccs) <-
+ if gopt Opt_ProfLateCcs dflags && not (gopt Opt_ProfLateInlineCcs dflags)
+ then {-# SCC lateCC #-} do
+ (binds,late_ccs) <- addLateCostCentresPgm dflags logger this_mod core_binds
+ return ( binds, (S.toList late_ccs `mappend` local_ccs ))
+ else
+ return (core_binds, local_ccs)
+
+
+
+ -------------------
-- PREPARE FOR CODE GENERATION
-- Do saturation and convert to A-normal form
(prepd_binds) <- {-# SCC "CorePrep" #-} do
@@ -1700,7 +1716,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do
(hsc_logger hsc_env)
cp_cfg
(initCorePrepPgmConfig (hsc_dflags hsc_env) (interactiveInScope $ hsc_IC hsc_env))
- this_mod location core_binds data_tycons
+ this_mod location late_cc_binds data_tycons
----------------- Convert to STG ------------------
(stg_binds, denv, (caf_ccs, caf_cc_stacks))
@@ -1711,7 +1727,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do
(myCoreToStg logger dflags (hsc_IC hsc_env) False this_mod location prepd_binds)
let cost_centre_info =
- (local_ccs ++ caf_ccs, caf_cc_stacks)
+ (late_local_ccs ++ caf_ccs, caf_cc_stacks)
platform = targetPlatform dflags
prof_init
| sccProfilingEnabled dflags = profilingInitCode platform this_mod cost_centre_info
diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs
index ee032a2652..0407952c33 100644
--- a/compiler/GHC/Driver/Session.hs
+++ b/compiler/GHC/Driver/Session.hs
@@ -2486,6 +2486,8 @@ dynamic_flags_deps = [
(setDumpFlag Opt_D_dump_spec)
, make_ord_flag defGhcFlag "ddump-prep"
(setDumpFlag Opt_D_dump_prep)
+ , make_ord_flag defGhcFlag "ddump-late-cc"
+ (setDumpFlag Opt_D_dump_late_cc)
, make_ord_flag defGhcFlag "ddump-stg-from-core"
(setDumpFlag Opt_D_dump_stg_from_core)
, make_ord_flag defGhcFlag "ddump-stg-unarised"
@@ -3446,6 +3448,7 @@ fFlagsDeps = [
flagSpec "prof-count-entries" Opt_ProfCountEntries,
flagSpec "prof-late" Opt_ProfLateCcs,
flagSpec "prof-manual" Opt_ProfManualCcs,
+ flagSpec "prof-late-inline" Opt_ProfLateInlineCcs,
flagSpec "regs-graph" Opt_RegsGraph,
flagSpec "regs-iterative" Opt_RegsIterative,
depFlagSpec' "rewrite-rules" Opt_EnableRewriteRules
diff --git a/compiler/GHC/Iface/Tidy.hs b/compiler/GHC/Iface/Tidy.hs
index 1edd3cb9bf..7fec591196 100644
--- a/compiler/GHC/Iface/Tidy.hs
+++ b/compiler/GHC/Iface/Tidy.hs
@@ -347,7 +347,7 @@ data UnfoldingExposure
data TidyOpts = TidyOpts
{ opt_name_cache :: !NameCache
- , opt_collect_ccs :: !Bool
+ , opt_collect_ccs :: !Bool -- ^ Always true if we compile with -prof
, opt_unfolding_opts :: !UnfoldingOpts
, opt_expose_unfoldings :: !UnfoldingExposure
-- ^ Which unfoldings to expose
@@ -468,7 +468,7 @@ tidyProgram opts (ModGuts { mg_module = mod
-- unfoldings.
collectCostCentres :: Module -> CoreProgram -> [CoreRule] -> S.Set CostCentre
collectCostCentres mod_name binds rules
- = foldl' go_bind (go_rules S.empty) binds
+ = {-# SCC collectCostCentres #-} foldl' go_bind (go_rules S.empty) binds
where
go cs e = case e of
Var{} -> cs
diff --git a/compiler/GHC/Stg/Pipeline.hs b/compiler/GHC/Stg/Pipeline.hs
index c5f7bc2da3..ca7b1aefa2 100644
--- a/compiler/GHC/Stg/Pipeline.hs
+++ b/compiler/GHC/Stg/Pipeline.hs
@@ -126,7 +126,7 @@ stg2stg logger extra_vars opts this_mod binds
StgUnarise -> do
us <- getUniqueSupplyM
liftIO (stg_linter False "Pre-unarise" binds)
- let binds' = unarise us binds
+ let binds' = {-# SCC "StgUnarise" #-} unarise us binds
liftIO (dump_when Opt_D_dump_stg_unarised "Unarised STG:" binds')
liftIO (stg_linter True "Unarise" binds')
return binds'
diff --git a/compiler/GHC/Types/CostCentre.hs b/compiler/GHC/Types/CostCentre.hs
index 61f6b87c88..092b727d8d 100644
--- a/compiler/GHC/Types/CostCentre.hs
+++ b/compiler/GHC/Types/CostCentre.hs
@@ -31,6 +31,7 @@ import GHC.Utils.Outputable
import GHC.Types.SrcLoc
import GHC.Data.FastString
import GHC.Types.CostCentre.State
+import GHC.Utils.Panic.Plain
import Data.Data
@@ -71,6 +72,7 @@ data CCFlavour = CafCC -- ^ Auto-generated top-level thunk
| ExprCC !CostCentreIndex -- ^ Explicitly annotated expression
| DeclCC !CostCentreIndex -- ^ Explicitly annotated declaration
| HpcCC !CostCentreIndex -- ^ Generated by HPC for coverage
+ | LateCC !CostCentreIndex -- ^ Annotated by the one of the prof-last* passes.
deriving (Eq, Ord, Data)
-- | Extract the index from a flavour
@@ -79,6 +81,7 @@ flavourIndex CafCC = 0
flavourIndex (ExprCC x) = unCostCentreIndex x
flavourIndex (DeclCC x) = unCostCentreIndex x
flavourIndex (HpcCC x) = unCostCentreIndex x
+flavourIndex (LateCC x) = unCostCentreIndex x
instance Eq CostCentre where
c1 == c2 = case c1 `cmpCostCentre` c2 of { EQ -> True; _ -> False }
@@ -292,7 +295,8 @@ ppFlavourLblComponent :: CCFlavour -> SDoc
ppFlavourLblComponent CafCC = text "CAF"
ppFlavourLblComponent (ExprCC i) = text "EXPR" <> ppIdxLblComponent i
ppFlavourLblComponent (DeclCC i) = text "DECL" <> ppIdxLblComponent i
-ppFlavourLblComponent (HpcCC i) = text "HPC" <> ppIdxLblComponent i
+ppFlavourLblComponent (HpcCC i) = text "HPC" <> ppIdxLblComponent i
+ppFlavourLblComponent (LateCC i) = text "LATECC" <> ppIdxLblComponent i
-- ^ Print the flavour index component of a C label
ppIdxLblComponent :: CostCentreIndex -> SDoc
@@ -328,13 +332,18 @@ instance Binary CCFlavour where
put_ bh (HpcCC i) = do
putByte bh 3
put_ bh i
+ put_ bh (LateCC i) = do
+ putByte bh 4
+ put_ bh i
get bh = do
h <- getByte bh
case h of
0 -> return CafCC
1 -> ExprCC <$> get bh
2 -> DeclCC <$> get bh
- _ -> HpcCC <$> get bh
+ 3 -> HpcCC <$> get bh
+ 4 -> LateCC <$> get bh
+ _ -> panic "Invalid CCFlavour"
instance Binary CostCentre where
put_ bh (NormalCC aa ab ac _ad) = do
diff --git a/docs/users_guide/debugging.rst b/docs/users_guide/debugging.rst
index 0c09c4c3ec..87a689c268 100644
--- a/docs/users_guide/debugging.rst
+++ b/docs/users_guide/debugging.rst
@@ -446,6 +446,12 @@ subexpression elimination pass.
Dump output of Core preparation pass
+.. ghc-flag:: -ddump-late-cc
+ :shortdesc: Dump core with late cost centres added
+ :type: dynamic
+
+ Dump output of LateCC pass after cost centres have been added.
+
.. ghc-flag:: -ddump-view-pattern-commoning
:shortdesc: Dump commoned view patterns
:type: dynamic
diff --git a/docs/users_guide/profiling.rst b/docs/users_guide/profiling.rst
index 418c9b0bb0..1c2f458f10 100644
--- a/docs/users_guide/profiling.rst
+++ b/docs/users_guide/profiling.rst
@@ -439,19 +439,47 @@ compiled program.
details.
.. ghc-flag:: -fprof-late
- :shortdesc: Auto-add ``SCC``\\ s to all top level bindings *after* the optimizer has run.
+ :shortdesc: Auto-add ``SCC``\\ s to all top level bindings *after* the core pipeline has run.
:type: dynamic
:reverse: -fno-prof-late
:category:
:since: 9.4.1
- Adds an automatic ``SCC`` annotation to all top level bindings late in the core pipeline after
- the optimizer has run. This means these cost centres will not interfere with core-level optimizations
+ Adds an automatic ``SCC`` annotation to all top level bindings late in the compilation pipeline after
+ the optimizer has run and unfoldings have been created. This means these cost centres will not interfere with core-level optimizations
and the resulting profile will be closer to the performance profile of an optimized non-profiled
executable.
- While the results of this are generally very informative some of the compiler internal names
- will leak into the profile.
+ While the results of this are generally informative, some of the compiler internal names
+ will leak into the profile. Further if a function is inlined into a use site it's costs will be counted against the
+ caller's cost center.
+
+ For example if we have this code:
+
+ .. code-block:: haskell
+
+ {-# INLINE mysum #-}
+ mysum = sum
+ main = print $ mysum [1..9999999]
+
+ Then ``mysum`` will not show up in the profile since it will be inlined into main and therefore
+ it's associated costs will be attributed to mains implicit cost centre.
+
+.. ghc-flag:: -fprof-late-inline
+ :shortdesc: Auto-add ``SCC``\\ s to all top level bindings *after* the optimizer has run and retain them when inlining.
+ :type: dynamic
+ :reverse: -fno-prof-late-inline
+ :category:
+
+ :since: 9.4.1
+
+ Adds an automatic ``SCC`` annotation to all top level bindings late in the core pipeline after
+ the optimizer has run. This is the same as :ghc-flag:`-fprof-late` except that cost centers are included in some unfoldings.
+
+ The result of which is that cost centers *can* inhibit core optimizations to some degree at use sites
+ after inlining. Further there can be significant overhead from cost centres added to small functions if they are inlined often.
+
+ You can try this mode if :ghc-flag:`-fprof-late` results in a profile that's too hard to interpret.
.. ghc-flag:: -fprof-cafs
:shortdesc: Auto-add ``SCC``\\ s to all CAFs
diff --git a/testsuite/tests/profiling/should_compile/all.T b/testsuite/tests/profiling/should_compile/all.T
index 9e460e153b..1f4ae1430d 100644
--- a/testsuite/tests/profiling/should_compile/all.T
+++ b/testsuite/tests/profiling/should_compile/all.T
@@ -4,6 +4,8 @@ test('prof001', [only_ways(['normal']), req_profiling], compile_and_run, ['-prof
test('prof002', [only_ways(['normal']), req_profiling], compile_and_run, ['-prof -fprof-cafs'])
test('prof-late-cc', [only_ways(['normal']), req_profiling], compile, ['-prof -fprof-late'])
test('prof-late-cc2', [only_ways(['normal']), req_profiling], compile_and_run, ['-prof -fprof-late'])
+test('prof-late-cc3', [only_ways(['normal']), req_profiling, grep_errmsg('scc')], compile, ['-prof -fprof-late -ddump-prep -ddump-simpl -dno-typeable-binds'])
+test('prof-late-cc4', [only_ways(['normal']), req_profiling, grep_errmsg('scc')], compile, ['-prof -fprof-late-inline -ddump-simpl -dno-typeable-binds -O'])
test('T2410', [only_ways(['normal']), req_profiling], compile, ['-O2 -prof -fprof-cafs'])
test('T5889', [only_ways(['normal']), req_profiling, extra_files(['T5889/A.hs', 'T5889/B.hs'])], multimod_compile, ['A B', '-O -prof -fno-prof-count-entries -v0'])
diff --git a/testsuite/tests/profiling/should_compile/prof-late-cc3.hs b/testsuite/tests/profiling/should_compile/prof-late-cc3.hs
new file mode 100644
index 0000000000..b7badbc216
--- /dev/null
+++ b/testsuite/tests/profiling/should_compile/prof-late-cc3.hs
@@ -0,0 +1,8 @@
+module M where
+
+-- There should be a cost center in core prep output but not in
+-- -ddump-simpl output with -fprof-late
+{-# INLINE doStuff #-}
+doStuff x = do
+ print x
+ return x
diff --git a/testsuite/tests/profiling/should_compile/prof-late-cc3.stderr b/testsuite/tests/profiling/should_compile/prof-late-cc3.stderr
new file mode 100644
index 0000000000..92d9b1cd8f
--- /dev/null
+++ b/testsuite/tests/profiling/should_compile/prof-late-cc3.stderr
@@ -0,0 +1,50 @@
+
+==================== Tidy Core ====================
+Result size of Tidy Core
+ = {terms: 12, types: 16, coercions: 0, joins: 0/0}
+
+-- RHS size: {terms: 11, types: 10, coercions: 0, joins: 0/0}
+doStuff [InlPrag=INLINE (sat-args=1)]
+ :: forall {b}. Show b => b -> IO b
+[GblId, Arity=2, Unf=OtherCon []]
+doStuff
+ = \ (@b_axP) ($dShow_aET :: Show b_axP) (eta_B0 :: b_axP) ->
+ >>
+ @IO
+ GHC.Base.$fMonadIO
+ @()
+ @b_axP
+ (print @b_axP $dShow_aET eta_B0)
+ (return @IO GHC.Base.$fMonadIO @b_axP eta_B0)
+
+
+
+
+==================== CorePrep ====================
+Result size of CorePrep
+ = {terms: 16, types: 20, coercions: 0, joins: 0/2}
+
+-- RHS size: {terms: 15, types: 14, coercions: 0, joins: 0/2}
+M.doStuff [InlPrag=INLINE (sat-args=1)]
+ :: forall {b}. GHC.Show.Show b => b -> GHC.Types.IO b
+[GblId, Arity=2, Unf=OtherCon []]
+M.doStuff
+ = \ (@b_axP)
+ ($dShow_sJs [Occ=OnceL1] :: GHC.Show.Show b_axP)
+ (eta_sJt :: b_axP) ->
+ scctick<doStuff>
+ let {
+ sat_sJv [Occ=Once1] :: GHC.Types.IO b_axP
+ [LclId]
+ sat_sJv
+ = GHC.Base.return
+ @GHC.Types.IO GHC.Base.$fMonadIO @b_axP eta_sJt } in
+ let {
+ sat_sJu [Occ=Once1] :: GHC.Types.IO ()
+ [LclId]
+ sat_sJu = System.IO.print @b_axP $dShow_sJs eta_sJt } in
+ GHC.Base.>>
+ @GHC.Types.IO GHC.Base.$fMonadIO @() @b_axP sat_sJu sat_sJv
+
+
+
diff --git a/testsuite/tests/profiling/should_compile/prof-late-cc4.hs b/testsuite/tests/profiling/should_compile/prof-late-cc4.hs
new file mode 100644
index 0000000000..45d628cd0f
--- /dev/null
+++ b/testsuite/tests/profiling/should_compile/prof-late-cc4.hs
@@ -0,0 +1,5 @@
+module M where
+
+-- There should be a cost center in the -ddump-simpl output
+addStuff x y = do
+ x + y :: Int
diff --git a/testsuite/tests/profiling/should_compile/prof-late-cc4.stderr b/testsuite/tests/profiling/should_compile/prof-late-cc4.stderr
new file mode 100644
index 0000000000..2de7dad615
--- /dev/null
+++ b/testsuite/tests/profiling/should_compile/prof-late-cc4.stderr
@@ -0,0 +1,18 @@
+
+==================== Tidy Core ====================
+Result size of Tidy Core
+ = {terms: 2, types: 3, coercions: 0, joins: 0/0}
+
+-- RHS size: {terms: 1, types: 0, coercions: 0, joins: 0/0}
+addStuff [InlPrag=INLINE (sat-args=2)] :: Int -> Int -> Int
+[GblId,
+ Str=<1!P(L)><1!P(L)>,
+ Cpr=1,
+ Unf=Unf{Src=InlineStable, TopLvl=True, Value=True, ConLike=True,
+ WorkFree=True, Expandable=True,
+ Guidance=ALWAYS_IF(arity=2,unsat_ok=False,boring_ok=True)
+ Tmpl= GHC.Num.$fNumInt_$c+}]
+addStuff = scctick<addStuff> GHC.Num.$fNumInt_$c+
+
+
+