Reduce inlining in deeply-nested cases

This adds a new heuristic, controllable via two new flags to better tune inlining behaviour. The new flags are -funfolding-case-threshold and -funfolding-case-scaling which are document both in the user guide and in Note [Avoid inlining into deeply nested cases]. Co-authored-by: Andreas Klebinger <klebinger.andreas@gmx.at>
author: Simon Peyton Jones <simonpj@microsoft.com> 2020-12-16 12:57:17 +0000
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2021-02-09 16:21:40 -0500
commit: 891a791f3f2438e5d768e1f96119d275e58e8d37 (patch)
tree: 5a051da2d04b14ce92bdcb98a5c5ab84c9d344fb /compiler
parent: ab5fd982a7a501136cb8b90fa841c02cc9551b5a (diff)
download: haskell-891a791f3f2438e5d768e1f96119d275e58e8d37.tar.gz
6 files changed, 175 insertions, 26 deletions
diff --git a/compiler/GHC/Core/Opt/Simplify.hs b/compiler/GHC/Core/Opt/Simplify.hs
index 0cea6da0bf..4ca8985f8b 100644
--- a/compiler/GHC/Core/Opt/Simplify.hs
+++ b/compiler/GHC/Core/Opt/Simplify.hs
@@ -1882,7 +1882,7 @@ simplIdF env var cont
 
 completeCall :: SimplEnv -> OutId -> SimplCont -> SimplM (SimplFloats, OutExpr)
 completeCall env var cont
-  | Just expr <- callSiteInline dflags var active_unf
+  | Just expr <- callSiteInline dflags case_depth var active_unf
                                 lone_variable arg_infos interesting_cont
   -- Inline the variable's RHS
   = do { checkedTick (UnfoldingDone var)
@@ -1897,7 +1897,8 @@ completeCall env var cont
        ; rebuildCall env info cont }
 
   where
-    dflags = seDynFlags env
+    dflags     = seDynFlags env
+    case_depth = seCaseDepth env
     (lone_variable, arg_infos, call_cont) = contArgs cont
     n_val_args       = length arg_infos
     interesting_cont = interestingCallContext env call_cont
@@ -2724,9 +2725,11 @@ reallyRebuildCase env scrut case_bndr alts cont
        ; rebuild env case_expr cont }
 
   | otherwise
-  = do { (floats, cont') <- mkDupableCaseCont env alts cont
-       ; case_expr <- simplAlts (env `setInScopeFromF` floats)
-                                scrut (scaleIdBy holeScaling case_bndr) (scaleAltsBy holeScaling alts) cont'
+  = do { (floats, env', cont') <- mkDupableCaseCont env alts cont
+       ; case_expr <- simplAlts env' scrut
+                                (scaleIdBy holeScaling case_bndr)
+                                (scaleAltsBy holeScaling alts)
+                                cont'
        ; return (floats, case_expr) }
   where
     holeScaling = contHoleScaling cont
@@ -3234,10 +3237,15 @@ join points and inlining them away.  See #4930.
 
 --------------------
 mkDupableCaseCont :: SimplEnv -> [InAlt] -> SimplCont
-                  -> SimplM (SimplFloats, SimplCont)
+                  -> SimplM ( SimplFloats  -- Join points (if any)
+                            , SimplEnv     -- Use this for the alts
+                            , SimplCont)
 mkDupableCaseCont env alts cont
-  | altsWouldDup alts = mkDupableCont env cont
-  | otherwise         = return (emptyFloats env, cont)
+  | altsWouldDup alts = do { (floats, cont) <- mkDupableCont env cont
+                           ; let env' = bumpCaseDepth $
+                                        env `setInScopeFromF` floats
+                           ; return (floats, env', cont) }
+  | otherwise         = return (emptyFloats env, env, cont)
 
 altsWouldDup :: [InAlt] -> Bool -- True iff strictly > 1 non-bottom alternative
 altsWouldDup []  = False        -- See Note [Bottom alternatives]
@@ -3370,12 +3378,11 @@ mkDupableContWithDmds env _
         --              in case [...hole...] of { pi -> ji xij }
         -- NB: sc_dup /= OkToDup; that is caught earlier by contIsDupable
     do  { tick (CaseOfCase case_bndr)
-        ; (floats, alt_cont) <- mkDupableCaseCont env alts cont
+        ; (floats, alt_env, alt_cont) <- mkDupableCaseCont (se `setInScopeFromE` env) alts cont
                 -- NB: We call mkDupableCaseCont here to make cont duplicable
                 --     (if necessary, depending on the number of alts)
                 -- And this is important: see Note [Fusing case continuations]
 
-        ; let alt_env = se `setInScopeFromF` floats
         ; let cont_scaling = contHoleScaling cont
           -- See Note [Scaling in case-of-case]
         ; (alt_env', case_bndr') <- simplBinder alt_env (scaleIdBy cont_scaling case_bndr)
diff --git a/compiler/GHC/Core/Opt/Simplify/Env.hs b/compiler/GHC/Core/Opt/Simplify/Env.hs
index 180e562c73..0d4e06f9c2 100644
--- a/compiler/GHC/Core/Opt/Simplify/Env.hs
+++ b/compiler/GHC/Core/Opt/Simplify/Env.hs
@@ -14,7 +14,7 @@ module GHC.Core.Opt.Simplify.Env (
         SimplEnv(..), pprSimplEnv,   -- Temp not abstract
         mkSimplEnv, extendIdSubst,
         extendTvSubst, extendCvSubst,
-        zapSubstEnv, setSubstEnv,
+        zapSubstEnv, setSubstEnv, bumpCaseDepth,
         getInScope, setInScopeFromE, setInScopeFromF,
         setInScopeSet, modifyInScope, addNewInScopeIds,
         getSimplRules,
@@ -103,6 +103,8 @@ data SimplEnv
         -- The current set of in-scope variables
         -- They are all OutVars, and all bound in this module
       , seInScope   :: InScopeSet       -- OutVars only
+
+      , seCaseDepth :: !Int  -- Depth of multi-branch case alternatives
     }
 
 data SimplFloats
@@ -272,11 +274,12 @@ points we're substituting. -}
 
 mkSimplEnv :: SimplMode -> SimplEnv
 mkSimplEnv mode
-  = SimplEnv { seMode = mode
-             , seInScope = init_in_scope
-             , seTvSubst = emptyVarEnv
-             , seCvSubst = emptyVarEnv
-             , seIdSubst = emptyVarEnv }
+  = SimplEnv { seMode      = mode
+             , seInScope   = init_in_scope
+             , seTvSubst   = emptyVarEnv
+             , seCvSubst   = emptyVarEnv
+             , seIdSubst   = emptyVarEnv
+             , seCaseDepth = 0 }
         -- The top level "enclosing CC" is "SUBSUMED".
 
 init_in_scope :: InScopeSet
@@ -319,6 +322,9 @@ setMode mode env = env { seMode = mode }
 updMode :: (SimplMode -> SimplMode) -> SimplEnv -> SimplEnv
 updMode upd env = env { seMode = upd (seMode env) }
 
+bumpCaseDepth :: SimplEnv -> SimplEnv
+bumpCaseDepth env = env { seCaseDepth = seCaseDepth env + 1 }
+
 ---------------------
 extendIdSubst :: SimplEnv -> Id -> SimplSR -> SimplEnv
 extendIdSubst env@(SimplEnv {seIdSubst = subst}) var res
diff --git a/compiler/GHC/Core/Opt/Simplify/Monad.hs b/compiler/GHC/Core/Opt/Simplify/Monad.hs
index 0130fcf61e..d1e27f9fca 100644
--- a/compiler/GHC/Core/Opt/Simplify/Monad.hs
+++ b/compiler/GHC/Core/Opt/Simplify/Monad.hs
@@ -254,8 +254,13 @@ checkedTick t
       [ text "When trying" <+> ppr t
       , text "To increase the limit, use -fsimpl-tick-factor=N (default 100)."
       , space
-      , text "If you need to increase the limit substantially, please file a"
-      , text "bug report and indicate the factor you needed."
+      , text "In addition try adjusting -funfolding-case-threshold=N and"
+      , text "-funfolding-case-scaling=N for the module in question."
+      , text "Using threshold=1 and scaling=5 should break most inlining loops."
+      , space
+      , text "If you need to increase the tick factor substantially, while also"
+      , text "adjusting unfolding parameters please file a bug report and"
+      , text "indicate the factor you needed."
       , space
       , text "If GHC was unable to complete compilation even"
                <+> text "with a very large factor"
diff --git a/compiler/GHC/Core/Unfold.hs b/compiler/GHC/Core/Unfold.hs
index b97e10cd46..8a61eec3c7 100644
--- a/compiler/GHC/Core/Unfold.hs
+++ b/compiler/GHC/Core/Unfold.hs
@@ -26,7 +26,7 @@ module GHC.Core.Unfold (
         UnfoldingOpts (..), defaultUnfoldingOpts,
         updateCreationThreshold, updateUseThreshold,
         updateFunAppDiscount, updateDictDiscount,
-        updateVeryAggressive,
+        updateVeryAggressive, updateCaseScaling, updateCaseThreshold,
 
         ArgSummary(..),
 
@@ -82,6 +82,12 @@ data UnfoldingOpts = UnfoldingOpts
 
    , unfoldingVeryAggressive :: !Bool
       -- ^ Force inlining in many more cases
+
+      -- Don't consider depth up to x
+   , unfoldingCaseThreshold :: !Int
+
+      -- Penalize depth with 1/x
+   , unfoldingCaseScaling :: !Int
    }
 
 defaultUnfoldingOpts :: UnfoldingOpts
@@ -106,6 +112,13 @@ defaultUnfoldingOpts = UnfoldingOpts
       -- we'll be able to pick the right method from a dictionary
 
    , unfoldingVeryAggressive = False
+
+      -- Only apply scaling once we are deeper than threshold cases
+      -- in an RHS.
+   , unfoldingCaseThreshold = 2
+
+      -- Penalize depth with (size*depth)/scaling
+   , unfoldingCaseScaling = 30
    }
 
 -- Helpers for "GHC.Driver.Session"
@@ -125,6 +138,13 @@ updateDictDiscount n opts = opts { unfoldingDictDiscount = n }
 updateVeryAggressive :: Bool -> UnfoldingOpts -> UnfoldingOpts
 updateVeryAggressive n opts = opts { unfoldingVeryAggressive = n }
 
+
+updateCaseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
+updateCaseThreshold n opts = opts { unfoldingCaseThreshold = n }
+
+updateCaseScaling :: Int -> UnfoldingOpts -> UnfoldingOpts
+updateCaseScaling n opts = opts { unfoldingCaseScaling = n }
+
 {-
 Note [Occurrence analysis of unfoldings]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1033,6 +1053,7 @@ StrictAnal.addStrictnessInfoToTopId
 -}
 
 callSiteInline :: DynFlags
+               -> Int                   -- Case depth
                -> Id                    -- The Id
                -> Bool                  -- True <=> unfolding is active
                -> Bool                  -- True if there are no arguments at all (incl type args)
@@ -1075,7 +1096,7 @@ instance Outputable CallCtxt where
   ppr DiscArgCtxt = text "DiscArgCtxt"
   ppr RuleArgCtxt = text "RuleArgCtxt"
 
-callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
+callSiteInline dflags !case_depth id active_unfolding lone_variable arg_infos cont_info
   = case idUnfolding id of
       -- idUnfolding checks for loop-breakers, returning NoUnfolding
       -- Things with an INLINE pragma may have an unfolding *and*
@@ -1083,7 +1104,7 @@ callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
         CoreUnfolding { uf_tmpl = unf_template
                       , uf_is_work_free = is_wf
                       , uf_guidance = guidance, uf_expandable = is_exp }
-          | active_unfolding -> tryUnfolding dflags id lone_variable
+          | active_unfolding -> tryUnfolding dflags case_depth id lone_variable
                                     arg_infos cont_info unf_template
                                     is_wf is_exp guidance
           | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
@@ -1110,10 +1131,106 @@ traceInline dflags inline_id str doc result
       = False
 {-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities]
 
-tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
+{- Note [Avoid inlining into deeply nested cases]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Consider a function f like this:
+
+  f arg1 arg2 =
+    case ...
+      ... -> g arg1
+      ... -> g arg2
+
+This function is small. So should be safe to inline.
+However sometimes this doesn't quite work out like that.
+Consider this code:
+
+f1 arg1 arg2 ... = ...
+    case _foo of
+      alt1 -> ... f2 arg1 ...
+      alt2 -> ... f2 arg2 ...
+
+f2 arg1 arg2 ... = ...
+    case _foo of
+      alt1 -> ... f3 arg1 ...
+      alt2 -> ... f3 arg2 ...
+
+f3 arg1 arg2 ... = ...
+
+... repeats up to n times. And then f1 is
+applied to some arguments:
+
+foo = ... f1 <interestingArgs> ...
+
+Initially f2..fn are not interesting to inline so we don't.
+However we see that f1 is applied to interesting args.
+So it's an obvious choice to inline those:
+
+foo =
+    ...
+      case _foo of
+        alt1 -> ... f2 <interestingArg> ...
+        alt2 -> ... f2 <interestingArg> ...
+
+As a result we go and inline f2 both mentions of f2 in turn are now applied to interesting
+arguments and f2 is small:
+
+foo =
+    ...
+      case _foo of
+        alt1 -> ... case _foo of
+            alt1 -> ... f3 <interestingArg> ...
+            alt2 -> ... f3 <interestingArg> ...
+
+        alt2 -> ... case _foo of
+            alt1 -> ... f3 <interestingArg> ...
+            alt2 -> ... f3 <interestingArg> ...
+
+The same thing happens for each binding up to f_n, duplicating the amount of inlining
+done in each step. Until at some point we are either done or run out of simplifier
+ticks/RAM. This pattern happened #18730.
+
+To combat this we introduce one more heuristic when weighing inlining decision.
+We keep track of a "case-depth". Which increases each time we look inside a case
+expression with more than one alternative.
+
+We then apply a penalty to inlinings based on the case-depth at which they would
+be inlined. Bounding the number of inlinings in such a scenario.
+
+The heuristic can be tuned in two ways:
+
+* We can ignore the first n levels of case nestings for inlining decisions using
+  -funfolding-case-threshold.
+* The penalty grows linear with the depth. It's computed as size*(depth-threshold)/scaling.
+  Scaling can be set with -funfolding-case-scaling.
+
+Some guidance on setting these defaults:
+
+* A low treshold (<= 2) is needed to prevent exponential cases from spiraling out of
+  control. We picked 2 for no particular reason.
+* Scaling the penalty by any more than 30 means the reproducer from
+  T18730 won't compile even with reasonably small values of n. Instead
+  it will run out of runs/ticks. This means to positively affect the reproducer
+  a scaling <= 30 is required.
+* A scaling of >= 15 still causes a few very large regressions on some nofib benchmarks.
+  (+80% for gc/fulsom, +90% for real/ben-raytrace, +20% for spectral/fibheaps)
+* A scaling of >= 25 showed no regressions on nofib. However it showed a number of
+  (small) regression for compiler perf benchmarks.
+
+The end result is that we are settling for a scaling of 30, with a threshold of 2.
+This gives us minimal compiler perf regressions. No nofib runtime regressions and
+will still avoid this pattern sometimes. This is a "safe" default, where we err on
+the side of compiler blowup instead of risking runtime regressions.
+
+For cases where the default falls short the flag can be changed to allow more/less inlining as
+needed on a per-module basis.
+
+-}
+
+tryUnfolding :: DynFlags -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt
              -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
              -> Maybe CoreExpr
-tryUnfolding dflags id lone_variable
+tryUnfolding dflags !case_depth id lone_variable
              arg_infos cont_info unf_template
              is_wf is_exp guidance
  = case guidance of
@@ -1138,9 +1255,16 @@ tryUnfolding dflags id lone_variable
         -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
         where
           some_benefit = calc_some_benefit (length arg_discounts)
-          extra_doc = text "discounted size =" <+> int discounted_size
-          discounted_size = size - discount
-          small_enough = discounted_size <= unfoldingUseThreshold uf_opts
+          extra_doc = vcat [ text "case depth =" <+> int case_depth
+                           , text "depth based penalty =" <+> int depth_penalty
+                           , text "discounted size =" <+> int adjusted_size ]
+          -- See Note [Avoid inlining into deeply nested cases]
+          depth_treshold = unfoldingCaseThreshold uf_opts
+          depth_scaling = unfoldingCaseScaling uf_opts
+          depth_penalty | case_depth <= depth_treshold = 0
+                        | otherwise       = (size * (case_depth - depth_treshold)) `div` depth_scaling
+          adjusted_size = size + depth_penalty - discount
+          small_enough = adjusted_size <= unfoldingUseThreshold uf_opts
           discount = computeDiscount arg_discounts res_discount arg_infos cont_info
 
   where
diff --git a/compiler/GHC/Core/Unfold.hs-boot b/compiler/GHC/Core/Unfold.hs-boot
index b86f8b2585..c62f1915c9 100644
--- a/compiler/GHC/Core/Unfold.hs-boot
+++ b/compiler/GHC/Core/Unfold.hs-boot
@@ -11,3 +11,5 @@ updateUseThreshold      :: Int -> UnfoldingOpts -> UnfoldingOpts
 updateFunAppDiscount    :: Int -> UnfoldingOpts -> UnfoldingOpts
 updateDictDiscount      :: Int -> UnfoldingOpts -> UnfoldingOpts
 updateVeryAggressive    :: Bool -> UnfoldingOpts -> UnfoldingOpts
+updateCaseThreshold     :: Int -> UnfoldingOpts -> UnfoldingOpts
+updateCaseScaling       :: Int -> UnfoldingOpts -> UnfoldingOpts
diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs
index 85f1b71852..cee4ba692b 100644
--- a/compiler/GHC/Driver/Session.hs
+++ b/compiler/GHC/Driver/Session.hs
@@ -2845,6 +2845,11 @@ dynamic_flags_deps = [
   , make_ord_flag defFlag "funfolding-dict-discount"
       (intSuffix   (\n d -> d { unfoldingOpts = updateDictDiscount n (unfoldingOpts d)}))
 
+  , make_ord_flag defFlag "funfolding-case-threshold"
+      (intSuffix   (\n d -> d { unfoldingOpts = updateCaseThreshold n (unfoldingOpts d)}))
+  , make_ord_flag defFlag "funfolding-case-scaling"
+      (intSuffix   (\n d -> d { unfoldingOpts = updateCaseScaling n (unfoldingOpts d)}))
+
   , make_dep_flag defFlag "funfolding-keeness-factor"
       (floatSuffix (\_ d -> d))
       "-funfolding-keeness-factor is no longer respected as of GHC 9.0"
author	Simon Peyton Jones <simonpj@microsoft.com>	2020-12-16 12:57:17 +0000
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2021-02-09 16:21:40 -0500
commit	891a791f3f2438e5d768e1f96119d275e58e8d37 (patch)
tree	5a051da2d04b14ce92bdcb98a5c5ab84c9d344fb /compiler
parent	ab5fd982a7a501136cb8b90fa841c02cc9551b5a (diff)
download	haskell-891a791f3f2438e5d768e1f96119d275e58e8d37.tar.gz