summaryrefslogtreecommitdiff
path: root/compiler/GHC/Core/Unfold/Make.hs
blob: 1c72fc2974eeb18ca0f28edbae334b3f40971ac1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
{-# OPTIONS_GHC -Wno-incomplete-record-updates #-}

-- | Unfolding creation
module GHC.Core.Unfold.Make
   ( noUnfolding
   , mkUnfolding
   , mkCoreUnfolding
   , mkFinalUnfolding
   , mkFinalUnfolding'
   , mkSimpleUnfolding
   , mkWorkerUnfolding
   , mkInlineUnfoldingWithArity, mkInlineUnfoldingNoArity
   , mkInlinableUnfolding
   , mkWrapperUnfolding
   , mkCompulsoryUnfolding, mkCompulsoryUnfolding'
   , mkDFunUnfolding
   , mkDataConUnfolding
   , specUnfolding
   , certainlyWillInline
   )
where

import GHC.Prelude
import GHC.Core
import GHC.Core.Unfold
import GHC.Core.Unfoldings
import GHC.Core.Opt.OccurAnal ( occurAnalyseExpr )
import GHC.Core.Opt.Arity   ( manifestArity )
import GHC.Core.DataCon
import GHC.Core.Utils
import GHC.Types.Basic
import GHC.Types.Id
import GHC.Types.Id.Info
import GHC.Types.Demand ( DmdSig, isDeadEndSig )

import GHC.Utils.Outputable
import GHC.Utils.Misc
import GHC.Utils.Panic

import Data.Maybe ( fromMaybe )

-- the very simple optimiser is used to optimise unfoldings
import {-# SOURCE #-} GHC.Core.SimpleOpt



mkFinalUnfolding :: UnfoldingOpts -> UnfoldingSource -> DmdSig -> CoreExpr -> Unfolding
-- "Final" in the sense that this is a GlobalId that will not be further
-- simplified; so the unfolding should be occurrence-analysed
mkFinalUnfolding opts src strict_sig expr = mkFinalUnfolding' opts src strict_sig expr Nothing

-- See Note [Tying the 'CoreUnfolding' knot] for why interfaces need
-- to pass a precomputed 'UnfoldingCache'
mkFinalUnfolding' :: UnfoldingOpts -> UnfoldingSource -> DmdSig -> CoreExpr -> Maybe UnfoldingCache -> Unfolding
-- "Final" in the sense that this is a GlobalId that will not be further
-- simplified; so the unfolding should be occurrence-analysed
mkFinalUnfolding' opts src strict_sig expr
  = mkUnfolding opts src
                True {- Top level -}
                (isDeadEndSig strict_sig)
                expr

-- | Same as 'mkCompulsoryUnfolding' but simplifies the unfolding first
mkCompulsoryUnfolding' :: SimpleOpts -> CoreExpr -> Unfolding
mkCompulsoryUnfolding' opts expr = mkCompulsoryUnfolding (simpleOptExpr opts expr)

-- | Used for things that absolutely must be unfolded
mkCompulsoryUnfolding :: CoreExpr -> Unfolding
mkCompulsoryUnfolding expr
  = mkCoreUnfolding CompulsorySrc True
                    expr Nothing
                    (UnfWhen { ug_arity = 0    -- Arity of unfolding doesn't matter
                             , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })

-- Note [Top-level flag on inline rules]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- Slight hack: note that mk_inline_rules conservatively sets the
-- top-level flag to True.  It gets set more accurately by the simplifier
-- Simplify.simplUnfolding.

mkSimpleUnfolding :: UnfoldingOpts -> CoreExpr -> Unfolding
mkSimpleUnfolding !opts rhs
  = mkUnfolding opts VanillaSrc False False rhs Nothing

mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
mkDFunUnfolding bndrs con ops
  = DFunUnfolding { df_bndrs = bndrs
                  , df_con = con
                  , df_args = map occurAnalyseExpr ops }
                  -- See Note [Occurrence analysis of unfoldings]

mkDataConUnfolding :: CoreExpr -> Unfolding
-- Used for non-newtype data constructors with non-trivial wrappers
mkDataConUnfolding expr
  = mkCoreUnfolding StableSystemSrc True expr Nothing guide
    -- No need to simplify the expression
  where
    guide = UnfWhen { ug_arity     = manifestArity expr
                    , ug_unsat_ok  = unSaturatedOk
                    , ug_boring_ok = False }

mkWrapperUnfolding :: SimpleOpts -> CoreExpr -> Arity -> Unfolding
-- Make the unfolding for the wrapper in a worker/wrapper split
-- after demand/CPR analysis
mkWrapperUnfolding opts expr arity
  = mkCoreUnfolding StableSystemSrc True
                    (simpleOptExpr opts expr) Nothing
                    (UnfWhen { ug_arity     = arity
                             , ug_unsat_ok  = unSaturatedOk
                             , ug_boring_ok = boringCxtNotOk })

mkWorkerUnfolding :: SimpleOpts -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
-- See Note [Worker/wrapper for INLINABLE functions] in GHC.Core.Opt.WorkWrap
mkWorkerUnfolding opts work_fn
                  (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
                                 , uf_is_top = top_lvl })
  | isStableSource src
  = mkCoreUnfolding src top_lvl new_tmpl Nothing guidance
  where
    new_tmpl = simpleOptExpr opts (work_fn tmpl)
    guidance = calcUnfoldingGuidance (so_uf_opts opts) False new_tmpl

mkWorkerUnfolding _ _ _ = noUnfolding

-- | Make an INLINE unfolding that may be used unsaturated
-- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
-- manifest arity (the number of outer lambdas applications will
-- resolve before doing any work).
mkInlineUnfoldingNoArity :: SimpleOpts -> UnfoldingSource -> CoreExpr -> Unfolding
mkInlineUnfoldingNoArity opts src expr
  = mkCoreUnfolding src
                    True         -- Note [Top-level flag on inline rules]
                    expr' Nothing guide
  where
    expr' = simpleOptExpr opts expr
    guide = UnfWhen { ug_arity = manifestArity expr'
                    , ug_unsat_ok = unSaturatedOk
                    , ug_boring_ok = boring_ok }
    boring_ok = inlineBoringOk expr'

-- | Make an INLINE unfolding that will be used once the RHS has been saturated
-- to the given arity.
mkInlineUnfoldingWithArity :: SimpleOpts -> UnfoldingSource -> Arity -> CoreExpr -> Unfolding
mkInlineUnfoldingWithArity opts src arity expr
  = mkCoreUnfolding src
                    True         -- Note [Top-level flag on inline rules]
                    expr' Nothing guide
  where
    expr' = simpleOptExpr opts expr
    guide = UnfWhen { ug_arity = arity
                    , ug_unsat_ok = needSaturated
                    , ug_boring_ok = boring_ok }
    -- See Note [INLINE pragmas and boring contexts] as to why we need to look
    -- at the arity here.
    boring_ok | arity == 0 = True
              | otherwise  = inlineBoringOk expr'

mkInlinableUnfolding :: SimpleOpts -> UnfoldingSource -> CoreExpr -> Unfolding
mkInlinableUnfolding opts src expr
  = mkUnfolding (so_uf_opts opts) src False False expr' Nothing
  where
    expr' = simpleOptExpr opts expr

specUnfolding :: SimpleOpts
              -> [Var] -> (CoreExpr -> CoreExpr)
              -> [CoreArg]   -- LHS arguments in the RULE
              -> Unfolding -> Unfolding
-- See Note [Specialising unfoldings]
-- specUnfolding spec_bndrs spec_args unf
--   = \spec_bndrs. unf spec_args
--
specUnfolding opts spec_bndrs spec_app rule_lhs_args
              df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
  = assertPpr (rule_lhs_args `equalLength` old_bndrs)
              (ppr df $$ ppr rule_lhs_args) $
           -- For this ASSERT see Note [Specialising DFuns] in GHC.Core.Opt.Specialise
    mkDFunUnfolding spec_bndrs con (map spec_arg args)
      -- For DFunUnfoldings we transform
      --       \obs. MkD <op1> ... <opn>
      -- to
      --       \sbs. MkD ((\obs. <op1>) spec_args) ... ditto <opn>
  where
    spec_arg arg = simpleOptExpr opts $
                   spec_app (mkLams old_bndrs arg)
                   -- The beta-redexes created by spec_app will be
                   -- simplified away by simplOptExpr

specUnfolding opts spec_bndrs spec_app rule_lhs_args
              (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
                             , uf_is_top = top_lvl
                             , uf_guidance = old_guidance })
 | isStableSource src  -- See Note [Specialising unfoldings]
 , UnfWhen { ug_arity = old_arity } <- old_guidance
 = mkCoreUnfolding src top_lvl new_tmpl Nothing
                   (old_guidance { ug_arity = old_arity - arity_decrease })
 where
   new_tmpl = simpleOptExpr opts $
              mkLams spec_bndrs  $
              spec_app tmpl  -- The beta-redexes created by spec_app
                             -- will be simplified away by simplOptExpr
   arity_decrease = count isValArg rule_lhs_args - count isId spec_bndrs


specUnfolding _ _ _ _ _ = noUnfolding

{- Note [Specialising unfoldings]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When we specialise a function for some given type-class arguments, we use
specUnfolding to specialise its unfolding.  Some important points:

* If the original function has a DFunUnfolding, the specialised one
  must do so too!  Otherwise we lose the magic rules that make it
  interact with ClassOps

* For a /stable/ CoreUnfolding, we specialise the unfolding, no matter
  how big, iff it has UnfWhen guidance.  This happens for INLINE
  functions, and for wrappers.  For these, it would be very odd if a
  function marked INLINE was specialised (because of some local use),
  and then forever after (including importing modules) the specialised
  version wasn't INLINEd!  After all, the programmer said INLINE.

* However, for a stable CoreUnfolding with guidance UnfoldIfGoodArgs,
  which arises from INLINABLE functions, we drop the unfolding.
  See #4874 for persuasive examples.  Suppose we have
    {-# INLINABLE f #-}
    f :: Ord a => [a] -> Int f xs = letrec f' = ...f'... in f'

  Then, when f is specialised and optimised we might get
    wgo :: [Int] -> Int#
    wgo = ...wgo...
    f_spec :: [Int] -> Int
    f_spec xs = case wgo xs of { r -> I# r }

  and we clearly want to inline f_spec at call sites.  But if we still
  have the big, un-optimised of f (albeit specialised) captured in the
  stable unfolding for f_spec, we won't get that optimisation.

  This happens with Control.Monad.liftM3, and can cause a lot more
  allocation as a result (nofib n-body shows this).

  Moreover, keeping the stable unfolding isn't much help, because
  the specialised function (probably) isn't overloaded any more.

  TL;DR: we simply drop the stable unfolding when specialising. It's not
  really a complete solution; ignoring specialisation for now, INLINABLE
  functions don't get properly strictness analysed, for example.
  Moreover, it means that the specialised function has an INLINEABLE
  pragma, but no stable unfolding. But it works well for examples
  involving specialisation, which is the dominant use of INLINABLE.

Note [Honour INLINE on 0-ary bindings]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Consider

   x = <expensive>
   {-# INLINE x #-}

   f y = ...x...

The semantics of an INLINE pragma is

  inline x at every call site, provided it is saturated;
  that is, applied to at least as many arguments as appear
  on the LHS of the Haskell source definition.

(This source-code-derived arity is stored in the `ug_arity` field of
the `UnfoldingGuidance`.)

In the example, x's ug_arity is 0, so we should inline it at every use
site.  It's rare to have such an INLINE pragma (usually INLINE is on
functions), but it's occasionally very important (#15578, #15519).
In #15519 we had something like
   x = case (g a b) of I# r -> T r
   {-# INLINE x #-}
   f y = ...(h x)....

where h is strict.  So we got
   f y = ...(case g a b of I# r -> h (T r))...

and that in turn allowed SpecConstr to ramp up performance.

How do we deliver on this?  By adjusting the ug_boring_ok
flag in mkInlineUnfoldingWithArity; see
Note [INLINE pragmas and boring contexts]

NB: there is a real risk that full laziness will float it right back
out again. Consider again
  x = factorial 200
  {-# INLINE x #-}
  f y = ...x...

After inlining we get
  f y = ...(factorial 200)...

but it's entirely possible that full laziness will do
  lvl23 = factorial 200
  f y = ...lvl23...

That's a problem for another day.

Note [INLINE pragmas and boring contexts]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
An INLINE pragma uses mkInlineUnfoldingWithArity to build the
unfolding.  That sets the ug_boring_ok flag to False if the function
is not tiny (inlineBoringOK), so that even INLINE functions are not
inlined in an utterly boring context.  E.g.
     \x y. Just (f y x)
Nothing is gained by inlining f here, even if it has an INLINE
pragma.

But for 0-ary bindings, we want to inline regardless; see
Note [Honour INLINE on 0-ary bindings].

I'm a bit worried that it's possible for the same kind of problem
to arise for non-0-ary functions too, but let's wait and see.
-}

mkUnfolding :: UnfoldingOpts
            -> UnfoldingSource
            -> Bool       -- Is top-level
            -> Bool       -- Definitely a bottoming binding
                          -- (only relevant for top-level bindings)
            -> CoreExpr
            -> Maybe UnfoldingCache
            -> Unfolding
-- Calculates unfolding guidance
-- Occurrence-analyses the expression before capturing it
mkUnfolding opts src top_lvl is_bottoming expr cache
  = mkCoreUnfolding src top_lvl expr cache guidance
  where
    is_top_bottoming = top_lvl && is_bottoming
    guidance         = calcUnfoldingGuidance opts is_top_bottoming expr
        -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
        -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]

mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
                -> Maybe UnfoldingCache -> UnfoldingGuidance -> Unfolding
-- Occurrence-analyses the expression before capturing it
mkCoreUnfolding src top_lvl expr precomputed_cache guidance
  = CoreUnfolding { uf_tmpl = cache `seq`
                              occurAnalyseExpr expr
      -- occAnalyseExpr: see Note [Occurrence analysis of unfoldings]
      -- See #20905 for what a discussion of this 'seq'.
      -- We are careful to make sure we only
      -- have one copy of an unfolding around at once.
      -- Note [Thoughtful forcing in mkCoreUnfolding]

                  , uf_src          = src
                  , uf_is_top       = top_lvl
                  , uf_cache        = cache
                  , uf_guidance     = guidance }
  where
    is_value      = exprIsHNF expr
    is_conlike    = exprIsConLike expr
    is_work_free  = exprIsWorkFree expr
    is_expandable = exprIsExpandable expr

    recomputed_cache = UnfoldingCache { uf_is_value = is_value
                                      , uf_is_conlike = is_conlike
                                      , uf_is_work_free = is_work_free
                                      , uf_expandable = is_expandable }

    cache = fromMaybe recomputed_cache precomputed_cache

----------------
certainlyWillInline :: UnfoldingOpts -> IdInfo -> CoreExpr -> Maybe Unfolding
-- ^ Sees if the unfolding is pretty certain to inline.
-- If so, return a *stable* unfolding for it, that will always inline.
-- The CoreExpr is the WW'd and simplified RHS. In contrast, the unfolding
-- template might not have been WW'd yet.
certainlyWillInline opts fn_info rhs'
  = case fn_unf of
      CoreUnfolding { uf_guidance = guidance, uf_src = src }
        | noinline -> Nothing       -- See Note [Worker/wrapper for NOINLINE functions]
        | otherwise
        -> case guidance of
             UnfNever   -> Nothing
             UnfWhen {} -> Just (fn_unf { uf_src = src', uf_tmpl = tmpl' })
                             -- INLINE functions have UnfWhen
             UnfIfGoodArgs { ug_size = size, ug_args = args }
                        -> do_cunf size args src' tmpl'
        where
          src' | isCompulsorySource src = src  -- Do not change InlineCompulsory!
               | otherwise              = StableSystemSrc

          tmpl' | isStableSource src = uf_tmpl fn_unf
                | otherwise          = occurAnalyseExpr rhs'
                -- Do not overwrite stable unfoldings!

      DFunUnfolding {} -> Just fn_unf  -- Don't w/w DFuns; it never makes sense
                                       -- to do so, and even if it is currently a
                                       -- loop breaker, it may not be later

      _other_unf       -> Nothing

  where
    noinline = isNoInlinePragma (inlinePragInfo fn_info)
    fn_unf   = unfoldingInfo fn_info -- NB: loop-breakers never inline

        -- The UnfIfGoodArgs case seems important.  If we w/w small functions
        -- binary sizes go up by 10%!  (This is with SplitObjs.)
        -- I'm not totally sure why.
        -- INLINABLE functions come via this path
        --    See Note [certainlyWillInline: INLINABLE]
    do_cunf size args src' tmpl'
      | arityInfo fn_info > 0  -- See Note [certainlyWillInline: be careful of thunks]
      , not (isDeadEndSig (dmdSigInfo fn_info))
              -- Do not unconditionally inline a bottoming functions even if
              -- it seems smallish. We've carefully lifted it out to top level,
              -- so we don't want to re-inline it.
      , let unf_arity = length args
      , size - (10 * (unf_arity + 1)) <= unfoldingUseThreshold opts
      = Just (fn_unf { uf_src      = src'
                     , uf_tmpl     = tmpl'
                     , uf_guidance = UnfWhen { ug_arity     = unf_arity
                                             , ug_unsat_ok  = unSaturatedOk
                                             , ug_boring_ok = inlineBoringOk tmpl' } })
             -- Note the "unsaturatedOk". A function like  f = \ab. a
             -- will certainly inline, even if partially applied (f e), so we'd
             -- better make sure that the transformed inlining has the same property
      | otherwise
      = Nothing

{- Note [certainlyWillInline: be careful of thunks]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Don't claim that thunks will certainly inline, because that risks work
duplication.  Even if the work duplication is not great (eg is_cheap
holds), it can make a big difference in an inner loop In #5623 we
found that the WorkWrap phase thought that
       y = case x of F# v -> F# (v +# v)
was certainlyWillInline, so the addition got duplicated.

Note that we check arityInfo instead of the arity of the unfolding to detect
this case. This is so that we don't accidentally fail to inline small partial
applications, like `f = g 42` (where `g` recurses into `f`) where g has arity 2
(say). Here there is no risk of work duplication, and the RHS is tiny, so
certainlyWillInline should return True. But `unf_arity` is zero! However f's
arity, gotten from `arityInfo fn_info`, is 1.

Failing to say that `f` will inline forces W/W to generate a potentially huge
worker for f that will immediately cancel with `g`'s wrapper anyway, causing
unnecessary churn in the Simplifier while arriving at the same result.

Note [certainlyWillInline: INLINABLE]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
even though we have a stable inlining, so that strictness w/w takes
place.  It makes a big difference to efficiency, and the w/w pass knows
how to transfer the INLINABLE info to the worker; see WorkWrap
Note [Worker/wrapper for INLINABLE functions]

Note [Thoughtful forcing in mkCoreUnfolding]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Core expressions retained in unfoldings is one of biggest uses of memory when compiling
a program. Therefore we have to be careful about retaining copies of old or redundant
templates (see !6202 for a particularly bad case).

With that in mind we want to maintain the invariant that each unfolding only references
a single CoreExpr. One place where we have to be careful is in mkCoreUnfolding.

* The template of the unfolding is the result of performing occurrence analysis
  (Note [Occurrence analysis of unfoldings])
* Predicates are applied to the unanalysed expression

Therefore if we are not thoughtful about forcing you can end up in a situation where the
template is forced but not all the predicates are forced so the unfolding will retain
both the old and analysed expressions.

I investigated this using ghc-debug and it was clear this situation did often arise:

```
(["ghc:GHC.Core:Lam","ghc-prim:GHC.Types:True","THUNK_1_0","THUNK_1_0","THUNK_1_0"],Count 4307)
```

Here the predicates are unforced but the template is forced.

Therefore we basically had two options in order to fix this:

1. Perform the predicates on the analysed expression.
2. Force the predicates to remove retainer to the old expression if we force the template.

Option 1 is bad because occurrence analysis is expensive and destroys any sharing of the unfolding
with the actual program. (Testing this approach showed peak 25G memory usage)

Therefore we got for Option 2 which performs a little more work but compensates by
reducing memory pressure.

The result of fixing this led to a 1G reduction in peak memory usage (12G -> 11G) when
compiling a very large module (peak 3 million terms). For more discussion see #20905.
-}