summaryrefslogtreecommitdiff
path: root/compiler/GHC/Cmm/Pipeline.hs
blob: ff61a2a7a41d74fb6c8e38f30ee52e23e3a41982 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
{-# LANGUAGE BangPatterns #-}

module GHC.Cmm.Pipeline (
  -- | Converts C-- with an implicit stack and native C-- calls into
  -- optimized, CPS converted and native-call-less C--.  The latter
  -- C-- can be used to generate assembly.
  cmmPipeline
) where

import GHC.Prelude

import GHC.Cmm
import GHC.Cmm.Lint
import GHC.Cmm.Info.Build
import GHC.Cmm.CommonBlockElim
import GHC.Cmm.Switch.Implement
import GHC.Cmm.ProcPoint
import GHC.Cmm.ContFlowOpt
import GHC.Cmm.LayoutStack
import GHC.Cmm.Sink
import GHC.Cmm.Dataflow.Collections

import GHC.Types.Unique.Supply
import GHC.Driver.Session
import GHC.Driver.Backend
import GHC.Utils.Error
import GHC.Utils.Logger
import GHC.Driver.Env
import Control.Monad
import GHC.Utils.Outputable
import GHC.Platform
import Data.Either (partitionEithers)

-----------------------------------------------------------------------------
-- | Top level driver for C-- pipeline
-----------------------------------------------------------------------------

cmmPipeline
 :: HscEnv -- Compilation env including
           -- dynamic flags: -dcmm-lint -ddump-cmm-cps
 -> ModuleSRTInfo        -- Info about SRTs generated so far
 -> CmmGroup             -- Input C-- with Procedures
 -> IO (ModuleSRTInfo, CmmGroupSRTs) -- Output CPS transformed C--

cmmPipeline hsc_env srtInfo prog = do
  let logger = hsc_logger hsc_env
  let dflags = hsc_dflags hsc_env
  let forceRes (info, group) = info `seq` foldr (\decl r -> decl `seq` r) () group
  let platform = targetPlatform dflags
  withTimingSilent logger (text "Cmm pipeline") forceRes $ do
     tops <- {-# SCC "tops" #-} mapM (cpsTop logger platform dflags) prog

     let (procs, data_) = partitionEithers tops
     (srtInfo, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags srtInfo procs data_
     dumpWith logger Opt_D_dump_cmm_cps "Post CPS Cmm" FormatCMM (pdoc platform cmms)

     return (srtInfo, cmms)


cpsTop :: Logger -> Platform -> DynFlags -> CmmDecl -> IO (Either (CAFEnv, [CmmDecl]) (CAFSet, CmmDecl))
cpsTop _logger platform _ p@(CmmData _ statics) = return (Right (cafAnalData platform statics, p))
cpsTop logger platform dflags proc =
    do
      ----------- Control-flow optimisations ----------------------------------

      -- The first round of control-flow optimisation speeds up the
      -- later passes by removing lots of empty blocks, so we do it
      -- even when optimisation isn't turned on.
      --
      CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
           return $ cmmCfgOptsProc splitting_proc_points proc
      dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g

      let !TopInfo {stack_info=StackInfo { arg_space = entry_off
                                         , do_layout = do_layout }} = h

      ----------- Eliminate common blocks -------------------------------------
      g <- {-# SCC "elimCommonBlocks" #-}
           condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
                         Opt_D_dump_cmm_cbe "Post common block elimination"

      -- Any work storing block Labels must be performed _after_
      -- elimCommonBlocks

      ----------- Implement switches ------------------------------------------
      g <- {-# SCC "createSwitchPlans" #-}
           runUniqSMIO $ cmmImplementSwitchPlans (backend dflags) platform g
      dump Opt_D_dump_cmm_switch "Post switch plan" g

      ----------- Proc points -------------------------------------------------
      let
        call_pps :: ProcPointSet -- LabelMap
        call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
      proc_points <-
         if splitting_proc_points
            then do
              pp <- {-# SCC "minimalProcPointSet" #-} runUniqSMIO $
                 minimalProcPointSet platform call_pps g
              dumpWith logger Opt_D_dump_cmm_proc "Proc points"
                    FormatCMM (pdoc platform l $$ ppr pp $$ pdoc platform g)
              return pp
            else
              return call_pps

      ----------- Layout the stack and manifest Sp ----------------------------
      (g, stackmaps) <-
           {-# SCC "layoutStack" #-}
           if do_layout
              then runUniqSMIO $ cmmLayoutStack dflags proc_points entry_off g
              else return (g, mapEmpty)
      dump Opt_D_dump_cmm_sp "Layout Stack" g

      ----------- Sink and inline assignments  --------------------------------
      g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
           condPass Opt_CmmSink (cmmSink platform) g
                    Opt_D_dump_cmm_sink "Sink assignments"

      ------------- CAF analysis ----------------------------------------------
      let cafEnv = {-# SCC "cafAnal" #-} cafAnal platform call_pps l g
      dumpWith logger Opt_D_dump_cmm_caf "CAFEnv" FormatText (pdoc platform cafEnv)

      g <- if splitting_proc_points
           then do
             ------------- Split into separate procedures -----------------------
             let pp_map = {-# SCC "procPointAnalysis" #-}
                          procPointAnalysis proc_points g
             dumpWith logger Opt_D_dump_cmm_procmap "procpoint map"
                FormatCMM (ppr pp_map)
             g <- {-# SCC "splitAtProcPoints" #-} runUniqSMIO $
                  splitAtProcPoints platform l call_pps proc_points pp_map
                                    (CmmProc h l v g)
             dumps Opt_D_dump_cmm_split "Post splitting" g
             return g
           else
             -- attach info tables to return points
             return $ [attachContInfoTables call_pps (CmmProc h l v g)]

      ------------- Populate info tables with stack info -----------------
      g <- {-# SCC "setInfoTableStackMap" #-}
           return $ map (setInfoTableStackMap platform stackmaps) g
      dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g

      ----------- Control-flow optimisations -----------------------------
      g <- {-# SCC "cmmCfgOpts(2)" #-}
           return $ if optLevel dflags >= 1
                    then map (cmmCfgOptsProc splitting_proc_points) g
                    else g
      g <- return (map removeUnreachableBlocksProc g)
           -- See Note [unreachable blocks]
      dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g

      return (Left (cafEnv, g))

  where dump = dumpGraph logger platform dflags

        dumps flag name
           = mapM_ (dumpWith logger flag name FormatCMM . pdoc platform)

        condPass flag pass g dumpflag dumpname =
            if gopt flag dflags
               then do
                    g <- return $ pass g
                    dump dumpflag dumpname g
                    return g
               else return g

        -- we don't need to split proc points for the NCG, unless
        -- tablesNextToCode is off.  The latter is because we have no
        -- label to put on info tables for basic blocks that are not
        -- the entry point.
        splitting_proc_points = backend dflags /= NCG
                             || not (platformTablesNextToCode platform)
                             || -- Note [inconsistent-pic-reg]
                                usingInconsistentPicReg
        usingInconsistentPicReg
           = case (platformArch platform, platformOS platform, positionIndependent dflags)
             of   (ArchX86, OSDarwin, pic) -> pic
                  _                        -> False

-- Note [Sinking after stack layout]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-- In the past we considered running sinking pass also before stack
-- layout, but after making some measurements we realized that:
--
--   a) running sinking only before stack layout produces slower
--      code than running sinking only before stack layout
--
--   b) running sinking both before and after stack layout produces
--      code that has the same performance as when running sinking
--      only after stack layout.
--
-- In other words sinking before stack layout doesn't buy as anything.
--
-- An interesting question is "why is it better to run sinking after
-- stack layout"? It seems that the major reason are stores and loads
-- generated by stack layout. Consider this code before stack layout:
--
--  c1E:
--      _c1C::P64 = R3;
--      _c1B::P64 = R2;
--      _c1A::P64 = R1;
--      I64[(young<c1D> + 8)] = c1D;
--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
--  c1D:
--      R3 = _c1C::P64;
--      R2 = _c1B::P64;
--      R1 = _c1A::P64;
--      call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
--
-- Stack layout pass will save all local variables live across a call
-- (_c1C, _c1B and _c1A in this example) on the stack just before
-- making a call and reload them from the stack after returning from a
-- call:
--
--  c1E:
--      _c1C::P64 = R3;
--      _c1B::P64 = R2;
--      _c1A::P64 = R1;
--      I64[Sp - 32] = c1D;
--      P64[Sp - 24] = _c1A::P64;
--      P64[Sp - 16] = _c1B::P64;
--      P64[Sp - 8] = _c1C::P64;
--      Sp = Sp - 32;
--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
--  c1D:
--      _c1A::P64 = P64[Sp + 8];
--      _c1B::P64 = P64[Sp + 16];
--      _c1C::P64 = P64[Sp + 24];
--      R3 = _c1C::P64;
--      R2 = _c1B::P64;
--      R1 = _c1A::P64;
--      Sp = Sp + 32;
--      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
--
-- If we don't run sinking pass after stack layout we are basically
-- left with such code. However, running sinking on this code can lead
-- to significant improvements:
--
--  c1E:
--      I64[Sp - 32] = c1D;
--      P64[Sp - 24] = R1;
--      P64[Sp - 16] = R2;
--      P64[Sp - 8] = R3;
--      Sp = Sp - 32;
--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
--  c1D:
--      R3 = P64[Sp + 24];
--      R2 = P64[Sp + 16];
--      R1 = P64[Sp + 8];
--      Sp = Sp + 32;
--      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
--
-- Now we only have 9 assignments instead of 15.
--
-- There is one case when running sinking before stack layout could
-- be beneficial. Consider this:
--
--   L1:
--      x = y
--      call f() returns L2
--   L2: ...x...y...
--
-- Since both x and y are live across a call to f, they will be stored
-- on the stack during stack layout and restored after the call:
--
--   L1:
--      x = y
--      P64[Sp - 24] = L2
--      P64[Sp - 16] = x
--      P64[Sp - 8]  = y
--      Sp = Sp - 24
--      call f() returns L2
--   L2:
--      y = P64[Sp + 16]
--      x = P64[Sp + 8]
--      Sp = Sp + 24
--      ...x...y...
--
-- However, if we run sinking before stack layout we would propagate x
-- to its usage place (both x and y must be local register for this to
-- be possible - global registers cannot be floated past a call):
--
--   L1:
--      x = y
--      call f() returns L2
--   L2: ...y...y...
--
-- Thus making x dead at the call to f(). If we ran stack layout now
-- we would generate less stores and loads:
--
--   L1:
--      x = y
--      P64[Sp - 16] = L2
--      P64[Sp - 8]  = y
--      Sp = Sp - 16
--      call f() returns L2
--   L2:
--      y = P64[Sp + 8]
--      Sp = Sp + 16
--      ...y...y...
--
-- But since we don't see any benefits from running sinking before stack
-- layout, this situation probably doesn't arise too often in practice.
--

{- Note [inconsistent-pic-reg]

On x86/Darwin, PIC is implemented by inserting a sequence like

    call 1f
 1: popl %reg

at the proc entry point, and then referring to labels as offsets from
%reg.  If we don't split proc points, then we could have many entry
points in a proc that would need this sequence, and each entry point
would then get a different value for %reg.  If there are any join
points, then at the join point we don't have a consistent value for
%reg, so we don't know how to refer to labels.

Hence, on x86/Darwin, we have to split proc points, and then each proc
point will get its own PIC initialisation sequence.

This isn't an issue on x86/ELF, where the sequence is

    call 1f
 1: popl %reg
    addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg

so %reg always has a consistent value: the address of
_GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.

-}

{- Note [unreachable blocks]

The control-flow optimiser sometimes leaves unreachable blocks behind
containing junk code.  These aren't necessarily a problem, but
removing them is good because it might save time in the native code
generator later.

-}

dumpGraph :: Logger -> Platform -> DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
dumpGraph logger platform dflags flag name g = do
  when (gopt Opt_DoCmmLinting dflags) $ do_lint g
  dumpWith logger flag name FormatCMM (pdoc platform g)
 where
  do_lint g = case cmmLintGraph platform g of
                 Just err -> do { fatalErrorMsg logger err
                                ; ghcExit logger 1
                                }
                 Nothing  -> return ()

dumpWith :: Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
dumpWith logger flag txt fmt sdoc = do
  putDumpFileMaybe logger flag txt fmt sdoc
  when (not (logHasDumpFlag logger flag)) $
    -- If `-ddump-cmm-verbose -ddump-to-file` is specified,
    -- dump each Cmm pipeline stage output to a separate file.  #16930
    when (logHasDumpFlag logger Opt_D_dump_cmm_verbose)
      $ logDumpFile logger (mkDumpStyle alwaysQualify) flag txt fmt sdoc
  putDumpFileMaybe logger Opt_D_dump_cmm_verbose_by_proc txt fmt sdoc