1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
|
{-# LANGUAGE GADTs #-}
module CmmSink (
cmmSink
) where
import CodeGen.Platform (callerSaves)
import Cmm
import CmmOpt
import BlockId
import CmmLive
import CmmUtils
import Hoopl
import DynFlags
import UniqFM
import PprCmm ()
import Data.List (partition)
import qualified Data.Set as Set
-- -----------------------------------------------------------------------------
-- Sinking and inlining
-- This is an optimisation pass that
-- (a) moves assignments closer to their uses, to reduce register pressure
-- (b) pushes assignments into a single branch of a conditional if possible
-- (c) inlines assignments to registers that are mentioned only once
-- (d) discards dead assignments
--
-- This tightens up lots of register-heavy code. It is particularly
-- helpful in the Cmm generated by the Stg->Cmm code generator, in
-- which every function starts with a copyIn sequence like:
--
-- x1 = R1
-- x2 = Sp[8]
-- x3 = Sp[16]
-- if (Sp - 32 < SpLim) then L1 else L2
--
-- we really want to push the x1..x3 assignments into the L2 branch.
--
-- Algorithm:
--
-- * Start by doing liveness analysis.
--
-- * Keep a list of assignments A; earlier ones may refer to later ones
--
-- * Walk forwards through the graph, look at each node N:
-- * If any assignments in A (1) occur only once in N, and (2) are
-- not live after N, inline the assignment and remove it
-- from A.
-- * If N is an assignment:
-- * If the register is not live after N, discard it
-- * otherwise pick up the assignment and add it to A
-- * If N is a non-assignment node:
-- * remove any assignments from A that conflict with N, and
-- place them before N in the current block. (we call this
-- "dropping" the assignments).
-- * An assignment conflicts with N if it:
-- - assigns to a register mentioned in N
-- - mentions a register assigned by N
-- - reads from memory written by N
-- * do this recursively, dropping dependent assignments
-- * At a multi-way branch:
-- * drop any assignments that are live on more than one branch
-- * if any successor has more than one predecessor (a
-- join-point), drop everything live in that successor
--
-- As a side-effect we'll delete some dead assignments (transitively,
-- even). This isn't as good as removeDeadAssignments, but it's much
-- cheaper.
-- If we do this *before* stack layout, we might be able to avoid
-- saving some things across calls/procpoints.
--
-- *but*, that will invalidate the liveness analysis, and we'll have
-- to re-do it.
-- -----------------------------------------------------------------------------
-- things that we aren't optimising very well yet.
--
-- -----------
-- (1) From GHC's FastString.hashStr:
--
-- s2ay:
-- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
-- c2gn:
-- R1 = _s2au::I64;
-- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
-- c2gp:
-- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
-- 4091);
-- _s2an::I64 = _s2an::I64 + 1;
-- _s2au::I64 = _s2cO::I64;
-- goto s2ay;
--
-- a nice loop, but we didn't eliminate the silly assignment at the end.
-- See Note [dependent assignments], which would probably fix this.
--
-- -----------
-- (2) From stg_atomically_frame in PrimOps.cmm
--
-- We have a diamond control flow:
--
-- x = ...
-- |
-- / \
-- A B
-- \ /
-- |
-- use of x
--
-- Now x won't be sunk down to its use, because we won't push it into
-- both branches of the conditional. We certainly do have to check
-- that we can sink it past all the code in both A and B, but having
-- discovered that, we could sink it to its use.
--
-- -----------------------------------------------------------------------------
type Assignment = (LocalReg, CmmExpr, AbsMem)
-- Assignment caches AbsMem, an abstraction of the memory read by
-- the RHS of the assignment.
cmmSink :: DynFlags -> CmmGraph -> CmmGraph
cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
where
liveness = cmmLiveness graph
getLive l = mapFindWithDefault Set.empty l liveness
blocks = postorderDfs graph
join_pts = findJoinPoints blocks
sink :: BlockEnv [Assignment] -> [CmmBlock] -> [CmmBlock]
sink _ [] = []
sink sunk (b:bs) =
-- pprTrace "sink" (ppr lbl) $
blockJoin first final_middle final_last : sink sunk' bs
where
lbl = entryLabel b
(first, middle, last) = blockSplit b
succs = successors last
-- Annotate the middle nodes with the registers live *after*
-- the node. This will help us decide whether we can inline
-- an assignment in the current node or not.
live = Set.unions (map getLive succs)
live_middle = gen_kill last live
ann_middles = annotate live_middle (blockToList middle)
-- Now sink and inline in this block
(middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
fold_last = constantFold dflags last
(final_last, assigs') = tryToInline dflags live fold_last assigs
-- We cannot sink into join points (successors with more than
-- one predecessor), so identify the join points and the set
-- of registers live in them.
(joins, nonjoins) = partition (`mapMember` join_pts) succs
live_in_joins = Set.unions (map getLive joins)
-- We do not want to sink an assignment into multiple branches,
-- so identify the set of registers live in multiple successors.
-- This is made more complicated because when we sink an assignment
-- into one branch, this might change the set of registers that are
-- now live in multiple branches.
init_live_sets = map getLive nonjoins
live_in_multi live_sets r =
case filter (Set.member r) live_sets of
(_one:_two:_) -> True
_ -> False
-- Now, drop any assignments that we will not sink any further.
(dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
where
should_drop = conflicts dflags a final_last
|| {- not (isSmall rhs) && -} live_in_multi live_sets r
|| r `Set.member` live_in_joins
live_sets' | should_drop = live_sets
| otherwise = map upd live_sets
upd set | r `Set.member` set = set `Set.union` live_rhs
| otherwise = set
live_rhs = foldRegsUsed extendRegSet emptyRegSet rhs
final_middle = foldl blockSnoc middle' dropped_last
sunk' = mapUnion sunk $
mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
| l <- succs ]
{- TODO: enable this later, when we have some good tests in place to
measure the effect and tune it.
-- small: an expression we don't mind duplicating
isSmall :: CmmExpr -> Bool
isSmall (CmmReg (CmmLocal _)) = True -- not globals, we want to coalesce them instead
isSmall (CmmLit _) = True
isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
isSmall (CmmRegOff (CmmLocal _) _) = True
isSmall _ = False
isTrivial :: CmmExpr -> Bool
isTrivial (CmmReg (CmmLocal _)) = True
isTrivial (CmmLit _) = True
isTrivial _ = False
-}
--
-- annotate each node with the set of registers live *after* the node
--
annotate :: RegSet -> [CmmNode O O] -> [(RegSet, CmmNode O O)]
annotate live nodes = snd $ foldr ann (live,[]) nodes
where ann n (live,nodes) = (gen_kill n live, (live,n) : nodes)
--
-- Find the blocks that have multiple successors (join points)
--
findJoinPoints :: [CmmBlock] -> BlockEnv Int
findJoinPoints blocks = mapFilter (>1) succ_counts
where
all_succs = concatMap successors blocks
succ_counts :: BlockEnv Int
succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
--
-- filter the list of assignments to remove any assignments that
-- are not live in a continuation.
--
filterAssignments :: DynFlags -> RegSet -> [Assignment] -> [Assignment]
filterAssignments dflags live assigs = reverse (go assigs [])
where go [] kept = kept
go (a@(r,_,_):as) kept | needed = go as (a:kept)
| otherwise = go as kept
where
needed = r `Set.member` live
|| any (conflicts dflags a) (map toNode kept)
-- Note that we must keep assignments that are
-- referred to by other assignments we have
-- already kept.
-- -----------------------------------------------------------------------------
-- Walk through the nodes of a block, sinking and inlining assignments
-- as we go.
walk :: DynFlags
-> [(RegSet, CmmNode O O)] -- nodes of the block, annotated with
-- the set of registers live *after*
-- this node.
-> [Assignment] -- The current list of
-- assignments we are sinking.
-- Later assignments may refer
-- to earlier ones.
-> ( Block CmmNode O O -- The new block
, [Assignment] -- Assignments to sink further
)
walk dflags nodes assigs = go nodes emptyBlock assigs
where
go [] block as = (block, as)
go ((live,node):ns) block as
| shouldDiscard node live = go ns block as
| Just a <- shouldSink dflags node2 = go ns block (a : as1)
| otherwise = go ns block' as'
where
node1 = constantFold dflags node
(node2, as1) = tryToInline dflags live node1 as
(dropped, as') = dropAssignmentsSimple dflags
(\a -> conflicts dflags a node2) as1
block' = foldl blockSnoc block dropped `blockSnoc` node2
constantFold :: DynFlags -> CmmNode e x -> CmmNode e x
constantFold dflags node = mapExpDeep f node
where f (CmmMachOp op args) = cmmMachOpFold dflags op args
f (CmmRegOff r 0) = CmmReg r
f e = e
--
-- Heuristic to decide whether to pick up and sink an assignment
-- Currently we pick up all assignments to local registers. It might
-- be profitable to sink assignments to global regs too, but the
-- liveness analysis doesn't track those (yet) so we can't.
--
shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
shouldSink _ _other = Nothing
--
-- discard dead assignments. This doesn't do as good a job as
-- removeDeadAsssignments, because it would need multiple passes
-- to get all the dead code, but it catches the common case of
-- superfluous reloads from the stack that the stack allocator
-- leaves behind.
--
-- Also we catch "r = r" here. You might think it would fall
-- out of inlining, but the inliner will see that r is live
-- after the instruction and choose not to inline r in the rhs.
--
shouldDiscard :: CmmNode e x -> RegSet -> Bool
shouldDiscard node live
= case node of
CmmAssign r (CmmReg r') | r == r' -> True
CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
_otherwise -> False
toNode :: Assignment -> CmmNode O O
toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> [Assignment]
-> ([CmmNode O O], [Assignment])
dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> [Assignment]
-> ([CmmNode O O], [Assignment])
dropAssignments dflags should_drop state assigs
= (dropped, reverse kept)
where
(dropped,kept) = go state assigs [] []
go _ [] dropped kept = (dropped, kept)
go state (assig : rest) dropped kept
| conflict = go state' rest (toNode assig : dropped) kept
| otherwise = go state' rest dropped (assig:kept)
where
(dropit, state') = should_drop assig state
conflict = dropit || any (conflicts dflags assig) dropped
-- -----------------------------------------------------------------------------
-- Try to inline assignments into a node.
tryToInline
:: DynFlags
-> RegSet -- set of registers live after this
-- node. We cannot inline anything
-- that is live after the node, unless
-- it is small enough to duplicate.
-> CmmNode O x -- The node to inline into
-> [Assignment] -- Assignments to inline
-> (
CmmNode O x -- New node
, [Assignment] -- Remaining assignments
)
tryToInline dflags live node assigs = go usages node [] assigs
where
usages :: UniqFM Int
usages = foldRegsUsed addUsage emptyUFM node
go _usages node _skipped [] = (node, [])
go usages node skipped (a@(l,rhs,_) : rest)
| can_inline = inline_and_discard
| False {- isTiny rhs -} = inline_and_keep
-- ^^ seems to make things slightly worse
where
inline_and_discard = go usages' node' skipped rest
inline_and_keep = (node'', a : rest')
where (node'',rest') = go usages' node' (l:skipped) rest
can_inline =
not (l `elemRegSet` live)
&& not (skipped `regsUsedIn` rhs) -- Note [dependent assignments]
&& okToInline dflags rhs node
&& lookupUFM usages l == Just 1
usages' = foldRegsUsed addUsage usages rhs
node' = mapExpDeep inline node
where inline (CmmReg (CmmLocal l')) | l == l' = rhs
inline (CmmRegOff (CmmLocal l') off) | l == l'
= cmmOffset dflags rhs off
-- re-constant fold after inlining
inline (CmmMachOp op args) = cmmMachOpFold dflags op args
inline other = other
go usages node skipped (assig@(l,rhs,_) : rest)
= (node', assig : rest')
where (node', rest') = go usages' node (l:skipped) rest
usages' = foldRegsUsed (\m r -> addToUFM m r 2) usages rhs
-- we must not inline anything that is mentioned in the RHS
-- of a binding that we have already skipped, so we set the
-- usages of the regs on the RHS to 2.
-- Note [dependent assignments]
--
-- If our assignment list looks like
--
-- [ y = e, x = ... y ... ]
--
-- We cannot inline x. Remember this list is really in reverse order,
-- so it means x = ... y ...; y = e
--
-- Hence if we inline x, the outer assignment to y will capture the
-- reference in x's right hand side.
--
-- In this case we should rename the y in x's right-hand side,
-- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
-- Now we can go ahead and inline x.
--
-- For now we do nothing, because this would require putting
-- everything inside UniqSM.
addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
addUsage m r = addToUFM_C (+) m r 1
regsUsedIn :: [LocalReg] -> CmmExpr -> Bool
regsUsedIn [] _ = False
regsUsedIn ls e = wrapRecExpf f e False
where f (CmmReg (CmmLocal l)) _ | l `elem` ls = True
f (CmmRegOff (CmmLocal l) _) _ | l `elem` ls = True
f _ z = z
-- we don't inline into CmmUnsafeForeignCall if the expression refers
-- to global registers. This is a HACK to avoid global registers
-- clashing with C argument-passing registers, really the back-end
-- ought to be able to handle it properly, but currently neither PprC
-- nor the NCG can do it. See Note [Register parameter passing]
-- See also StgCmmForeign:load_args_into_temps.
okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
okToInline dflags expr CmmUnsafeForeignCall{} = not (anyCallerSavesRegs dflags expr)
okToInline _ _ _ = True
-- -----------------------------------------------------------------------------
-- | @conflicts (r,e) stmt@ is @False@ if and only if the assignment
-- @r = e@ can be safely commuted past @stmt@.
--
-- We only sink "r = G" assignments right now, so conflicts is very simple:
--
conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
conflicts dflags (r, rhs, addr) node
-- (1) an assignment to a register conflicts with a use of the register
| CmmAssign reg _ <- node, reg `regUsedIn` rhs = True
| foldRegsUsed (\b r' -> r == r' || b) False node = True
-- (2) a store to an address conflicts with a read of the same memory
| CmmStore addr' e <- node
, memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
-- (3) an assignment to Hp/Sp conflicts with a heap/stack read respectively
| HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
| StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
| SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
-- (4) assignments that read caller-saves GlobalRegs conflict with a
-- foreign call. See Note [foreign calls clobber GlobalRegs].
| CmmUnsafeForeignCall{} <- node, anyCallerSavesRegs dflags rhs = True
-- (5) foreign calls clobber memory, but not heap/stack memory
| CmmUnsafeForeignCall{} <- node, AnyMem <- addr = True
-- (6) native calls clobber any memory
| CmmCall{} <- node, memConflicts addr AnyMem = True
-- (7) otherwise, no conflict
| otherwise = False
anyCallerSavesRegs :: DynFlags -> CmmExpr -> Bool
anyCallerSavesRegs dflags e = wrapRecExpf f e False
where f (CmmReg (CmmGlobal r)) _
| callerSaves (targetPlatform dflags) r = True
f _ z = z
-- An abstraction of memory read or written.
data AbsMem
= NoMem -- no memory accessed
| AnyMem -- arbitrary memory
| HeapMem -- definitely heap memory
| StackMem -- definitely stack memory
| SpMem -- <size>[Sp+n]
{-# UNPACK #-} !Int
{-# UNPACK #-} !Int
-- Having SpMem is important because it lets us float loads from Sp
-- past stores to Sp as long as they don't overlap, and this helps to
-- unravel some long sequences of
-- x1 = [Sp + 8]
-- x2 = [Sp + 16]
-- ...
-- [Sp + 8] = xi
-- [Sp + 16] = xj
--
-- Note that SpMem is invalidated if Sp is changed, but the definition
-- of 'conflicts' above handles that.
-- ToDo: this won't currently fix the following commonly occurring code:
-- x1 = [R1 + 8]
-- x2 = [R1 + 16]
-- ..
-- [Hp - 8] = x1
-- [Hp - 16] = x2
-- ..
-- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
-- assignments to [Hp + n] do not conflict with any other heap memory,
-- but this is tricky to nail down. What if we had
--
-- x = Hp + n
-- [x] = ...
--
-- the store to [x] should be "new heap", not "old heap".
-- Furthermore, you could imagine that if we started inlining
-- functions in Cmm then there might well be reads of heap memory
-- that was written in the same basic block. To take advantage of
-- non-aliasing of heap memory we will have to be more clever.
bothMems :: AbsMem -> AbsMem -> AbsMem
bothMems NoMem x = x
bothMems x NoMem = x
bothMems HeapMem HeapMem = HeapMem
bothMems StackMem StackMem = StackMem
bothMems (SpMem o1 w1) (SpMem o2 w2)
| o1 == o2 = SpMem o1 (max w1 w2)
| otherwise = StackMem
bothMems SpMem{} StackMem = StackMem
bothMems StackMem SpMem{} = StackMem
bothMems _ _ = AnyMem
memConflicts :: AbsMem -> AbsMem -> Bool
memConflicts NoMem _ = False
memConflicts _ NoMem = False
memConflicts HeapMem StackMem = False
memConflicts StackMem HeapMem = False
memConflicts SpMem{} HeapMem = False
memConflicts HeapMem SpMem{} = False
memConflicts (SpMem o1 w1) (SpMem o2 w2)
| o1 < o2 = o1 + w1 > o2
| otherwise = o2 + w2 > o1
memConflicts _ _ = True
exprMem :: DynFlags -> CmmExpr -> AbsMem
exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
exprMem _ _ = NoMem
loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
loadAddr dflags e w =
case e of
CmmReg r -> regAddr dflags r 0 w
CmmRegOff r i -> regAddr dflags r i w
_other | CmmGlobal Sp `regUsedIn` e -> StackMem
| otherwise -> AnyMem
regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
regAddr _ (CmmGlobal Hp) _ _ = HeapMem
regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
regAddr _ _ _ _ = AnyMem
|