summaryrefslogtreecommitdiff
path: root/compiler/GHC/Cmm/Sink.hs
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/GHC/Cmm/Sink.hs')
-rw-r--r--compiler/GHC/Cmm/Sink.hs854
1 files changed, 854 insertions, 0 deletions
diff --git a/compiler/GHC/Cmm/Sink.hs b/compiler/GHC/Cmm/Sink.hs
new file mode 100644
index 0000000000..8e231df300
--- /dev/null
+++ b/compiler/GHC/Cmm/Sink.hs
@@ -0,0 +1,854 @@
+{-# LANGUAGE GADTs #-}
+module GHC.Cmm.Sink (
+ cmmSink
+ ) where
+
+import GhcPrelude
+
+import GHC.Cmm
+import GHC.Cmm.Opt
+import GHC.Cmm.Liveness
+import GHC.Cmm.Utils
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Graph
+import GHC.Platform.Regs
+import GHC.Platform (isARM, platformArch)
+
+import DynFlags
+import Unique
+import UniqFM
+
+import qualified Data.IntSet as IntSet
+import Data.List (partition)
+import qualified Data.Set as Set
+import Data.Maybe
+
+-- Compact sets for membership tests of local variables.
+
+type LRegSet = IntSet.IntSet
+
+emptyLRegSet :: LRegSet
+emptyLRegSet = IntSet.empty
+
+nullLRegSet :: LRegSet -> Bool
+nullLRegSet = IntSet.null
+
+insertLRegSet :: LocalReg -> LRegSet -> LRegSet
+insertLRegSet l = IntSet.insert (getKey (getUnique l))
+
+elemLRegSet :: LocalReg -> LRegSet -> Bool
+elemLRegSet l = IntSet.member (getKey (getUnique l))
+
+-- -----------------------------------------------------------------------------
+-- Sinking and inlining
+
+-- This is an optimisation pass that
+-- (a) moves assignments closer to their uses, to reduce register pressure
+-- (b) pushes assignments into a single branch of a conditional if possible
+-- (c) inlines assignments to registers that are mentioned only once
+-- (d) discards dead assignments
+--
+-- This tightens up lots of register-heavy code. It is particularly
+-- helpful in the Cmm generated by the Stg->Cmm code generator, in
+-- which every function starts with a copyIn sequence like:
+--
+-- x1 = R1
+-- x2 = Sp[8]
+-- x3 = Sp[16]
+-- if (Sp - 32 < SpLim) then L1 else L2
+--
+-- we really want to push the x1..x3 assignments into the L2 branch.
+--
+-- Algorithm:
+--
+-- * Start by doing liveness analysis.
+--
+-- * Keep a list of assignments A; earlier ones may refer to later ones.
+-- Currently we only sink assignments to local registers, because we don't
+-- have liveness information about global registers.
+--
+-- * Walk forwards through the graph, look at each node N:
+--
+-- * If it is a dead assignment, i.e. assignment to a register that is
+-- not used after N, discard it.
+--
+-- * Try to inline based on current list of assignments
+-- * If any assignments in A (1) occur only once in N, and (2) are
+-- not live after N, inline the assignment and remove it
+-- from A.
+--
+-- * If an assignment in A is cheap (RHS is local register), then
+-- inline the assignment and keep it in A in case it is used afterwards.
+--
+-- * Otherwise don't inline.
+--
+-- * If N is assignment to a local register pick up the assignment
+-- and add it to A.
+--
+-- * If N is not an assignment to a local register:
+-- * remove any assignments from A that conflict with N, and
+-- place them before N in the current block. We call this
+-- "dropping" the assignments.
+--
+-- * An assignment conflicts with N if it:
+-- - assigns to a register mentioned in N
+-- - mentions a register assigned by N
+-- - reads from memory written by N
+-- * do this recursively, dropping dependent assignments
+--
+-- * At an exit node:
+-- * drop any assignments that are live on more than one successor
+-- and are not trivial
+-- * if any successor has more than one predecessor (a join-point),
+-- drop everything live in that successor. Since we only propagate
+-- assignments that are not dead at the successor, we will therefore
+-- eliminate all assignments dead at this point. Thus analysis of a
+-- join-point will always begin with an empty list of assignments.
+--
+--
+-- As a result of above algorithm, sinking deletes some dead assignments
+-- (transitively, even). This isn't as good as removeDeadAssignments,
+-- but it's much cheaper.
+
+-- -----------------------------------------------------------------------------
+-- things that we aren't optimising very well yet.
+--
+-- -----------
+-- (1) From GHC's FastString.hashStr:
+--
+-- s2ay:
+-- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
+-- c2gn:
+-- R1 = _s2au::I64;
+-- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
+-- c2gp:
+-- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
+-- 4091);
+-- _s2an::I64 = _s2an::I64 + 1;
+-- _s2au::I64 = _s2cO::I64;
+-- goto s2ay;
+--
+-- a nice loop, but we didn't eliminate the silly assignment at the end.
+-- See Note [dependent assignments], which would probably fix this.
+-- This is #8336.
+--
+-- -----------
+-- (2) From stg_atomically_frame in PrimOps.cmm
+--
+-- We have a diamond control flow:
+--
+-- x = ...
+-- |
+-- / \
+-- A B
+-- \ /
+-- |
+-- use of x
+--
+-- Now x won't be sunk down to its use, because we won't push it into
+-- both branches of the conditional. We certainly do have to check
+-- that we can sink it past all the code in both A and B, but having
+-- discovered that, we could sink it to its use.
+--
+
+-- -----------------------------------------------------------------------------
+
+type Assignment = (LocalReg, CmmExpr, AbsMem)
+ -- Assignment caches AbsMem, an abstraction of the memory read by
+ -- the RHS of the assignment.
+
+type Assignments = [Assignment]
+ -- A sequence of assignments; kept in *reverse* order
+ -- So the list [ x=e1, y=e2 ] means the sequence of assignments
+ -- y = e2
+ -- x = e1
+
+cmmSink :: DynFlags -> CmmGraph -> CmmGraph
+cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
+ where
+ liveness = cmmLocalLiveness dflags graph
+ getLive l = mapFindWithDefault Set.empty l liveness
+
+ blocks = revPostorder graph
+
+ join_pts = findJoinPoints blocks
+
+ sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
+ sink _ [] = []
+ sink sunk (b:bs) =
+ -- pprTrace "sink" (ppr lbl) $
+ blockJoin first final_middle final_last : sink sunk' bs
+ where
+ lbl = entryLabel b
+ (first, middle, last) = blockSplit b
+
+ succs = successors last
+
+ -- Annotate the middle nodes with the registers live *after*
+ -- the node. This will help us decide whether we can inline
+ -- an assignment in the current node or not.
+ live = Set.unions (map getLive succs)
+ live_middle = gen_kill dflags last live
+ ann_middles = annotate dflags live_middle (blockToList middle)
+
+ -- Now sink and inline in this block
+ (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
+ fold_last = constantFoldNode dflags last
+ (final_last, assigs') = tryToInline dflags live fold_last assigs
+
+ -- We cannot sink into join points (successors with more than
+ -- one predecessor), so identify the join points and the set
+ -- of registers live in them.
+ (joins, nonjoins) = partition (`mapMember` join_pts) succs
+ live_in_joins = Set.unions (map getLive joins)
+
+ -- We do not want to sink an assignment into multiple branches,
+ -- so identify the set of registers live in multiple successors.
+ -- This is made more complicated because when we sink an assignment
+ -- into one branch, this might change the set of registers that are
+ -- now live in multiple branches.
+ init_live_sets = map getLive nonjoins
+ live_in_multi live_sets r =
+ case filter (Set.member r) live_sets of
+ (_one:_two:_) -> True
+ _ -> False
+
+ -- Now, drop any assignments that we will not sink any further.
+ (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
+
+ drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
+ where
+ should_drop = conflicts dflags a final_last
+ || not (isTrivial dflags rhs) && live_in_multi live_sets r
+ || r `Set.member` live_in_joins
+
+ live_sets' | should_drop = live_sets
+ | otherwise = map upd live_sets
+
+ upd set | r `Set.member` set = set `Set.union` live_rhs
+ | otherwise = set
+
+ live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
+
+ final_middle = foldl' blockSnoc middle' dropped_last
+
+ sunk' = mapUnion sunk $
+ mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
+ | l <- succs ]
+
+{- TODO: enable this later, when we have some good tests in place to
+ measure the effect and tune it.
+
+-- small: an expression we don't mind duplicating
+isSmall :: CmmExpr -> Bool
+isSmall (CmmReg (CmmLocal _)) = True --
+isSmall (CmmLit _) = True
+isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
+isSmall (CmmRegOff (CmmLocal _) _) = True
+isSmall _ = False
+-}
+
+--
+-- We allow duplication of trivial expressions: registers (both local and
+-- global) and literals.
+--
+isTrivial :: DynFlags -> CmmExpr -> Bool
+isTrivial _ (CmmReg (CmmLocal _)) = True
+isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
+ if isARM (platformArch (targetPlatform dflags))
+ then True -- CodeGen.Platform.ARM does not have globalRegMaybe
+ else isJust (globalRegMaybe (targetPlatform dflags) r)
+ -- GlobalRegs that are loads from BaseReg are not trivial
+isTrivial _ (CmmLit _) = True
+isTrivial _ _ = False
+
+--
+-- annotate each node with the set of registers live *after* the node
+--
+annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
+annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
+ where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
+
+--
+-- Find the blocks that have multiple successors (join points)
+--
+findJoinPoints :: [CmmBlock] -> LabelMap Int
+findJoinPoints blocks = mapFilter (>1) succ_counts
+ where
+ all_succs = concatMap successors blocks
+
+ succ_counts :: LabelMap Int
+ succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
+
+--
+-- filter the list of assignments to remove any assignments that
+-- are not live in a continuation.
+--
+filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
+filterAssignments dflags live assigs = reverse (go assigs [])
+ where go [] kept = kept
+ go (a@(r,_,_):as) kept | needed = go as (a:kept)
+ | otherwise = go as kept
+ where
+ needed = r `Set.member` live
+ || any (conflicts dflags a) (map toNode kept)
+ -- Note that we must keep assignments that are
+ -- referred to by other assignments we have
+ -- already kept.
+
+-- -----------------------------------------------------------------------------
+-- Walk through the nodes of a block, sinking and inlining assignments
+-- as we go.
+--
+-- On input we pass in a:
+-- * list of nodes in the block
+-- * a list of assignments that appeared *before* this block and
+-- that are being sunk.
+--
+-- On output we get:
+-- * a new block
+-- * a list of assignments that will be placed *after* that block.
+--
+
+walk :: DynFlags
+ -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with
+ -- the set of registers live *after*
+ -- this node.
+
+ -> Assignments -- The current list of
+ -- assignments we are sinking.
+ -- Earlier assignments may refer
+ -- to later ones.
+
+ -> ( Block CmmNode O O -- The new block
+ , Assignments -- Assignments to sink further
+ )
+
+walk dflags nodes assigs = go nodes emptyBlock assigs
+ where
+ go [] block as = (block, as)
+ go ((live,node):ns) block as
+ | shouldDiscard node live = go ns block as
+ -- discard dead assignment
+ | Just a <- shouldSink dflags node2 = go ns block (a : as1)
+ | otherwise = go ns block' as'
+ where
+ node1 = constantFoldNode dflags node
+
+ (node2, as1) = tryToInline dflags live node1 as
+
+ (dropped, as') = dropAssignmentsSimple dflags
+ (\a -> conflicts dflags a node2) as1
+
+ block' = foldl' blockSnoc block dropped `blockSnoc` node2
+
+
+--
+-- Heuristic to decide whether to pick up and sink an assignment
+-- Currently we pick up all assignments to local registers. It might
+-- be profitable to sink assignments to global regs too, but the
+-- liveness analysis doesn't track those (yet) so we can't.
+--
+shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
+shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
+ where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
+shouldSink _ _other = Nothing
+
+--
+-- discard dead assignments. This doesn't do as good a job as
+-- removeDeadAssignments, because it would need multiple passes
+-- to get all the dead code, but it catches the common case of
+-- superfluous reloads from the stack that the stack allocator
+-- leaves behind.
+--
+-- Also we catch "r = r" here. You might think it would fall
+-- out of inlining, but the inliner will see that r is live
+-- after the instruction and choose not to inline r in the rhs.
+--
+shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
+shouldDiscard node live
+ = case node of
+ CmmAssign r (CmmReg r') | r == r' -> True
+ CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
+ _otherwise -> False
+
+
+toNode :: Assignment -> CmmNode O O
+toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
+
+dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
+ -> ([CmmNode O O], Assignments)
+dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
+
+dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
+ -> ([CmmNode O O], Assignments)
+dropAssignments dflags should_drop state assigs
+ = (dropped, reverse kept)
+ where
+ (dropped,kept) = go state assigs [] []
+
+ go _ [] dropped kept = (dropped, kept)
+ go state (assig : rest) dropped kept
+ | conflict = go state' rest (toNode assig : dropped) kept
+ | otherwise = go state' rest dropped (assig:kept)
+ where
+ (dropit, state') = should_drop assig state
+ conflict = dropit || any (conflicts dflags assig) dropped
+
+
+-- -----------------------------------------------------------------------------
+-- Try to inline assignments into a node.
+-- This also does constant folding for primpops, since
+-- inlining opens up opportunities for doing so.
+
+tryToInline
+ :: DynFlags
+ -> LocalRegSet -- set of registers live after this
+ -- node. We cannot inline anything
+ -- that is live after the node, unless
+ -- it is small enough to duplicate.
+ -> CmmNode O x -- The node to inline into
+ -> Assignments -- Assignments to inline
+ -> (
+ CmmNode O x -- New node
+ , Assignments -- Remaining assignments
+ )
+
+tryToInline dflags live node assigs = go usages node emptyLRegSet assigs
+ where
+ usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
+ usages = foldLocalRegsUsed dflags addUsage emptyUFM node
+
+ go _usages node _skipped [] = (node, [])
+
+ go usages node skipped (a@(l,rhs,_) : rest)
+ | cannot_inline = dont_inline
+ | occurs_none = discard -- Note [discard during inlining]
+ | occurs_once = inline_and_discard
+ | isTrivial dflags rhs = inline_and_keep
+ | otherwise = dont_inline
+ where
+ inline_and_discard = go usages' inl_node skipped rest
+ where usages' = foldLocalRegsUsed dflags addUsage usages rhs
+
+ discard = go usages node skipped rest
+
+ dont_inline = keep node -- don't inline the assignment, keep it
+ inline_and_keep = keep inl_node -- inline the assignment, keep it
+
+ keep node' = (final_node, a : rest')
+ where (final_node, rest') = go usages' node' (insertLRegSet l skipped) rest
+ usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
+ usages rhs
+ -- we must not inline anything that is mentioned in the RHS
+ -- of a binding that we have already skipped, so we set the
+ -- usages of the regs on the RHS to 2.
+
+ cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
+ || l `elemLRegSet` skipped
+ || not (okToInline dflags rhs node)
+
+ l_usages = lookupUFM usages l
+ l_live = l `elemRegSet` live
+
+ occurs_once = not l_live && l_usages == Just 1
+ occurs_none = not l_live && l_usages == Nothing
+
+ inl_node = improveConditional (mapExpDeep inl_exp node)
+
+ inl_exp :: CmmExpr -> CmmExpr
+ -- inl_exp is where the inlining actually takes place!
+ inl_exp (CmmReg (CmmLocal l')) | l == l' = rhs
+ inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
+ = cmmOffset dflags rhs off
+ -- re-constant fold after inlining
+ inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
+ inl_exp other = other
+
+
+{- Note [improveConditional]
+
+cmmMachOpFold tries to simplify conditionals to turn things like
+ (a == b) != 1
+into
+ (a != b)
+but there's one case it can't handle: when the comparison is over
+floating-point values, we can't invert it, because floating-point
+comparisons aren't invertible (because of NaNs).
+
+But we *can* optimise this conditional by swapping the true and false
+branches. Given
+ CmmCondBranch ((a >## b) != 1) t f
+we can turn it into
+ CmmCondBranch (a >## b) f t
+
+So here we catch conditionals that weren't optimised by cmmMachOpFold,
+and apply above transformation to eliminate the comparison against 1.
+
+It's tempting to just turn every != into == and then let cmmMachOpFold
+do its thing, but that risks changing a nice fall-through conditional
+into one that requires two jumps. (see swapcond_last in
+GHC.Cmm.ContFlowOpt), so instead we carefully look for just the cases where
+we can eliminate a comparison.
+-}
+improveConditional :: CmmNode O x -> CmmNode O x
+improveConditional
+ (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l)
+ | neLike mop, isComparisonExpr x
+ = CmmCondBranch x f t (fmap not l)
+ where
+ neLike (MO_Ne _) = True
+ neLike (MO_U_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1
+ neLike (MO_S_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1
+ neLike _ = False
+improveConditional other = other
+
+-- Note [dependent assignments]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- If our assignment list looks like
+--
+-- [ y = e, x = ... y ... ]
+--
+-- We cannot inline x. Remember this list is really in reverse order,
+-- so it means x = ... y ...; y = e
+--
+-- Hence if we inline x, the outer assignment to y will capture the
+-- reference in x's right hand side.
+--
+-- In this case we should rename the y in x's right-hand side,
+-- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
+-- Now we can go ahead and inline x.
+--
+-- For now we do nothing, because this would require putting
+-- everything inside UniqSM.
+--
+-- One more variant of this (#7366):
+--
+-- [ y = e, y = z ]
+--
+-- If we don't want to inline y = e, because y is used many times, we
+-- might still be tempted to inline y = z (because we always inline
+-- trivial rhs's). But of course we can't, because y is equal to e,
+-- not z.
+
+-- Note [discard during inlining]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- Opportunities to discard assignments sometimes appear after we've
+-- done some inlining. Here's an example:
+--
+-- x = R1;
+-- y = P64[x + 7];
+-- z = P64[x + 15];
+-- /* z is dead */
+-- R1 = y & (-8);
+--
+-- The x assignment is trivial, so we inline it in the RHS of y, and
+-- keep both x and y. z gets dropped because it is dead, then we
+-- inline y, and we have a dead assignment to x. If we don't notice
+-- that x is dead in tryToInline, we end up retaining it.
+
+addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
+addUsage m r = addToUFM_C (+) m r 1
+
+regsUsedIn :: LRegSet -> CmmExpr -> Bool
+regsUsedIn ls _ | nullLRegSet ls = False
+regsUsedIn ls e = wrapRecExpf f e False
+ where f (CmmReg (CmmLocal l)) _ | l `elemLRegSet` ls = True
+ f (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True
+ f _ z = z
+
+-- we don't inline into CmmUnsafeForeignCall if the expression refers
+-- to global registers. This is a HACK to avoid global registers
+-- clashing with C argument-passing registers, really the back-end
+-- ought to be able to handle it properly, but currently neither PprC
+-- nor the NCG can do it. See Note [Register parameter passing]
+-- See also GHC.StgToCmm.Foreign.load_args_into_temps.
+okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
+ not (globalRegistersConflict dflags expr node)
+okToInline _ _ _ = True
+
+-- -----------------------------------------------------------------------------
+
+-- | @conflicts (r,e) node@ is @False@ if and only if the assignment
+-- @r = e@ can be safely commuted past statement @node@.
+conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
+conflicts dflags (r, rhs, addr) node
+
+ -- (1) node defines registers used by rhs of assignment. This catches
+ -- assignments and all three kinds of calls. See Note [Sinking and calls]
+ | globalRegistersConflict dflags rhs node = True
+ | localRegistersConflict dflags rhs node = True
+
+ -- (2) node uses register defined by assignment
+ | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
+
+ -- (3) a store to an address conflicts with a read of the same memory
+ | CmmStore addr' e <- node
+ , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
+
+ -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
+ | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
+ | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
+ | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
+
+ -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
+ | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
+
+ -- (6) native calls clobber any memory
+ | CmmCall{} <- node, memConflicts addr AnyMem = True
+
+ -- (7) otherwise, no conflict
+ | otherwise = False
+
+-- Returns True if node defines any global registers that are used in the
+-- Cmm expression
+globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+globalRegistersConflict dflags expr node =
+ foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
+ False node
+
+-- Returns True if node defines any local registers that are used in the
+-- Cmm expression
+localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+localRegistersConflict dflags expr node =
+ foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr)
+ False node
+
+-- Note [Sinking and calls]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
+-- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
+-- stack layout (see Note [Sinking after stack layout]) which leads to two
+-- invariants related to calls:
+--
+-- a) during stack layout phase all safe foreign calls are turned into
+-- unsafe foreign calls (see Note [Lower safe foreign calls]). This
+-- means that we will never encounter CmmForeignCall node when running
+-- sinking after stack layout
+--
+-- b) stack layout saves all variables live across a call on the stack
+-- just before making a call (remember we are not sinking assignments to
+-- stack):
+--
+-- L1:
+-- x = R1
+-- P64[Sp - 16] = L2
+-- P64[Sp - 8] = x
+-- Sp = Sp - 16
+-- call f() returns L2
+-- L2:
+--
+-- We will attempt to sink { x = R1 } but we will detect conflict with
+-- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
+-- checking whether it conflicts with { call f() }. In this way we will
+-- never need to check any assignment conflicts with CmmCall. Remember
+-- that we still need to check for potential memory conflicts.
+--
+-- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
+-- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
+-- This assumption holds only when we do sinking after stack layout. If we run
+-- it before stack layout we need to check for possible conflicts with all three
+-- kinds of calls. Our `conflicts` function does that by using a generic
+-- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
+-- UserOfRegs typeclasses.
+--
+
+-- An abstraction of memory read or written.
+data AbsMem
+ = NoMem -- no memory accessed
+ | AnyMem -- arbitrary memory
+ | HeapMem -- definitely heap memory
+ | StackMem -- definitely stack memory
+ | SpMem -- <size>[Sp+n]
+ {-# UNPACK #-} !Int
+ {-# UNPACK #-} !Int
+
+-- Having SpMem is important because it lets us float loads from Sp
+-- past stores to Sp as long as they don't overlap, and this helps to
+-- unravel some long sequences of
+-- x1 = [Sp + 8]
+-- x2 = [Sp + 16]
+-- ...
+-- [Sp + 8] = xi
+-- [Sp + 16] = xj
+--
+-- Note that SpMem is invalidated if Sp is changed, but the definition
+-- of 'conflicts' above handles that.
+
+-- ToDo: this won't currently fix the following commonly occurring code:
+-- x1 = [R1 + 8]
+-- x2 = [R1 + 16]
+-- ..
+-- [Hp - 8] = x1
+-- [Hp - 16] = x2
+-- ..
+
+-- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
+-- assignments to [Hp + n] do not conflict with any other heap memory,
+-- but this is tricky to nail down. What if we had
+--
+-- x = Hp + n
+-- [x] = ...
+--
+-- the store to [x] should be "new heap", not "old heap".
+-- Furthermore, you could imagine that if we started inlining
+-- functions in Cmm then there might well be reads of heap memory
+-- that was written in the same basic block. To take advantage of
+-- non-aliasing of heap memory we will have to be more clever.
+
+-- Note [Foreign calls clobber heap]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- It is tempting to say that foreign calls clobber only
+-- non-heap/stack memory, but unfortunately we break this invariant in
+-- the RTS. For example, in stg_catch_retry_frame we call
+-- stmCommitNestedTransaction() which modifies the contents of the
+-- TRec it is passed (this actually caused incorrect code to be
+-- generated).
+--
+-- Since the invariant is true for the majority of foreign calls,
+-- perhaps we ought to have a special annotation for calls that can
+-- modify heap/stack memory. For now we just use the conservative
+-- definition here.
+--
+-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
+-- therefore we should never float any memory operations across one of
+-- these calls.
+
+
+bothMems :: AbsMem -> AbsMem -> AbsMem
+bothMems NoMem x = x
+bothMems x NoMem = x
+bothMems HeapMem HeapMem = HeapMem
+bothMems StackMem StackMem = StackMem
+bothMems (SpMem o1 w1) (SpMem o2 w2)
+ | o1 == o2 = SpMem o1 (max w1 w2)
+ | otherwise = StackMem
+bothMems SpMem{} StackMem = StackMem
+bothMems StackMem SpMem{} = StackMem
+bothMems _ _ = AnyMem
+
+memConflicts :: AbsMem -> AbsMem -> Bool
+memConflicts NoMem _ = False
+memConflicts _ NoMem = False
+memConflicts HeapMem StackMem = False
+memConflicts StackMem HeapMem = False
+memConflicts SpMem{} HeapMem = False
+memConflicts HeapMem SpMem{} = False
+memConflicts (SpMem o1 w1) (SpMem o2 w2)
+ | o1 < o2 = o1 + w1 > o2
+ | otherwise = o2 + w2 > o1
+memConflicts _ _ = True
+
+exprMem :: DynFlags -> CmmExpr -> AbsMem
+exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
+exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
+exprMem _ _ = NoMem
+
+loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
+loadAddr dflags e w =
+ case e of
+ CmmReg r -> regAddr dflags r 0 w
+ CmmRegOff r i -> regAddr dflags r i w
+ _other | regUsedIn dflags spReg e -> StackMem
+ | otherwise -> AnyMem
+
+regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
+regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
+regAddr _ (CmmGlobal Hp) _ _ = HeapMem
+regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
+regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
+regAddr _ _ _ _ = AnyMem
+
+{-
+Note [Inline GlobalRegs?]
+
+Should we freely inline GlobalRegs?
+
+Actually it doesn't make a huge amount of difference either way, so we
+*do* currently treat GlobalRegs as "trivial" and inline them
+everywhere, but for what it's worth, here is what I discovered when I
+(SimonM) looked into this:
+
+Common sense says we should not inline GlobalRegs, because when we
+have
+
+ x = R1
+
+the register allocator will coalesce this assignment, generating no
+code, and simply record the fact that x is bound to $rbx (or
+whatever). Furthermore, if we were to sink this assignment, then the
+range of code over which R1 is live increases, and the range of code
+over which x is live decreases. All things being equal, it is better
+for x to be live than R1, because R1 is a fixed register whereas x can
+live in any register. So we should neither sink nor inline 'x = R1'.
+
+However, not inlining GlobalRegs can have surprising
+consequences. e.g. (cgrun020)
+
+ c3EN:
+ _s3DB::P64 = R1;
+ _c3ES::P64 = _s3DB::P64 & 7;
+ if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
+ c3EU:
+ _s3DD::P64 = P64[_s3DB::P64 + 6];
+ _s3DE::P64 = P64[_s3DB::P64 + 14];
+ I64[Sp - 8] = c3F0;
+ R1 = _s3DE::P64;
+ P64[Sp] = _s3DD::P64;
+
+inlining the GlobalReg gives:
+
+ c3EN:
+ if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
+ c3EU:
+ I64[Sp - 8] = c3F0;
+ _s3DD::P64 = P64[R1 + 6];
+ R1 = P64[R1 + 14];
+ P64[Sp] = _s3DD::P64;
+
+but if we don't inline the GlobalReg, instead we get:
+
+ _s3DB::P64 = R1;
+ if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
+ c3EU:
+ I64[Sp - 8] = c3F0;
+ R1 = P64[_s3DB::P64 + 14];
+ P64[Sp] = P64[_s3DB::P64 + 6];
+
+This looks better - we managed to inline _s3DD - but in fact it
+generates an extra reg-reg move:
+
+.Lc3EU:
+ movq $c3F0_info,-8(%rbp)
+ movq %rbx,%rax
+ movq 14(%rbx),%rbx
+ movq 6(%rax),%rax
+ movq %rax,(%rbp)
+
+because _s3DB is now live across the R1 assignment, we lost the
+benefit of coalescing.
+
+Who is at fault here? Perhaps if we knew that _s3DB was an alias for
+R1, then we would not sink a reference to _s3DB past the R1
+assignment. Or perhaps we *should* do that - we might gain by sinking
+it, despite losing the coalescing opportunity.
+
+Sometimes not inlining global registers wins by virtue of the rule
+about not inlining into arguments of a foreign call, e.g. (T7163) this
+is what happens when we inlined F1:
+
+ _s3L2::F32 = F1;
+ _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
+ (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
+
+but if we don't inline F1:
+
+ (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
+ 10.0 :: W32));
+-}