summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/GHC/Cmm/Alias.hs374
-rw-r--r--compiler/GHC/Cmm/Config.hs1
-rw-r--r--compiler/GHC/Cmm/Dataflow.hs84
-rw-r--r--compiler/GHC/Cmm/Expr.hs32
-rw-r--r--compiler/GHC/Cmm/Node.hs20
-rw-r--r--compiler/GHC/Cmm/Pipeline.hs3
-rw-r--r--compiler/GHC/Cmm/Sink.hs278
-rw-r--r--compiler/GHC/Driver/Config/Cmm.hs1
-rw-r--r--compiler/GHC/Driver/Flags.hs4
-rw-r--r--compiler/GHC/Driver/Session.hs2
-rw-r--r--compiler/ghc.cabal.in1
-rw-r--r--docs/users_guide/using-optimisation.rst11
-rw-r--r--testsuite/tests/cmm/should_compile/all.T4
-rw-r--r--testsuite/tests/cmm/should_compile/cmm_hp_aliases.cmm12
-rw-r--r--testsuite/tests/cmm/should_compile/cmm_hp_aliases.stderr16
-rw-r--r--testsuite/tests/cmm/should_compile/cmm_sink_hp.cmm22
-rw-r--r--testsuite/tests/cmm/should_compile/cmm_sink_hp.stderr29
17 files changed, 710 insertions, 184 deletions
diff --git a/compiler/GHC/Cmm/Alias.hs b/compiler/GHC/Cmm/Alias.hs
new file mode 100644
index 0000000000..c89f50be86
--- /dev/null
+++ b/compiler/GHC/Cmm/Alias.hs
@@ -0,0 +1,374 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+
+module GHC.Cmm.Alias
+ ( AbsMem(..)
+ , bothHeaps, heapsConflict, bothMems
+ , memConflicts --, exprMem, loadAddr, storeAddr
+
+ , exprMem, loadAddr, storeAddr
+
+ , cmmHpAlias, node_exit_hps, HpSet(..), regAliasesHp
+ , sizeHpSet
+
+ )
+where
+
+import GHC.Prelude as Prelude
+
+import GHC.Platform
+import GHC.Cmm
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+import GHC.Cmm.Ppr () -- For Outputable instances
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow
+import GHC.Cmm.Dataflow.Label
+
+import GHC.Utils.Outputable
+
+import Data.Set as Set
+import qualified Data.Semigroup
+import GHC.Cmm.Utils (regUsedIn)
+-- import GHC.Utils.Trace (pprTrace)
+
+-----------------------------------------------------------------------------
+-- Abstracting over memory access
+-----------------------------------------------------------------------------
+
+-- An abstraction of memory read or written.
+data AbsMem
+ = NoMem -- ^ no memory accessed
+ | AnyMem -- ^ arbitrary memory
+ | HeapMem !HeapType-- ^ heap memory
+ | StackMem -- ^ definitely stack memory
+ | SpMem -- ^ <size>[Sp+n] see Note [SpMem Aliasing]
+ {-# UNPACK #-} !Int
+ {-# UNPACK #-} !Int
+ deriving Show
+
+instance Outputable AbsMem where
+ ppr x = parens (text . show $ x)
+
+-- See Note [Heap Kinds]
+data HeapType = OldHeap | NewHeap | AnyHeap deriving (Show,Eq)
+
+bothHeaps :: HeapType -> HeapType -> HeapType
+bothHeaps h1 h2 | h1 == h2 = h1
+bothHeaps _ _ = AnyHeap
+
+heapsConflict :: HeapType -> HeapType -> Bool
+heapsConflict AnyHeap _ = True
+heapsConflict _ AnyHeap = True
+heapsConflict OldHeap OldHeap = True
+heapsConflict NewHeap NewHeap = False
+heapsConflict OldHeap NewHeap = False
+heapsConflict NewHeap OldHeap = False
+
+{- Note [Heap Kinds]
+~~~~~~~~~~~~~~~~~~~~
+Our goal is to allow sinking into assignments to Hp.
+That is for a sequence like:
+
+ c1 = [R1 + 8]
+ c2 = [R1 + 16]
+ [Hp-16] = c1
+ [Hp-8] = c2
+
+We want to inline the assignments to get:
+
+ [Hp-16] = [R1 + 8]
+ [Hp-8] = [R1 + 16]
+
+To achieve this we split heap memory references into three kinds.
+OldHeap, NewHeap, AnyHeap.
+
+AnyHeap is the conservative estimate of a reference where a write/read
+might conflight with any other write/read.
+
+OldHeap represents reads from memory where objects existing on entry to
+the current function are located.
+
+NewHeap represents the area of memory into which we allocate new objects.
+Since we only create *new* objects there it won't conflict with reading
+from already existing objects. And while we write to various Hp-relative
+memory locations by constructions none of these do conflict.
+
+* Reading from regular heap memory is defined to be OldHeap.
+* Writing to regular heap memory is defined to be AnyHeap.
+* Writing via HpReg is defined to be NewHeap. A write like this
+ always allocates a *new* object (by design) so it won't affect
+ reads from existing objects.
+* An expression depending on New+Old heap is treated as AnyHeap
+* Reading via HpReg (or an alias to it) is treated as AnyMem.
+
+New/OldHeap don't conflict. All other kinds of reference combinations do conflict.
+
+This means we can sink reads from `OldHeap` past writes to `NewHeap` (Hp)
+giving use better code as we can remove all the intermediate variables which
+sometimes used to get spilled to the C stack.
+
+This depends on Hp never being used to write to "old" heap. This
+isn't something our code generation ever does, so that is fine.
+
+Note [CmmCalls and Hp Aliasing]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Since we assume all foreign calls clopper heap/stack (see Note [Foreign calls clobber heap])
+we can relax the Hp aliasing slightly. In particular we will never sink memory accessing
+expressions across calls so using Hp and Hp-aliasing variables as arguments/targets for function
+calls is allowed.
+
+This is important if we have code like this:
+
+ // Allocate some closure
+ I64[Hp - 32] = x1_s5bz_info;
+ ...
+ hp_ptr::P64 = Hp - 32;
+ I64[Hp - n] = foo;
+ ...
+ if <cond> goto c6cS;
+ ...
+c6cS:
+ // Evaluate the thunk
+ call (I64[hp_ptr])(hp_ptr) returns to c6cQ, args: 8, res: 8, upd: 8;
+
+Is this code problematic for sink? Not really. While hp_ptr aliases to the
+same area of memory as Hp it's only used inside a call. And we currently
+never sink reads/writes across calls anyway.
+The end result being that using hp-aliasing variables as arguments/targets
+for function calls is fine.
+
+-----
+
+The other issue with calls are their results. Naturally a call might return a newly
+allocated heap object as result. But since we don't sink across calls we can assume
+any write to Hp after a call will write to different memory than where the call allocated
+the object. So even if technically the result can point to the nursery we will treat it
+as OldHeap after the call.
+
+--------------------------------------------------------------------------------
+
+Note [SpMem Aliasing]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Having SpMem is important because it lets us float loads from Sp
+past stores to Sp as long as they don't overlap, and this helps to
+unravel some long sequences of
+ x1 = [Sp + 8]
+ x2 = [Sp + 16]
+ ...
+ [Sp + 8] = xi
+ [Sp + 16] = xj
+
+Note that SpMem is invalidated if Sp is changed, but the definition
+of 'conflicts' above handles that.
+
+ToDo: this won't currently fix the following commonly occurring code:
+ x1 = [R1 + 8]
+ x2 = [R1 + 16]
+ ..
+ [Hp - 8] = x1
+ [Hp - 16] = x2
+ ..
+
+because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
+assignments to [Hp + n] do not conflict with any other heap memory,
+but this is tricky to nail down. What if we had
+
+ x = Hp + n
+ [x] = ...
+
+ the store to [x] should be "new heap", not "old heap".
+ Furthermore, you could imagine that if we started inlining
+ functions in Cmm then there might well be reads of heap memory
+ that was written in the same basic block. To take advantage of
+ non-aliasing of heap memory we will have to be more clever.
+
+ NB: We now solved the last point. See Note [Heap Kinds].
+
+Note [Foreign calls clobber heap]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+It is tempting to say that foreign calls clobber only
+non-heap/stack memory, but unfortunately we break this invariant in
+the RTS. For example, in stg_catch_retry_frame we call
+stmCommitNestedTransaction() which modifies the contents of the
+TRec it is passed (this actually caused incorrect code to be
+generated).
+
+Since the invariant is true for the majority of foreign calls,
+perhaps we ought to have a special annotation for calls that can
+modify heap/stack memory. For now we just use the conservative
+definition here.
+
+Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
+therefore we should never float any memory operations across one of
+these calls.
+
+`suspendThread` releases the capability used by the thread, hence we mustn't
+float accesses to heap, stack or virtual global registers stored in the
+capability (e.g. with unregisterised build, see #19237).
+
+-}
+
+bothMems :: AbsMem -> AbsMem -> AbsMem
+bothMems NoMem x = x
+bothMems x NoMem = x
+bothMems (HeapMem h1) (HeapMem h2) = HeapMem $! bothHeaps h1 h2
+bothMems StackMem StackMem = StackMem
+bothMems (SpMem o1 w1) (SpMem o2 w2)
+ | o1 == o2 = SpMem o1 (max w1 w2)
+ | otherwise = StackMem
+bothMems SpMem{} StackMem = StackMem
+bothMems StackMem SpMem{} = StackMem
+bothMems _ _ = AnyMem
+
+memConflicts :: AbsMem -> AbsMem -> Bool
+memConflicts NoMem _ = False
+memConflicts _ NoMem = False
+memConflicts HeapMem{} StackMem = False
+memConflicts StackMem HeapMem{} = False
+memConflicts SpMem{} HeapMem{} = False
+memConflicts HeapMem{} SpMem{} = False
+memConflicts (HeapMem h1) (HeapMem h2) = heapsConflict h1 h2
+memConflicts (SpMem o1 w1) (SpMem o2 w2)
+ | o1 < o2 = o1 + w1 > o2
+ | otherwise = o2 + w2 > o1
+memConflicts _ _ = True
+
+-----------------------------------------------------------------------------
+-- Abstracting over memory access - considering which registers might alias to Hp
+--
+-- Currently these will panic when trying to load values via Hp or Hp aliased
+-- expressions. If we ever allow use of Hp for memory reads then we need to return
+-- AnyHeap instead.
+-----------------------------------------------------------------------------
+
+exprMem :: Platform -> Maybe HpSet -> CmmExpr -> AbsMem
+exprMem platform hps (CmmLoad addr w _a) = bothMems (loadAddr platform hps addr (typeWidth w))
+ (exprMem platform hps addr)
+exprMem platform hps (CmmMachOp _ es) = let args = fmap (exprMem platform hps) es
+ in Prelude.foldr (\l r -> l `seq` r `seq` bothMems l r) NoMem args
+exprMem _ _ (CmmStackSlot {}) = AnyMem
+exprMem _ _ _ = NoMem
+
+-- We treat reading from Hp different than loading from Hp, hence the load/store distinction.
+-- See Note [Heap Kinds]
+loadAddr, storeAddr :: Platform -> Maybe HpSet -> CmmExpr -> Width -> AbsMem
+loadAddr p hps = refAddrHp p hps False
+storeAddr p hps = refAddrHp p hps True
+
+refAddrHp :: Platform -> Maybe HpSet -> Bool -> CmmExpr -> Width -> AbsMem
+refAddrHp platform hps is_store e w = -- pprTrace "refAddrHp" (ppr e) $
+ case e of
+ CmmReg r -> regAddrHp platform hps is_store r 0 w
+ CmmRegOff r i -> regAddrHp platform hps is_store r i w
+ _other | regUsedIn platform spReg e -> StackMem
+ | foldRegsUsed platform (\b r -> b || r `maybe_regAliasesHp` hps) False e -> trace_hp_mem (text "refAddrHp") (AnyMem)
+ | otherwise -> -- pprTrace "refAddrAny" (ppr e)
+ AnyMem
+
+regAddrHp :: Platform -> Maybe HpSet -> Bool -> CmmReg -> Int -> Width -> AbsMem
+regAddrHp _ _hps _store (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
+regAddrHp _ _hps is_store (CmmGlobal Hp) _ _
+ | is_store = HeapMem NewHeap
+ | otherwise = trace_hp_mem (text "HpStore") (HeapMem AnyHeap)
+regAddrHp _ _hps _store (CmmGlobal CurrentTSO) _ _ = HeapMem (AnyHeap) -- important for PrimOps
+regAddrHp platform hps is_store r _ _
+ | isGcPtrType (cmmRegType platform r)
+ = if is_store
+ then (HeapMem AnyHeap)
+ else if r `maybe_regAliasesHp` hps
+ then trace_hp_mem (text "Aliased HpRead") (HeapMem AnyHeap)
+ else (HeapMem OldHeap) -- yay! GCPtr pays for itself
+regAddrHp _ _hps _store _ _ _ = AnyMem
+
+trace_hp_mem :: SDoc -> a -> a
+trace_hp_mem _err x =
+ -- pprTrace "trace_hp_mem" err $
+ x
+
+-----------------------------------------------------------------------------
+-- Calculating what variables transitively depend on the value of Hp on block entry.
+-----------------------------------------------------------------------------
+
+-- | The variables aliased to HP on entry to a block
+data HpSet = HpSet { localSet :: !LocalRegSet, globalSet :: !GlobalRegSet }
+
+instance Outputable HpSet where
+ ppr (HpSet local global) = parens (text "HpSet" <+> text "local:" <+> ppr (regSetToList local) <+> ppr (regSetToList global))
+
+instance Semigroup HpSet where
+ (<>) = plusHpSet
+
+instance Monoid HpSet where
+ mempty = emptyHpSet
+
+sizeHpSet :: HpSet -> Int
+sizeHpSet (HpSet l g) = sizeRegSet l + sizeRegSet g
+
+plusHpSet :: HpSet -> HpSet -> HpSet
+plusHpSet (HpSet l1 g1) (HpSet l2 g2) = HpSet (plusRegSet l1 l2) (plusRegSet g1 g2) :: HpSet
+
+regAliasesHp :: CmmReg -> HpSet -> Bool
+regAliasesHp reg hp_set = go reg hp_set
+ where go (CmmLocal r) (HpSet l_set _g_set) = elemRegSet r l_set
+ go (CmmGlobal r) (HpSet _l_set g_set) = elemRegSet r g_set
+
+-- | If we have no information about aliasing we must assume everything can alias to Hp.
+maybe_regAliasesHp :: CmmReg -> Maybe HpSet -> Bool
+maybe_regAliasesHp _reg Nothing = True
+maybe_regAliasesHp reg (Just hps) = regAliasesHp reg hps
+
+
+emptyHpSet :: HpSet
+emptyHpSet = HpSet mempty mempty
+
+-- | The dataflow lattice
+hpLattice :: DataflowLattice (HpSet)
+hpLattice = DataflowLattice emptyHpSet add
+ where
+ add (OldFact old@(HpSet lold gold)) (NewFact (HpSet lnew gnew)) =
+ let !changed = (Set.size l_join + Set.size g_join > Set.size lold + Set.size gold)
+ join@(HpSet l_join g_join) = HpSet (Set.union lold lnew) (Set.union gold gnew)
+ in if changed then Changed join
+ else NotChanged old
+
+-- Given a set of registers aliasing to Hp compute the set of registers
+-- aliasing Hp after this node.
+node_exit_hps
+ :: ( OutputableP Platform (CmmNode e x)
+ )
+ => Platform -> (CmmNode e x) -> HpSet -> HpSet
+node_exit_hps platform node hp_set@(HpSet lset gset) =
+ let !result_aliases_hp =
+ case node of
+ -- See Note [CmmCalls and Hp Aliasing]
+ CmmCall{} -> False
+ CmmForeignCall{} -> False
+ -- Default (conservative) case. If the statement uses Hp assume it's result aliases Hp.
+ _default -> ( foldRegsUsed platform (\b r -> b || r == hpReg || aliasesHp r) False node)
+ where
+ aliasesHp r = r `regAliasesHp` hp_set
+
+ {-# INLINE update #-}
+ update :: forall r. (Ord r,Outputable r) => RegSet r -> r -> RegSet r
+ update s r = if result_aliases_hp
+ then -- pprTrace "Adding hp" (text "r:" <> ppr r <+> text "node:" <> pdoc platform node) $
+ extendRegSet s r
+ else deleteFromRegSet s r
+
+ g_hps = foldRegsDefd platform (\s reg -> update s reg) gset node :: GlobalRegSet
+ l_hps = foldRegsDefd platform (\s reg -> update s reg) lset node :: LocalRegSet
+
+ in (HpSet l_hps g_hps)
+
+-- | Compute hp aliasing registers at exit
+xferHp :: Platform -> TransferFun HpSet
+xferHp p = blockTransferFwd p hpLattice node_exit_hps
+
+-- | Compute a map from blocks a set of registers that alias to Hp on *entry* to that block.
+cmmHpAlias :: Platform -> CmmGraph -> LabelMap HpSet
+cmmHpAlias platform graph =
+ analyzeCmmFwd hpLattice (xferHp platform) graph mapEmpty
diff --git a/compiler/GHC/Cmm/Config.hs b/compiler/GHC/Cmm/Config.hs
index 415becd109..a3be9d163c 100644
--- a/compiler/GHC/Cmm/Config.hs
+++ b/compiler/GHC/Cmm/Config.hs
@@ -19,6 +19,7 @@ data CmmConfig = CmmConfig
, cmmDoLinting :: !Bool -- ^ Do Cmm Linting Optimization or not
, cmmOptElimCommonBlks :: !Bool -- ^ Eliminate common blocks or not
, cmmOptSink :: !Bool -- ^ Perform sink after stack layout or not
+ , cmmOptSinkAlias :: !Bool -- ^ Perform sink using more expensive aliasing analysis.
, cmmGenStackUnwindInstr :: !Bool -- ^ Generate stack unwinding instructions (for debugging)
, cmmExternalDynamicRefs :: !Bool -- ^ Generate code to link against dynamic libraries
, cmmDoCmmSwitchPlans :: !Bool -- ^ Should the Cmm pass replace Stg switch statements
diff --git a/compiler/GHC/Cmm/Dataflow.hs b/compiler/GHC/Cmm/Dataflow.hs
index edd91481e7..299a253e87 100644
--- a/compiler/GHC/Cmm/Dataflow.hs
+++ b/compiler/GHC/Cmm/Dataflow.hs
@@ -21,6 +21,7 @@ module GHC.Cmm.Dataflow
( C, O, Block
, lastNode, entryLabel
, foldNodesBwdOO
+ , foldNodesFwdOO
, foldRewriteNodesBwdOO
, DataflowLattice(..), OldFact(..), NewFact(..), JoinedFact(..)
, TransferFun, RewriteFun
@@ -31,6 +32,7 @@ module GHC.Cmm.Dataflow
, changedIf
, joinOutFacts
, joinFacts
+ , blockTransferFwd
)
where
@@ -49,6 +51,7 @@ import GHC.Cmm.Dataflow.Block
import GHC.Cmm.Dataflow.Graph
import GHC.Cmm.Dataflow.Collections
import GHC.Cmm.Dataflow.Label
+import GHC.Platform (Platform)
type family Fact (x :: Extensibility) f :: Type
type instance Fact C f = FactBase f
@@ -80,6 +83,7 @@ data DataflowLattice a = DataflowLattice
data Direction = Fwd | Bwd
+-- | Apply a block to the current fact base
type TransferFun f = CmmBlock -> FactBase f -> FactBase f
-- | `TransferFun` abstracted over `n` (the node type)
@@ -99,6 +103,8 @@ type RewriteFun f = CmmBlock -> FactBase f -> UniqSM (CmmBlock, FactBase f)
type RewriteFun' (n :: Extensibility -> Extensibility -> Type) f =
Block n C C -> FactBase f -> UniqSM (Block n C C, FactBase f)
+{-# INLINEABLE analyzeCmmBwd #-}
+{-# INLINEABLE analyzeCmmFwd #-}
analyzeCmmBwd, analyzeCmmFwd
:: (NonLocal node)
=> DataflowLattice f
@@ -109,6 +115,7 @@ analyzeCmmBwd, analyzeCmmFwd
analyzeCmmBwd = analyzeCmm Bwd
analyzeCmmFwd = analyzeCmm Fwd
+{-# INLINEABLE analyzeCmm #-}
analyzeCmm
:: (NonLocal node)
=> Direction
@@ -126,6 +133,7 @@ analyzeCmm dir lattice transfer cmmGraph initFact =
GMany NothingO bm NothingO -> bm
in fixpointAnalysis dir lattice transfer entry blockMap initFact
+{-# INLINEABLE fixpointAnalysis #-}
-- Fixpoint algorithm.
fixpointAnalysis
:: forall f node.
@@ -154,17 +162,17 @@ fixpointAnalysis direction lattice do_block entry blockmap = loop start
:: IntHeap -- Worklist, i.e., blocks to process
-> FactBase f -- Current result (increases monotonically)
-> FactBase f
- loop todo !fbase1 | Just (index, todo1) <- IntSet.minView todo =
+ loop todo !fbase_in | Just (index, todo1) <- IntSet.minView todo =
let block = block_arr ! index
- out_facts = {-# SCC "do_block" #-} do_block block fbase1
+ out_facts = {-# SCC "do_block" #-} do_block block fbase_in :: FactBase f
-- For each of the outgoing edges, we join it with the current
- -- information in fbase1 and (if something changed) we update it
+ -- information in fbase_in and (if something changed) we update it
-- and add the affected blocks to the worklist.
(todo2, fbase2) = {-# SCC "mapFoldWithKey" #-}
mapFoldlWithKey
- (updateFact join dep_blocks) (todo1, fbase1) out_facts
+ (updateFact join dep_blocks) (todo1, fbase_in) out_facts
in loop todo2 fbase2
- loop _ !fbase1 = fbase1
+ loop _ !fbase_in = fbase_in
rewriteCmmBwd
:: (NonLocal node)
@@ -222,7 +230,7 @@ fixpointRewrite dir lattice do_block entry blockmap = loop start blockmap
-> LabelMap (Block node C C) -- Rewritten blocks.
-> FactBase f -- Current facts.
-> UniqSM (LabelMap (Block node C C), FactBase f)
- loop todo !blocks1 !fbase1
+ loop todo !blocks1 !fbase_in
| Just (index, todo1) <- IntSet.minView todo = do
-- Note that we use the *original* block here. This is important.
-- We're optimistically rewriting blocks even before reaching the fixed
@@ -231,13 +239,13 @@ fixpointRewrite dir lattice do_block entry blockmap = loop start blockmap
-- into account the new facts).
let block = block_arr ! index
(new_block, out_facts) <- {-# SCC "do_block_rewrite" #-}
- do_block block fbase1
+ do_block block fbase_in
let blocks2 = mapInsert (entryLabel new_block) new_block blocks1
(todo2, fbase2) = {-# SCC "mapFoldWithKey_rewrite" #-}
mapFoldlWithKey
- (updateFact join dep_blocks) (todo1, fbase1) out_facts
+ (updateFact join dep_blocks) (todo1, fbase_in) out_facts
loop todo2 blocks2 fbase2
- loop _ !blocks1 !fbase1 = return (blocks1, fbase1)
+ loop _ !blocks1 !fbase_in = return (blocks1, fbase_in)
{-
@@ -340,13 +348,14 @@ mkDepBlocks Bwd blocks = go blocks 0 mapEmpty
-- | After some new facts have been generated by analysing a block, we
-- fold this function over them to generate (a) a list of block
-- indices to (re-)analyse, and (b) the new FactBase.
+{-# INLINEABLE updateFact #-}
updateFact
- :: JoinFun f
- -> LabelMap IntSet
- -> (IntHeap, FactBase f)
- -> Label
- -> f -- out fact
- -> (IntHeap, FactBase f)
+ :: JoinFun f -- ^ combinator
+ -> LabelMap IntSet -- ^ Block -> dependent blocks
+ -> (IntHeap, FactBase f) -- ^ (WorkList, InFacts)
+ -> Label -- ^ current block
+ -> f -- out fact -- ^ computed fact for current block
+ -> (IntHeap, FactBase f) -- ^ (updated worklist, updated FactBase)
updateFact fact_join dep_blocks (todo, fbase) lbl new_fact
= case lookupFact lbl fbase of
Nothing ->
@@ -357,8 +366,8 @@ updateFact fact_join dep_blocks (todo, fbase) lbl new_fact
(NotChanged _) -> (todo, fbase)
(Changed f) -> let !z = mapInsert lbl f fbase in (changed, z)
where
- changed = todo `IntSet.union`
- mapFindWithDefault IntSet.empty lbl dep_blocks
+ !changed = todo `IntSet.union`
+ mapFindWithDefault IntSet.empty lbl dep_blocks
{-
Note [No old fact]
@@ -374,13 +383,15 @@ out that always recording a change is faster.
----------------------------------------------------------------
-- Fact lookup: the fact `orelse` bottom
+{-# INLINEABLE getFact #-}
getFact :: DataflowLattice f -> Label -> FactBase f -> f
getFact lat l fb = case lookupFact l fb of Just f -> f
Nothing -> fact_bot lat
-- | Returns the result of joining the facts from all the successors of the
-- provided node or block.
-joinOutFacts :: (NonLocal n) => DataflowLattice f -> n e C -> FactBase f -> f
+{-# INLINEABLE joinOutFacts #-}
+joinOutFacts :: (NonLocal node) => DataflowLattice f -> node e C -> FactBase f -> f
joinOutFacts lattice nonLocal fact_base = foldl' join (fact_bot lattice) facts
where
join new old = getJoined $ fact_join lattice (OldFact old) (NewFact new)
@@ -391,12 +402,14 @@ joinOutFacts lattice nonLocal fact_base = foldl' join (fact_bot lattice) facts
, isJust fact
]
+{-# INLINEABLE joinFacts #-}
joinFacts :: DataflowLattice f -> [f] -> f
joinFacts lattice facts = foldl' join (fact_bot lattice) facts
where
join new old = getJoined $ fact_join lattice (OldFact old) (NewFact new)
-- | Returns the joined facts for each label.
+{-# INLINEABLE mkFactBase #-}
mkFactBase :: DataflowLattice f -> [(Label, f)] -> FactBase f
mkFactBase lattice = foldl' add mapEmpty
where
@@ -421,6 +434,41 @@ foldNodesBwdOO funOO = go
go BNil f = f
{-# INLINABLE foldNodesBwdOO #-}
+-- | Folds forwards over all nodes of an open-open block.
+-- Strict in the accumulator.
+foldNodesFwdOO :: (CmmNode O O -> f -> f) -> Block CmmNode O O -> f -> f
+foldNodesFwdOO funOO = go
+ where
+ go (BCat b1 b2) f = go b2 $! go b1 f
+ go (BSnoc h n) f = funOO n $! go h f
+ go (BCons n t) f = go t $! funOO n f
+ go (BMiddle n) f = funOO n f
+ go BNil f = f
+{-# INLINABLE foldNodesFwdOO #-}
+
+-- | Create a transfer function over blocks from a transfer function over nodes.
+--
+-- Useful if the facts are things that are true at *entry* to a block which we can
+-- determine by looking at predecessors.
+{-# INLINE blockTransferFwd #-} -- We *really* want this to specialize on value arguments.
+ -- So force inlining it.
+blockTransferFwd :: forall fact. Platform
+ -> (DataflowLattice fact)
+ -> (forall e x. Platform -> CmmNode e x -> fact -> fact)
+ -> TransferFun fact
+blockTransferFwd platform lattice node_transfer =
+ let transfer :: TransferFun fact
+ transfer (BlockCC eNode middle xNode) fBase =
+ let lbl = entryLabel eNode
+ block_info = getFact lattice lbl fBase
+ after_entry = (node_transfer platform) eNode block_info
+ after_middle = foldNodesFwdOO (node_transfer platform) middle after_entry
+ after_exit = node_transfer platform xNode after_middle
+ succs = successors xNode
+ in mapFromList $ map (\s -> (s,after_exit)) succs
+ in transfer
+
+
-- | Folds backward over all the nodes of an open-open block and allows
-- rewriting them. The accumulator is both the block of nodes and @f@ (usually
-- dataflow facts).
diff --git a/compiler/GHC/Cmm/Expr.hs b/compiler/GHC/Cmm/Expr.hs
index f910e65f04..3f910cd819 100644
--- a/compiler/GHC/Cmm/Expr.hs
+++ b/compiler/GHC/Cmm/Expr.hs
@@ -78,6 +78,10 @@ instance Eq CmmExpr where -- Equality ignores the types
CmmStackSlot a1 i1 == CmmStackSlot a2 i2 = a1==a2 && i1==i2
_e1 == _e2 = False
+-- Useful for debugging.
+instance Outputable CmmExpr where
+ ppr = text . show
+
data AlignmentSpec = NaturallyAligned | Unaligned
deriving (Eq, Ord, Show)
@@ -384,6 +388,34 @@ instance Ord r => UserOfRegs r r where
instance Ord r => DefinerOfRegs r r where
foldRegsDefd _ f z r = f z r
+-- | This is a instance of convenience.
+-- For performance sensitive code paths rather
+-- fold over local and global regs separately.
+instance UserOfRegs CmmReg GlobalReg where
+ {-# INLINEABLE foldRegsUsed #-}
+ foldRegsUsed _ f z reg = f z (CmmGlobal reg)
+
+-- | This is a instance of convenience.
+-- For performance sensitive code paths rather
+-- fold over local and global regs separately.
+instance DefinerOfRegs CmmReg GlobalReg where
+ foldRegsDefd _ f z reg = f z (CmmGlobal reg)
+
+-- | This is a instance of convenience.
+-- For performance sensitive code paths rather
+-- fold over local and global regs separately.
+instance UserOfRegs CmmReg LocalReg where
+ {-# INLINEABLE foldRegsUsed #-}
+ foldRegsUsed _ f z reg = f z (CmmLocal reg)
+
+-- | This is a instance of convenience.
+-- For performance sensitive code paths rather
+-- fold over local and global regs separately.
+instance DefinerOfRegs CmmReg LocalReg where
+ {-# INLINEABLE foldRegsDefd #-}
+ foldRegsDefd _ f z reg = f z (CmmLocal reg)
+
+
instance (Ord r, UserOfRegs r CmmReg) => UserOfRegs r CmmExpr where
-- The (Ord r) in the context is necessary here
-- See Note [Recursive superclasses] in GHC.Tc.TyCl.Instance
diff --git a/compiler/GHC/Cmm/Node.hs b/compiler/GHC/Cmm/Node.hs
index 841c726b14..020ba63dd2 100644
--- a/compiler/GHC/Cmm/Node.hs
+++ b/compiler/GHC/Cmm/Node.hs
@@ -46,6 +46,7 @@ import Data.List (tails,sortBy)
import GHC.Types.Unique (nonDetCmpUnique)
import GHC.Utils.Misc
+import GHC.Platform (Platform)
------------------------
-- CmmNode
@@ -348,6 +349,25 @@ instance UserOfRegs GlobalReg (CmmNode e x) where
=> (b -> GlobalReg -> b) -> b -> a -> b
fold f z n = foldRegsUsed platform f z n
+instance UserOfRegs CmmReg (CmmNode e x) where
+ {-# INLINE foldRegsUsed #-}
+ foldRegsUsed = foldRegsUsed_cmmReg_node
+
+{-# INLINEABLE foldRegsUsed_cmmReg_node #-}
+foldRegsUsed_cmmReg_node :: Platform -> (b->CmmReg->b) -> b -> CmmNode e x -> b
+foldRegsUsed_cmmReg_node platform f !z n = case n of
+ CmmAssign _ expr -> fold f z expr
+ CmmStore addr rval _align -> fold f (fold f z addr) rval
+ CmmUnsafeForeignCall t _ args -> fold f (fold f z t) args
+ CmmCondBranch expr _ _ _ -> fold f z expr
+ CmmSwitch expr _ -> fold f z expr
+ CmmCall {cml_target=tgt, cml_args_regs=args} -> fold f (fold f z args) tgt
+ CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args
+ _ -> z
+ where fold :: forall a b. UserOfRegs CmmReg a
+ => (b -> CmmReg -> b) -> b -> a -> b
+ fold f z n = foldRegsUsed platform f z n
+
instance (Ord r, UserOfRegs r CmmReg) => UserOfRegs r ForeignTarget where
-- The (Ord r) in the context is necessary here
-- See Note [Recursive superclasses] in GHC.Tc.TyCl.Instance
diff --git a/compiler/GHC/Cmm/Pipeline.hs b/compiler/GHC/Cmm/Pipeline.hs
index 585606fcb2..39de30bb0d 100644
--- a/compiler/GHC/Cmm/Pipeline.hs
+++ b/compiler/GHC/Cmm/Pipeline.hs
@@ -114,8 +114,9 @@ cpsTop logger platform cfg proc =
dump Opt_D_dump_cmm_sp "Layout Stack" g
----------- Sink and inline assignments --------------------------------
+ let sink_aliasing = if cmmOptSinkAlias cfg then SinkWithAliasAnalysis else SinkWithoutAliasAnalysis
g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
- condPass (cmmOptSink cfg) (cmmSink platform) g
+ condPass (cmmOptSink cfg) (cmmSink platform sink_aliasing) g
Opt_D_dump_cmm_sink "Sink assignments"
------------- CAF analysis ----------------------------------------------
diff --git a/compiler/GHC/Cmm/Sink.hs b/compiler/GHC/Cmm/Sink.hs
index 84f9317f21..d2a2049f95 100644
--- a/compiler/GHC/Cmm/Sink.hs
+++ b/compiler/GHC/Cmm/Sink.hs
@@ -2,12 +2,14 @@
{-# LANGUAGE ScopedTypeVariables #-}
module GHC.Cmm.Sink (
- cmmSink
- ) where
+ cmmSink
+ , SinkAliasFlag(..)
+ ) where
import GHC.Prelude
import GHC.Cmm
+import GHC.Cmm.Alias
import GHC.Cmm.Opt
import GHC.Cmm.Liveness
import GHC.Cmm.LRegSet
@@ -22,11 +24,15 @@ import GHC.Platform
import GHC.Types.Unique.FM
import qualified Data.IntSet as IntSet
-import Data.List (partition)
+import Data.List (partition, mapAccumL)
import Data.Maybe
import GHC.Exts (inline)
+import GHC.Utils.Outputable
+import GHC.Utils.Misc (fstOf3, thdOf3)
+import GHC.Plugins (sndOf3)
+
-- -----------------------------------------------------------------------------
-- Sinking and inlining
@@ -49,7 +55,7 @@ import GHC.Exts (inline)
--
-- Algorithm:
--
--- * Start by doing liveness analysis.
+-- * Start by doing liveness and Hp-aliasing analysis.
--
-- * Keep a list of assignments A; earlier ones may refer to later ones.
-- Currently we only sink assignments to local registers, because we don't
@@ -151,15 +157,24 @@ type Assignments = [Assignment]
-- y = e2
-- x = e1
-cmmSink :: Platform -> CmmGraph -> CmmGraph
-cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
+-- | Should we be more aggresive when sinking by running a aliasing analysis.
+data SinkAliasFlag = SinkWithAliasAnalysis | SinkWithoutAliasAnalysis deriving Eq
+
+cmmSink :: Platform -> SinkAliasFlag -> CmmGraph -> CmmGraph
+cmmSink platform alias_flag graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
where
+ do_alias = alias_flag == SinkWithAliasAnalysis
liveness = cmmLocalLivenessL platform graph
+ hp_aliases = if do_alias
+ then Just $ cmmHpAlias platform graph
+ else Nothing
getLive l = mapFindWithDefault emptyLRegSet l liveness
+ getHps :: Label -> Maybe HpSet
+ getHps l = mapFindWithDefault mempty l <$> hp_aliases -- Things which alias to Hp on entry.
blocks = revPostorder graph
- join_pts = findJoinPoints blocks
+ join_pts = findJoinPoints blocks :: LabelMap Int -- Block -> Number of Predecessors
sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
sink _ [] = []
@@ -176,11 +191,12 @@ cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
-- the node. This will help us decide whether we can inline
-- an assignment in the current node or not.
live = IntSet.unions (map getLive succs)
+ hps = getHps lbl :: Maybe HpSet
live_middle = gen_killL platform last live
- ann_middles = annotate platform live_middle (blockToList middle)
+ ann_middle = annotate platform live_middle hps (blockToList middle) :: AnnotatedNodes
-- Now sink and inline in this block
- (middle', assigs) = walk platform ann_middles (mapFindWithDefault [] lbl sunk)
+ (middle', assigs) = walk platform ann_middle (mapFindWithDefault [] lbl sunk)
fold_last = constantFoldNode platform last
(final_last, assigs') = tryToInline platform live fold_last assigs
@@ -202,13 +218,13 @@ cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
_ -> False
-- Now, drop any assignments that we will not sink any further.
- (dropped_last, assigs'') = dropAssignments platform drop_if init_live_sets assigs'
+ (dropped_last, assigs'') = dropAssignments platform (finalHpsFromAnnotatedNodes ann_middle) drop_if init_live_sets assigs'
drop_if :: (LocalReg, CmmExpr, AbsMem)
-> [LRegSet] -> (Bool, [LRegSet])
drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
where
- should_drop = conflicts platform a final_last
+ should_drop = conflicts platform hps a final_last
|| not (isTrivial platform rhs) && live_in_multi live_sets r
|| r `elemLRegSet` live_in_joins
@@ -223,7 +239,7 @@ cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
final_middle = foldl' blockSnoc middle' dropped_last
sunk' = mapUnion sunk $
- mapFromList [ (l, filterAssignments platform (getLive l) assigs'')
+ mapFromList [ (l, filterAssignments platform (getHps l) (getLive l) assigs'')
| l <- succs ]
{- TODO: enable this later, when we have some good tests in place to
@@ -252,36 +268,52 @@ isTrivial platform (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
isTrivial _ (CmmLit _) = True
isTrivial _ _ = False
---
--- annotate each node with the set of registers live *after* the node
---
-annotate :: Platform -> LRegSet -> [CmmNode O O] -> [(LRegSet, CmmNode O O)]
-annotate platform live nodes = snd $ foldr ann (live,[]) nodes
+-- | We don't use HpAliasing at below -O2 by default so we only annotate nodes with aliases if needed.
+type AnnotatedNodes = Either LiveAnnotated LiveHpAliasAnnotated
+type LiveAnnotated = [(LRegSet, CmmNode O O)] -- ^ Annotated with registers live *after* entry.
+type LiveHpAliasAnnotated = (HpSet, [(LRegSet, HpSet,CmmNode O O)]) -- ^ Annotated with registers live/HpAliases *after* entry.
+
+finalHpsFromAnnotatedNodes :: Either a1 (HpSet, b) -> Maybe HpSet
+finalHpsFromAnnotatedNodes (Left _) = Nothing
+finalHpsFromAnnotatedNodes (Right (final_hps,_)) = Just final_hps
+
+
+annotate :: Platform -> LRegSet -> Maybe HpSet -> [CmmNode O O] -> AnnotatedNodes
+annotate platform live hp_set nodes = annotateHps platform hp_set $ annotateLive platform live nodes
+
+-- | annotate each node with the set of registers live *after* the node
+annotateLive :: Platform -> LRegSet -> [CmmNode O O] -> LiveAnnotated
+annotateLive platform live nodes = snd $ foldr ann (live,[]) nodes
where ann n (live,nodes) = (gen_killL platform n live, (live,n) : nodes)
+annotateHps :: Platform -> Maybe HpSet -> [(LRegSet, CmmNode O O)] -> AnnotatedNodes
+annotateHps _platform Nothing nodes = Left nodes
+annotateHps platform (Just hp_set) nodes = Right $ mapAccumL ann hp_set nodes
+ where ann hps (live,n) = (node_exit_hps platform n hps, (live,hps,n))
+
--
-- Find the blocks that have multiple successors (join points)
--
findJoinPoints :: [CmmBlock] -> LabelMap Int
findJoinPoints blocks = mapFilter (>1) succ_counts
where
- all_succs = concatMap successors blocks
+ all_succs = concatMap successors blocks :: [Label]
succ_counts :: LabelMap Int
- succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
+ succ_counts = foldl' (\m l -> mapInsertWith (+) l 1 m) mapEmpty all_succs
--
-- filter the list of assignments to remove any assignments that
-- are not live in a continuation.
--
-filterAssignments :: Platform -> LRegSet -> Assignments -> Assignments
-filterAssignments platform live assigs = reverse (go assigs [])
+filterAssignments :: Platform -> Maybe HpSet -> LRegSet -> Assignments -> Assignments
+filterAssignments platform hps live assigs = reverse (go assigs [])
where go [] kept = kept
go (a@(r,_,_):as) kept | needed = go as (a:kept)
| otherwise = go as kept
where
needed = r `elemLRegSet` live
- || any (conflicts platform a) (map toNode kept)
+ || any (conflicts platform hps a) (map toNode kept)
-- Note that we must keep assignments that are
-- referred to by other assignments we have
-- already kept.
@@ -301,10 +333,28 @@ filterAssignments platform live assigs = reverse (go assigs [])
--
walk :: Platform
- -> [(LRegSet, CmmNode O O)] -- nodes of the block, annotated with
- -- the set of registers live *after*
- -- this node.
+ -> AnnotatedNodes
+ -> Assignments -- The current list of
+ -- assignments we are sinking.
+ -- Earlier assignments may refer
+ -- to later ones.
+ -> ( Block CmmNode O O -- The new block
+ , Assignments -- Assignments to sink further
+ )
+walk platform nodes assigs =
+ case nodes of
+ Left nodes' -> walk' platform nodes' snd fst (const Nothing) assigs
+ Right (_hps_exit,nodes') -> walk' platform nodes' thdOf3 fstOf3 (Just . sndOf3) assigs
+
+{-# INLINE walk' #-} -- We want the function arguments to result in known calls so we force inlining.
+walk' :: Platform
+ -> [a] -- nodes of the block, annotated with
+ -- the set of registers live/things aliasing Hp *after*
+ -- this node.
+ -> (a -> CmmNode O O)
+ -> (a -> LRegSet)
+ -> (a -> Maybe HpSet)
-> Assignments -- The current list of
-- assignments we are sinking.
-- Earlier assignments may refer
@@ -314,20 +364,23 @@ walk :: Platform
, Assignments -- Assignments to sink further
)
-walk platform nodes assigs = go nodes emptyBlock assigs
+walk' platform nodes getNode getLiveSet getHpAliases assigs = go nodes emptyBlock assigs
where
go [] block as = (block, as)
- go ((live,node):ns) block as
+ go (node_info:ns) block as
-- discard nodes representing dead assignment
| shouldDiscard node live = go ns block as
-- sometimes only after simplification we can tell we can discard the node.
-- See Note [Discard simplified nodes]
| noOpAssignment node2 = go ns block as
-- Pick up interesting assignments
- | Just a <- shouldSink platform node2 = go ns block (a : as1)
+ | Just a <- shouldSink platform hps node2 = go ns block (a : as1)
-- Try inlining, drop assignments and move on
| otherwise = go ns block' as'
where
+ live = getLiveSet node_info
+ hps = getHpAliases node_info
+ node = getNode node_info
-- Simplify node
node1 = constantFoldNode platform node
@@ -336,7 +389,7 @@ walk platform nodes assigs = go nodes emptyBlock assigs
-- Drop any earlier assignments conflicting with node2
(dropped, as') = dropAssignmentsSimple platform
- (\a -> conflicts platform a node2) as1
+ (\a -> conflicts platform hps a node2) hps as1
-- Walk over the rest of the block. Includes dropped assignments
block' = foldl' blockSnoc block dropped `blockSnoc` node2
@@ -383,10 +436,10 @@ cmm_sink_sp.
-- be profitable to sink assignments to global regs too, but the
-- liveness analysis doesn't track those (yet) so we can't.
--
-shouldSink :: Platform -> CmmNode e x -> Maybe Assignment
-shouldSink platform (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem platform e)
+shouldSink :: Platform -> Maybe HpSet -> CmmNode e x -> Maybe Assignment
+shouldSink platform hps (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem platform hps e)
where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
-shouldSink _ _other = Nothing
+shouldSink _ _ _other = Nothing
--
-- discard dead assignments. This doesn't do as good a job as
@@ -419,13 +472,13 @@ noOpAssignment node
toNode :: Assignment -> CmmNode O O
toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
-dropAssignmentsSimple :: Platform -> (Assignment -> Bool) -> Assignments
+dropAssignmentsSimple :: Platform -> (Assignment -> Bool) -> Maybe HpSet -> Assignments
-> ([CmmNode O O], Assignments)
-dropAssignmentsSimple platform f = dropAssignments platform (\a _ -> (f a, ())) ()
+dropAssignmentsSimple platform f hps = dropAssignments platform hps (\a _ -> (f a, ())) ()
-dropAssignments :: Platform -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
+dropAssignments :: Platform -> Maybe HpSet -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
-> ([CmmNode O O], Assignments)
-dropAssignments platform should_drop state assigs
+dropAssignments platform hps should_drop state assigs
= (dropped, reverse kept)
where
(dropped,kept) = go state assigs [] []
@@ -438,7 +491,7 @@ dropAssignments platform should_drop state assigs
| otherwise = go state' rest dropped (assig:kept)
where
(dropit, state') = should_drop assig state
- conflict = dropit || any (conflicts platform assig) dropped
+ conflict = dropit || any (conflicts platform hps assig) dropped
-- -----------------------------------------------------------------------------
@@ -448,7 +501,7 @@ dropAssignments platform should_drop state assigs
tryToInline
:: forall x. Platform
- -> LRegSet -- set of registers live after this
+ -> LRegSet -- set of registers live after this
-- node. We cannot inline anything
-- that is live after the node, unless
-- it is small enough to duplicate.
@@ -461,7 +514,9 @@ tryToInline
tryToInline platform liveAfter node assigs =
-- pprTrace "tryToInline assig length:" (ppr $ length assigs) $
- go usages liveAfter node emptyLRegSet assigs
+ let (n,as) = go usages liveAfter node emptyLRegSet assigs
+ in -- pprTrace "inlined:" (pdoc platform n $$ ppr as)
+ (n,as)
where
usages :: UniqFM LocalReg Int -- Maps each LocalReg to a count of how often it is used
usages = foldLocalRegsUsed platform addUsage emptyUFM node
@@ -651,40 +706,47 @@ okToInline _ _ _ = True
-- | @conflicts (r,e) node@ is @False@ if and only if the assignment
-- @r = e@ can be safely commuted past statement @node@.
-conflicts :: Platform -> Assignment -> CmmNode O x -> Bool
-conflicts platform (r, rhs, addr) node
+conflicts :: Platform -> Maybe HpSet -> Assignment -> CmmNode O x -> Bool
+conflicts platform hps (r, rhs, addr) node
-- (1) node defines registers used by rhs of assignment. This catches
-- assignments and all three kinds of calls. See Note [Sinking and calls]
- | globalRegistersConflict platform rhs node = True
- | localRegistersConflict platform rhs node = True
+ | globalRegistersConflict platform rhs node = traceConflicts "conflicts1" (ppr r) True
+ | localRegistersConflict platform rhs node = traceConflicts "conflicts2" (ppr r) True
-- (2) node uses register defined by assignment
- | foldRegsUsed platform (\b r' -> r == r' || b) False node = True
+ | foldRegsUsed platform (\b r' -> r == r' || b) False node = traceConflicts "conflicts3" (ppr r) True
-- (3) a store to an address conflicts with a read of the same memory
- | CmmStore addr' e _ <- node
- , memConflicts addr (loadAddr platform addr' (cmmExprWidth platform e)) = True
+ | CmmStore addr' e _a <- node
+ , memConflicts addr (storeAddr platform hps addr' (cmmExprWidth platform e))
+ = traceConflicts "conflicts4" (ppr r <+> ppr addr <+> ppr (storeAddr platform hps addr' (cmmExprWidth platform e)) $$
+ pdoc platform node)
+ True
-- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
- | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
- | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
- | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
+ | HeapMem{} <- addr, CmmAssign (CmmGlobal Hp) _ <- node = traceConflicts "conflicts6.1" (ppr r) True
+ | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = traceConflicts "conflicts6" (ppr r) True
+ | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = traceConflicts "conflicts7" (ppr r) True
-- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
- | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
+ | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = traceConflicts "conflicts8" (ppr r) True
-- (6) suspendThread clobbers every global register not backed by a real
-- register. It also clobbers heap and stack but this is handled by (5)
| CmmUnsafeForeignCall (PrimTarget MO_SuspendThread) _ _ <- node
, foldRegsUsed platform (\b g -> globalRegMaybe platform g == Nothing || b) False rhs
- = True
+ = traceConflicts "conflicts9" (ppr r) True
-- (7) native calls clobber any memory
- | CmmCall{} <- node, memConflicts addr AnyMem = True
+ | CmmCall{} <- node, memConflicts addr AnyMem = traceConflicts "conflicts10" (ppr r) True
-- (8) otherwise, no conflict
- | otherwise = False
+ | otherwise = traceConflicts "NoConflict" (ppr r)
+ False
+ where
+ -- traceConflicts s d = pprTrace s d
+ traceConflicts = \_s _d -> id
{- Note [Inlining foldRegsDefd]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -754,116 +816,6 @@ localRegistersConflict platform expr node =
-- UserOfRegs typeclasses.
--
--- An abstraction of memory read or written.
-data AbsMem
- = NoMem -- no memory accessed
- | AnyMem -- arbitrary memory
- | HeapMem -- definitely heap memory
- | StackMem -- definitely stack memory
- | SpMem -- <size>[Sp+n]
- {-# UNPACK #-} !Int
- {-# UNPACK #-} !Int
-
--- Having SpMem is important because it lets us float loads from Sp
--- past stores to Sp as long as they don't overlap, and this helps to
--- unravel some long sequences of
--- x1 = [Sp + 8]
--- x2 = [Sp + 16]
--- ...
--- [Sp + 8] = xi
--- [Sp + 16] = xj
---
--- Note that SpMem is invalidated if Sp is changed, but the definition
--- of 'conflicts' above handles that.
-
--- ToDo: this won't currently fix the following commonly occurring code:
--- x1 = [R1 + 8]
--- x2 = [R1 + 16]
--- ..
--- [Hp - 8] = x1
--- [Hp - 16] = x2
--- ..
-
--- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
--- assignments to [Hp + n] do not conflict with any other heap memory,
--- but this is tricky to nail down. What if we had
---
--- x = Hp + n
--- [x] = ...
---
--- the store to [x] should be "new heap", not "old heap".
--- Furthermore, you could imagine that if we started inlining
--- functions in Cmm then there might well be reads of heap memory
--- that was written in the same basic block. To take advantage of
--- non-aliasing of heap memory we will have to be more clever.
-
--- Note [Foreign calls clobber heap]
--- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- It is tempting to say that foreign calls clobber only
--- non-heap/stack memory, but unfortunately we break this invariant in
--- the RTS. For example, in stg_catch_retry_frame we call
--- stmCommitNestedTransaction() which modifies the contents of the
--- TRec it is passed (this actually caused incorrect code to be
--- generated).
---
--- Since the invariant is true for the majority of foreign calls,
--- perhaps we ought to have a special annotation for calls that can
--- modify heap/stack memory. For now we just use the conservative
--- definition here.
---
--- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
--- therefore we should never float any memory operations across one of
--- these calls.
---
--- `suspendThread` releases the capability used by the thread, hence we mustn't
--- float accesses to heap, stack or virtual global registers stored in the
--- capability (e.g. with unregisterised build, see #19237).
-
-
-bothMems :: AbsMem -> AbsMem -> AbsMem
-bothMems NoMem x = x
-bothMems x NoMem = x
-bothMems HeapMem HeapMem = HeapMem
-bothMems StackMem StackMem = StackMem
-bothMems (SpMem o1 w1) (SpMem o2 w2)
- | o1 == o2 = SpMem o1 (max w1 w2)
- | otherwise = StackMem
-bothMems SpMem{} StackMem = StackMem
-bothMems StackMem SpMem{} = StackMem
-bothMems _ _ = AnyMem
-
-memConflicts :: AbsMem -> AbsMem -> Bool
-memConflicts NoMem _ = False
-memConflicts _ NoMem = False
-memConflicts HeapMem StackMem = False
-memConflicts StackMem HeapMem = False
-memConflicts SpMem{} HeapMem = False
-memConflicts HeapMem SpMem{} = False
-memConflicts (SpMem o1 w1) (SpMem o2 w2)
- | o1 < o2 = o1 + w1 > o2
- | otherwise = o2 + w2 > o1
-memConflicts _ _ = True
-
-exprMem :: Platform -> CmmExpr -> AbsMem
-exprMem platform (CmmLoad addr w _) = bothMems (loadAddr platform addr (typeWidth w)) (exprMem platform addr)
-exprMem platform (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem platform) es)
-exprMem _ _ = NoMem
-
-loadAddr :: Platform -> CmmExpr -> Width -> AbsMem
-loadAddr platform e w =
- case e of
- CmmReg r -> regAddr platform r 0 w
- CmmRegOff r i -> regAddr platform r i w
- _other | regUsedIn platform spReg e -> StackMem
- | otherwise -> AnyMem
-
-regAddr :: Platform -> CmmReg -> Int -> Width -> AbsMem
-regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
-regAddr _ (CmmGlobal Hp) _ _ = HeapMem
-regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
-regAddr platform r _ _ | isGcPtrType (cmmRegType platform r) = HeapMem -- yay! GCPtr pays for itself
-regAddr _ _ _ _ = AnyMem
-
{-
Note [Inline GlobalRegs?]
~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/compiler/GHC/Driver/Config/Cmm.hs b/compiler/GHC/Driver/Config/Cmm.hs
index 38bab62048..a141e89e7c 100644
--- a/compiler/GHC/Driver/Config/Cmm.hs
+++ b/compiler/GHC/Driver/Config/Cmm.hs
@@ -19,6 +19,7 @@ initCmmConfig dflags = CmmConfig
, cmmDoLinting = gopt Opt_DoCmmLinting dflags
, cmmOptElimCommonBlks = gopt Opt_CmmElimCommonBlocks dflags
, cmmOptSink = gopt Opt_CmmSink dflags
+ , cmmOptSinkAlias = gopt Opt_CmmSinkAlias dflags
, cmmGenStackUnwindInstr = debugLevel dflags > 0
, cmmExternalDynamicRefs = gopt Opt_ExternalDynamicRefs dflags
, cmmDoCmmSwitchPlans = not . backendSupportsSwitch . backend $ dflags
diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs
index 209e6d1776..66c308312e 100644
--- a/compiler/GHC/Driver/Flags.hs
+++ b/compiler/GHC/Driver/Flags.hs
@@ -216,7 +216,8 @@ data GeneralFlag
| Opt_LlvmTBAA -- Use LLVM TBAA infrastructure for improving AA (hidden flag)
| Opt_LlvmFillUndefWithGarbage -- Testing for undef bugs (hidden flag)
| Opt_IrrefutableTuples
- | Opt_CmmSink
+ | Opt_CmmSink -- ^ Run cmm sinking pass.
+ | Opt_CmmSinkAlias -- ^ When doing cmm sinking, perform aliasing analysis to improve sinking.
| Opt_CmmStaticPred
| Opt_CmmElimCommonBlocks
| Opt_CmmControlFlow
@@ -446,6 +447,7 @@ optimisationFlags = EnumSet.fromList
, Opt_LlvmFillUndefWithGarbage
, Opt_IrrefutableTuples
, Opt_CmmSink
+ , Opt_CmmSinkAlias
, Opt_CmmElimCommonBlocks
, Opt_AsmShortcutting
, Opt_OmitYields
diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs
index b78d141061..fa3457cf81 100644
--- a/compiler/GHC/Driver/Session.hs
+++ b/compiler/GHC/Driver/Session.hs
@@ -3393,6 +3393,7 @@ fFlagsDeps = [
flagSpec "case-folding" Opt_CaseFolding,
flagSpec "cmm-elim-common-blocks" Opt_CmmElimCommonBlocks,
flagSpec "cmm-sink" Opt_CmmSink,
+ flagSpec "cmm-sink-alias" Opt_CmmSinkAlias,
flagSpec "cmm-static-pred" Opt_CmmStaticPred,
flagSpec "cse" Opt_CSE,
flagSpec "stg-cse" Opt_StgCSE,
@@ -3989,6 +3990,7 @@ optLevelFlags -- see Note [Documenting optimisation flags]
, ([1,2], Opt_CmmElimCommonBlocks)
, ([2], Opt_AsmShortcutting)
, ([1,2], Opt_CmmSink)
+ , ([2], Opt_CmmSinkAlias)
, ([1,2], Opt_CmmStaticPred)
, ([1,2], Opt_CSE)
, ([1,2], Opt_StgCSE)
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index f4c1a41dd3..ddc78cbca8 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -194,6 +194,7 @@ Library
GHC.Cmm.Lexer
GHC.Cmm.Lint
GHC.Cmm.Liveness
+ GHC.Cmm.Alias
GHC.Cmm.MachOp
GHC.Cmm.Node
GHC.Cmm.Opt
diff --git a/docs/users_guide/using-optimisation.rst b/docs/users_guide/using-optimisation.rst
index ab49f08ade..2db1455186 100644
--- a/docs/users_guide/using-optimisation.rst
+++ b/docs/users_guide/using-optimisation.rst
@@ -225,6 +225,17 @@ by saying ``-fno-wombat``.
to their usage sites. It also inlines simple expressions like
literals or registers.
+.. ghc-flag:: -fcmm-sink-alias
+ :shortdesc: Enable more complex aliasing analysis to improve the sinking pass. Implied by :ghc-flag:`-O2`.
+ :type: dynamic
+ :reverse: -fno-cmm-sink
+ :category:
+
+ :default: off but enabled with :ghc-flag:`-O2`.
+
+ This allows us to remove more intermediate variables in the Cmm pass.
+ For some programs this can avoid excessive spilling.
+
.. ghc-flag:: -fcmm-static-pred
:shortdesc: Enable static control flow prediction. Implied by :ghc-flag:`-O`.
:type: dynamic
diff --git a/testsuite/tests/cmm/should_compile/all.T b/testsuite/tests/cmm/should_compile/all.T
index 40813f01ec..88169eb720 100644
--- a/testsuite/tests/cmm/should_compile/all.T
+++ b/testsuite/tests/cmm/should_compile/all.T
@@ -1,6 +1,8 @@
#
test('selfloop', [cmm_src], compile, ['-no-hs-main'])
-test('cmm_sink_sp', [ only_ways(['optasm']), grep_errmsg('(\[Sp.*\]).*(=).*(\[.*R1.*\]).*;',[1,2,3]), cmm_src], compile, ['-no-hs-main -ddump-cmm -dsuppress-uniques -O'])
+test('cmm_sink_sp', [ when(wordsize(32), skip), only_ways(['optasm']), grep_errmsg('(\[Sp.*\]).*(=).*(\[.*R1.*\]).*;',[1,2,3]), cmm_src], compile, ['-no-hs-main -ddump-cmm -dsuppress-uniques -O -dsuppress-all'])
+test('cmm_sink_hp', [when(wordsize(32), skip), cmm_src,grep_errmsg('(I64\[.*Hp.*=.*;)',[1])], compile, ['-no-hs-main -O -ddump-cmm -fcmm-sink-alias'])
+test('cmm_hp_aliases', [ when(wordsize(32), skip), cmm_src,grep_errmsg('(I64\[.*Hp.*=.*;)',[1])], compile, ['-no-hs-main -O -ddump-cmm -fcmm-sink-alias'])
test('T16930', normal, makefile_test, ['T16930'])
test('T17442', normal, compile, [''])
diff --git a/testsuite/tests/cmm/should_compile/cmm_hp_aliases.cmm b/testsuite/tests/cmm/should_compile/cmm_hp_aliases.cmm
new file mode 100644
index 0000000000..dd7c9ac2d3
--- /dev/null
+++ b/testsuite/tests/cmm/should_compile/cmm_hp_aliases.cmm
@@ -0,0 +1,12 @@
+#include "Cmm.h"
+
+violate_hp_invariant ()
+{
+ W_ x1, x2;
+ x1 = Hp + 8;
+ x2 = W_[x1+8];
+ W_[Hp+8] = 1;
+ W_[Hp+16] = x2; // We shouldn't inline the alignment to x2 here because it conflichts with the write above.
+ // If we end up with something like W_[Hp+16] = W_[Hp+8] it's broken.
+ return (x2);
+}
diff --git a/testsuite/tests/cmm/should_compile/cmm_hp_aliases.stderr b/testsuite/tests/cmm/should_compile/cmm_hp_aliases.stderr
new file mode 100644
index 0000000000..810f5c955d
--- /dev/null
+++ b/testsuite/tests/cmm/should_compile/cmm_hp_aliases.stderr
@@ -0,0 +1,16 @@
+
+==================== Output Cmm ====================
+[violate_hp_invariant() { // []
+ { info_tbls: []
+ stack_info: arg_space: 8
+ }
+ {offset
+ c3: _c2::I64 = I64[Hp + 16];
+ I64[Hp + 8] = 1;
+ I64[Hp + 16] = _c2::I64;
+ R1 = _c2::I64;
+ call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
+ }
+ }]
+
+
diff --git a/testsuite/tests/cmm/should_compile/cmm_sink_hp.cmm b/testsuite/tests/cmm/should_compile/cmm_sink_hp.cmm
new file mode 100644
index 0000000000..b226bb7331
--- /dev/null
+++ b/testsuite/tests/cmm/should_compile/cmm_sink_hp.cmm
@@ -0,0 +1,22 @@
+#include "Cmm.h"
+
+stg_sink_hp ( P_ r1 )
+{
+ // Should produce a series of loads that are sunk into
+ // stores to Hp like this:
+ // W_[Hp + 16] = W_[x1 + 1]; // CmmStore
+ // W_[Hp + 24] = W_[x1 + 2]; // CmmStore
+
+ P_ x1,y1, y2, y3,y4;
+ x1 = P_[r1];
+ y1 = W_[x1];
+ y2 = W_[x1+8];
+ y3 = W_[x1+16];
+ y4 = W_[x1+24];
+
+ W_[Hp+8] = y1;
+ W_[Hp+16] = y2;
+ W_[Hp+24] = y3;
+ W_[Hp+32] = y4;
+ return (x1);
+} \ No newline at end of file
diff --git a/testsuite/tests/cmm/should_compile/cmm_sink_hp.stderr b/testsuite/tests/cmm/should_compile/cmm_sink_hp.stderr
new file mode 100644
index 0000000000..b22654169b
--- /dev/null
+++ b/testsuite/tests/cmm/should_compile/cmm_sink_hp.stderr
@@ -0,0 +1,29 @@
+
+==================== Output Cmm ====================
+[stg_sink_hp() { // [R1]
+ { info_tbls: []
+ stack_info: arg_space: 8
+ }
+ {offset
+ c7: // global
+ //tick src<cmm_sink_hp.cmm:(4,1)-(22,1)>
+ //tick src<cmm_sink_hp.cmm:11:8-19>
+ //tick src<cmm_sink_hp.cmm:12:8-20>
+ //tick src<cmm_sink_hp.cmm:13:8-22>
+ //tick src<cmm_sink_hp.cmm:14:8-23>
+ //tick src<cmm_sink_hp.cmm:15:8-23>
+ //tick src<cmm_sink_hp.cmm:17:11-22>
+ _c2::P64 = P64[R1];
+ I64[Hp + 8] = I64[_c2::P64];
+ //tick src<cmm_sink_hp.cmm:18:11-23>
+ I64[Hp + 16] = I64[_c2::P64 + 8];
+ //tick src<cmm_sink_hp.cmm:19:11-23>
+ I64[Hp + 24] = I64[_c2::P64 + 16];
+ //tick src<cmm_sink_hp.cmm:20:11-23>
+ I64[Hp + 32] = I64[_c2::P64 + 24];
+ R1 = _c2::P64;
+ call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
+ }
+ }]
+
+