diff options
-rw-r--r-- | compiler/GHC/Cmm/Alias.hs | 374 | ||||
-rw-r--r-- | compiler/GHC/Cmm/Config.hs | 1 | ||||
-rw-r--r-- | compiler/GHC/Cmm/Dataflow.hs | 84 | ||||
-rw-r--r-- | compiler/GHC/Cmm/Expr.hs | 32 | ||||
-rw-r--r-- | compiler/GHC/Cmm/Node.hs | 20 | ||||
-rw-r--r-- | compiler/GHC/Cmm/Pipeline.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/Cmm/Sink.hs | 278 | ||||
-rw-r--r-- | compiler/GHC/Driver/Config/Cmm.hs | 1 | ||||
-rw-r--r-- | compiler/GHC/Driver/Flags.hs | 4 | ||||
-rw-r--r-- | compiler/GHC/Driver/Session.hs | 2 | ||||
-rw-r--r-- | compiler/ghc.cabal.in | 1 | ||||
-rw-r--r-- | docs/users_guide/using-optimisation.rst | 11 | ||||
-rw-r--r-- | testsuite/tests/cmm/should_compile/all.T | 4 | ||||
-rw-r--r-- | testsuite/tests/cmm/should_compile/cmm_hp_aliases.cmm | 12 | ||||
-rw-r--r-- | testsuite/tests/cmm/should_compile/cmm_hp_aliases.stderr | 16 | ||||
-rw-r--r-- | testsuite/tests/cmm/should_compile/cmm_sink_hp.cmm | 22 | ||||
-rw-r--r-- | testsuite/tests/cmm/should_compile/cmm_sink_hp.stderr | 29 |
17 files changed, 710 insertions, 184 deletions
diff --git a/compiler/GHC/Cmm/Alias.hs b/compiler/GHC/Cmm/Alias.hs new file mode 100644 index 0000000000..c89f50be86 --- /dev/null +++ b/compiler/GHC/Cmm/Alias.hs @@ -0,0 +1,374 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} + +module GHC.Cmm.Alias + ( AbsMem(..) + , bothHeaps, heapsConflict, bothMems + , memConflicts --, exprMem, loadAddr, storeAddr + + , exprMem, loadAddr, storeAddr + + , cmmHpAlias, node_exit_hps, HpSet(..), regAliasesHp + , sizeHpSet + + ) +where + +import GHC.Prelude as Prelude + +import GHC.Platform +import GHC.Cmm +import GHC.Cmm.Ppr.Expr () -- For Outputable instances +import GHC.Cmm.Ppr () -- For Outputable instances +import GHC.Cmm.Dataflow.Collections +import GHC.Cmm.Dataflow +import GHC.Cmm.Dataflow.Label + +import GHC.Utils.Outputable + +import Data.Set as Set +import qualified Data.Semigroup +import GHC.Cmm.Utils (regUsedIn) +-- import GHC.Utils.Trace (pprTrace) + +----------------------------------------------------------------------------- +-- Abstracting over memory access +----------------------------------------------------------------------------- + +-- An abstraction of memory read or written. +data AbsMem + = NoMem -- ^ no memory accessed + | AnyMem -- ^ arbitrary memory + | HeapMem !HeapType-- ^ heap memory + | StackMem -- ^ definitely stack memory + | SpMem -- ^ <size>[Sp+n] see Note [SpMem Aliasing] + {-# UNPACK #-} !Int + {-# UNPACK #-} !Int + deriving Show + +instance Outputable AbsMem where + ppr x = parens (text . show $ x) + +-- See Note [Heap Kinds] +data HeapType = OldHeap | NewHeap | AnyHeap deriving (Show,Eq) + +bothHeaps :: HeapType -> HeapType -> HeapType +bothHeaps h1 h2 | h1 == h2 = h1 +bothHeaps _ _ = AnyHeap + +heapsConflict :: HeapType -> HeapType -> Bool +heapsConflict AnyHeap _ = True +heapsConflict _ AnyHeap = True +heapsConflict OldHeap OldHeap = True +heapsConflict NewHeap NewHeap = False +heapsConflict OldHeap NewHeap = False +heapsConflict NewHeap OldHeap = False + +{- Note [Heap Kinds] +~~~~~~~~~~~~~~~~~~~~ +Our goal is to allow sinking into assignments to Hp. +That is for a sequence like: + + c1 = [R1 + 8] + c2 = [R1 + 16] + [Hp-16] = c1 + [Hp-8] = c2 + +We want to inline the assignments to get: + + [Hp-16] = [R1 + 8] + [Hp-8] = [R1 + 16] + +To achieve this we split heap memory references into three kinds. +OldHeap, NewHeap, AnyHeap. + +AnyHeap is the conservative estimate of a reference where a write/read +might conflight with any other write/read. + +OldHeap represents reads from memory where objects existing on entry to +the current function are located. + +NewHeap represents the area of memory into which we allocate new objects. +Since we only create *new* objects there it won't conflict with reading +from already existing objects. And while we write to various Hp-relative +memory locations by constructions none of these do conflict. + +* Reading from regular heap memory is defined to be OldHeap. +* Writing to regular heap memory is defined to be AnyHeap. +* Writing via HpReg is defined to be NewHeap. A write like this + always allocates a *new* object (by design) so it won't affect + reads from existing objects. +* An expression depending on New+Old heap is treated as AnyHeap +* Reading via HpReg (or an alias to it) is treated as AnyMem. + +New/OldHeap don't conflict. All other kinds of reference combinations do conflict. + +This means we can sink reads from `OldHeap` past writes to `NewHeap` (Hp) +giving use better code as we can remove all the intermediate variables which +sometimes used to get spilled to the C stack. + +This depends on Hp never being used to write to "old" heap. This +isn't something our code generation ever does, so that is fine. + +Note [CmmCalls and Hp Aliasing] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Since we assume all foreign calls clopper heap/stack (see Note [Foreign calls clobber heap]) +we can relax the Hp aliasing slightly. In particular we will never sink memory accessing +expressions across calls so using Hp and Hp-aliasing variables as arguments/targets for function +calls is allowed. + +This is important if we have code like this: + + // Allocate some closure + I64[Hp - 32] = x1_s5bz_info; + ... + hp_ptr::P64 = Hp - 32; + I64[Hp - n] = foo; + ... + if <cond> goto c6cS; + ... +c6cS: + // Evaluate the thunk + call (I64[hp_ptr])(hp_ptr) returns to c6cQ, args: 8, res: 8, upd: 8; + +Is this code problematic for sink? Not really. While hp_ptr aliases to the +same area of memory as Hp it's only used inside a call. And we currently +never sink reads/writes across calls anyway. +The end result being that using hp-aliasing variables as arguments/targets +for function calls is fine. + +----- + +The other issue with calls are their results. Naturally a call might return a newly +allocated heap object as result. But since we don't sink across calls we can assume +any write to Hp after a call will write to different memory than where the call allocated +the object. So even if technically the result can point to the nursery we will treat it +as OldHeap after the call. + +-------------------------------------------------------------------------------- + +Note [SpMem Aliasing] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Having SpMem is important because it lets us float loads from Sp +past stores to Sp as long as they don't overlap, and this helps to +unravel some long sequences of + x1 = [Sp + 8] + x2 = [Sp + 16] + ... + [Sp + 8] = xi + [Sp + 16] = xj + +Note that SpMem is invalidated if Sp is changed, but the definition +of 'conflicts' above handles that. + +ToDo: this won't currently fix the following commonly occurring code: + x1 = [R1 + 8] + x2 = [R1 + 16] + .. + [Hp - 8] = x1 + [Hp - 16] = x2 + .. + +because [R1 + 8] and [Hp - 8] are both HeapMem. We know that +assignments to [Hp + n] do not conflict with any other heap memory, +but this is tricky to nail down. What if we had + + x = Hp + n + [x] = ... + + the store to [x] should be "new heap", not "old heap". + Furthermore, you could imagine that if we started inlining + functions in Cmm then there might well be reads of heap memory + that was written in the same basic block. To take advantage of + non-aliasing of heap memory we will have to be more clever. + + NB: We now solved the last point. See Note [Heap Kinds]. + +Note [Foreign calls clobber heap] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +It is tempting to say that foreign calls clobber only +non-heap/stack memory, but unfortunately we break this invariant in +the RTS. For example, in stg_catch_retry_frame we call +stmCommitNestedTransaction() which modifies the contents of the +TRec it is passed (this actually caused incorrect code to be +generated). + +Since the invariant is true for the majority of foreign calls, +perhaps we ought to have a special annotation for calls that can +modify heap/stack memory. For now we just use the conservative +definition here. + +Some CallishMachOp imply a memory barrier e.g. AtomicRMW and +therefore we should never float any memory operations across one of +these calls. + +`suspendThread` releases the capability used by the thread, hence we mustn't +float accesses to heap, stack or virtual global registers stored in the +capability (e.g. with unregisterised build, see #19237). + +-} + +bothMems :: AbsMem -> AbsMem -> AbsMem +bothMems NoMem x = x +bothMems x NoMem = x +bothMems (HeapMem h1) (HeapMem h2) = HeapMem $! bothHeaps h1 h2 +bothMems StackMem StackMem = StackMem +bothMems (SpMem o1 w1) (SpMem o2 w2) + | o1 == o2 = SpMem o1 (max w1 w2) + | otherwise = StackMem +bothMems SpMem{} StackMem = StackMem +bothMems StackMem SpMem{} = StackMem +bothMems _ _ = AnyMem + +memConflicts :: AbsMem -> AbsMem -> Bool +memConflicts NoMem _ = False +memConflicts _ NoMem = False +memConflicts HeapMem{} StackMem = False +memConflicts StackMem HeapMem{} = False +memConflicts SpMem{} HeapMem{} = False +memConflicts HeapMem{} SpMem{} = False +memConflicts (HeapMem h1) (HeapMem h2) = heapsConflict h1 h2 +memConflicts (SpMem o1 w1) (SpMem o2 w2) + | o1 < o2 = o1 + w1 > o2 + | otherwise = o2 + w2 > o1 +memConflicts _ _ = True + +----------------------------------------------------------------------------- +-- Abstracting over memory access - considering which registers might alias to Hp +-- +-- Currently these will panic when trying to load values via Hp or Hp aliased +-- expressions. If we ever allow use of Hp for memory reads then we need to return +-- AnyHeap instead. +----------------------------------------------------------------------------- + +exprMem :: Platform -> Maybe HpSet -> CmmExpr -> AbsMem +exprMem platform hps (CmmLoad addr w _a) = bothMems (loadAddr platform hps addr (typeWidth w)) + (exprMem platform hps addr) +exprMem platform hps (CmmMachOp _ es) = let args = fmap (exprMem platform hps) es + in Prelude.foldr (\l r -> l `seq` r `seq` bothMems l r) NoMem args +exprMem _ _ (CmmStackSlot {}) = AnyMem +exprMem _ _ _ = NoMem + +-- We treat reading from Hp different than loading from Hp, hence the load/store distinction. +-- See Note [Heap Kinds] +loadAddr, storeAddr :: Platform -> Maybe HpSet -> CmmExpr -> Width -> AbsMem +loadAddr p hps = refAddrHp p hps False +storeAddr p hps = refAddrHp p hps True + +refAddrHp :: Platform -> Maybe HpSet -> Bool -> CmmExpr -> Width -> AbsMem +refAddrHp platform hps is_store e w = -- pprTrace "refAddrHp" (ppr e) $ + case e of + CmmReg r -> regAddrHp platform hps is_store r 0 w + CmmRegOff r i -> regAddrHp platform hps is_store r i w + _other | regUsedIn platform spReg e -> StackMem + | foldRegsUsed platform (\b r -> b || r `maybe_regAliasesHp` hps) False e -> trace_hp_mem (text "refAddrHp") (AnyMem) + | otherwise -> -- pprTrace "refAddrAny" (ppr e) + AnyMem + +regAddrHp :: Platform -> Maybe HpSet -> Bool -> CmmReg -> Int -> Width -> AbsMem +regAddrHp _ _hps _store (CmmGlobal Sp) i w = SpMem i (widthInBytes w) +regAddrHp _ _hps is_store (CmmGlobal Hp) _ _ + | is_store = HeapMem NewHeap + | otherwise = trace_hp_mem (text "HpStore") (HeapMem AnyHeap) +regAddrHp _ _hps _store (CmmGlobal CurrentTSO) _ _ = HeapMem (AnyHeap) -- important for PrimOps +regAddrHp platform hps is_store r _ _ + | isGcPtrType (cmmRegType platform r) + = if is_store + then (HeapMem AnyHeap) + else if r `maybe_regAliasesHp` hps + then trace_hp_mem (text "Aliased HpRead") (HeapMem AnyHeap) + else (HeapMem OldHeap) -- yay! GCPtr pays for itself +regAddrHp _ _hps _store _ _ _ = AnyMem + +trace_hp_mem :: SDoc -> a -> a +trace_hp_mem _err x = + -- pprTrace "trace_hp_mem" err $ + x + +----------------------------------------------------------------------------- +-- Calculating what variables transitively depend on the value of Hp on block entry. +----------------------------------------------------------------------------- + +-- | The variables aliased to HP on entry to a block +data HpSet = HpSet { localSet :: !LocalRegSet, globalSet :: !GlobalRegSet } + +instance Outputable HpSet where + ppr (HpSet local global) = parens (text "HpSet" <+> text "local:" <+> ppr (regSetToList local) <+> ppr (regSetToList global)) + +instance Semigroup HpSet where + (<>) = plusHpSet + +instance Monoid HpSet where + mempty = emptyHpSet + +sizeHpSet :: HpSet -> Int +sizeHpSet (HpSet l g) = sizeRegSet l + sizeRegSet g + +plusHpSet :: HpSet -> HpSet -> HpSet +plusHpSet (HpSet l1 g1) (HpSet l2 g2) = HpSet (plusRegSet l1 l2) (plusRegSet g1 g2) :: HpSet + +regAliasesHp :: CmmReg -> HpSet -> Bool +regAliasesHp reg hp_set = go reg hp_set + where go (CmmLocal r) (HpSet l_set _g_set) = elemRegSet r l_set + go (CmmGlobal r) (HpSet _l_set g_set) = elemRegSet r g_set + +-- | If we have no information about aliasing we must assume everything can alias to Hp. +maybe_regAliasesHp :: CmmReg -> Maybe HpSet -> Bool +maybe_regAliasesHp _reg Nothing = True +maybe_regAliasesHp reg (Just hps) = regAliasesHp reg hps + + +emptyHpSet :: HpSet +emptyHpSet = HpSet mempty mempty + +-- | The dataflow lattice +hpLattice :: DataflowLattice (HpSet) +hpLattice = DataflowLattice emptyHpSet add + where + add (OldFact old@(HpSet lold gold)) (NewFact (HpSet lnew gnew)) = + let !changed = (Set.size l_join + Set.size g_join > Set.size lold + Set.size gold) + join@(HpSet l_join g_join) = HpSet (Set.union lold lnew) (Set.union gold gnew) + in if changed then Changed join + else NotChanged old + +-- Given a set of registers aliasing to Hp compute the set of registers +-- aliasing Hp after this node. +node_exit_hps + :: ( OutputableP Platform (CmmNode e x) + ) + => Platform -> (CmmNode e x) -> HpSet -> HpSet +node_exit_hps platform node hp_set@(HpSet lset gset) = + let !result_aliases_hp = + case node of + -- See Note [CmmCalls and Hp Aliasing] + CmmCall{} -> False + CmmForeignCall{} -> False + -- Default (conservative) case. If the statement uses Hp assume it's result aliases Hp. + _default -> ( foldRegsUsed platform (\b r -> b || r == hpReg || aliasesHp r) False node) + where + aliasesHp r = r `regAliasesHp` hp_set + + {-# INLINE update #-} + update :: forall r. (Ord r,Outputable r) => RegSet r -> r -> RegSet r + update s r = if result_aliases_hp + then -- pprTrace "Adding hp" (text "r:" <> ppr r <+> text "node:" <> pdoc platform node) $ + extendRegSet s r + else deleteFromRegSet s r + + g_hps = foldRegsDefd platform (\s reg -> update s reg) gset node :: GlobalRegSet + l_hps = foldRegsDefd platform (\s reg -> update s reg) lset node :: LocalRegSet + + in (HpSet l_hps g_hps) + +-- | Compute hp aliasing registers at exit +xferHp :: Platform -> TransferFun HpSet +xferHp p = blockTransferFwd p hpLattice node_exit_hps + +-- | Compute a map from blocks a set of registers that alias to Hp on *entry* to that block. +cmmHpAlias :: Platform -> CmmGraph -> LabelMap HpSet +cmmHpAlias platform graph = + analyzeCmmFwd hpLattice (xferHp platform) graph mapEmpty diff --git a/compiler/GHC/Cmm/Config.hs b/compiler/GHC/Cmm/Config.hs index 415becd109..a3be9d163c 100644 --- a/compiler/GHC/Cmm/Config.hs +++ b/compiler/GHC/Cmm/Config.hs @@ -19,6 +19,7 @@ data CmmConfig = CmmConfig , cmmDoLinting :: !Bool -- ^ Do Cmm Linting Optimization or not , cmmOptElimCommonBlks :: !Bool -- ^ Eliminate common blocks or not , cmmOptSink :: !Bool -- ^ Perform sink after stack layout or not + , cmmOptSinkAlias :: !Bool -- ^ Perform sink using more expensive aliasing analysis. , cmmGenStackUnwindInstr :: !Bool -- ^ Generate stack unwinding instructions (for debugging) , cmmExternalDynamicRefs :: !Bool -- ^ Generate code to link against dynamic libraries , cmmDoCmmSwitchPlans :: !Bool -- ^ Should the Cmm pass replace Stg switch statements diff --git a/compiler/GHC/Cmm/Dataflow.hs b/compiler/GHC/Cmm/Dataflow.hs index edd91481e7..299a253e87 100644 --- a/compiler/GHC/Cmm/Dataflow.hs +++ b/compiler/GHC/Cmm/Dataflow.hs @@ -21,6 +21,7 @@ module GHC.Cmm.Dataflow ( C, O, Block , lastNode, entryLabel , foldNodesBwdOO + , foldNodesFwdOO , foldRewriteNodesBwdOO , DataflowLattice(..), OldFact(..), NewFact(..), JoinedFact(..) , TransferFun, RewriteFun @@ -31,6 +32,7 @@ module GHC.Cmm.Dataflow , changedIf , joinOutFacts , joinFacts + , blockTransferFwd ) where @@ -49,6 +51,7 @@ import GHC.Cmm.Dataflow.Block import GHC.Cmm.Dataflow.Graph import GHC.Cmm.Dataflow.Collections import GHC.Cmm.Dataflow.Label +import GHC.Platform (Platform) type family Fact (x :: Extensibility) f :: Type type instance Fact C f = FactBase f @@ -80,6 +83,7 @@ data DataflowLattice a = DataflowLattice data Direction = Fwd | Bwd +-- | Apply a block to the current fact base type TransferFun f = CmmBlock -> FactBase f -> FactBase f -- | `TransferFun` abstracted over `n` (the node type) @@ -99,6 +103,8 @@ type RewriteFun f = CmmBlock -> FactBase f -> UniqSM (CmmBlock, FactBase f) type RewriteFun' (n :: Extensibility -> Extensibility -> Type) f = Block n C C -> FactBase f -> UniqSM (Block n C C, FactBase f) +{-# INLINEABLE analyzeCmmBwd #-} +{-# INLINEABLE analyzeCmmFwd #-} analyzeCmmBwd, analyzeCmmFwd :: (NonLocal node) => DataflowLattice f @@ -109,6 +115,7 @@ analyzeCmmBwd, analyzeCmmFwd analyzeCmmBwd = analyzeCmm Bwd analyzeCmmFwd = analyzeCmm Fwd +{-# INLINEABLE analyzeCmm #-} analyzeCmm :: (NonLocal node) => Direction @@ -126,6 +133,7 @@ analyzeCmm dir lattice transfer cmmGraph initFact = GMany NothingO bm NothingO -> bm in fixpointAnalysis dir lattice transfer entry blockMap initFact +{-# INLINEABLE fixpointAnalysis #-} -- Fixpoint algorithm. fixpointAnalysis :: forall f node. @@ -154,17 +162,17 @@ fixpointAnalysis direction lattice do_block entry blockmap = loop start :: IntHeap -- Worklist, i.e., blocks to process -> FactBase f -- Current result (increases monotonically) -> FactBase f - loop todo !fbase1 | Just (index, todo1) <- IntSet.minView todo = + loop todo !fbase_in | Just (index, todo1) <- IntSet.minView todo = let block = block_arr ! index - out_facts = {-# SCC "do_block" #-} do_block block fbase1 + out_facts = {-# SCC "do_block" #-} do_block block fbase_in :: FactBase f -- For each of the outgoing edges, we join it with the current - -- information in fbase1 and (if something changed) we update it + -- information in fbase_in and (if something changed) we update it -- and add the affected blocks to the worklist. (todo2, fbase2) = {-# SCC "mapFoldWithKey" #-} mapFoldlWithKey - (updateFact join dep_blocks) (todo1, fbase1) out_facts + (updateFact join dep_blocks) (todo1, fbase_in) out_facts in loop todo2 fbase2 - loop _ !fbase1 = fbase1 + loop _ !fbase_in = fbase_in rewriteCmmBwd :: (NonLocal node) @@ -222,7 +230,7 @@ fixpointRewrite dir lattice do_block entry blockmap = loop start blockmap -> LabelMap (Block node C C) -- Rewritten blocks. -> FactBase f -- Current facts. -> UniqSM (LabelMap (Block node C C), FactBase f) - loop todo !blocks1 !fbase1 + loop todo !blocks1 !fbase_in | Just (index, todo1) <- IntSet.minView todo = do -- Note that we use the *original* block here. This is important. -- We're optimistically rewriting blocks even before reaching the fixed @@ -231,13 +239,13 @@ fixpointRewrite dir lattice do_block entry blockmap = loop start blockmap -- into account the new facts). let block = block_arr ! index (new_block, out_facts) <- {-# SCC "do_block_rewrite" #-} - do_block block fbase1 + do_block block fbase_in let blocks2 = mapInsert (entryLabel new_block) new_block blocks1 (todo2, fbase2) = {-# SCC "mapFoldWithKey_rewrite" #-} mapFoldlWithKey - (updateFact join dep_blocks) (todo1, fbase1) out_facts + (updateFact join dep_blocks) (todo1, fbase_in) out_facts loop todo2 blocks2 fbase2 - loop _ !blocks1 !fbase1 = return (blocks1, fbase1) + loop _ !blocks1 !fbase_in = return (blocks1, fbase_in) {- @@ -340,13 +348,14 @@ mkDepBlocks Bwd blocks = go blocks 0 mapEmpty -- | After some new facts have been generated by analysing a block, we -- fold this function over them to generate (a) a list of block -- indices to (re-)analyse, and (b) the new FactBase. +{-# INLINEABLE updateFact #-} updateFact - :: JoinFun f - -> LabelMap IntSet - -> (IntHeap, FactBase f) - -> Label - -> f -- out fact - -> (IntHeap, FactBase f) + :: JoinFun f -- ^ combinator + -> LabelMap IntSet -- ^ Block -> dependent blocks + -> (IntHeap, FactBase f) -- ^ (WorkList, InFacts) + -> Label -- ^ current block + -> f -- out fact -- ^ computed fact for current block + -> (IntHeap, FactBase f) -- ^ (updated worklist, updated FactBase) updateFact fact_join dep_blocks (todo, fbase) lbl new_fact = case lookupFact lbl fbase of Nothing -> @@ -357,8 +366,8 @@ updateFact fact_join dep_blocks (todo, fbase) lbl new_fact (NotChanged _) -> (todo, fbase) (Changed f) -> let !z = mapInsert lbl f fbase in (changed, z) where - changed = todo `IntSet.union` - mapFindWithDefault IntSet.empty lbl dep_blocks + !changed = todo `IntSet.union` + mapFindWithDefault IntSet.empty lbl dep_blocks {- Note [No old fact] @@ -374,13 +383,15 @@ out that always recording a change is faster. ---------------------------------------------------------------- -- Fact lookup: the fact `orelse` bottom +{-# INLINEABLE getFact #-} getFact :: DataflowLattice f -> Label -> FactBase f -> f getFact lat l fb = case lookupFact l fb of Just f -> f Nothing -> fact_bot lat -- | Returns the result of joining the facts from all the successors of the -- provided node or block. -joinOutFacts :: (NonLocal n) => DataflowLattice f -> n e C -> FactBase f -> f +{-# INLINEABLE joinOutFacts #-} +joinOutFacts :: (NonLocal node) => DataflowLattice f -> node e C -> FactBase f -> f joinOutFacts lattice nonLocal fact_base = foldl' join (fact_bot lattice) facts where join new old = getJoined $ fact_join lattice (OldFact old) (NewFact new) @@ -391,12 +402,14 @@ joinOutFacts lattice nonLocal fact_base = foldl' join (fact_bot lattice) facts , isJust fact ] +{-# INLINEABLE joinFacts #-} joinFacts :: DataflowLattice f -> [f] -> f joinFacts lattice facts = foldl' join (fact_bot lattice) facts where join new old = getJoined $ fact_join lattice (OldFact old) (NewFact new) -- | Returns the joined facts for each label. +{-# INLINEABLE mkFactBase #-} mkFactBase :: DataflowLattice f -> [(Label, f)] -> FactBase f mkFactBase lattice = foldl' add mapEmpty where @@ -421,6 +434,41 @@ foldNodesBwdOO funOO = go go BNil f = f {-# INLINABLE foldNodesBwdOO #-} +-- | Folds forwards over all nodes of an open-open block. +-- Strict in the accumulator. +foldNodesFwdOO :: (CmmNode O O -> f -> f) -> Block CmmNode O O -> f -> f +foldNodesFwdOO funOO = go + where + go (BCat b1 b2) f = go b2 $! go b1 f + go (BSnoc h n) f = funOO n $! go h f + go (BCons n t) f = go t $! funOO n f + go (BMiddle n) f = funOO n f + go BNil f = f +{-# INLINABLE foldNodesFwdOO #-} + +-- | Create a transfer function over blocks from a transfer function over nodes. +-- +-- Useful if the facts are things that are true at *entry* to a block which we can +-- determine by looking at predecessors. +{-# INLINE blockTransferFwd #-} -- We *really* want this to specialize on value arguments. + -- So force inlining it. +blockTransferFwd :: forall fact. Platform + -> (DataflowLattice fact) + -> (forall e x. Platform -> CmmNode e x -> fact -> fact) + -> TransferFun fact +blockTransferFwd platform lattice node_transfer = + let transfer :: TransferFun fact + transfer (BlockCC eNode middle xNode) fBase = + let lbl = entryLabel eNode + block_info = getFact lattice lbl fBase + after_entry = (node_transfer platform) eNode block_info + after_middle = foldNodesFwdOO (node_transfer platform) middle after_entry + after_exit = node_transfer platform xNode after_middle + succs = successors xNode + in mapFromList $ map (\s -> (s,after_exit)) succs + in transfer + + -- | Folds backward over all the nodes of an open-open block and allows -- rewriting them. The accumulator is both the block of nodes and @f@ (usually -- dataflow facts). diff --git a/compiler/GHC/Cmm/Expr.hs b/compiler/GHC/Cmm/Expr.hs index f910e65f04..3f910cd819 100644 --- a/compiler/GHC/Cmm/Expr.hs +++ b/compiler/GHC/Cmm/Expr.hs @@ -78,6 +78,10 @@ instance Eq CmmExpr where -- Equality ignores the types CmmStackSlot a1 i1 == CmmStackSlot a2 i2 = a1==a2 && i1==i2 _e1 == _e2 = False +-- Useful for debugging. +instance Outputable CmmExpr where + ppr = text . show + data AlignmentSpec = NaturallyAligned | Unaligned deriving (Eq, Ord, Show) @@ -384,6 +388,34 @@ instance Ord r => UserOfRegs r r where instance Ord r => DefinerOfRegs r r where foldRegsDefd _ f z r = f z r +-- | This is a instance of convenience. +-- For performance sensitive code paths rather +-- fold over local and global regs separately. +instance UserOfRegs CmmReg GlobalReg where + {-# INLINEABLE foldRegsUsed #-} + foldRegsUsed _ f z reg = f z (CmmGlobal reg) + +-- | This is a instance of convenience. +-- For performance sensitive code paths rather +-- fold over local and global regs separately. +instance DefinerOfRegs CmmReg GlobalReg where + foldRegsDefd _ f z reg = f z (CmmGlobal reg) + +-- | This is a instance of convenience. +-- For performance sensitive code paths rather +-- fold over local and global regs separately. +instance UserOfRegs CmmReg LocalReg where + {-# INLINEABLE foldRegsUsed #-} + foldRegsUsed _ f z reg = f z (CmmLocal reg) + +-- | This is a instance of convenience. +-- For performance sensitive code paths rather +-- fold over local and global regs separately. +instance DefinerOfRegs CmmReg LocalReg where + {-# INLINEABLE foldRegsDefd #-} + foldRegsDefd _ f z reg = f z (CmmLocal reg) + + instance (Ord r, UserOfRegs r CmmReg) => UserOfRegs r CmmExpr where -- The (Ord r) in the context is necessary here -- See Note [Recursive superclasses] in GHC.Tc.TyCl.Instance diff --git a/compiler/GHC/Cmm/Node.hs b/compiler/GHC/Cmm/Node.hs index 841c726b14..020ba63dd2 100644 --- a/compiler/GHC/Cmm/Node.hs +++ b/compiler/GHC/Cmm/Node.hs @@ -46,6 +46,7 @@ import Data.List (tails,sortBy) import GHC.Types.Unique (nonDetCmpUnique) import GHC.Utils.Misc +import GHC.Platform (Platform) ------------------------ -- CmmNode @@ -348,6 +349,25 @@ instance UserOfRegs GlobalReg (CmmNode e x) where => (b -> GlobalReg -> b) -> b -> a -> b fold f z n = foldRegsUsed platform f z n +instance UserOfRegs CmmReg (CmmNode e x) where + {-# INLINE foldRegsUsed #-} + foldRegsUsed = foldRegsUsed_cmmReg_node + +{-# INLINEABLE foldRegsUsed_cmmReg_node #-} +foldRegsUsed_cmmReg_node :: Platform -> (b->CmmReg->b) -> b -> CmmNode e x -> b +foldRegsUsed_cmmReg_node platform f !z n = case n of + CmmAssign _ expr -> fold f z expr + CmmStore addr rval _align -> fold f (fold f z addr) rval + CmmUnsafeForeignCall t _ args -> fold f (fold f z t) args + CmmCondBranch expr _ _ _ -> fold f z expr + CmmSwitch expr _ -> fold f z expr + CmmCall {cml_target=tgt, cml_args_regs=args} -> fold f (fold f z args) tgt + CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args + _ -> z + where fold :: forall a b. UserOfRegs CmmReg a + => (b -> CmmReg -> b) -> b -> a -> b + fold f z n = foldRegsUsed platform f z n + instance (Ord r, UserOfRegs r CmmReg) => UserOfRegs r ForeignTarget where -- The (Ord r) in the context is necessary here -- See Note [Recursive superclasses] in GHC.Tc.TyCl.Instance diff --git a/compiler/GHC/Cmm/Pipeline.hs b/compiler/GHC/Cmm/Pipeline.hs index 585606fcb2..39de30bb0d 100644 --- a/compiler/GHC/Cmm/Pipeline.hs +++ b/compiler/GHC/Cmm/Pipeline.hs @@ -114,8 +114,9 @@ cpsTop logger platform cfg proc = dump Opt_D_dump_cmm_sp "Layout Stack" g ----------- Sink and inline assignments -------------------------------- + let sink_aliasing = if cmmOptSinkAlias cfg then SinkWithAliasAnalysis else SinkWithoutAliasAnalysis g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout] - condPass (cmmOptSink cfg) (cmmSink platform) g + condPass (cmmOptSink cfg) (cmmSink platform sink_aliasing) g Opt_D_dump_cmm_sink "Sink assignments" ------------- CAF analysis ---------------------------------------------- diff --git a/compiler/GHC/Cmm/Sink.hs b/compiler/GHC/Cmm/Sink.hs index 84f9317f21..d2a2049f95 100644 --- a/compiler/GHC/Cmm/Sink.hs +++ b/compiler/GHC/Cmm/Sink.hs @@ -2,12 +2,14 @@ {-# LANGUAGE ScopedTypeVariables #-} module GHC.Cmm.Sink ( - cmmSink - ) where + cmmSink + , SinkAliasFlag(..) + ) where import GHC.Prelude import GHC.Cmm +import GHC.Cmm.Alias import GHC.Cmm.Opt import GHC.Cmm.Liveness import GHC.Cmm.LRegSet @@ -22,11 +24,15 @@ import GHC.Platform import GHC.Types.Unique.FM import qualified Data.IntSet as IntSet -import Data.List (partition) +import Data.List (partition, mapAccumL) import Data.Maybe import GHC.Exts (inline) +import GHC.Utils.Outputable +import GHC.Utils.Misc (fstOf3, thdOf3) +import GHC.Plugins (sndOf3) + -- ----------------------------------------------------------------------------- -- Sinking and inlining @@ -49,7 +55,7 @@ import GHC.Exts (inline) -- -- Algorithm: -- --- * Start by doing liveness analysis. +-- * Start by doing liveness and Hp-aliasing analysis. -- -- * Keep a list of assignments A; earlier ones may refer to later ones. -- Currently we only sink assignments to local registers, because we don't @@ -151,15 +157,24 @@ type Assignments = [Assignment] -- y = e2 -- x = e1 -cmmSink :: Platform -> CmmGraph -> CmmGraph -cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks +-- | Should we be more aggresive when sinking by running a aliasing analysis. +data SinkAliasFlag = SinkWithAliasAnalysis | SinkWithoutAliasAnalysis deriving Eq + +cmmSink :: Platform -> SinkAliasFlag -> CmmGraph -> CmmGraph +cmmSink platform alias_flag graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks where + do_alias = alias_flag == SinkWithAliasAnalysis liveness = cmmLocalLivenessL platform graph + hp_aliases = if do_alias + then Just $ cmmHpAlias platform graph + else Nothing getLive l = mapFindWithDefault emptyLRegSet l liveness + getHps :: Label -> Maybe HpSet + getHps l = mapFindWithDefault mempty l <$> hp_aliases -- Things which alias to Hp on entry. blocks = revPostorder graph - join_pts = findJoinPoints blocks + join_pts = findJoinPoints blocks :: LabelMap Int -- Block -> Number of Predecessors sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock] sink _ [] = [] @@ -176,11 +191,12 @@ cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks -- the node. This will help us decide whether we can inline -- an assignment in the current node or not. live = IntSet.unions (map getLive succs) + hps = getHps lbl :: Maybe HpSet live_middle = gen_killL platform last live - ann_middles = annotate platform live_middle (blockToList middle) + ann_middle = annotate platform live_middle hps (blockToList middle) :: AnnotatedNodes -- Now sink and inline in this block - (middle', assigs) = walk platform ann_middles (mapFindWithDefault [] lbl sunk) + (middle', assigs) = walk platform ann_middle (mapFindWithDefault [] lbl sunk) fold_last = constantFoldNode platform last (final_last, assigs') = tryToInline platform live fold_last assigs @@ -202,13 +218,13 @@ cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks _ -> False -- Now, drop any assignments that we will not sink any further. - (dropped_last, assigs'') = dropAssignments platform drop_if init_live_sets assigs' + (dropped_last, assigs'') = dropAssignments platform (finalHpsFromAnnotatedNodes ann_middle) drop_if init_live_sets assigs' drop_if :: (LocalReg, CmmExpr, AbsMem) -> [LRegSet] -> (Bool, [LRegSet]) drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets') where - should_drop = conflicts platform a final_last + should_drop = conflicts platform hps a final_last || not (isTrivial platform rhs) && live_in_multi live_sets r || r `elemLRegSet` live_in_joins @@ -223,7 +239,7 @@ cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks final_middle = foldl' blockSnoc middle' dropped_last sunk' = mapUnion sunk $ - mapFromList [ (l, filterAssignments platform (getLive l) assigs'') + mapFromList [ (l, filterAssignments platform (getHps l) (getLive l) assigs'') | l <- succs ] {- TODO: enable this later, when we have some good tests in place to @@ -252,36 +268,52 @@ isTrivial platform (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?] isTrivial _ (CmmLit _) = True isTrivial _ _ = False --- --- annotate each node with the set of registers live *after* the node --- -annotate :: Platform -> LRegSet -> [CmmNode O O] -> [(LRegSet, CmmNode O O)] -annotate platform live nodes = snd $ foldr ann (live,[]) nodes +-- | We don't use HpAliasing at below -O2 by default so we only annotate nodes with aliases if needed. +type AnnotatedNodes = Either LiveAnnotated LiveHpAliasAnnotated +type LiveAnnotated = [(LRegSet, CmmNode O O)] -- ^ Annotated with registers live *after* entry. +type LiveHpAliasAnnotated = (HpSet, [(LRegSet, HpSet,CmmNode O O)]) -- ^ Annotated with registers live/HpAliases *after* entry. + +finalHpsFromAnnotatedNodes :: Either a1 (HpSet, b) -> Maybe HpSet +finalHpsFromAnnotatedNodes (Left _) = Nothing +finalHpsFromAnnotatedNodes (Right (final_hps,_)) = Just final_hps + + +annotate :: Platform -> LRegSet -> Maybe HpSet -> [CmmNode O O] -> AnnotatedNodes +annotate platform live hp_set nodes = annotateHps platform hp_set $ annotateLive platform live nodes + +-- | annotate each node with the set of registers live *after* the node +annotateLive :: Platform -> LRegSet -> [CmmNode O O] -> LiveAnnotated +annotateLive platform live nodes = snd $ foldr ann (live,[]) nodes where ann n (live,nodes) = (gen_killL platform n live, (live,n) : nodes) +annotateHps :: Platform -> Maybe HpSet -> [(LRegSet, CmmNode O O)] -> AnnotatedNodes +annotateHps _platform Nothing nodes = Left nodes +annotateHps platform (Just hp_set) nodes = Right $ mapAccumL ann hp_set nodes + where ann hps (live,n) = (node_exit_hps platform n hps, (live,hps,n)) + -- -- Find the blocks that have multiple successors (join points) -- findJoinPoints :: [CmmBlock] -> LabelMap Int findJoinPoints blocks = mapFilter (>1) succ_counts where - all_succs = concatMap successors blocks + all_succs = concatMap successors blocks :: [Label] succ_counts :: LabelMap Int - succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs + succ_counts = foldl' (\m l -> mapInsertWith (+) l 1 m) mapEmpty all_succs -- -- filter the list of assignments to remove any assignments that -- are not live in a continuation. -- -filterAssignments :: Platform -> LRegSet -> Assignments -> Assignments -filterAssignments platform live assigs = reverse (go assigs []) +filterAssignments :: Platform -> Maybe HpSet -> LRegSet -> Assignments -> Assignments +filterAssignments platform hps live assigs = reverse (go assigs []) where go [] kept = kept go (a@(r,_,_):as) kept | needed = go as (a:kept) | otherwise = go as kept where needed = r `elemLRegSet` live - || any (conflicts platform a) (map toNode kept) + || any (conflicts platform hps a) (map toNode kept) -- Note that we must keep assignments that are -- referred to by other assignments we have -- already kept. @@ -301,10 +333,28 @@ filterAssignments platform live assigs = reverse (go assigs []) -- walk :: Platform - -> [(LRegSet, CmmNode O O)] -- nodes of the block, annotated with - -- the set of registers live *after* - -- this node. + -> AnnotatedNodes + -> Assignments -- The current list of + -- assignments we are sinking. + -- Earlier assignments may refer + -- to later ones. + -> ( Block CmmNode O O -- The new block + , Assignments -- Assignments to sink further + ) +walk platform nodes assigs = + case nodes of + Left nodes' -> walk' platform nodes' snd fst (const Nothing) assigs + Right (_hps_exit,nodes') -> walk' platform nodes' thdOf3 fstOf3 (Just . sndOf3) assigs + +{-# INLINE walk' #-} -- We want the function arguments to result in known calls so we force inlining. +walk' :: Platform + -> [a] -- nodes of the block, annotated with + -- the set of registers live/things aliasing Hp *after* + -- this node. + -> (a -> CmmNode O O) + -> (a -> LRegSet) + -> (a -> Maybe HpSet) -> Assignments -- The current list of -- assignments we are sinking. -- Earlier assignments may refer @@ -314,20 +364,23 @@ walk :: Platform , Assignments -- Assignments to sink further ) -walk platform nodes assigs = go nodes emptyBlock assigs +walk' platform nodes getNode getLiveSet getHpAliases assigs = go nodes emptyBlock assigs where go [] block as = (block, as) - go ((live,node):ns) block as + go (node_info:ns) block as -- discard nodes representing dead assignment | shouldDiscard node live = go ns block as -- sometimes only after simplification we can tell we can discard the node. -- See Note [Discard simplified nodes] | noOpAssignment node2 = go ns block as -- Pick up interesting assignments - | Just a <- shouldSink platform node2 = go ns block (a : as1) + | Just a <- shouldSink platform hps node2 = go ns block (a : as1) -- Try inlining, drop assignments and move on | otherwise = go ns block' as' where + live = getLiveSet node_info + hps = getHpAliases node_info + node = getNode node_info -- Simplify node node1 = constantFoldNode platform node @@ -336,7 +389,7 @@ walk platform nodes assigs = go nodes emptyBlock assigs -- Drop any earlier assignments conflicting with node2 (dropped, as') = dropAssignmentsSimple platform - (\a -> conflicts platform a node2) as1 + (\a -> conflicts platform hps a node2) hps as1 -- Walk over the rest of the block. Includes dropped assignments block' = foldl' blockSnoc block dropped `blockSnoc` node2 @@ -383,10 +436,10 @@ cmm_sink_sp. -- be profitable to sink assignments to global regs too, but the -- liveness analysis doesn't track those (yet) so we can't. -- -shouldSink :: Platform -> CmmNode e x -> Maybe Assignment -shouldSink platform (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem platform e) +shouldSink :: Platform -> Maybe HpSet -> CmmNode e x -> Maybe Assignment +shouldSink platform hps (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem platform hps e) where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e -shouldSink _ _other = Nothing +shouldSink _ _ _other = Nothing -- -- discard dead assignments. This doesn't do as good a job as @@ -419,13 +472,13 @@ noOpAssignment node toNode :: Assignment -> CmmNode O O toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs -dropAssignmentsSimple :: Platform -> (Assignment -> Bool) -> Assignments +dropAssignmentsSimple :: Platform -> (Assignment -> Bool) -> Maybe HpSet -> Assignments -> ([CmmNode O O], Assignments) -dropAssignmentsSimple platform f = dropAssignments platform (\a _ -> (f a, ())) () +dropAssignmentsSimple platform f hps = dropAssignments platform hps (\a _ -> (f a, ())) () -dropAssignments :: Platform -> (Assignment -> s -> (Bool, s)) -> s -> Assignments +dropAssignments :: Platform -> Maybe HpSet -> (Assignment -> s -> (Bool, s)) -> s -> Assignments -> ([CmmNode O O], Assignments) -dropAssignments platform should_drop state assigs +dropAssignments platform hps should_drop state assigs = (dropped, reverse kept) where (dropped,kept) = go state assigs [] [] @@ -438,7 +491,7 @@ dropAssignments platform should_drop state assigs | otherwise = go state' rest dropped (assig:kept) where (dropit, state') = should_drop assig state - conflict = dropit || any (conflicts platform assig) dropped + conflict = dropit || any (conflicts platform hps assig) dropped -- ----------------------------------------------------------------------------- @@ -448,7 +501,7 @@ dropAssignments platform should_drop state assigs tryToInline :: forall x. Platform - -> LRegSet -- set of registers live after this + -> LRegSet -- set of registers live after this -- node. We cannot inline anything -- that is live after the node, unless -- it is small enough to duplicate. @@ -461,7 +514,9 @@ tryToInline tryToInline platform liveAfter node assigs = -- pprTrace "tryToInline assig length:" (ppr $ length assigs) $ - go usages liveAfter node emptyLRegSet assigs + let (n,as) = go usages liveAfter node emptyLRegSet assigs + in -- pprTrace "inlined:" (pdoc platform n $$ ppr as) + (n,as) where usages :: UniqFM LocalReg Int -- Maps each LocalReg to a count of how often it is used usages = foldLocalRegsUsed platform addUsage emptyUFM node @@ -651,40 +706,47 @@ okToInline _ _ _ = True -- | @conflicts (r,e) node@ is @False@ if and only if the assignment -- @r = e@ can be safely commuted past statement @node@. -conflicts :: Platform -> Assignment -> CmmNode O x -> Bool -conflicts platform (r, rhs, addr) node +conflicts :: Platform -> Maybe HpSet -> Assignment -> CmmNode O x -> Bool +conflicts platform hps (r, rhs, addr) node -- (1) node defines registers used by rhs of assignment. This catches -- assignments and all three kinds of calls. See Note [Sinking and calls] - | globalRegistersConflict platform rhs node = True - | localRegistersConflict platform rhs node = True + | globalRegistersConflict platform rhs node = traceConflicts "conflicts1" (ppr r) True + | localRegistersConflict platform rhs node = traceConflicts "conflicts2" (ppr r) True -- (2) node uses register defined by assignment - | foldRegsUsed platform (\b r' -> r == r' || b) False node = True + | foldRegsUsed platform (\b r' -> r == r' || b) False node = traceConflicts "conflicts3" (ppr r) True -- (3) a store to an address conflicts with a read of the same memory - | CmmStore addr' e _ <- node - , memConflicts addr (loadAddr platform addr' (cmmExprWidth platform e)) = True + | CmmStore addr' e _a <- node + , memConflicts addr (storeAddr platform hps addr' (cmmExprWidth platform e)) + = traceConflicts "conflicts4" (ppr r <+> ppr addr <+> ppr (storeAddr platform hps addr' (cmmExprWidth platform e)) $$ + pdoc platform node) + True -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively - | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True - | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True - | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True + | HeapMem{} <- addr, CmmAssign (CmmGlobal Hp) _ <- node = traceConflicts "conflicts6.1" (ppr r) True + | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = traceConflicts "conflicts6" (ppr r) True + | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = traceConflicts "conflicts7" (ppr r) True -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap] - | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True + | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = traceConflicts "conflicts8" (ppr r) True -- (6) suspendThread clobbers every global register not backed by a real -- register. It also clobbers heap and stack but this is handled by (5) | CmmUnsafeForeignCall (PrimTarget MO_SuspendThread) _ _ <- node , foldRegsUsed platform (\b g -> globalRegMaybe platform g == Nothing || b) False rhs - = True + = traceConflicts "conflicts9" (ppr r) True -- (7) native calls clobber any memory - | CmmCall{} <- node, memConflicts addr AnyMem = True + | CmmCall{} <- node, memConflicts addr AnyMem = traceConflicts "conflicts10" (ppr r) True -- (8) otherwise, no conflict - | otherwise = False + | otherwise = traceConflicts "NoConflict" (ppr r) + False + where + -- traceConflicts s d = pprTrace s d + traceConflicts = \_s _d -> id {- Note [Inlining foldRegsDefd] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -754,116 +816,6 @@ localRegistersConflict platform expr node = -- UserOfRegs typeclasses. -- --- An abstraction of memory read or written. -data AbsMem - = NoMem -- no memory accessed - | AnyMem -- arbitrary memory - | HeapMem -- definitely heap memory - | StackMem -- definitely stack memory - | SpMem -- <size>[Sp+n] - {-# UNPACK #-} !Int - {-# UNPACK #-} !Int - --- Having SpMem is important because it lets us float loads from Sp --- past stores to Sp as long as they don't overlap, and this helps to --- unravel some long sequences of --- x1 = [Sp + 8] --- x2 = [Sp + 16] --- ... --- [Sp + 8] = xi --- [Sp + 16] = xj --- --- Note that SpMem is invalidated if Sp is changed, but the definition --- of 'conflicts' above handles that. - --- ToDo: this won't currently fix the following commonly occurring code: --- x1 = [R1 + 8] --- x2 = [R1 + 16] --- .. --- [Hp - 8] = x1 --- [Hp - 16] = x2 --- .. - --- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that --- assignments to [Hp + n] do not conflict with any other heap memory, --- but this is tricky to nail down. What if we had --- --- x = Hp + n --- [x] = ... --- --- the store to [x] should be "new heap", not "old heap". --- Furthermore, you could imagine that if we started inlining --- functions in Cmm then there might well be reads of heap memory --- that was written in the same basic block. To take advantage of --- non-aliasing of heap memory we will have to be more clever. - --- Note [Foreign calls clobber heap] --- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- It is tempting to say that foreign calls clobber only --- non-heap/stack memory, but unfortunately we break this invariant in --- the RTS. For example, in stg_catch_retry_frame we call --- stmCommitNestedTransaction() which modifies the contents of the --- TRec it is passed (this actually caused incorrect code to be --- generated). --- --- Since the invariant is true for the majority of foreign calls, --- perhaps we ought to have a special annotation for calls that can --- modify heap/stack memory. For now we just use the conservative --- definition here. --- --- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and --- therefore we should never float any memory operations across one of --- these calls. --- --- `suspendThread` releases the capability used by the thread, hence we mustn't --- float accesses to heap, stack or virtual global registers stored in the --- capability (e.g. with unregisterised build, see #19237). - - -bothMems :: AbsMem -> AbsMem -> AbsMem -bothMems NoMem x = x -bothMems x NoMem = x -bothMems HeapMem HeapMem = HeapMem -bothMems StackMem StackMem = StackMem -bothMems (SpMem o1 w1) (SpMem o2 w2) - | o1 == o2 = SpMem o1 (max w1 w2) - | otherwise = StackMem -bothMems SpMem{} StackMem = StackMem -bothMems StackMem SpMem{} = StackMem -bothMems _ _ = AnyMem - -memConflicts :: AbsMem -> AbsMem -> Bool -memConflicts NoMem _ = False -memConflicts _ NoMem = False -memConflicts HeapMem StackMem = False -memConflicts StackMem HeapMem = False -memConflicts SpMem{} HeapMem = False -memConflicts HeapMem SpMem{} = False -memConflicts (SpMem o1 w1) (SpMem o2 w2) - | o1 < o2 = o1 + w1 > o2 - | otherwise = o2 + w2 > o1 -memConflicts _ _ = True - -exprMem :: Platform -> CmmExpr -> AbsMem -exprMem platform (CmmLoad addr w _) = bothMems (loadAddr platform addr (typeWidth w)) (exprMem platform addr) -exprMem platform (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem platform) es) -exprMem _ _ = NoMem - -loadAddr :: Platform -> CmmExpr -> Width -> AbsMem -loadAddr platform e w = - case e of - CmmReg r -> regAddr platform r 0 w - CmmRegOff r i -> regAddr platform r i w - _other | regUsedIn platform spReg e -> StackMem - | otherwise -> AnyMem - -regAddr :: Platform -> CmmReg -> Int -> Width -> AbsMem -regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w) -regAddr _ (CmmGlobal Hp) _ _ = HeapMem -regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps -regAddr platform r _ _ | isGcPtrType (cmmRegType platform r) = HeapMem -- yay! GCPtr pays for itself -regAddr _ _ _ _ = AnyMem - {- Note [Inline GlobalRegs?] ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/compiler/GHC/Driver/Config/Cmm.hs b/compiler/GHC/Driver/Config/Cmm.hs index 38bab62048..a141e89e7c 100644 --- a/compiler/GHC/Driver/Config/Cmm.hs +++ b/compiler/GHC/Driver/Config/Cmm.hs @@ -19,6 +19,7 @@ initCmmConfig dflags = CmmConfig , cmmDoLinting = gopt Opt_DoCmmLinting dflags , cmmOptElimCommonBlks = gopt Opt_CmmElimCommonBlocks dflags , cmmOptSink = gopt Opt_CmmSink dflags + , cmmOptSinkAlias = gopt Opt_CmmSinkAlias dflags , cmmGenStackUnwindInstr = debugLevel dflags > 0 , cmmExternalDynamicRefs = gopt Opt_ExternalDynamicRefs dflags , cmmDoCmmSwitchPlans = not . backendSupportsSwitch . backend $ dflags diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs index 209e6d1776..66c308312e 100644 --- a/compiler/GHC/Driver/Flags.hs +++ b/compiler/GHC/Driver/Flags.hs @@ -216,7 +216,8 @@ data GeneralFlag | Opt_LlvmTBAA -- Use LLVM TBAA infrastructure for improving AA (hidden flag) | Opt_LlvmFillUndefWithGarbage -- Testing for undef bugs (hidden flag) | Opt_IrrefutableTuples - | Opt_CmmSink + | Opt_CmmSink -- ^ Run cmm sinking pass. + | Opt_CmmSinkAlias -- ^ When doing cmm sinking, perform aliasing analysis to improve sinking. | Opt_CmmStaticPred | Opt_CmmElimCommonBlocks | Opt_CmmControlFlow @@ -446,6 +447,7 @@ optimisationFlags = EnumSet.fromList , Opt_LlvmFillUndefWithGarbage , Opt_IrrefutableTuples , Opt_CmmSink + , Opt_CmmSinkAlias , Opt_CmmElimCommonBlocks , Opt_AsmShortcutting , Opt_OmitYields diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs index b78d141061..fa3457cf81 100644 --- a/compiler/GHC/Driver/Session.hs +++ b/compiler/GHC/Driver/Session.hs @@ -3393,6 +3393,7 @@ fFlagsDeps = [ flagSpec "case-folding" Opt_CaseFolding, flagSpec "cmm-elim-common-blocks" Opt_CmmElimCommonBlocks, flagSpec "cmm-sink" Opt_CmmSink, + flagSpec "cmm-sink-alias" Opt_CmmSinkAlias, flagSpec "cmm-static-pred" Opt_CmmStaticPred, flagSpec "cse" Opt_CSE, flagSpec "stg-cse" Opt_StgCSE, @@ -3989,6 +3990,7 @@ optLevelFlags -- see Note [Documenting optimisation flags] , ([1,2], Opt_CmmElimCommonBlocks) , ([2], Opt_AsmShortcutting) , ([1,2], Opt_CmmSink) + , ([2], Opt_CmmSinkAlias) , ([1,2], Opt_CmmStaticPred) , ([1,2], Opt_CSE) , ([1,2], Opt_StgCSE) diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in index f4c1a41dd3..ddc78cbca8 100644 --- a/compiler/ghc.cabal.in +++ b/compiler/ghc.cabal.in @@ -194,6 +194,7 @@ Library GHC.Cmm.Lexer GHC.Cmm.Lint GHC.Cmm.Liveness + GHC.Cmm.Alias GHC.Cmm.MachOp GHC.Cmm.Node GHC.Cmm.Opt diff --git a/docs/users_guide/using-optimisation.rst b/docs/users_guide/using-optimisation.rst index ab49f08ade..2db1455186 100644 --- a/docs/users_guide/using-optimisation.rst +++ b/docs/users_guide/using-optimisation.rst @@ -225,6 +225,17 @@ by saying ``-fno-wombat``. to their usage sites. It also inlines simple expressions like literals or registers. +.. ghc-flag:: -fcmm-sink-alias + :shortdesc: Enable more complex aliasing analysis to improve the sinking pass. Implied by :ghc-flag:`-O2`. + :type: dynamic + :reverse: -fno-cmm-sink + :category: + + :default: off but enabled with :ghc-flag:`-O2`. + + This allows us to remove more intermediate variables in the Cmm pass. + For some programs this can avoid excessive spilling. + .. ghc-flag:: -fcmm-static-pred :shortdesc: Enable static control flow prediction. Implied by :ghc-flag:`-O`. :type: dynamic diff --git a/testsuite/tests/cmm/should_compile/all.T b/testsuite/tests/cmm/should_compile/all.T index 40813f01ec..88169eb720 100644 --- a/testsuite/tests/cmm/should_compile/all.T +++ b/testsuite/tests/cmm/should_compile/all.T @@ -1,6 +1,8 @@ # test('selfloop', [cmm_src], compile, ['-no-hs-main']) -test('cmm_sink_sp', [ only_ways(['optasm']), grep_errmsg('(\[Sp.*\]).*(=).*(\[.*R1.*\]).*;',[1,2,3]), cmm_src], compile, ['-no-hs-main -ddump-cmm -dsuppress-uniques -O']) +test('cmm_sink_sp', [ when(wordsize(32), skip), only_ways(['optasm']), grep_errmsg('(\[Sp.*\]).*(=).*(\[.*R1.*\]).*;',[1,2,3]), cmm_src], compile, ['-no-hs-main -ddump-cmm -dsuppress-uniques -O -dsuppress-all']) +test('cmm_sink_hp', [when(wordsize(32), skip), cmm_src,grep_errmsg('(I64\[.*Hp.*=.*;)',[1])], compile, ['-no-hs-main -O -ddump-cmm -fcmm-sink-alias']) +test('cmm_hp_aliases', [ when(wordsize(32), skip), cmm_src,grep_errmsg('(I64\[.*Hp.*=.*;)',[1])], compile, ['-no-hs-main -O -ddump-cmm -fcmm-sink-alias']) test('T16930', normal, makefile_test, ['T16930']) test('T17442', normal, compile, ['']) diff --git a/testsuite/tests/cmm/should_compile/cmm_hp_aliases.cmm b/testsuite/tests/cmm/should_compile/cmm_hp_aliases.cmm new file mode 100644 index 0000000000..dd7c9ac2d3 --- /dev/null +++ b/testsuite/tests/cmm/should_compile/cmm_hp_aliases.cmm @@ -0,0 +1,12 @@ +#include "Cmm.h" + +violate_hp_invariant () +{ + W_ x1, x2; + x1 = Hp + 8; + x2 = W_[x1+8]; + W_[Hp+8] = 1; + W_[Hp+16] = x2; // We shouldn't inline the alignment to x2 here because it conflichts with the write above. + // If we end up with something like W_[Hp+16] = W_[Hp+8] it's broken. + return (x2); +} diff --git a/testsuite/tests/cmm/should_compile/cmm_hp_aliases.stderr b/testsuite/tests/cmm/should_compile/cmm_hp_aliases.stderr new file mode 100644 index 0000000000..810f5c955d --- /dev/null +++ b/testsuite/tests/cmm/should_compile/cmm_hp_aliases.stderr @@ -0,0 +1,16 @@ + +==================== Output Cmm ==================== +[violate_hp_invariant() { // [] + { info_tbls: [] + stack_info: arg_space: 8 + } + {offset + c3: _c2::I64 = I64[Hp + 16]; + I64[Hp + 8] = 1; + I64[Hp + 16] = _c2::I64; + R1 = _c2::I64; + call (P64[Sp])(R1) args: 8, res: 0, upd: 8; + } + }] + + diff --git a/testsuite/tests/cmm/should_compile/cmm_sink_hp.cmm b/testsuite/tests/cmm/should_compile/cmm_sink_hp.cmm new file mode 100644 index 0000000000..b226bb7331 --- /dev/null +++ b/testsuite/tests/cmm/should_compile/cmm_sink_hp.cmm @@ -0,0 +1,22 @@ +#include "Cmm.h" + +stg_sink_hp ( P_ r1 ) +{ + // Should produce a series of loads that are sunk into + // stores to Hp like this: + // W_[Hp + 16] = W_[x1 + 1]; // CmmStore + // W_[Hp + 24] = W_[x1 + 2]; // CmmStore + + P_ x1,y1, y2, y3,y4; + x1 = P_[r1]; + y1 = W_[x1]; + y2 = W_[x1+8]; + y3 = W_[x1+16]; + y4 = W_[x1+24]; + + W_[Hp+8] = y1; + W_[Hp+16] = y2; + W_[Hp+24] = y3; + W_[Hp+32] = y4; + return (x1); +}
\ No newline at end of file diff --git a/testsuite/tests/cmm/should_compile/cmm_sink_hp.stderr b/testsuite/tests/cmm/should_compile/cmm_sink_hp.stderr new file mode 100644 index 0000000000..b22654169b --- /dev/null +++ b/testsuite/tests/cmm/should_compile/cmm_sink_hp.stderr @@ -0,0 +1,29 @@ + +==================== Output Cmm ==================== +[stg_sink_hp() { // [R1] + { info_tbls: [] + stack_info: arg_space: 8 + } + {offset + c7: // global + //tick src<cmm_sink_hp.cmm:(4,1)-(22,1)> + //tick src<cmm_sink_hp.cmm:11:8-19> + //tick src<cmm_sink_hp.cmm:12:8-20> + //tick src<cmm_sink_hp.cmm:13:8-22> + //tick src<cmm_sink_hp.cmm:14:8-23> + //tick src<cmm_sink_hp.cmm:15:8-23> + //tick src<cmm_sink_hp.cmm:17:11-22> + _c2::P64 = P64[R1]; + I64[Hp + 8] = I64[_c2::P64]; + //tick src<cmm_sink_hp.cmm:18:11-23> + I64[Hp + 16] = I64[_c2::P64 + 8]; + //tick src<cmm_sink_hp.cmm:19:11-23> + I64[Hp + 24] = I64[_c2::P64 + 16]; + //tick src<cmm_sink_hp.cmm:20:11-23> + I64[Hp + 32] = I64[_c2::P64 + 24]; + R1 = _c2::P64; + call (P64[Sp])(R1) args: 8, res: 0, upd: 8; + } + }] + + |