diff options
Diffstat (limited to 'compiler/nativeGen')
-rw-r--r-- | compiler/nativeGen/AsmCodeGen.hs | 9 | ||||
-rw-r--r-- | compiler/nativeGen/BlockLayout.hs | 635 | ||||
-rw-r--r-- | compiler/nativeGen/CFG.hs | 748 | ||||
-rw-r--r-- | compiler/nativeGen/RegAlloc/Graph/SpillCost.hs | 101 | ||||
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 2 |
5 files changed, 1131 insertions, 364 deletions
diff --git a/compiler/nativeGen/AsmCodeGen.hs b/compiler/nativeGen/AsmCodeGen.hs index 6b7727a426..4c883e7185 100644 --- a/compiler/nativeGen/AsmCodeGen.hs +++ b/compiler/nativeGen/AsmCodeGen.hs @@ -562,7 +562,7 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count Opt_D_dump_asm_native "Native code" (vcat $ map (pprNatCmmDecl ncgImpl) native) - dumpIfSet_dyn dflags + when (not $ null nativeCfgWeights) $ dumpIfSet_dyn dflags Opt_D_dump_cfg_weights "CFG Weights" (pprEdgeWeights nativeCfgWeights) @@ -691,7 +691,7 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count {-# SCC "generateJumpTables" #-} generateJumpTables ncgImpl alloced - dumpIfSet_dyn dflags + when (not $ null nativeCfgWeights) $ dumpIfSet_dyn dflags Opt_D_dump_cfg_weights "CFG Update information" ( text "stack:" <+> ppr stack_updt_blks $$ text "linearAlloc:" <+> ppr cfgRegAllocUpdates ) @@ -705,8 +705,9 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count optimizedCFG = optimizeCFG (cfgWeightInfo dflags) cmm <$!> postShortCFG - maybe (return ()) - (dumpIfSet_dyn dflags Opt_D_dump_cfg_weights "CFG Final Weights" . pprEdgeWeights) + maybe (return ()) (\cfg-> + dumpIfSet_dyn dflags Opt_D_dump_cfg_weights "CFG Final Weights" + ( pprEdgeWeights cfg )) optimizedCFG --TODO: Partially check validity of the cfg. diff --git a/compiler/nativeGen/BlockLayout.hs b/compiler/nativeGen/BlockLayout.hs index 7a39071541..56e3177dd8 100644 --- a/compiler/nativeGen/BlockLayout.hs +++ b/compiler/nativeGen/BlockLayout.hs @@ -6,6 +6,8 @@ {-# LANGUAGE DataKinds #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE FlexibleContexts #-} module BlockLayout ( sequenceTop ) @@ -22,7 +24,6 @@ import BlockId import Cmm import Hoopl.Collections import Hoopl.Label -import Hoopl.Block import DynFlags (gopt, GeneralFlag(..), DynFlags, backendMaintainsCfg) import UniqFM @@ -41,11 +42,30 @@ import ListSetOps (removeDups) import OrdList import Data.List import Data.Foldable (toList) -import Hoopl.Graph import qualified Data.Set as Set +import Data.STRef +import Control.Monad.ST.Strict +import Control.Monad (foldM) {- + Note [CFG based code layout] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + The major steps in placing blocks are as follow: + * Compute a CFG based on the Cmm AST, see getCfgProc. + This CFG will have edge weights representing a guess + on how important they are. + * After we convert Cmm to Asm we run `optimizeCFG` which + adds a few more "educated guesses" to the equation. + * Then we run loop analysis on the CFG (`loopInfo`) which tells us + about loop headers, loop nesting levels and the sort. + * Based on the CFG and loop information refine the edge weights + in the CFG and normalize them relative to the most often visited + node. (See `mkGlobalWeights`) + * Feed this CFG into the block layout code (`sequenceTop`) in this + module. Which will then produce a code layout based on the input weights. + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~ Note [Chain based CFG serialization] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -60,8 +80,8 @@ import qualified Data.Set as Set but also how much a block would benefit from being placed sequentially after it's predecessor. For example blocks which are preceeded by an info table are more likely to end - up in a different cache line than their predecessor. So there is less benefit - in placing them sequentially. + up in a different cache line than their predecessor and we can't eliminate the jump + so there is less benefit to placing them sequentially. For example consider this example: @@ -81,56 +101,83 @@ import qualified Data.Set as Set Eg for our example we might end up with two chains like: [A->B->C->X],[D]. Blocks inside chains will always be placed sequentially. However there is no particular order in which chains are placed since - (hopefully) the blocks for which sequentially is important have already + (hopefully) the blocks for which sequentiality is important have already been placed in the same chain. ----------------------------------------------------------------------------- - First try to create a lists of good chains. + 1) First try to create a list of good chains. ----------------------------------------------------------------------------- - We do so by taking a block not yet placed in a chain and - looking at these cases: + Good chains are these which allow us to eliminate jump instructions. + Which further eliminate often executed jumps first. + + We do so by: + + *) Ignore edges which represent instructions which can not be replaced + by fall through control flow. Primarily calls and edges to blocks which + are prefixed by a info table we have to jump across. + + *) Then process remaining edges in order of frequency taken and: + + +) If source and target have not been placed build a new chain from them. + + +) If source and target have been placed, and are ends of differing chains + try to merge the two chains. - *) Check if the best predecessor of the block is at the end of a chain. - If so add the current block to the end of that chain. + +) If one side of the edge is a end/front of a chain, add the other block of + to edge to the same chain - Eg if we look at block C and already have the chain (A -> B) - then we extend the chain to (A -> B -> C). + Eg if we look at edge (B -> C) and already have the chain (A -> B) + then we extend the chain to (A -> B -> C). - Combined with the fact that we process blocks in reverse post order - this means loop bodies and trivially sequential control flow already - ends up as a single chain. + +) If the edge was used to modify or build a new chain remove the edge from + our working list. - *) Otherwise we create a singleton chain from the block we are looking at. - Eg if we have from the example above already constructed (A->B) - and look at D we create the chain (D) resulting in the chains [A->B, D] + *) If there any blocks not being placed into a chain after these steps we place + them into a chain consisting of only this block. + + Ranking edges by their taken frequency, if + two edges compete for fall through on the same target block, the one taken + more often will automatically win out. Resulting in fewer instructions being + executed. + + Creating singleton chains is required for situations where we have code of the + form: + + A: goto B: + <infoTable> + B: goto C: + <infoTable> + C: ... + + As the code in block B is only connected to the rest of the program via edges + which will be ignored in this step we make sure that B still ends up in a chain + this way. ----------------------------------------------------------------------------- - We then try to fuse chains. + 2) We also try to fuse chains. ----------------------------------------------------------------------------- - There are edge cases which result in two chains being created which trivially - represent linear control flow. For example we might have the chains - [(A-B-C),(D-E)] with an cfg triangle: + As a result from the above step we still end up with multiple chains which + represent sequential control flow chunks. But they are not yet suitable for + code layout as we need to place *all* blocks into a single sequence. - A----->C->D->E - \->B-/ + In this step we combine chains result from the above step via these steps: - We also get three independent chains if two branches end with a jump - to a common successor. + *) Look at the ranked list of *all* edges, including calls/jumps across info tables + and the like. - We take care of these cases by fusing chains which are connected by an - edge. + *) Look at each edge and - We do so by looking at the list of edges sorted by weight. - Given the edge (C -> D) we try to find two chains such that: - * C is at the end of chain one. - * D is in front of chain two. - * If two such chains exist we fuse them. - We then remove the edge and repeat the process for the rest of the edges. + +) Given an edge (A -> B) try to find two chains for which + * Block A is at the end of one chain + * Block B is at the front of the other chain. + +) If we find such a chain we "fuse" them into a single chain, remove the + edge from working set and continue. + +) If we can't find such chains we skip the edge and continue. ----------------------------------------------------------------------------- - Place indirect successors (neighbours) after each other + 3) Place indirect successors (neighbours) after each other ----------------------------------------------------------------------------- We might have chains [A,B,C,X],[E] in a CFG of the sort: @@ -141,15 +188,11 @@ import qualified Data.Set as Set While E does not follow X it's still beneficial to place them near each other. This can be advantageous if eg C,X,E will end up in the same cache line. - TODO: If we remove edges as we use them (eg if we build up A->B remove A->B - from the list) we could save some more work in later phases. - - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~ Note [Triangle Control Flow] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Checking if an argument is already evaluating leads to a somewhat + Checking if an argument is already evaluated leads to a somewhat special case which looks like this: A: @@ -204,11 +247,6 @@ import qualified Data.Set as Set neighbourOverlapp :: Int neighbourOverlapp = 2 --- | Only edges heavier than this are considered --- for fusing two chains into a single chain. -fuseEdgeThreshold :: EdgeWeight -fuseEdgeThreshold = 0 - -- | Maps blocks near the end of a chain to it's chain AND -- the other blocks near the end. -- [A,B,C,D,E] Gives entries like (B -> ([A,B], [A,B,C,D,E])) @@ -224,40 +262,24 @@ type FrontierMap = LabelMap ([BlockId],BlockChain) newtype BlockChain = BlockChain { chainBlocks :: (OrdList BlockId) } -instance Eq (BlockChain) where - (BlockChain blks1) == (BlockChain blks2) - = fromOL blks1 == fromOL blks2 +-- All chains are constructed the same way so comparison +-- including structure is faster. +instance Eq BlockChain where + BlockChain b1 == BlockChain b2 = strictlyEqOL b1 b2 -- Useful for things like sets and debugging purposes, sorts by blocks -- in the chain. instance Ord (BlockChain) where (BlockChain lbls1) `compare` (BlockChain lbls2) - = (fromOL lbls1) `compare` (fromOL lbls2) + = ASSERT(toList lbls1 /= toList lbls2 || lbls1 `strictlyEqOL` lbls2) + strictlyOrdOL lbls1 lbls2 instance Outputable (BlockChain) where ppr (BlockChain blks) = parens (text "Chain:" <+> ppr (fromOL $ blks) ) -data WeightedEdge = WeightedEdge !BlockId !BlockId EdgeWeight deriving (Eq) - - --- | Non deterministic! (Uniques) Sorts edges by weight and nodes. -instance Ord WeightedEdge where - compare (WeightedEdge from1 to1 weight1) - (WeightedEdge from2 to2 weight2) - | weight1 < weight2 || weight1 == weight2 && from1 < from2 || - weight1 == weight2 && from1 == from2 && to1 < to2 - = LT - | from1 == from2 && to1 == to2 && weight1 == weight2 - = EQ - | otherwise - = GT - -instance Outputable WeightedEdge where - ppr (WeightedEdge from to info) = - ppr from <> text "->" <> ppr to <> brackets (ppr info) - -type WeightedEdgeList = [WeightedEdge] +chainFoldl :: (b -> BlockId -> b) -> b -> BlockChain -> b +chainFoldl f z (BlockChain blocks) = foldl' f z blocks noDups :: [BlockChain] -> Bool noDups chains = @@ -270,19 +292,21 @@ inFront :: BlockId -> BlockChain -> Bool inFront bid (BlockChain seq) = headOL seq == bid -chainMember :: BlockId -> BlockChain -> Bool -chainMember bid chain - = elem bid $ fromOL . chainBlocks $ chain --- = setMember bid . chainMembers $ chain - chainSingleton :: BlockId -> BlockChain chainSingleton lbl = BlockChain (unitOL lbl) +chainFromList :: [BlockId] -> BlockChain +chainFromList = BlockChain . toOL + chainSnoc :: BlockChain -> BlockId -> BlockChain chainSnoc (BlockChain blks) lbl = BlockChain (blks `snocOL` lbl) +chainCons :: BlockId -> BlockChain -> BlockChain +chainCons lbl (BlockChain blks) + = BlockChain (lbl `consOL` blks) + chainConcat :: BlockChain -> BlockChain -> BlockChain chainConcat (BlockChain blks1) (BlockChain blks2) = BlockChain (blks1 `appOL` blks2) @@ -311,52 +335,14 @@ takeL :: Int -> BlockChain -> [BlockId] takeL n (BlockChain blks) = take n . fromOL $ blks --- | For a given list of chains try to fuse chains with strong --- edges between them into a single chain. --- Returns the list of fused chains together with a set of --- used edges. The set of edges is indirectly encoded in the --- chains so doesn't need to be considered for later passes. -fuseChains :: WeightedEdgeList -> LabelMap BlockChain - -> (LabelMap BlockChain, Set.Set WeightedEdge) -fuseChains weights chains - = let fronts = mapFromList $ - map (\chain -> (headOL . chainBlocks $ chain,chain)) $ - mapElems chains :: LabelMap BlockChain - (chains', used, _) = applyEdges weights chains fronts Set.empty - in (chains', used) - where - applyEdges :: WeightedEdgeList -> LabelMap BlockChain - -> LabelMap BlockChain -> Set.Set WeightedEdge - -> (LabelMap BlockChain, Set.Set WeightedEdge, LabelMap BlockChain) - applyEdges [] chainsEnd chainsFront used - = (chainsEnd, used, chainsFront) - applyEdges (edge@(WeightedEdge from to w):edges) chainsEnd chainsFront used - --Since we order edges descending by weight we can stop here - | w <= fuseEdgeThreshold - = ( chainsEnd, used, chainsFront) - --Fuse the two chains - | Just c1 <- mapLookup from chainsEnd - , Just c2 <- mapLookup to chainsFront - , c1 /= c2 - = let newChain = chainConcat c1 c2 - front = headOL . chainBlocks $ newChain - end = lastOL . chainBlocks $ newChain - chainsFront' = mapInsert front newChain $ - mapDelete to chainsFront - chainsEnd' = mapInsert end newChain $ - mapDelete from chainsEnd - in applyEdges edges chainsEnd' chainsFront' - (Set.insert edge used) - | otherwise - --Check next edge - = applyEdges edges chainsEnd chainsFront used - +-- Note [Combining neighborhood chains] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- See also Note [Chain based CFG serialization] -- We have the chains (A-B-C-D) and (E-F) and an Edge C->E. -- --- While placing the later after the former doesn't result in sequential --- control flow it is still be benefical since block C and E might end +-- While placing the latter after the former doesn't result in sequential +-- control flow it is still benefical. As block C and E might end -- up in the same cache line. -- -- So we place these chains next to each other even if we can't fuse them. @@ -365,7 +351,7 @@ fuseChains weights chains -- v -- - -> E -> F ... -- --- Simple heuristic to chose which chains we want to combine: +-- A simple heuristic to chose which chains we want to combine: -- * Process edges in descending priority. -- * Check if there is a edge near the end of one chain which goes -- to a block near the start of another edge. @@ -375,14 +361,22 @@ fuseChains weights chains -- us to find all edges between two chains, check the distance for all edges, -- rank them based on the distance and and only then we can select two chains -- to combine. Which would add a lot of complexity for little gain. +-- +-- So instead we just rank by the strength of the edge and use the first pair we +-- find. -- | For a given list of chains and edges try to combine chains with strong -- edges between them. -combineNeighbourhood :: WeightedEdgeList -> [BlockChain] - -> [BlockChain] +combineNeighbourhood :: [CfgEdge] -- ^ Edges to consider + -> [BlockChain] -- ^ Current chains of blocks + -> ([BlockChain], Set.Set (BlockId,BlockId)) + -- ^ Resulting list of block chains, and a set of edges which + -- were used to fuse chains and as such no longer need to be + -- considered. combineNeighbourhood edges chains = -- pprTraceIt "Neigbours" $ - applyEdges edges endFrontier startFrontier + -- pprTrace "combineNeighbours" (ppr edges) $ + applyEdges edges endFrontier startFrontier (Set.empty) where --Build maps from chain ends to chains endFrontier, startFrontier :: FrontierMap @@ -396,14 +390,14 @@ combineNeighbourhood edges chains let front = getFronts chain entry = (front,chain) in map (\x -> (x,entry)) front) chains - applyEdges :: WeightedEdgeList -> FrontierMap -> FrontierMap - -> [BlockChain] - applyEdges [] chainEnds _chainFronts = - ordNub $ map snd $ mapElems chainEnds - applyEdges ((WeightedEdge from to _w):edges) chainEnds chainFronts + applyEdges :: [CfgEdge] -> FrontierMap -> FrontierMap -> Set.Set (BlockId, BlockId) + -> ([BlockChain], Set.Set (BlockId,BlockId)) + applyEdges [] chainEnds _chainFronts combined = + (ordNub $ map snd $ mapElems chainEnds, combined) + applyEdges ((CfgEdge from to _w):edges) chainEnds chainFronts combined | Just (c1_e,c1) <- mapLookup from chainEnds , Just (c2_f,c2) <- mapLookup to chainFronts - , c1 /= c2 -- Avoid trying to concat a short chain with itself. + , c1 /= c2 -- Avoid trying to concat a chain with itself. = let newChain = chainConcat c1 c2 newChainFrontier = getFronts newChain newChainEnds = getEnds newChain @@ -437,165 +431,299 @@ combineNeighbourhood edges chains -- text "fronts" <+> ppr newFronts $$ -- text "ends" <+> ppr newEnds -- ) - applyEdges edges newEnds newFronts + applyEdges edges newEnds newFronts (Set.insert (from,to) combined) | otherwise - = --pprTrace "noNeigbours" (ppr ()) $ - applyEdges edges chainEnds chainFronts + = applyEdges edges chainEnds chainFronts combined where getFronts chain = takeL neighbourOverlapp chain getEnds chain = takeR neighbourOverlapp chain - - --- See [Chain based CFG serialization] -buildChains :: CFG -> [BlockId] - -> ( LabelMap BlockChain -- Resulting chains. +-- In the last stop we combine all chains into a single one. +-- Trying to place chains with strong edges next to each other. +mergeChains :: [CfgEdge] -> [BlockChain] + -> (BlockChain) +mergeChains edges chains + = -- pprTrace "combine" (ppr edges) $ + runST $ do + let addChain m0 chain = do + ref <- newSTRef chain + return $ chainFoldl (\m' b -> mapInsert b ref m') m0 chain + chainMap' <- foldM (\m0 c -> addChain m0 c) mapEmpty chains + merge edges chainMap' + where + -- We keep a map from ALL blocks to their respective chain (sigh) + -- This is required since when looking at an edge we need to find + -- the associated chains quickly. + -- We use a map of STRefs, maintaining a invariant of one STRef per chain. + -- When merging chains we can update the + -- STRef of one chain once (instead of writing to the map for each block). + -- We then overwrite the STRefs for the other chain so there is again only + -- a single STRef for the combined chain. + -- The difference in terms of allocations saved is ~0.2% with -O so actually + -- significant compared to using a regular map. + + merge :: forall s. [CfgEdge] -> LabelMap (STRef s BlockChain) -> ST s BlockChain + merge [] chains = do + chains' <- ordNub <$> (mapM readSTRef $ mapElems chains) :: ST s [BlockChain] + return $ foldl' chainConcat (head chains') (tail chains') + merge ((CfgEdge from to _):edges) chains + -- | pprTrace "merge" (ppr (from,to) <> ppr chains) False + -- = undefined + | cFrom == cTo + = merge edges chains + | otherwise + = do + chains' <- mergeComb cFrom cTo + merge edges chains' + where + mergeComb :: STRef s BlockChain -> STRef s BlockChain -> ST s (LabelMap (STRef s BlockChain)) + mergeComb refFrom refTo = do + cRight <- readSTRef refTo + chain <- pure chainConcat <*> readSTRef refFrom <*> pure cRight + writeSTRef refFrom chain + return $ chainFoldl (\m b -> mapInsert b refFrom m) chains cRight + + cFrom = expectJust "mergeChains:chainMap:from" $ mapLookup from chains + cTo = expectJust "mergeChains:chainMap:to" $ mapLookup to chains + + +-- See Note [Chain based CFG serialization] for the general idea. +-- This creates and fuses chains at the same time for performance reasons. + +-- Try to build chains from a list of edges. +-- Edges must be sorted **descending** by their priority. +-- Returns the constructed chains, along with all edges which +-- are irrelevant past this point, this information doesn't need +-- to be complete - it's only used to speed up the process. +-- An Edge is irrelevant if the ends are part of the same chain. +-- We say these edges are already linked +buildChains :: [CfgEdge] -> [BlockId] + -> ( LabelMap BlockChain -- Resulting chains, indexd by end if chain. , Set.Set (BlockId, BlockId)) --List of fused edges. -buildChains succWeights blocks - = let (_, fusedEdges, chains) = buildNext setEmpty mapEmpty blocks Set.empty - in (chains, fusedEdges) +buildChains edges blocks + = runST $ buildNext setEmpty mapEmpty mapEmpty edges Set.empty where - -- We keep a map from the last block in a chain to the chain itself. - -- This we we can easily check if an block should be appened to an + -- buildNext builds up chains from edges one at a time. + + -- We keep a map from the ends of chains to the chains. + -- This we we can easily check if an block should be appended to an -- existing chain! - buildNext :: LabelSet - -> LabelMap BlockChain -- Map from last element to chain. - -> [BlockId] -- Blocks to place - -> Set.Set (BlockId, BlockId) - -> ( [BlockChain] -- Placed Blocks - , Set.Set (BlockId, BlockId) --List of fused edges - , LabelMap BlockChain - ) - buildNext _placed chains [] linked = - ([], linked, chains) - buildNext placed chains (block:todo) linked - | setMember block placed - = buildNext placed chains todo linked + -- We store them using STRefs so we don't have to rebuild the spine of both + -- maps every time we update a chain. + buildNext :: forall s. LabelSet + -> LabelMap (STRef s BlockChain) -- Map from end of chain to chain. + -> LabelMap (STRef s BlockChain) -- Map from start of chain to chain. + -> [CfgEdge] -- Edges to check - ordered by decreasing weight + -> Set.Set (BlockId, BlockId) -- Used edges + -> ST s ( LabelMap BlockChain -- Chains by end + , Set.Set (BlockId, BlockId) --List of fused edges + ) + buildNext placed _chainStarts chainEnds [] linked = do + ends' <- sequence $ mapMap readSTRef chainEnds :: ST s (LabelMap BlockChain) + -- Any remaining blocks have to be made to singleton chains. + -- They might be combined with other chains later on outside this function. + let unplaced = filter (\x -> not (setMember x placed)) blocks + singletons = map (\x -> (x,chainSingleton x)) unplaced :: [(BlockId,BlockChain)] + return (foldl' (\m (k,v) -> mapInsert k v m) ends' singletons , linked) + buildNext placed chainStarts chainEnds (edge:todo) linked + | from == to + -- We skip self edges + = buildNext placed chainStarts chainEnds todo (Set.insert (from,to) linked) + | not (alreadyPlaced from) && + not (alreadyPlaced to) + = do + --pprTraceM "Edge-Chain:" (ppr edge) + chain' <- newSTRef $ chainFromList [from,to] + buildNext + (setInsert to (setInsert from placed)) + (mapInsert from chain' chainStarts) + (mapInsert to chain' chainEnds) + todo + (Set.insert (from,to) linked) + + | (alreadyPlaced from) && + (alreadyPlaced to) + , Just predChain <- mapLookup from chainEnds + , Just succChain <- mapLookup to chainStarts + , predChain /= succChain -- Otherwise we try to create a cycle. + = do + -- pprTraceM "Fusing edge" (ppr edge) + fuseChain predChain succChain + + | (alreadyPlaced from) && + (alreadyPlaced to) + = --pprTraceM "Skipping:" (ppr edge) >> + buildNext placed chainStarts chainEnds todo linked + | otherwise - = buildNext placed' chains' todo linked' + = do -- pprTraceM "Finding chain for:" (ppr edge $$ + -- text "placed" <+> ppr placed) + findChain where - placed' = (foldl' (flip setInsert) placed placedBlocks) - linked' = Set.union linked linkedEdges - (placedBlocks, chains', linkedEdges) = findChain block - - --Add the block to a existing or new chain - --Returns placed blocks, list of resulting chains - --and fused edges - findChain :: BlockId - -> ([BlockId],LabelMap BlockChain, Set.Set (BlockId, BlockId)) - findChain block - -- B) place block at end of existing chain if - -- there is no better block to append. - | (pred:_) <- preds - , alreadyPlaced pred - , Just predChain <- mapLookup pred chains - , (best:_) <- filter (not . alreadyPlaced) $ getSuccs pred - , best == lbl - = --pprTrace "B.2)" (ppr (pred,lbl)) $ - let newChain = chainSnoc predChain block - chainMap = mapInsert lbl newChain $ mapDelete pred chains - in ( [lbl] - , chainMap - , Set.singleton (pred,lbl) ) - + from = edgeFrom edge + to = edgeTo edge + alreadyPlaced blkId = (setMember blkId placed) + + -- Combine two chains into a single one. + fuseChain :: STRef s BlockChain -> STRef s BlockChain + -> ST s ( LabelMap BlockChain -- Chains by end + , Set.Set (BlockId, BlockId) --List of fused edges + ) + fuseChain fromRef toRef = do + fromChain <- readSTRef fromRef + toChain <- readSTRef toRef + let newChain = chainConcat fromChain toChain + ref <- newSTRef newChain + let start = head $ takeL 1 newChain + let end = head $ takeR 1 newChain + -- chains <- sequence $ mapMap readSTRef chainStarts + -- pprTraceM "pre-fuse chains:" $ ppr chains + buildNext + placed + (mapInsert start ref $ mapDelete to $ chainStarts) + (mapInsert end ref $ mapDelete from $ chainEnds) + todo + (Set.insert (from,to) linked) + + + --Add the block to a existing chain or creates a new chain + findChain :: ST s ( LabelMap BlockChain -- Chains by end + , Set.Set (BlockId, BlockId) --List of fused edges + ) + findChain + -- We can attach the block to the end of a chain + | alreadyPlaced from + , Just predChain <- mapLookup from chainEnds + = do + chain <- readSTRef predChain + let newChain = chainSnoc chain to + writeSTRef predChain newChain + let chainEnds' = mapInsert to predChain $ mapDelete from chainEnds + -- chains <- sequence $ mapMap readSTRef chainStarts + -- pprTraceM "from chains:" $ ppr chains + buildNext (setInsert to placed) chainStarts chainEnds' todo (Set.insert (from,to) linked) + -- We can attack it to the front of a chain + | alreadyPlaced to + , Just succChain <- mapLookup to chainStarts + = do + chain <- readSTRef succChain + let newChain = from `chainCons` chain + writeSTRef succChain newChain + let chainStarts' = mapInsert from succChain $ mapDelete to chainStarts + -- chains <- sequence $ mapMap readSTRef chainStarts' + -- pprTraceM "to chains:" $ ppr chains + buildNext (setInsert from placed) chainStarts' chainEnds todo (Set.insert (from,to) linked) + -- The placed end of the edge is part of a chain already and not an end. | otherwise - = --pprTrace "single" (ppr lbl) - ( [lbl] - , mapInsert lbl (chainSingleton lbl) chains - , Set.empty) + = do + let block = if alreadyPlaced to then from else to + --pprTraceM "Singleton" $ ppr block + let newChain = chainSingleton block + ref <- newSTRef newChain + buildNext (setInsert block placed) (mapInsert block ref chainStarts) + (mapInsert block ref chainEnds) todo (linked) where alreadyPlaced blkId = (setMember blkId placed) - lbl = block - getSuccs = map fst . getSuccEdgesSorted succWeights - preds = map fst $ getSuccEdgesSorted predWeights lbl - --For efficiency we also create the map to look up predecessors here - predWeights = reverseEdges succWeights - - --- We make the CFG a Hoopl Graph, so we can reuse revPostOrder. -newtype BlockNode (e :: Extensibility) (x :: Extensibility) = BN (BlockId,[BlockId]) -instance NonLocal (BlockNode) where - entryLabel (BN (lbl,_)) = lbl - successors (BN (_,succs)) = succs - -fromNode :: BlockNode C C -> BlockId -fromNode (BN x) = fst x - -sequenceChain :: forall a i. (Instruction i, Outputable i) => LabelMap a -> CFG - -> [GenBasicBlock i] -> [GenBasicBlock i] +-- | Place basic blocks based on the given CFG. +-- See Note [Chain based CFG serialization] +sequenceChain :: forall a i. (Instruction i, Outputable i) + => LabelMap a -- ^ Keys indicate an info table on the block. + -> CFG -- ^ Control flow graph and some meta data. + -> [GenBasicBlock i] -- ^ List of basic blocks to be placed. + -> [GenBasicBlock i] -- ^ Blocks placed in sequence. sequenceChain _info _weights [] = [] sequenceChain _info _weights [x] = [x] sequenceChain info weights' blocks@((BasicBlock entry _):_) = - --Optimization, delete edges of weight <= 0. - --This significantly improves performance whenever - --we iterate over all edges, which is a few times! let weights :: CFG - weights - = filterEdges (\_f _t edgeInfo -> edgeWeight edgeInfo > 0) weights' + weights = --pprTrace "cfg'" (pprEdgeWeights cfg') + cfg' + where + (_, globalEdgeWeights) = {-# SCC mkGlobalWeights #-} mkGlobalWeights entry weights' + cfg' = {-# SCC rewriteEdges #-} + mapFoldlWithKey + (\cfg from m -> + mapFoldlWithKey + (\cfg to w -> setEdgeWeight cfg (EdgeWeight w) from to ) + cfg m ) + weights' + globalEdgeWeights + + directEdges :: [CfgEdge] + directEdges = sortBy (flip compare) $ catMaybes . map relevantWeight $ (infoEdgeList weights) + where + relevantWeight :: CfgEdge -> Maybe CfgEdge + relevantWeight edge@(CfgEdge from to edgeInfo) + | (EdgeInfo CmmSource { trans_cmmNode = CmmCall {} } _) <- edgeInfo + -- Ignore edges across calls + = Nothing + | mapMember to info + , w <- edgeWeight edgeInfo + -- The payoff is small if we jump over an info table + = Just (CfgEdge from to edgeInfo { edgeWeight = w/8 }) + | otherwise + = Just edge + blockMap :: LabelMap (GenBasicBlock i) blockMap = foldl' (\m blk@(BasicBlock lbl _ins) -> mapInsert lbl blk m) mapEmpty blocks - toNode :: BlockId -> BlockNode C C - toNode bid = - -- sorted such that heavier successors come first. - BN (bid,map fst . getSuccEdgesSorted weights' $ bid) - - orderedBlocks :: [BlockId] - orderedBlocks - = map fromNode $ - revPostorderFrom (fmap (toNode . blockId) blockMap) entry - (builtChains, builtEdges) = {-# SCC "buildChains" #-} --pprTraceIt "generatedChains" $ - --pprTrace "orderedBlocks" (ppr orderedBlocks) $ - buildChains weights orderedBlocks + --pprTrace "blocks" (ppr (mapKeys blockMap)) $ + buildChains directEdges (mapKeys blockMap) - rankedEdges :: WeightedEdgeList - -- Sort edges descending, remove fused eges + rankedEdges :: [CfgEdge] + -- Sort descending by weight, remove fused edges rankedEdges = - map (\(from, to, weight) -> WeightedEdge from to weight) . - filter (\(from, to, _) - -> not (Set.member (from,to) builtEdges)) . - sortWith (\(_,_,w) -> - w) $ weightedEdgeList weights + filter (\edge -> not (Set.member (edgeFrom edge,edgeTo edge) builtEdges)) $ + directEdges - (fusedChains, fusedEdges) + (neighbourChains, combined) = ASSERT(noDups $ mapElems builtChains) - {-# SCC "fuseChains" #-} - --(pprTrace "RankedEdges" $ ppr rankedEdges) $ - --pprTraceIt "FusedChains" $ - fuseChains rankedEdges builtChains - - rankedEdges' = - filter (\edge -> not $ Set.member edge fusedEdges) $ rankedEdges - - neighbourChains - = ASSERT(noDups $ mapElems fusedChains) {-# SCC "groupNeighbourChains" #-} - --pprTraceIt "ResultChains" $ - combineNeighbourhood rankedEdges' (mapElems fusedChains) + -- pprTraceIt "NeighbourChains" $ + combineNeighbourhood rankedEdges (mapElems builtChains) + + + allEdges :: [CfgEdge] + allEdges = {-# SCC allEdges #-} + sortOn (relevantWeight) $ filter (not . deadEdge) $ (infoEdgeList weights) + where + deadEdge :: CfgEdge -> Bool + deadEdge (CfgEdge from to _) = let e = (from,to) in Set.member e combined || Set.member e builtEdges + relevantWeight :: CfgEdge -> EdgeWeight + relevantWeight (CfgEdge _ _ edgeInfo) + | EdgeInfo (CmmSource { trans_cmmNode = CmmCall {}}) _ <- edgeInfo + -- Penalize edges across calls + = weight/(64.0) + | otherwise + = weight + where + -- negate to sort descending + weight = negate (edgeWeight edgeInfo) + + masterChain = + {-# SCC "mergeChains" #-} + -- pprTraceIt "MergedChains" $ + mergeChains allEdges neighbourChains --Make sure the first block stays first - ([entryChain],chains') - = ASSERT(noDups $ neighbourChains) - partition (chainMember entry) neighbourChains - (entryChain':entryRest) - | inFront entry entryChain = [entryChain] - | (rest,entry) <- breakChainAt entry entryChain + prepedChains + | inFront entry masterChain + = [masterChain] + | (rest,entry) <- breakChainAt entry masterChain = [entry,rest] | otherwise = pprPanic "Entry point eliminated" $ - ppr ([entryChain],chains') + ppr masterChain - prepedChains - = entryChain':(entryRest++chains') :: [BlockChain] blockList - -- = (concatMap chainToBlocks prepedChains) - = (concatMap fromOL $ map chainBlocks prepedChains) + = ASSERT(noDups [masterChain]) + (concatMap fromOL $ map chainBlocks prepedChains) --chainPlaced = setFromList $ map blockId blockList :: LabelSet chainPlaced = setFromList $ blockList :: LabelSet @@ -605,14 +733,22 @@ sequenceChain info weights' blocks@((BasicBlock entry _):_) = in filter (\block -> not (isPlaced block)) blocks placedBlocks = + -- We want debug builds to catch this as it's a good indicator for + -- issues with CFG invariants. But we don't want to blow up production + -- builds if something slips through. + ASSERT(null unplaced) --pprTraceIt "placedBlocks" $ - blockList ++ unplaced + -- ++ [] is stil kinda expensive + if null unplaced then blockList else blockList ++ unplaced getBlock bid = expectJust "Block placment" $ mapLookup bid blockMap in --Assert we placed all blocks given as input ASSERT(all (\bid -> mapMember bid blockMap) placedBlocks) dropJumps info $ map getBlock placedBlocks +{-# SCC dropJumps #-} +-- | Remove redundant jumps between blocks when we can rely on +-- fall through. dropJumps :: forall a i. Instruction i => LabelMap a -> [GenBasicBlock i] -> [GenBasicBlock i] dropJumps _ [] = [] @@ -641,7 +777,8 @@ sequenceTop => DynFlags -- Determine which layout algo to use -> NcgImpl statics instr jumpDest -> Maybe CFG -- ^ CFG if we have one. - -> NatCmmDecl statics instr -> NatCmmDecl statics instr + -> NatCmmDecl statics instr -- ^ Function to serialize + -> NatCmmDecl statics instr sequenceTop _ _ _ top@(CmmData _ _) = top sequenceTop dflags ncgImpl edgeWeights @@ -650,11 +787,13 @@ sequenceTop dflags ncgImpl edgeWeights --Use chain based algorithm , Just cfg <- edgeWeights = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $ + {-# SCC layoutBlocks #-} sequenceChain info cfg blocks ) | otherwise --Use old algorithm = let cfg = if dontUseCfg then Nothing else edgeWeights in CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $ + {-# SCC layoutBlocks #-} sequenceBlocks cfg info blocks) where dontUseCfg = gopt Opt_WeightlessBlocklayout dflags || diff --git a/compiler/nativeGen/CFG.hs b/compiler/nativeGen/CFG.hs index fee47188ac..8eb69a9dbf 100644 --- a/compiler/nativeGen/CFG.hs +++ b/compiler/nativeGen/CFG.hs @@ -6,31 +6,40 @@ {-# LANGUAGE TupleSections #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE CPP #-} +{-# LANGUAGE Rank2Types #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE DataKinds #-} module CFG ( CFG, CfgEdge(..), EdgeInfo(..), EdgeWeight(..) , TransitionSource(..) --Modify the CFG - , addWeightEdge, addEdge, delEdge + , addWeightEdge, addEdge + , delEdge, delNode , addNodesBetween, shortcutWeightMap , reverseEdges, filterEdges , addImmediateSuccessor - , mkWeightInfo, adjustEdgeWeight + , mkWeightInfo, adjustEdgeWeight, setEdgeWeight --Query the CFG , infoEdgeList, edgeList , getSuccessorEdges, getSuccessors - , getSuccEdgesSorted, weightedEdgeList + , getSuccEdgesSorted , getEdgeInfo , getCfgNodes, hasNode - , loopMembers + + -- Loop Information + , loopMembers, loopLevels, loopInfo --Construction/Misc , getCfg, getCfgProc, pprEdgeWeights, sanityCheckCfg --Find backedges and update their weight - , optimizeCFG ) + , optimizeCFG + , mkGlobalWeights + + ) where #include "HsVersions.h" @@ -38,9 +47,8 @@ where import GhcPrelude import BlockId -import Cmm ( RawCmmDecl, GenCmmDecl( .. ), CmmBlock, succ, g_entry - , CmmGraph ) -import CmmNode +import Cmm + import CmmUtils import CmmSwitch import Hoopl.Collections @@ -50,10 +58,24 @@ import qualified Hoopl.Graph as G import Util import Digraph +import Maybes + +import Unique +import qualified Dominators as Dom +import Data.IntMap.Strict (IntMap) +import Data.IntSet (IntSet) + +import qualified Data.IntMap.Strict as IM +import qualified Data.Map as M +import qualified Data.IntSet as IS +import qualified Data.Set as S +import Data.Tree +import Data.Bifunctor import Outputable -- DEBUGGING ONLY --import Debug +-- import Debug.Trace --import OrdList --import Debug.Trace import PprCmm () -- For Outputable instances @@ -61,17 +83,28 @@ import qualified DynFlags as D import Data.List --- import qualified Data.IntMap.Strict as M --TODO: LabelMap +import Data.STRef.Strict +import Control.Monad.ST + +import Data.Array.MArray +import Data.Array.ST +import Data.Array.IArray +import Data.Array.Unsafe (unsafeFreeze) +import Data.Array.Base (unsafeRead, unsafeWrite) + +import Control.Monad + +type Prob = Double type Edge = (BlockId, BlockId) type Edges = [Edge] newtype EdgeWeight - = EdgeWeight Int - deriving (Eq,Ord,Enum,Num,Real,Integral) + = EdgeWeight { weightToDouble :: Double } + deriving (Eq,Ord,Enum,Num,Real,Fractional) instance Outputable EdgeWeight where - ppr (EdgeWeight w) = ppr w + ppr (EdgeWeight w) = doublePrec 5 w type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo) @@ -108,15 +141,28 @@ instance Outputable CfgEdge where = parens (ppr from1 <+> text "-(" <> ppr edgeInfo <> text ")->" <+> ppr to1) -- | Can we trace back a edge to a specific Cmm Node --- or has it been introduced for codegen. We use this to maintain +-- or has it been introduced during assembly codegen. We use this to maintain -- some information which would otherwise be lost during the -- Cmm <-> asm transition. -- See also Note [Inverting Conditional Branches] data TransitionSource - = CmmSource (CmmNode O C) + = CmmSource { trans_cmmNode :: (CmmNode O C) + , trans_info :: BranchInfo } | AsmCodeGen deriving (Eq) +data BranchInfo = NoInfo -- ^ Unknown, but not heap or stack check. + | HeapStackCheck -- ^ Heap or stack check + deriving Eq + +instance Outputable BranchInfo where + ppr NoInfo = text "regular" + ppr HeapStackCheck = text "heap/stack" + +isHeapOrStackCheck :: TransitionSource -> Bool +isHeapOrStackCheck (CmmSource { trans_info = HeapStackCheck}) = True +isHeapOrStackCheck _ = False + -- | Information about edges data EdgeInfo = EdgeInfo @@ -127,12 +173,10 @@ data EdgeInfo instance Outputable EdgeInfo where ppr edgeInfo = text "weight:" <+> ppr (edgeWeight edgeInfo) --- Allow specialization -{-# INLINEABLE mkWeightInfo #-} -- | Convenience function, generate edge info based -- on weight not originating from cmm. -mkWeightInfo :: Integral n => n -> EdgeInfo -mkWeightInfo = EdgeInfo AsmCodeGen . fromIntegral +mkWeightInfo :: EdgeWeight -> EdgeInfo +mkWeightInfo = EdgeInfo AsmCodeGen -- | Adjust the weight between the blocks using the given function. -- If there is no such edge returns the original map. @@ -140,12 +184,25 @@ adjustEdgeWeight :: CFG -> (EdgeWeight -> EdgeWeight) -> BlockId -> BlockId -> CFG adjustEdgeWeight cfg f from to | Just info <- getEdgeInfo from to cfg - , weight <- edgeWeight info - = addEdge from to (info { edgeWeight = f weight}) cfg + , !weight <- edgeWeight info + , !newWeight <- f weight + = addEdge from to (info { edgeWeight = newWeight}) cfg + | otherwise = cfg + +-- | Set the weight between the blocks to the given weight. +-- If there is no such edge returns the original map. +setEdgeWeight :: CFG -> EdgeWeight + -> BlockId -> BlockId -> CFG +setEdgeWeight cfg !weight from to + | Just info <- getEdgeInfo from to cfg + = addEdge from to (info { edgeWeight = weight}) cfg | otherwise = cfg + + getCfgNodes :: CFG -> LabelSet -getCfgNodes m = mapFoldMapWithKey (\k v -> setFromList (k:mapKeys v)) m +getCfgNodes m = + mapFoldlWithKey (\s k toMap -> mapFoldlWithKey (\s k _ -> setInsert k s) (setInsert k s) toMap ) setEmpty m hasNode :: CFG -> BlockId -> Bool hasNode m node = mapMember node m || any (mapMember node) m @@ -294,6 +351,11 @@ delEdge from to m = remDest Nothing = Nothing remDest (Just wm) = Just $ mapDelete to wm +delNode :: BlockId -> CFG -> CFG +delNode node cfg = + fmap (mapDelete node) -- < Edges to the node + (mapDelete node cfg) -- < Edges from the node + -- | Destinations from bid ordered by weight (descending) getSuccEdgesSorted :: CFG -> BlockId -> [(BlockId,EdgeInfo)] getSuccEdgesSorted m bid = @@ -315,36 +377,54 @@ getEdgeInfo from to m | otherwise = Nothing +getEdgeWeight :: CFG -> BlockId -> BlockId -> EdgeWeight +getEdgeWeight cfg from to = + edgeWeight $ expectJust "Edgeweight for noexisting block" $ + getEdgeInfo from to cfg + +getTransitionSource :: BlockId -> BlockId -> CFG -> TransitionSource +getTransitionSource from to cfg = transitionSource $ expectJust "Source info for noexisting block" $ + getEdgeInfo from to cfg + reverseEdges :: CFG -> CFG -reverseEdges cfg = foldr add mapEmpty flatElems +reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg where - elems = mapToList $ fmap mapToList cfg :: [(BlockId,[(BlockId,EdgeInfo)])] - flatElems = - concatMap (\(from,ws) -> map (\(to,info) -> (to,from,info)) ws ) elems - add (to,from,info) m = addEdge to from info m + -- We preserve nodes without outgoing edges! + addNode :: CFG -> BlockId -> CFG + addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg + go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG + go cfg from toMap = mapFoldlWithKey (\cfg to info -> addEdge to from info cfg) cfg toMap :: CFG + -- | Returns a unordered list of all edges with info infoEdgeList :: CFG -> [CfgEdge] infoEdgeList m = - mapFoldMapWithKey - (\from toMap -> - map (\(to,info) -> CfgEdge from to info) (mapToList toMap)) - m - --- | Unordered list of edges with weight as Tuple (from,to,weight) -weightedEdgeList :: CFG -> [(BlockId,BlockId,EdgeWeight)] -weightedEdgeList m = - mapFoldMapWithKey - (\from toMap -> - map (\(to,info) -> - (from,to, edgeWeight info)) (mapToList toMap)) - m - -- (\(from, tos) -> map (\(to,info) -> (from,to, edgeWeight info)) tos ) + go (mapToList m) [] + where + -- We avoid foldMap to avoid thunk buildup + go :: [(BlockId,LabelMap EdgeInfo)] -> [CfgEdge] -> [CfgEdge] + go [] acc = acc + go ((from,toMap):xs) acc + = go' xs from (mapToList toMap) acc + go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [(BlockId,EdgeInfo)] -> [CfgEdge] -> [CfgEdge] + go' froms _ [] acc = go froms acc + go' froms from ((to,info):tos) acc + = go' froms from tos (CfgEdge from to info : acc) -- | Returns a unordered list of all edges without weights edgeList :: CFG -> [Edge] edgeList m = - mapFoldMapWithKey (\from toMap -> fmap (from,) (mapKeys toMap)) m + go (mapToList m) [] + where + -- We avoid foldMap to avoid thunk buildup + go :: [(BlockId,LabelMap EdgeInfo)] -> [Edge] -> [Edge] + go [] acc = acc + go ((from,toMap):xs) acc + = go' xs from (mapKeys toMap) acc + go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [BlockId] -> [Edge] -> [Edge] + go' froms _ [] acc = go froms acc + go' froms from (to:tos) acc + = go' froms from tos ((from,to) : acc) -- | Get successors of a given node without edge weights. getSuccessors :: CFG -> BlockId -> [BlockId] @@ -355,8 +435,8 @@ getSuccessors m bid pprEdgeWeights :: CFG -> SDoc pprEdgeWeights m = - let edges = sort $ weightedEdgeList m - printEdge (from, to, weight) + let edges = sort $ infoEdgeList m :: [CfgEdge] + printEdge (CfgEdge from to (EdgeInfo { edgeWeight = weight })) = text "\t" <> ppr from <+> text "->" <+> ppr to <> text "[label=\"" <> ppr weight <> text "\",weight=\"" <> ppr weight <> text "\"];\n" @@ -365,7 +445,7 @@ pprEdgeWeights m = --to immediately see it when it does. printNode node = text "\t" <> ppr node <> text ";\n" - getEdgeNodes (from, to, _weight) = [from,to] + getEdgeNodes (CfgEdge from to _) = [from,to] edgeNodes = setFromList $ concatMap getEdgeNodes edges :: LabelSet nodes = filter (\n -> (not . setMember n) edgeNodes) . mapKeys $ mapFilter null m in @@ -378,8 +458,8 @@ pprEdgeWeights m = updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG updateEdgeWeight f (from, to) cfg | Just oldInfo <- getEdgeInfo from to cfg - = let oldWeight = edgeWeight oldInfo - newWeight = f oldWeight + = let !oldWeight = edgeWeight oldInfo + !newWeight = f oldWeight in addEdge from to (oldInfo {edgeWeight = newWeight}) cfg | otherwise = panic "Trying to update invalid edge" @@ -448,9 +528,7 @@ addNodesBetween m updates = Should A or B be placed in front of C? The block layout algorithm decides this based on which edge (A,C)/(B,C) is heavier. So we - make a educated guess how often execution will transer control - along each edge as well as how much we gain by placing eg A before - C. + make a educated guess on which branch should be preferred. We rank edges in this order: * Unconditional Control Transfer - They will always @@ -479,7 +557,6 @@ addNodesBetween m updates = address. This reduces the chance that we return to the same cache line further. - -} -- | Generate weights for a Cmm proc based on some simple heuristics. getCfgProc :: D.CfgWeights -> RawCmmDecl -> CFG @@ -515,13 +592,24 @@ getCfg weights graph = getBlockEdges block = case branch of CmmBranch dest -> [mkEdge dest uncondWeight] - CmmCondBranch _c t f l + CmmCondBranch cond t f l | l == Nothing -> [mkEdge f condBranchWeight, mkEdge t condBranchWeight] | l == Just True -> [mkEdge f unlikelyCondWeight, mkEdge t likelyCondWeight] | l == Just False -> [mkEdge f likelyCondWeight, mkEdge t unlikelyCondWeight] + where + mkEdgeInfo = -- pprTrace "Info" (ppr branchInfo <+> ppr cond) + EdgeInfo (CmmSource branch branchInfo) . fromIntegral + mkEdge target weight = ((bid,target), mkEdgeInfo weight) + branchInfo = + foldRegsUsed + (panic "foldRegsDynFlags") + (\info r -> if r == SpLim || r == HpLim || r == BaseReg + then HeapStackCheck else info) + NoInfo cond + (CmmSwitch _e ids) -> let switchTargets = switchTargetsToList ids --Compiler performance hack - for very wide switches don't @@ -539,7 +627,7 @@ getCfg weights graph = map (\x -> ((bid,x),mkEdgeInfo 0)) $ G.successors other where bid = G.entryLabel block - mkEdgeInfo = EdgeInfo (CmmSource branch) . fromIntegral + mkEdgeInfo = EdgeInfo (CmmSource branch NoInfo) . fromIntegral mkEdge target weight = ((bid,target), mkEdgeInfo weight) branch = lastNode block :: CmmNode O C @@ -561,6 +649,11 @@ findBackEdges root cfg = optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG optimizeCFG _ (CmmData {}) cfg = cfg optimizeCFG weights (CmmProc info _lab _live graph) cfg = + {-# SCC optimizeCFG #-} + -- pprTrace "Initial:" (pprEdgeWeights cfg) $ + -- pprTrace "Initial:" (ppr $ mkGlobalWeights (g_entry graph) cfg) $ + + -- pprTrace "LoopInfo:" (ppr $ loopInfo cfg (g_entry graph)) $ favourFewerPreds . penalizeInfoTables info . increaseBackEdgeWeight (g_entry graph) $ cfg @@ -590,12 +683,8 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg = = weight - (fromIntegral $ D.infoTablePenalty weights) | otherwise = weight - -{- Note [Optimize for Fallthrough] - --} -- | If a block has two successors, favour the one with fewer - -- predecessors. (As that one is more likely to become a fallthrough) + -- predecessors and/or the one allowing fall through. favourFewerPreds :: CFG -> CFG favourFewerPreds cfg = let @@ -612,16 +701,17 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg = | preds1 == preds2 = ( 0, 0) | otherwise = (-1, 1) + update :: CFG -> BlockId -> CFG update cfg node | [(s1,e1),(s2,e2)] <- getSuccessorEdges cfg node - , w1 <- edgeWeight e1 - , w2 <- edgeWeight e2 + , !w1 <- edgeWeight e1 + , !w2 <- edgeWeight e2 --Only change the weights if there isn't already a ordering. , w1 == w2 , (mod1,mod2) <- modifiers (predCount s1) (predCount s2) = (\cfg' -> (adjustEdgeWeight cfg' (+mod2) node s2)) - (adjustEdgeWeight cfg (+mod1) node s1) + (adjustEdgeWeight cfg (+mod1) node s1) | otherwise = cfg in setFoldl update cfg nodes @@ -630,13 +720,12 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg = fallthroughTarget to (EdgeInfo source _weight) | mapMember to info = False | AsmCodeGen <- source = True - | CmmSource (CmmBranch {}) <- source = True - | CmmSource (CmmCondBranch {}) <- source = True + | CmmSource { trans_cmmNode = CmmBranch {} } <- source = True + | CmmSource { trans_cmmNode = CmmCondBranch {} } <- source = True | otherwise = False -- | Determine loop membership of blocks based on SCC analysis --- Ideally we would replace this with a variant giving us loop --- levels instead but the SCC code will do for now. +-- This is faster but only gives yes/no answers. loopMembers :: CFG -> LabelMap Bool loopMembers cfg = foldl' (flip setLevel) mapEmpty sccs @@ -650,3 +739,534 @@ loopMembers cfg = setLevel :: SCC BlockId -> LabelMap Bool -> LabelMap Bool setLevel (AcyclicSCC bid) m = mapInsert bid False m setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids + +loopLevels :: CFG -> BlockId -> LabelMap Int +loopLevels cfg root = liLevels $ loopInfo cfg root + +data LoopInfo = LoopInfo + { liBackEdges :: [(Edge)] -- ^ List of back edges + , liLevels :: LabelMap Int -- ^ BlockId -> LoopLevel mapping + , liLoops :: [(Edge, LabelSet)] -- ^ (backEdge, loopBody), body includes header + } + +instance Outputable LoopInfo where + ppr (LoopInfo _ _lvls loops) = + text "Loops:(backEdge, bodyNodes)" $$ + (vcat $ map ppr loops) + +-- | Determine loop membership of blocks based on Dominator analysis. +-- This is slower but gives loop levels instead of just loop membership. +-- However it only detects natural loops. Irreducible control flow is not +-- recognized even if it loops. But that is rare enough that we don't have +-- to care about that special case. +loopInfo :: CFG -> BlockId -> LoopInfo +loopInfo cfg root = LoopInfo { liBackEdges = backEdges + , liLevels = mapFromList loopCounts + , liLoops = loopBodies } + where + revCfg = reverseEdges cfg + graph = fmap (setFromList . mapKeys ) cfg :: LabelMap LabelSet + + --TODO - This should be a no op: Export constructors? Use unsafeCoerce? ... + rooted = ( fromBlockId root + , toIntMap $ fmap toIntSet graph) :: (Int, IntMap IntSet) + -- rooted = unsafeCoerce (root, graph) + tree = fmap toBlockId $ Dom.domTree rooted :: Tree BlockId + + -- Map from Nodes to their dominators + domMap :: LabelMap LabelSet + domMap = mkDomMap tree + + edges = edgeList cfg :: [(BlockId, BlockId)] + -- We can't recompute this from the edges, there might be blocks not connected via edges. + nodes = getCfgNodes cfg :: LabelSet + + -- identify back edges + isBackEdge (from,to) + | Just doms <- mapLookup from domMap + , setMember to doms + = True + | otherwise = False + + -- determine the loop body for a back edge + findBody edge@(tail, head) + = ( edge, setInsert head $ go (setSingleton tail) (setSingleton tail) ) + where + -- The reversed cfg makes it easier to look up predecessors + cfg' = delNode head revCfg + go :: LabelSet -> LabelSet -> LabelSet + go found current + | setNull current = found + | otherwise = go (setUnion newSuccessors found) + newSuccessors + where + newSuccessors = setFilter (\n -> not $ setMember n found) successors :: LabelSet + successors = setFromList $ concatMap + (getSuccessors cfg') + (setElems current) :: LabelSet + + backEdges = filter isBackEdge edges + loopBodies = map findBody backEdges :: [(Edge, LabelSet)] + + -- Block b is part of n loop bodies => loop nest level of n + loopCounts = + let bodies = map (first snd) loopBodies -- [(Header, Body)] + loopCount n = length $ nub . map fst . filter (setMember n . snd) $ bodies + in map (\n -> (n, loopCount n)) $ setElems nodes :: [(BlockId, Int)] + + toIntSet :: LabelSet -> IntSet + toIntSet s = IS.fromList . map fromBlockId . setElems $ s + toIntMap :: LabelMap a -> IntMap a + toIntMap m = IM.fromList $ map (\(x,y) -> (fromBlockId x,y)) $ mapToList m + + mkDomMap :: Tree BlockId -> LabelMap LabelSet + mkDomMap root = mapFromList $ go setEmpty root + where + go :: LabelSet -> Tree BlockId -> [(Label,LabelSet)] + go parents (Node lbl []) + = [(lbl, parents)] + go parents (Node _ leaves) + = let nodes = map rootLabel leaves + entries = map (\x -> (x,parents)) nodes + in entries ++ concatMap + (\n -> go (setInsert (rootLabel n) parents) n) + leaves + + fromBlockId :: BlockId -> Int + fromBlockId = getKey . getUnique + + toBlockId :: Int -> BlockId + toBlockId = mkBlockId . mkUniqueGrimily + +-- We make the CFG a Hoopl Graph, so we can reuse revPostOrder. +newtype BlockNode (e :: Extensibility) (x :: Extensibility) = BN (BlockId,[BlockId]) + +instance G.NonLocal (BlockNode) where + entryLabel (BN (lbl,_)) = lbl + successors (BN (_,succs)) = succs + +revPostorderFrom :: CFG -> BlockId -> [BlockId] +revPostorderFrom cfg root = + map fromNode $ G.revPostorderFrom hooplGraph root + where + nodes = getCfgNodes cfg + hooplGraph = setFoldl (\m n -> mapInsert n (toNode n) m) mapEmpty nodes + + fromNode :: BlockNode C C -> BlockId + fromNode (BN x) = fst x + + toNode :: BlockId -> BlockNode C C + toNode bid = + BN (bid,getSuccessors cfg $ bid) + + +-- | We take in a CFG which has on its edges weights which are +-- relative only to other edges originating from the same node. +-- +-- We return a CFG for which each edge represents a GLOBAL weight. +-- This means edge weights are comparable across the whole graph. +-- +-- For irreducible control flow results might be imprecise, otherwise they +-- are reliable. +-- +-- The algorithm is based on the Paper +-- "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus +-- The only big change is that we go over the nodes in the body of loops in +-- reverse post order. Which is required for diamond control flow to work probably. +-- +-- We also apply a few prediction heuristics (based on the same paper) + +{-# SCC mkGlobalWeights #-} +mkGlobalWeights :: BlockId -> CFG -> (LabelMap Double, LabelMap (LabelMap Double)) +mkGlobalWeights root localCfg + | null localCfg = panic "Error - Empty CFG" + | otherwise + = --pprTrace "revOrder" (ppr revOrder) $ + -- undefined --propagate (mapSingleton root 1) (revOrder) + (blockFreqs', edgeFreqs') + where + -- Calculate fixpoints + (blockFreqs, edgeFreqs) = calcFreqs nodeProbs backEdges' bodies' revOrder' + blockFreqs' = mapFromList $ map (first fromVertex) (assocs blockFreqs) :: LabelMap Double + edgeFreqs' = fmap fromVertexMap $ fromVertexMap edgeFreqs + + fromVertexMap :: IM.IntMap x -> LabelMap x + fromVertexMap m = mapFromList . map (first fromVertex) $ IM.toList m + + revOrder = revPostorderFrom localCfg root :: [BlockId] + loopinfo@(LoopInfo backedges _levels bodies) = loopInfo localCfg root + + revOrder' = map toVertex revOrder + backEdges' = map (bimap toVertex toVertex) backedges + bodies' = map calcBody bodies + + estimatedCfg = staticBranchPrediction root loopinfo localCfg + -- Normalize the weights to probabilities and apply heuristics + nodeProbs = cfgEdgeProbabilities estimatedCfg toVertex + + -- By mapping vertices to numbers in reverse post order we can bring any subset into reverse post + -- order simply by sorting. + -- TODO: The sort is redundant if we can guarantee that setElems returns elements ascending + calcBody (backedge, blocks) = + (toVertex $ snd backedge, sort . map toVertex $ (setElems blocks)) + + vertexMapping = mapFromList $ zip revOrder [0..] :: LabelMap Int + blockMapping = listArray (0,mapSize vertexMapping - 1) revOrder :: Array Int BlockId + -- Map from blockId to indicies starting at zero + toVertex :: BlockId -> Int + toVertex blockId = expectJust "mkGlobalWeights" $ mapLookup blockId vertexMapping + -- Map from indicies starting at zero to blockIds + fromVertex :: Int -> BlockId + fromVertex vertex = blockMapping ! vertex + +{- Note [Static Branch Prediction] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The work here has been based on the paper +"Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus. + +The primary differences are that if we branch on the result of a heap +check we do not apply any of the heuristics. +The reason is simple: They look like loops in the control flow graph +but are usually never entered, and if at most once. + +Currently implemented is a heuristic to predict that we do not exit +loops (lehPredicts) and one to predict that backedges are more likely +than any other edge. + +The back edge case is special as it superceeds any other heuristic if it +applies. + +Do NOT rely solely on nofib results for benchmarking this. I recommend at least +comparing megaparsec and container benchmarks. Nofib does not seeem to have +many instances of "loopy" Cmm where these make a difference. + +TODO: +* The paper containers more benchmarks which should be implemented. +* If we turn the likelyhood on if/else branches into a probability + instead of true/false we could implement this as a Cmm pass. + + The complete Cmm code still exists and can be accessed by the heuristics + + There is no chance of register allocation/codegen inserting branches/blocks + + making the TransitionSource info wrong. + + potential to use this information in CmmPasses. + - Requires refactoring of all the code relying on the binary nature of likelyhood. + - Requires refactoring `loopInfo` to work on both, Cmm Graphs and the backend CFG. +-} + +-- | Combination of target node id and information about the branch +-- we are looking at. +type TargetNodeInfo = (BlockId, EdgeInfo) + + +-- | Update branch weights based on certain heuristics. +-- See Note [Static Branch Prediction] +-- TODO: This should be combined with optimizeCFG +{-# SCC staticBranchPrediction #-} +staticBranchPrediction :: BlockId -> LoopInfo -> CFG -> CFG +staticBranchPrediction _root (LoopInfo l_backEdges loopLevels l_loops) cfg = + -- pprTrace "staticEstimatesOn" (ppr (cfg)) $ + setFoldl update cfg nodes + where + nodes = getCfgNodes cfg + backedges = S.fromList $ l_backEdges + -- Loops keyed by their back edge + loops = M.fromList $ l_loops :: M.Map Edge LabelSet + loopHeads = S.fromList $ map snd $ M.keys loops + + update :: CFG -> BlockId -> CFG + update cfg node + -- No successors, nothing to do. + | null successors = cfg + + -- Mix of backedges and others: + -- Always predict the backedges. + | not (null m) && length m < length successors + -- Heap/Stack checks "loop", but only once. + -- So we simply exclude any case involving them. + , not $ any (isHeapOrStackCheck . transitionSource . snd) successors + = let loopChance = repeat $! pred_LBH / (fromIntegral $ length m) + exitChance = repeat $! (1 - pred_LBH) / fromIntegral (length not_m) + updates = zip (map fst m) loopChance ++ zip (map fst not_m) exitChance + in -- pprTrace "mix" (ppr (node,successors)) $ + foldl' (\cfg (to,weight) -> setEdgeWeight cfg weight node to) cfg updates + + -- For (regular) non-binary branches we keep the weights from the STG -> Cmm translation. + | length successors /= 2 + = cfg + + -- Only backedges - no need to adjust + | length m > 0 + = cfg + + -- A regular binary branch, we can plug addition predictors in here. + | [(s1,s1_info),(s2,s2_info)] <- successors + , not $ any (isHeapOrStackCheck . transitionSource . snd) successors + = -- Normalize weights to total of 1 + let !w1 = max (edgeWeight s1_info) (0) + !w2 = max (edgeWeight s2_info) (0) + -- Of both weights are <= 0 we set both to 0.5 + normalizeWeight w = if w1 + w2 == 0 then 0.5 else w/(w1+w2) + !cfg' = setEdgeWeight cfg (normalizeWeight w1) node s1 + !cfg'' = setEdgeWeight cfg' (normalizeWeight w2) node s2 + + -- Figure out which heuristics apply to these successors + heuristics = map ($ ((s1,s1_info),(s2,s2_info))) + [lehPredicts, phPredicts, ohPredicts, ghPredicts, lhhPredicts, chPredicts + , shPredicts, rhPredicts] + -- Apply result of a heuristic. Argument is the likelyhood + -- predicted for s1. + applyHeuristic :: CFG -> Maybe Prob -> CFG + applyHeuristic cfg Nothing = cfg + applyHeuristic cfg (Just (s1_pred :: Double)) + | s1_old == 0 || s2_old == 0 || + isHeapOrStackCheck (transitionSource s1_info) || + isHeapOrStackCheck (transitionSource s2_info) + = cfg + | otherwise = + let -- Predictions from heuristic + s1_prob = EdgeWeight s1_pred :: EdgeWeight + s2_prob = 1.0 - s1_prob + -- Update + d = (s1_old * s1_prob) + (s2_old * s2_prob) :: EdgeWeight + s1_prob' = s1_old * s1_prob / d + !s2_prob' = s2_old * s2_prob / d + !cfg_s1 = setEdgeWeight cfg s1_prob' node s1 + in -- pprTrace "Applying heuristic!" (ppr (node,s1,s2) $$ ppr (s1_prob', s2_prob')) $ + setEdgeWeight cfg_s1 s2_prob' node s2 + where + -- Old weights + s1_old = getEdgeWeight cfg node s1 + s2_old = getEdgeWeight cfg node s2 + + in + -- pprTraceIt "RegularCfgResult" $ + foldl' applyHeuristic cfg'' heuristics + + -- Branch on heap/stack check + | otherwise = cfg + + where + -- Chance that loops are taken. + pred_LBH = 0.875 + -- successors + successors = getSuccessorEdges cfg node + -- backedges + (m,not_m) = partition (\succ -> S.member (node, fst succ) backedges) successors + + -- Heuristics return nothing if they don't say anything about this branch + -- or Just (prob_s1) where prob_s1 is the likelyhood for s1 to be the + -- taken branch. s1 is the branch in the true case. + + -- Loop exit heuristic. + -- We are unlikely to leave a loop unless it's to enter another one. + pred_LEH = 0.75 + -- If and only if no successor is a loopheader, + -- then we will likely not exit the current loop body. + lehPredicts :: (TargetNodeInfo,TargetNodeInfo) -> Maybe Prob + lehPredicts ((s1,_s1_info),(s2,_s2_info)) + | S.member s1 loopHeads || S.member s2 loopHeads + = Nothing + + | otherwise + = --pprTrace "lehPredict:" (ppr $ compare s1Level s2Level) $ + case compare s1Level s2Level of + EQ -> Nothing + LT -> Just (1-pred_LEH) --s1 exits to a shallower loop level (exits loop) + GT -> Just (pred_LEH) --s1 exits to a deeper loop level + where + s1Level = mapLookup s1 loopLevels + s2Level = mapLookup s2 loopLevels + + -- Comparing to a constant is unlikely to be equal. + ohPredicts (s1,_s2) + | CmmSource { trans_cmmNode = src1 } <- getTransitionSource node (fst s1) cfg + , CmmCondBranch cond ltrue _lfalse likely <- src1 + , likely == Nothing + , CmmMachOp mop args <- cond + , MO_Eq {} <- mop + , not (null [x | x@CmmLit{} <- args]) + = if fst s1 == ltrue then Just 0.3 else Just 0.7 + + | otherwise + = Nothing + + -- TODO: These are all the other heuristics from the paper. + -- Not all will apply, for now we just stub them out as Nothing. + phPredicts = const Nothing + ghPredicts = const Nothing + lhhPredicts = const Nothing + chPredicts = const Nothing + shPredicts = const Nothing + rhPredicts = const Nothing + +-- We normalize all edge weights as probabilities between 0 and 1. +-- Ignoring rounding errors all outgoing edges sum up to 1. +cfgEdgeProbabilities :: CFG -> (BlockId -> Int) -> IM.IntMap (IM.IntMap Prob) +cfgEdgeProbabilities cfg toVertex + = mapFoldlWithKey foldEdges IM.empty cfg + where + foldEdges = (\m from toMap -> IM.insert (toVertex from) (normalize toMap) m) + + normalize :: (LabelMap EdgeInfo) -> (IM.IntMap Prob) + normalize weightMap + | edgeCount <= 1 = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) 1.0 m) IM.empty weightMap + | otherwise = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) (normalWeight k) m) IM.empty weightMap + where + edgeCount = mapSize weightMap + -- Negative weights are generally allowed but are mapped to zero. + -- We then check if there is at least one non-zero edge and if not + -- assign uniform weights to all branches. + minWeight = 0 :: Prob + weightMap' = fmap (\w -> max (weightToDouble . edgeWeight $ w) minWeight) weightMap + totalWeight = sum weightMap' + + normalWeight :: BlockId -> Prob + normalWeight bid + | totalWeight == 0 + = 1.0 / fromIntegral edgeCount + | Just w <- mapLookup bid weightMap' + = w/totalWeight + | otherwise = panic "impossible" + +-- This is the fixpoint algorithm from +-- "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus +-- The adaption to Haskell is my own. +calcFreqs :: IM.IntMap (IM.IntMap Prob) -> [(Int,Int)] -> [(Int, [Int])] -> [Int] + -> (Array Int Double, IM.IntMap (IM.IntMap Prob)) +calcFreqs graph backEdges loops revPostOrder = runST $ do + visitedNodes <- newArray (0,nodeCount-1) False :: ST s (STUArray s Int Bool) + blockFreqs <- newArray (0,nodeCount-1) 0.0 :: ST s (STUArray s Int Double) + edgeProbs <- newSTRef graph + edgeBackProbs <- newSTRef graph + + -- let traceArray a = do + -- vs <- forM [0..nodeCount-1] $ \i -> readArray a i >>= (\v -> return (i,v)) + -- trace ("array: " ++ show vs) $ return () + + let -- See #1600, we need to inline or unboxing makes perf worse. + -- {-# INLINE getFreq #-} + {-# INLINE visited #-} + visited b = unsafeRead visitedNodes b + getFreq b = unsafeRead blockFreqs b + -- setFreq :: forall s. Int -> Double -> ST s () + setFreq b f = unsafeWrite blockFreqs b f + -- setVisited :: forall s. Node -> ST s () + setVisited b = unsafeWrite visitedNodes b True + -- Frequency/probability that edge is taken. + getProb' arr b1 b2 = readSTRef arr >>= + (\graph -> + return . + fromMaybe (error "getFreq 1") . + IM.lookup b2 . + fromMaybe (error "getFreq 2") $ + (IM.lookup b1 graph) + ) + setProb' arr b1 b2 prob = do + g <- readSTRef arr + let !m = fromMaybe (error "Foo") $ IM.lookup b1 g + !m' = IM.insert b2 prob m + writeSTRef arr $! (IM.insert b1 m' g) + + getEdgeFreq b1 b2 = getProb' edgeProbs b1 b2 + setEdgeFreq b1 b2 = setProb' edgeProbs b1 b2 + getProb b1 b2 = fromMaybe (error "getProb") $ do + m' <- IM.lookup b1 graph + IM.lookup b2 m' + + getBackProb b1 b2 = getProb' edgeBackProbs b1 b2 + setBackProb b1 b2 = setProb' edgeBackProbs b1 b2 + + + let -- calcOutFreqs :: Node -> ST s () + calcOutFreqs bhead block = do + !f <- getFreq block + forM (successors block) $ \bi -> do + let !prob = getProb block bi + let !succFreq = f * prob + setEdgeFreq block bi succFreq + -- traceM $ "SetOut: " ++ show (block, bi, f, prob, succFreq) + when (bi == bhead) $ setBackProb block bi succFreq + + + let propFreq block head = do + -- traceM ("prop:" ++ show (block,head)) + -- traceShowM block + + !v <- visited block + if v then + return () --Dont look at nodes twice + else if block == head then + setFreq block 1.0 -- Loop header frequency is always 1 + else do + let preds = IS.elems $ predecessors block + irreducible <- (fmap or) $ forM preds $ \bp -> do + !bp_visited <- visited bp + let bp_backedge = isBackEdge bp block + return (not bp_visited && not bp_backedge) + + if irreducible + then return () -- Rare we don't care + else do + setFreq block 0 + !cycleProb <- sum <$> (forM preds $ \pred -> do + if isBackEdge pred block + then + getBackProb pred block + else do + !f <- getFreq block + !prob <- getEdgeFreq pred block + setFreq block $! f + prob + return 0) + -- traceM $ "cycleProb:" ++ show cycleProb + let limit = 1 - 1/512 -- Paper uses 1 - epsilon, but this works. + -- determines how large likelyhoods in loops can grow. + !cycleProb <- return $ min cycleProb limit -- <- return $ if cycleProb > limit then limit else cycleProb + -- traceM $ "cycleProb:" ++ show cycleProb + + !f <- getFreq block + setFreq block (f / (1.0 - cycleProb)) + + setVisited block + calcOutFreqs head block + + -- Loops, by nesting, inner to outer + forM_ loops $ \(head, body) -> do + forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i True) -- Mark all nodes as visited. + forM_ body (\i -> unsafeWrite visitedNodes i False) -- Mark all blocks reachable from head as not visited + forM_ body $ \block -> propFreq block head + + -- After dealing with all loops, deal with non-looping parts of the CFG + forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i False) -- Everything in revPostOrder is reachable + forM_ revPostOrder $ \block -> propFreq block (head revPostOrder) + + -- trace ("Final freqs:") $ return () + -- let freqString = pprFreqs freqs + -- trace (unlines freqString) $ return () + -- trace (pprFre) $ return () + graph' <- readSTRef edgeProbs + freqs' <- unsafeFreeze blockFreqs + + return (freqs', graph') + where + predecessors :: Int -> IS.IntSet + predecessors b = fromMaybe IS.empty $ IM.lookup b revGraph + successors b = fromMaybe (lookupError "succ" b graph)$ IM.keys <$> IM.lookup b graph + lookupError s b g = pprPanic ("Lookup error " ++ s) $ + ( text "node" <+> ppr b $$ + text "graph" <+> + vcat (map (\(k,m) -> ppr (k,m :: IM.IntMap Double)) $ IM.toList g) + ) + + nodeCount = IM.foldl' (\count toMap -> IM.foldlWithKey' countTargets count toMap) (IM.size graph) graph + where + countTargets = (\count k _ -> countNode k + count ) + countNode n = if IM.member n graph then 0 else 1 + + isBackEdge from to = S.member (from,to) backEdgeSet + backEdgeSet = S.fromList backEdges + + revGraph :: IntMap IntSet + revGraph = IM.foldlWithKey' (\m from toMap -> addEdges m from toMap) IM.empty graph + where + addEdges m0 from toMap = IM.foldlWithKey' (\m k _ -> addEdge m from k) m0 toMap + addEdge m0 from to = IM.insertWith IS.union to (IS.singleton from) m0 diff --git a/compiler/nativeGen/RegAlloc/Graph/SpillCost.hs b/compiler/nativeGen/RegAlloc/Graph/SpillCost.hs index 9c6e24d320..52f590948a 100644 --- a/compiler/nativeGen/RegAlloc/Graph/SpillCost.hs +++ b/compiler/nativeGen/RegAlloc/Graph/SpillCost.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ScopedTypeVariables, GADTs, BangPatterns #-} module RegAlloc.Graph.SpillCost ( SpillCostRecord, plusSpillCostRecord, @@ -23,6 +23,7 @@ import Reg import GraphBase import Hoopl.Collections (mapLookup) +import Hoopl.Label import Cmm import UniqFM import UniqSet @@ -49,9 +50,6 @@ type SpillCostRecord type SpillCostInfo = UniqFM SpillCostRecord --- | Block membership in a loop -type LoopMember = Bool - type SpillCostState = State (UniqFM SpillCostRecord) () -- | An empty map of spill costs. @@ -88,45 +86,49 @@ slurpSpillCostInfo platform cfg cmm where countCmm CmmData{} = return () countCmm (CmmProc info _ _ sccs) - = mapM_ (countBlock info) + = mapM_ (countBlock info freqMap) $ flattenSCCs sccs + where + LiveInfo _ entries _ _ = info + freqMap = (fst . mkGlobalWeights (head entries)) <$> cfg -- Lookup the regs that are live on entry to this block in -- the info table from the CmmProc. - countBlock info (BasicBlock blockId instrs) + countBlock info freqMap (BasicBlock blockId instrs) | LiveInfo _ _ blockLive _ <- info , Just rsLiveEntry <- mapLookup blockId blockLive , rsLiveEntry_virt <- takeVirtuals rsLiveEntry - = countLIs (loopMember blockId) rsLiveEntry_virt instrs + = countLIs (ceiling $ blockFreq freqMap blockId) rsLiveEntry_virt instrs | otherwise = error "RegAlloc.SpillCost.slurpSpillCostInfo: bad block" - countLIs :: LoopMember -> UniqSet VirtualReg -> [LiveInstr instr] -> SpillCostState + + countLIs :: Int -> UniqSet VirtualReg -> [LiveInstr instr] -> SpillCostState countLIs _ _ [] = return () -- Skip over comment and delta pseudo instrs. - countLIs inLoop rsLive (LiveInstr instr Nothing : lis) + countLIs scale rsLive (LiveInstr instr Nothing : lis) | isMetaInstr instr - = countLIs inLoop rsLive lis + = countLIs scale rsLive lis | otherwise = pprPanic "RegSpillCost.slurpSpillCostInfo" $ text "no liveness information on instruction " <> ppr instr - countLIs inLoop rsLiveEntry (LiveInstr instr (Just live) : lis) + countLIs scale rsLiveEntry (LiveInstr instr (Just live) : lis) = do -- Increment the lifetime counts for regs live on entry to this instr. - mapM_ (incLifetime (loopCount inLoop)) $ nonDetEltsUniqSet rsLiveEntry + mapM_ incLifetime $ nonDetEltsUniqSet rsLiveEntry -- This is non-deterministic but we do not -- currently support deterministic code-generation. -- See Note [Unique Determinism and code generation] -- Increment counts for what regs were read/written from. let (RU read written) = regUsageOfInstr platform instr - mapM_ (incUses (loopCount inLoop)) $ catMaybes $ map takeVirtualReg $ nub read - mapM_ (incDefs (loopCount inLoop)) $ catMaybes $ map takeVirtualReg $ nub written + mapM_ (incUses scale) $ catMaybes $ map takeVirtualReg $ nub read + mapM_ (incDefs scale) $ catMaybes $ map takeVirtualReg $ nub written -- Compute liveness for entry to next instruction. let liveDieRead_virt = takeVirtuals (liveDieRead live) @@ -140,21 +142,18 @@ slurpSpillCostInfo platform cfg cmm = (rsLiveAcross `unionUniqSets` liveBorn_virt) `minusUniqSet` liveDieWrite_virt - countLIs inLoop rsLiveNext lis + countLIs scale rsLiveNext lis - loopCount inLoop - | inLoop = 10 - | otherwise = 1 incDefs count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, count, 0, 0) incUses count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, count, 0) - incLifetime count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, 0, count) + incLifetime reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, 0, 1) - loopBlocks = CFG.loopMembers <$> cfg - loopMember bid - | Just isMember <- join (mapLookup bid <$> loopBlocks) - = isMember + blockFreq :: Maybe (LabelMap Double) -> Label -> Double + blockFreq freqs bid + | Just freq <- join (mapLookup bid <$> freqs) + = max 1.0 (10000 * freq) | otherwise - = False + = 1.0 -- Only if no cfg given -- | Take all the virtual registers from this set. takeVirtuals :: UniqSet Reg -> UniqSet VirtualReg @@ -215,31 +214,39 @@ chooseSpill info graph -- Without live range splitting, its's better to spill from the outside -- in so set the cost of very long live ranges to zero -- -{- -spillCost_chaitin - :: SpillCostInfo - -> Graph Reg RegClass Reg - -> Reg - -> Float -spillCost_chaitin info graph reg - -- Spilling a live range that only lives for 1 instruction - -- isn't going to help us at all - and we definitely want to avoid - -- trying to re-spill previously inserted spill code. - | lifetime <= 1 = 1/0 - - -- It's unlikely that we'll find a reg for a live range this long - -- better to spill it straight up and not risk trying to keep it around - -- and have to go through the build/color cycle again. - | lifetime > allocatableRegsInClass (regClass reg) * 10 - = 0 +-- spillCost_chaitin +-- :: SpillCostInfo +-- -> Graph VirtualReg RegClass RealReg +-- -> VirtualReg +-- -> Float + +-- spillCost_chaitin info graph reg +-- -- Spilling a live range that only lives for 1 instruction +-- -- isn't going to help us at all - and we definitely want to avoid +-- -- trying to re-spill previously inserted spill code. +-- | lifetime <= 1 = 1/0 + +-- -- It's unlikely that we'll find a reg for a live range this long +-- -- better to spill it straight up and not risk trying to keep it around +-- -- and have to go through the build/color cycle again. + +-- -- To facility this we scale down the spill cost of long ranges. +-- -- This makes sure long ranges are still spilled first. +-- -- But this way spill cost remains relevant for long live +-- -- ranges. +-- | lifetime >= 128 +-- = (spillCost / conflicts) / 10.0 + + +-- -- Otherwise revert to chaitin's regular cost function. +-- | otherwise = (spillCost / conflicts) +-- where +-- !spillCost = fromIntegral (uses + defs) :: Float +-- conflicts = fromIntegral (nodeDegree classOfVirtualReg graph reg) +-- (_, defs, uses, lifetime) +-- = fromMaybe (reg, 0, 0, 0) $ lookupUFM info reg - -- Otherwise revert to chaitin's regular cost function. - | otherwise = fromIntegral (uses + defs) - / fromIntegral (nodeDegree graph reg) - where (_, defs, uses, lifetime) - = fromMaybe (reg, 0, 0, 0) $ lookupUFM info reg --} -- Just spill the longest live range. spillCost_length diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 7a2d59993b..b1dd9c58ad 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -3529,7 +3529,7 @@ invertCondBranches (Just cfg) keep bs = , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg -- Both jumps come from the same cmm statement , transitionSource edgeInfo1 == transitionSource edgeInfo2 - , (CmmSource cmmCondBranch) <- transitionSource edgeInfo1 + , CmmSource {trans_cmmNode = cmmCondBranch} <- transitionSource edgeInfo1 --Int comparisons are invertable , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch |