diff options
author | Andreas Klebinger <klebinger.andreas@gmx.at> | 2019-10-15 00:58:12 +0200 |
---|---|---|
committer | Andreas Klebinger <klebinger.andreas@gmx.at> | 2019-11-12 15:31:27 +0100 |
commit | ec962cb95fb6f288f9a449f3c5da8998afb636ba (patch) | |
tree | 1ecd5b3a612f5650d6a3d7d56fdaaaeebc8a6853 | |
parent | dde5c06a9ac205b4edbff679c682a4ed0fedbc2e (diff) | |
download | haskell-wip/andreask/1953_backport.tar.gz |
Backport fixes from !1953 fixing #17334.wip/andreask/1953_backport
-rw-r--r-- | compiler/nativeGen/AsmCodeGen.hs | 47 | ||||
-rw-r--r-- | compiler/nativeGen/BlockLayout.hs | 28 | ||||
-rw-r--r-- | compiler/nativeGen/CFG.hs | 78 | ||||
-rw-r--r-- | compiler/nativeGen/NCGMonad.hs | 15 | ||||
-rw-r--r-- | compiler/nativeGen/RegAlloc/Liveness.hs | 2 | ||||
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 682 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Instr.hs | 4 | ||||
-rw-r--r-- | testsuite/tests/codeGen/should_compile/T17334.hs | 240 |
8 files changed, 830 insertions, 266 deletions
diff --git a/compiler/nativeGen/AsmCodeGen.hs b/compiler/nativeGen/AsmCodeGen.hs index 0c21bc0641..cb2fb5476a 100644 --- a/compiler/nativeGen/AsmCodeGen.hs +++ b/compiler/nativeGen/AsmCodeGen.hs @@ -550,6 +550,10 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count = do let platform = targetPlatform dflags + let proc_name = case cmm of + (CmmProc _ entry_label _ _) -> ppr entry_label + _ -> text "DataChunk" + -- rewrite assignments to global regs let fixed_cmm = {-# SCC "fixStgRegisters" #-} @@ -579,12 +583,11 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count Opt_D_dump_asm_native "Native code" (vcat $ map (pprNatCmmDecl ncgImpl) native) - dumpIfSet_dyn dflags - Opt_D_dump_cfg_weights "CFG Weights" - (pprEdgeWeights nativeCfgWeights) + maybeDumpCfg dflags (Just nativeCfgWeights) "CFG Weights - Native" proc_name -- tag instructions with register liveness information - -- also drops dead code + -- also drops dead code. We don't keep the cfg in sync on + -- some backends, so don't use it there. let livenessCfg = if (backendMaintainsCfg dflags) then Just nativeCfgWeights else Nothing @@ -697,12 +700,13 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count cfgRegAllocUpdates = (concatMap Linear.ra_fixupList raStats) let cfgWithFixupBlks = - addNodesBetween nativeCfgWeights cfgRegAllocUpdates + pure addNodesBetween <*> livenessCfg <*> pure cfgRegAllocUpdates -- Insert stack update blocks - let postRegCFG = - foldl' (\m (from,to) -> addImmediateSuccessor from to m ) - cfgWithFixupBlks stack_updt_blks + let postRegCFG :: Maybe CFG + postRegCFG = + pure (foldl' (\m (from,to) -> addImmediateSuccessor from to m )) <*> + cfgWithFixupBlks <*> pure stack_updt_blks ---- x86fp_kludge. This pass inserts ffree instructions to clear ---- the FPU stack on x86. The x86 ABI requires that the FPU stack @@ -729,11 +733,9 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count shortcutBranches dflags ncgImpl tabled postRegCFG let optimizedCFG = - optimizeCFG (cfgWeightInfo dflags) cmm postShortCFG + optimizeCFG (cfgWeightInfo dflags) cmm <$> postShortCFG - dumpIfSet_dyn dflags - Opt_D_dump_cfg_weights "CFG Final Weights" - ( pprEdgeWeights optimizedCFG ) + maybeDumpCfg dflags optimizedCFG "CFG Weights - Final" proc_name --TODO: Partially check validity of the cfg. let getBlks (CmmProc _info _lbl _live (ListGraph blocks)) = blocks @@ -743,8 +745,8 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count (gopt Opt_DoAsmLinting dflags || debugIsOn )) $ do let blocks = concatMap getBlks shorted let labels = setFromList $ fmap blockId blocks :: LabelSet - return $! seq (sanityCheckCfg optimizedCFG labels $ - text "cfg not in lockstep") () + return $! seq (pure sanityCheckCfg <*> optimizedCFG <*> pure labels <*> + pure (text "cfg not in lockstep")) () ---- sequence blocks let sequenced :: [NatCmmDecl statics instr] @@ -761,6 +763,8 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count {-# SCC "invertCondBranches" #-} map invert sequenced where + invertConds :: LabelMap CmmStatics -> [NatBasicBlock instr] + -> [NatBasicBlock instr] invertConds = (invertCondBranches ncgImpl) optimizedCFG invert top@CmmData {} = top invert (CmmProc info lbl live (ListGraph blocks)) = @@ -793,6 +797,15 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count , ppr_raStatsLinear , unwinds ) +maybeDumpCfg :: DynFlags -> Maybe CFG -> String -> SDoc -> IO () +maybeDumpCfg _dflags Nothing _ _ = return () +maybeDumpCfg dflags (Just cfg) msg proc_name + | null cfg = return () + | otherwise + = dumpIfSet_dyn + dflags Opt_D_dump_cfg_weights msg + (proc_name <> char ':' $$ pprEdgeWeights cfg) + -- | Make sure all blocks we want the layout algorithm to place have been placed. checkLayout :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr] -> [NatCmmDecl statics instr] @@ -917,13 +930,13 @@ shortcutBranches :: forall statics instr jumpDest. (Outputable jumpDest) => DynFlags -> NcgImpl statics instr jumpDest -> [NatCmmDecl statics instr] - -> CFG - -> ([NatCmmDecl statics instr],CFG) + -> Maybe CFG + -> ([NatCmmDecl statics instr],Maybe CFG) shortcutBranches dflags ncgImpl tops weights | gopt Opt_AsmShortcutting dflags = ( map (apply_mapping ncgImpl mapping) tops' - , shortcutWeightMap weights mappingBid ) + , shortcutWeightMap mappingBid <$!> weights ) | otherwise = (tops, weights) where diff --git a/compiler/nativeGen/BlockLayout.hs b/compiler/nativeGen/BlockLayout.hs index 72aea5bf10..8dff0952fc 100644 --- a/compiler/nativeGen/BlockLayout.hs +++ b/compiler/nativeGen/BlockLayout.hs @@ -639,29 +639,31 @@ dropJumps info ((BasicBlock lbl ins):todo) sequenceTop :: (Instruction instr, Outputable instr) - => DynFlags --Use new layout code - -> NcgImpl statics instr jumpDest -> CFG - -> NatCmmDecl statics instr -> NatCmmDecl statics instr + => DynFlags -- Determine which layout algo to use + -> NcgImpl statics instr jumpDest + -> Maybe CFG -- ^ CFG if we have one. + -> NatCmmDecl statics instr -- ^ Function to serialize + -> NatCmmDecl statics instr sequenceTop _ _ _ top@(CmmData _ _) = top sequenceTop dflags ncgImpl edgeWeights (CmmProc info lbl live (ListGraph blocks)) | (gopt Opt_CfgBlocklayout dflags) && backendMaintainsCfg dflags --Use chain based algorithm + , Just cfg <- edgeWeights = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $ - sequenceChain info edgeWeights blocks ) + {-# SCC layoutBlocks #-} + sequenceChain info cfg blocks ) | otherwise --Use old algorithm - = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $ - sequenceBlocks cfg info blocks) + = let cfg = if dontUseCfg then Nothing else edgeWeights + in CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $ + {-# SCC layoutBlocks #-} + sequenceBlocks cfg info blocks) where - cfg - | (gopt Opt_WeightlessBlocklayout dflags) || - (not $ backendMaintainsCfg dflags) - -- Don't make use of cfg in the old algorithm - = Nothing - -- Use cfg in the old algorithm - | otherwise = Just edgeWeights + dontUseCfg = gopt Opt_WeightlessBlocklayout dflags || + (not $ backendMaintainsCfg dflags) + -- The old algorithm: -- It is very simple (and stupid): We make a graph out of diff --git a/compiler/nativeGen/CFG.hs b/compiler/nativeGen/CFG.hs index 155e5bcac4..cf155364e7 100644 --- a/compiler/nativeGen/CFG.hs +++ b/compiler/nativeGen/CFG.hs @@ -61,8 +61,6 @@ import qualified DynFlags as D import Data.List --- import qualified Data.IntMap.Strict as M --TODO: LabelMap - type Edge = (BlockId, BlockId) type Edges = [Edge] @@ -76,6 +74,13 @@ instance Outputable EdgeWeight where type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo) -- | A control flow graph where edges have been annotated with a weight. +-- Implemented as IntMap (IntMap <edgeData>) +-- We must uphold the invariant that for each edge A -> B we must have: +-- A entry B in the outer map. +-- A entry B in the map we get when looking up A. +-- Maintaining this invariant is useful as any failed lookup now indicates +-- an actual error in code which might go unnoticed for a while +-- otherwise. type CFG = EdgeInfoMap EdgeInfo data CfgEdge @@ -144,11 +149,20 @@ adjustEdgeWeight cfg f from to = addEdge from to (info { edgeWeight = f weight}) cfg | otherwise = cfg + getCfgNodes :: CFG -> LabelSet getCfgNodes m = mapFoldMapWithKey (\k v -> setFromList (k:mapKeys v)) m +-- | Is this block part of this graph? hasNode :: CFG -> BlockId -> Bool -hasNode m node = mapMember node m || any (mapMember node) m +hasNode m node = + -- Check the invariant that each node must exist in the first map or not at all. + ASSERT( found || not (any (mapMember node) m)) + found + where + found = mapMember node m + + -- | Check if the nodes in the cfg and the set of blocks are the same. -- In a case of a missmatch we panic and show the difference. @@ -160,7 +174,7 @@ sanityCheckCfg m blockSet msg pprPanic "Block list and cfg nodes don't match" ( text "difference:" <+> ppr diff $$ text "blocks:" <+> ppr blockSet $$ - text "cfg:" <+> ppr m $$ + text "cfg:" <+> pprEdgeWeights m $$ msg ) False where @@ -224,8 +238,8 @@ This function (shortcutWeightMap) takes the same mapping and applies the mapping to the CFG in the way layed out above. -} -shortcutWeightMap :: CFG -> LabelMap (Maybe BlockId) -> CFG -shortcutWeightMap cfg cuts = +shortcutWeightMap :: LabelMap (Maybe BlockId) -> CFG -> CFG +shortcutWeightMap cuts cfg = foldl' applyMapping cfg $ mapToList cuts where -- takes the tuple (B,C) from the notation in [Updating the CFG during shortcutting] @@ -259,7 +273,7 @@ shortcutWeightMap cfg cuts = -- \ \ -- -> C => -> C -- -addImmediateSuccessor :: BlockId -> BlockId -> CFG -> CFG +addImmediateSuccessor :: HasDebugCallStack => BlockId -> BlockId -> CFG -> CFG addImmediateSuccessor node follower cfg = updateEdges . addWeightEdge node follower uncondWeight $ cfg where @@ -275,10 +289,16 @@ addImmediateSuccessor node follower cfg -- | Adds a new edge, overwrites existing edges if present addEdge :: BlockId -> BlockId -> EdgeInfo -> CFG -> CFG addEdge from to info cfg = - mapAlter addDest from cfg + mapAlter addFromToEdge from $ + mapAlter addDestNode to cfg where - addDest Nothing = Just $ mapSingleton to info - addDest (Just wm) = Just $ mapInsert to info wm + -- Simply insert the edge into the edge list. + addFromToEdge Nothing = Just $ mapSingleton to info + addFromToEdge (Just wm) = Just $ mapInsert to info wm + -- We must add the destination node explicitly as well + addDestNode Nothing = Just $ mapEmpty + addDestNode n@(Just _) = n + -- | Adds a edge with the given weight to the cfg -- If there already existed an edge it is overwritten. @@ -304,8 +324,11 @@ getSuccEdgesSorted m bid = sortedEdges -- | Get successors of a given node with edge weights. -getSuccessorEdges :: CFG -> BlockId -> [(BlockId,EdgeInfo)] -getSuccessorEdges m bid = maybe [] mapToList $ mapLookup bid m +getSuccessorEdges :: HasDebugCallStack => CFG -> BlockId -> [(BlockId,EdgeInfo)] +getSuccessorEdges m bid = maybe lookupError mapToList (mapLookup bid m) + where + lookupError = pprPanic "getSuccessorEdges: Block does not exist" $ + ppr bid $$ text "CFG:" <+> pprEdgeWeights m getEdgeInfo :: BlockId -> BlockId -> CFG -> Maybe EdgeInfo getEdgeInfo from to m @@ -316,12 +339,13 @@ getEdgeInfo from to m = Nothing reverseEdges :: CFG -> CFG -reverseEdges cfg = foldr add mapEmpty flatElems +reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg where - elems = mapToList $ fmap mapToList cfg :: [(BlockId,[(BlockId,EdgeInfo)])] - flatElems = - concatMap (\(from,ws) -> map (\(to,info) -> (to,from,info)) ws ) elems - add (to,from,info) m = addEdge to from info m + -- We preserve nodes without outgoing edges! + addNode :: CFG -> BlockId -> CFG + addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg + go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG + go cfg from toMap = mapFoldlWithKey (\cfg to info -> addEdge to from info cfg) cfg toMap :: CFG -- | Returns a unordered list of all edges with info infoEdgeList :: CFG -> [CfgEdge] @@ -347,11 +371,14 @@ edgeList m = mapFoldMapWithKey (\from toMap -> fmap (from,) (mapKeys toMap)) m -- | Get successors of a given node without edge weights. -getSuccessors :: CFG -> BlockId -> [BlockId] +getSuccessors :: HasDebugCallStack => CFG -> BlockId -> [BlockId] getSuccessors m bid | Just wm <- mapLookup bid m = mapKeys wm - | otherwise = [] + | otherwise = lookupError + where + lookupError = pprPanic "getSuccessors: Block does not exist" $ + ppr bid <+> pprEdgeWeights m pprEdgeWeights :: CFG -> SDoc pprEdgeWeights m = @@ -375,6 +402,7 @@ pprEdgeWeights m = text "}\n" {-# INLINE updateEdgeWeight #-} --Allows eliminating the tuple when possible +-- | Invariant: The edge **must** exist already in the graph. updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG updateEdgeWeight f (from, to) cfg | Just oldInfo <- getEdgeInfo from to cfg @@ -422,7 +450,8 @@ addNodesBetween m updates = | otherwise = pprPanic "Can't find weight for edge that should have one" ( text "triple" <+> ppr (from,between,old) $$ - text "updates" <+> ppr updates ) + text "updates" <+> ppr updates $$ + text "cfg:" <+> pprEdgeWeights m ) updateWeight :: CFG -> (BlockId,BlockId,BlockId,EdgeInfo) -> CFG updateWeight m (from,between,old,edgeInfo) = addEdge from between edgeInfo . @@ -550,7 +579,7 @@ getCfg weights graph = blocks = revPostorder graph :: [CmmBlock] --Find back edges by BFS -findBackEdges :: BlockId -> CFG -> Edges +findBackEdges :: HasDebugCallStack => BlockId -> CFG -> Edges findBackEdges root cfg = --pprTraceIt "Backedges:" $ map fst . @@ -562,7 +591,7 @@ findBackEdges root cfg = classifyEdges root getSuccs edges :: [((BlockId,BlockId),EdgeType)] -optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG +optimizeCFG :: HasDebugCallStack => D.CfgWeights -> RawCmmDecl -> CFG -> CFG optimizeCFG _ (CmmData {}) cfg = cfg optimizeCFG weights (CmmProc info _lab _live graph) cfg = favourFewerPreds . @@ -641,16 +670,17 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg = -- | Determine loop membership of blocks based on SCC analysis -- Ideally we would replace this with a variant giving us loop -- levels instead but the SCC code will do for now. -loopMembers :: CFG -> LabelMap Bool +loopMembers :: HasDebugCallStack => CFG -> LabelMap Bool loopMembers cfg = foldl' (flip setLevel) mapEmpty sccs where mkNode :: BlockId -> Node BlockId BlockId mkNode bid = DigraphNode bid bid (getSuccessors cfg bid) - nodes = map mkNode (setElems $ getCfgNodes cfg) + nodes = map mkNode $ setElems (getCfgNodes cfg) sccs = stronglyConnCompFromEdgedVerticesOrd nodes setLevel :: SCC BlockId -> LabelMap Bool -> LabelMap Bool setLevel (AcyclicSCC bid) m = mapInsert bid False m setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids + diff --git a/compiler/nativeGen/NCGMonad.hs b/compiler/nativeGen/NCGMonad.hs index c22a656d2a..998361a2e3 100644 --- a/compiler/nativeGen/NCGMonad.hs +++ b/compiler/nativeGen/NCGMonad.hs @@ -1,4 +1,6 @@ {-# LANGUAGE CPP #-} +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE BangPatterns #-} -- ----------------------------------------------------------------------------- -- @@ -18,7 +20,7 @@ module NCGMonad ( addNodeBetweenNat, addImmediateSuccessorNat, updateCfgNat, - getUniqueNat, + getUniqueNat, getCfgNat, mapAccumLNat, setDeltaNat, getDeltaNat, @@ -65,6 +67,7 @@ import Instruction import Outputable (SDoc, pprPanic, ppr) import Cmm (RawCmmDecl, CmmStatics) import CFG +import Util data NcgImpl statics instr jumpDest = NcgImpl { cmmTopCodeGen :: RawCmmDecl -> NatM [NatCmmDecl statics instr], @@ -88,7 +91,7 @@ data NcgImpl statics instr jumpDest = NcgImpl { -- the block's 'UnwindPoint's -- See Note [What is this unwinding business?] in Debug -- and Note [Unwinding information in the NCG] in this module. - invertCondBranches :: CFG -> LabelMap CmmStatics -> [NatBasicBlock instr] + invertCondBranches :: Maybe CFG -> LabelMap CmmStatics -> [NatBasicBlock instr] -> [NatBasicBlock instr] -- ^ Turn the sequence of `jcc l1; jmp l2` into `jncc l2; <block_l1>` -- when possible. @@ -206,7 +209,11 @@ addImportNat imp updateCfgNat :: (CFG -> CFG) -> NatM () updateCfgNat f - = NatM $ \ st -> ((), st { natm_cfg = f (natm_cfg st) }) + = NatM $ \ st -> let !cfg' = f (natm_cfg st) + in ((), st { natm_cfg = cfg'}) + +getCfgNat :: NatM CFG +getCfgNat = NatM $ \ st -> (natm_cfg st, st) -- | Record that we added a block between `from` and `old`. addNodeBetweenNat :: BlockId -> BlockId -> BlockId -> NatM () @@ -231,7 +238,7 @@ addNodeBetweenNat from between to -- | Place `succ` after `block` and change any edges -- block -> X to `succ` -> X -addImmediateSuccessorNat :: BlockId -> BlockId -> NatM () +addImmediateSuccessorNat :: HasDebugCallStack => BlockId -> BlockId -> NatM () addImmediateSuccessorNat block succ = updateCfgNat (addImmediateSuccessor block succ) diff --git a/compiler/nativeGen/RegAlloc/Liveness.hs b/compiler/nativeGen/RegAlloc/Liveness.hs index b7f8d1c871..9d68ca12c1 100644 --- a/compiler/nativeGen/RegAlloc/Liveness.hs +++ b/compiler/nativeGen/RegAlloc/Liveness.hs @@ -705,7 +705,7 @@ sccBlocks blocks entries mcfg = map (fmap node_payload) sccs reachable :: LabelSet reachable | Just cfg <- mcfg - -- Our CFG only contains reachable nodes by construction. + -- Our CFG only contains reachable nodes by construction at this point. = getCfgNodes cfg | otherwise = setFromList $ [ node_key node | node <- reachablesG g1 roots ] diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 69ab7b202d..11e79acebc 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1,4 +1,5 @@ {-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-} +{-# LANGUAGE TupleSections #-} -- The default iteration limit is a bit too low for the definitions -- in this module. @@ -36,6 +37,7 @@ import GhcPrelude import X86.Instr import X86.Cond import X86.Regs +import X86.Ppr ( ) import X86.RegInfo --TODO: Remove - Just for development/debugging @@ -130,6 +132,56 @@ cmmTopCodeGen (CmmProc info lab live graph) = do cmmTopCodeGen (CmmData sec dat) = do return [CmmData sec (1, dat)] -- no translation, we just use CmmStatic +{- Note [Verifying basic blocks] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + We want to guarantee a few things about the results + of instruction selection. + + Namely that each basic blocks consists of: + * A (potentially empty) sequence of straight line instructions + followed by + * A (potentially empty) sequence of jump like instructions. + + We can verify this by going through the instructions and + making sure that any non-jumpish instruction can't appear + after a jumpish instruction. + + There are gotchas however: + * CALLs are strictly speaking control flow but here we care + not about them. Hence we treat them as regular instructions. + + It's safe for them to appear inside a basic block + as (ignoring side effects inside the call) they will result in + straight line code. + + * NEWBLOCK marks the start of a new basic block so can + be followed by any instructions. +-} + +-- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally. +verifyBasicBlock :: [Instr] -> () +verifyBasicBlock instrs + | debugIsOn = go False instrs + | otherwise = () + where + go _ [] = () + go atEnd (i:instr) + = case i of + -- Start a new basic block + NEWBLOCK {} -> go False instr + -- Calls are not viable block terminators + CALL {} | atEnd -> faultyBlockWith i + | not atEnd -> go atEnd instr + -- All instructions ok, check if we reached the end and continue. + _ | not atEnd -> go (isJumpishInstr i) instr + -- Only jumps allowed at the end of basic blocks. + | otherwise -> if isJumpishInstr i + then go True instr + else faultyBlockWith i + faultyBlockWith i + = pprPanic "Non control flow instructions after end of basic block." + (ppr i <+> text "in:" $$ vcat (map ppr instrs)) basicBlockCodeGen :: CmmBlock @@ -148,9 +200,10 @@ basicBlockCodeGen block = do let line = srcSpanStartLine span; col = srcSpanStartCol span return $ unitOL $ LOCATION fileId line col name _ -> return nilOL - mid_instrs <- stmtsToInstrs id stmts - tail_instrs <- stmtToInstrs id tail + (mid_instrs,mid_bid) <- stmtsToInstrs id stmts + (tail_instrs,_) <- stmtToInstrs mid_bid tail let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs + return $! verifyBasicBlock (fromOL instrs) instrs' <- fold <$> traverse addSpUnwindings instrs -- code generation may introduce new basic block boundaries, which -- are indicated by the NEWBLOCK instruction. We must split up the @@ -180,60 +233,137 @@ addSpUnwindings instr@(DELTA d) = do else return (unitOL instr) addSpUnwindings instr = return $ unitOL instr -stmtsToInstrs :: BlockId -> [CmmNode e x] -> NatM InstrBlock -stmtsToInstrs bid stmts - = do instrss <- mapM (stmtToInstrs bid) stmts - return (concatOL instrss) +{- Note [Keeping track of the current block] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When generating instructions for Cmm we sometimes require +the current block for things like retry loops. + +We also sometimes change the current block, if a MachOP +results in branching control flow. + +Issues arise if we have two statements in the same block, +which both depend on the current block id *and* change the +basic block after them. This happens for atomic primops +in the X86 backend where we want to update the CFG data structure +when introducing new basic blocks. + +For example in #17334 we got this Cmm code: + + c3Bf: // global + (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18); + (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0); + _s3sT::I64 = _s3sV::I64; + goto c3B1; + +This resulted in two new basic blocks being inserted: + + c3Bf: + movl $18,%vI_n3Bo + movq 88(%vI_s3sQ),%rax + jmp _n3Bp + n3Bp: + ... + cmpxchgq %vI_n3Bq,88(%vI_s3sQ) + jne _n3Bp + ... + jmp _n3Bs + n3Bs: + ... + cmpxchgq %vI_n3Bt,88(%vI_s3sQ) + jne _n3Bs + ... + jmp _c3B1 + ... + +Based on the Cmm we called stmtToInstrs we translated both atomic operations under +the assumption they would be placed into their Cmm basic block `c3Bf`. +However for the retry loop we introduce new labels, so this is not the case +for the second statement. +This resulted in a desync between the explicit control flow graph +we construct as a separate data type and the actual control flow graph in the code. + +Instead we now return the new basic block if a statement causes a change +in the current block and use the block for all following statements. + +For this reason genCCall is also split into two parts. +One for calls which *won't* change the basic blocks in +which successive instructions will be placed. +A different one for calls which *are* known to change the +basic block. + +-} + +-- See Note [Keeping track of the current block] for why +-- we pass the BlockId. +stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in. + -> [CmmNode O O] -- ^ Cmm Statement + -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction +stmtsToInstrs bid stmts = + go bid stmts nilOL + where + go bid [] instrs = return (instrs,bid) + go bid (s:stmts) instrs = do + (instrs',bid') <- stmtToInstrs bid s + -- If the statement introduced a new block, we use that one + let newBid = fromMaybe bid bid' + go newBid stmts (instrs `appOL` instrs') -- | `bid` refers to the current block and is used to update the CFG -- if new blocks are inserted in the control flow. -stmtToInstrs :: BlockId -> CmmNode e x -> NatM InstrBlock +-- See Note [Keeping track of the current block] for more details. +stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in. + -> CmmNode e x + -> NatM (InstrBlock, Maybe BlockId) + -- ^ Instructions, and bid of new block if successive + -- statements are placed in a different basic block. stmtToInstrs bid stmt = do dflags <- getDynFlags is32Bit <- is32BitPlatform case stmt of - CmmComment s -> return (unitOL (COMMENT s)) - CmmTick {} -> return nilOL - - CmmUnwind regs -> do - let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable - to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr) - case foldMap to_unwind_entry regs of - tbl | M.null tbl -> return nilOL - | otherwise -> do - lbl <- mkAsmTempLabel <$> getUniqueM - return $ unitOL $ UNWIND lbl tbl - - CmmAssign reg src - | isFloatType ty -> assignReg_FltCode format reg src - | is32Bit && isWord64 ty -> assignReg_I64Code reg src - | otherwise -> assignReg_IntCode format reg src - where ty = cmmRegType dflags reg - format = cmmTypeFormat ty - - CmmStore addr src - | isFloatType ty -> assignMem_FltCode format addr src - | is32Bit && isWord64 ty -> assignMem_I64Code addr src - | otherwise -> assignMem_IntCode format addr src - where ty = cmmExprType dflags src - format = cmmTypeFormat ty - CmmUnsafeForeignCall target result_regs args -> genCCall dflags is32Bit target result_regs args bid - CmmBranch id -> return $ genBranch id - - --We try to arrange blocks such that the likely branch is the fallthrough - --in CmmContFlowOpt. So we can assume the condition is likely false here. - CmmCondBranch arg true false _ -> genCondBranch bid true false arg - CmmSwitch arg ids -> do dflags <- getDynFlags - genSwitch dflags arg ids - CmmCall { cml_target = arg - , cml_args_regs = gregs } -> do - dflags <- getDynFlags - genJump arg (jumpRegs dflags gregs) - _ -> - panic "stmtToInstrs: statement should have been cps'd away" + _ -> (,Nothing) <$> case stmt of + CmmComment s -> return (unitOL (COMMENT s)) + CmmTick {} -> return nilOL + + CmmUnwind regs -> do + let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable + to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr) + case foldMap to_unwind_entry regs of + tbl | M.null tbl -> return nilOL + | otherwise -> do + lbl <- mkAsmTempLabel <$> getUniqueM + return $ unitOL $ UNWIND lbl tbl + + CmmAssign reg src + | isFloatType ty -> assignReg_FltCode format reg src + | is32Bit && isWord64 ty -> assignReg_I64Code reg src + | otherwise -> assignReg_IntCode format reg src + where ty = cmmRegType dflags reg + format = cmmTypeFormat ty + + CmmStore addr src + | isFloatType ty -> assignMem_FltCode format addr src + | is32Bit && isWord64 ty -> assignMem_I64Code addr src + | otherwise -> assignMem_IntCode format addr src + where ty = cmmExprType dflags src + format = cmmTypeFormat ty + + CmmBranch id -> return $ genBranch id + + --We try to arrange blocks such that the likely branch is the fallthrough + --in CmmContFlowOpt. So we can assume the condition is likely false here. + CmmCondBranch arg true false _ -> genCondBranch bid true false arg + CmmSwitch arg ids -> do dflags <- getDynFlags + genSwitch dflags arg ids + CmmCall { cml_target = arg + , cml_args_regs = gregs } -> do + dflags <- getDynFlags + genJump arg (jumpRegs dflags gregs) + _ -> + panic "stmtToInstrs: statement should have been cps'd away" jumpRegs :: DynFlags -> [GlobalReg] -> [Reg] @@ -1772,6 +1902,109 @@ genCondBranch' _ bid id false bool = do updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false) return (cond_code `appOL` code) +{- Note [Introducing cfg edges inside basic blocks] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + During instruction selection a statement `s` + in a block B with control of the sort: B -> C + will sometimes result in control + flow of the sort: + + ┌ < ┐ + v ^ + B -> B1 ┴ -> C + + as is the case for some atomic operations. + + Now to keep the CFG in sync when introducing B1 we clearly + want to insert it between B and C. However there is + a catch when we have to deal with self loops. + + We might start with code and a CFG of these forms: + + loop: + stmt1 ┌ < ┐ + .... v ^ + stmtX loop ┘ + stmtY + .... + goto loop: + + Now we introduce B1: + ┌ ─ ─ ─ ─ ─┐ + loop: │ ┌ < ┐ │ + instrs v │ │ ^ + .... loop ┴ B1 ┴ ┘ + instrsFromX + stmtY + goto loop: + + This is simple, all outgoing edges from loop now simply + start from B1 instead and the code generator knows which + new edges it introduced for the self loop of B1. + + Disaster strikes if the statement Y follows the same pattern. + If we apply the same rule that all outgoing edges change then + we end up with: + + loop ─> B1 ─> B2 ┬─┐ + │ │ └─<┤ │ + │ └───<───┘ │ + └───────<────────┘ + + This is problematic. The edge B1->B1 is modified as expected. + However the modification is wrong! + + The assembly in this case looked like this: + + _loop: + <instrs> + _B1: + ... + cmpxchgq ... + jne _B1 + <instrs> + <end _B1> + _B2: + ... + cmpxchgq ... + jne _B2 + <instrs> + jmp loop + + There is no edge _B2 -> _B1 here. It's still a self loop onto _B1. + + The problem here is that really B1 should be two basic blocks. + Otherwise we have control flow in the *middle* of a basic block. + A contradiction! + + So to account for this we add yet another basic block marker: + + _B: + <instrs> + _B1: + ... + cmpxchgq ... + jne _B1 + jmp _B1' + _B1': + <instrs> + <end _B1> + _B2: + ... + + Now when inserting B2 we will only look at the outgoing edges of B1' and + everything will work out nicely. + + You might also wonder why we don't insert jumps at the end of _B1'. There is + no way another block ends up jumping to the labels _B1 or _B2 since they are + essentially invisible to other blocks. View them as control flow labels local + to the basic block if you'd like. + + Not doing this ultimately caused (part 2 of) #17334. +-} + + -- ----------------------------------------------------------------------------- -- Generating C calls @@ -1789,14 +2022,168 @@ genCCall -> [CmmFormal] -- where to put the result -> [CmmActual] -- arguments (of mixed type) -> BlockId -- The block we are in - -> NatM InstrBlock + -> NatM (InstrBlock, Maybe BlockId) + +-- First we deal with cases which might introduce new blocks in the stream. + +genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop)) + [dst] [addr, n] bid = do + use_sse2 <- sse2Enabled + Amode amode addr_code <- + if amop `elem` [AMO_Add, AMO_Sub] + then getAmode addr + else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg + arg <- getNewRegNat format + arg_code <- getAnyReg n + let platform = targetPlatform dflags + dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) + (code, lbl) <- op_code dst_r arg amode + return (addr_code `appOL` arg_code arg `appOL` code, Just lbl) + where + -- Code for the operation + op_code :: Reg -- Destination reg + -> Reg -- Register containing argument + -> AddrMode -- Address of location to mutate + -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId + op_code dst_r arg amode = case amop of + -- In the common case where dst_r is a virtual register the + -- final move should go away, because it's the last use of arg + -- and the first use of dst_r. + AMO_Add -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode)) + , MOV format (OpReg arg) (OpReg dst_r) + ], bid) + AMO_Sub -> return $ (toOL [ NEGI format (OpReg arg) + , LOCK (XADD format (OpReg arg) (OpAddr amode)) + , MOV format (OpReg arg) (OpReg dst_r) + ], bid) + -- In these cases we need a new block id, and have to return it so + -- that later instruction selection can reference it. + AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst) + AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst + , NOT format dst + ]) + AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst) + AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst) + where + -- Simulate operation that lacks a dedicated instruction using + -- cmpxchg. + cmpxchg_code :: (Operand -> Operand -> OrdList Instr) + -> NatM (OrdList Instr, BlockId) + cmpxchg_code instrs = do + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + tmp <- getNewRegNat format + + --Record inserted blocks + -- We turn A -> B into A -> A' -> A'' -> B + -- with a self loop on A'. + addImmediateSuccessorNat bid lbl1 + addImmediateSuccessorNat lbl1 lbl2 + updateCfgNat (addWeightEdge lbl1 lbl1 0) + + return $ (toOL + [ MOV format (OpAddr amode) (OpReg eax) + , JXX ALWAYS lbl1 + , NEWBLOCK lbl1 + -- Keep old value so we can return it: + , MOV format (OpReg eax) (OpReg dst_r) + , MOV format (OpReg eax) (OpReg tmp) + ] + `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL + [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode)) + , JXX NE lbl1 + -- See Note [Introducing cfg edges inside basic blocks] + -- why this basic block is required. + , JXX ALWAYS lbl2 + , NEWBLOCK lbl2 + ], + lbl2) + format = intFormat width + +genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid + | is32Bit, width == W64 = do + ChildCode64 vcode rlo <- iselExpr64 src + use_sse2 <- sse2Enabled + let rhi = getHiVRegFromLo rlo + dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) + lbl1 <- getBlockIdNat + lbl2 <- getBlockIdNat + let format = if width == W8 then II16 else intFormat width + tmp_r <- getNewRegNat format + + -- New CFG Edges: + -- bid -> lbl2 + -- bid -> lbl1 -> lbl2 + -- We also changes edges originating at bid to start at lbl2 instead. + updateCfgNat (addWeightEdge bid lbl1 110 . + addWeightEdge lbl1 lbl2 110 . + addImmediateSuccessor bid lbl2) + + -- The following instruction sequence corresponds to the pseudo-code + -- + -- if (src) { + -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32); + -- } else { + -- dst = 64; + -- } + let instrs = vcode `appOL` toOL + ([ MOV II32 (OpReg rhi) (OpReg tmp_r) + , OR II32 (OpReg rlo) (OpReg tmp_r) + , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r) + , JXX EQQ lbl2 + , JXX ALWAYS lbl1 + + , NEWBLOCK lbl1 + , BSF II32 (OpReg rhi) dst_r + , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r) + , BSF II32 (OpReg rlo) tmp_r + , CMOV NE II32 (OpReg tmp_r) dst_r + , JXX ALWAYS lbl2 + + , NEWBLOCK lbl2 + ]) + return (instrs, Just lbl2) + + | otherwise = do + code_src <- getAnyReg src + use_sse2 <- sse2Enabled + let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) + + -- The following insn sequence makes sure 'ctz 0' has a defined value. + -- starting with Haswell, one could use the TZCNT insn instead. + let format = if width == W8 then II16 else intFormat width + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + let instrs = code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSF format (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r) + , CMOV NE format (OpReg tmp_r) dst_r + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits + return (instrs, Nothing) + where + bw = widthInBits width + platform = targetPlatform dflags + +genCCall dflags bits mop dst args bid = do + instr <- genCCall' dflags bits mop dst args bid + return (instr, Nothing) --- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +-- genCCall' handles cases not introducing new code blocks. +genCCall' + :: DynFlags + -> Bool -- 32 bit platform? + -> ForeignTarget -- function to call + -> [CmmFormal] -- where to put the result + -> [CmmActual] -- arguments (of mixed type) + -> BlockId -- The block we are in + -> NatM InstrBlock --- Unroll memcpy calls if the source and destination pointers are at --- least DWORD aligned and the number of bytes to copy isn't too +-- Unroll memcpy calls if the number of bytes to copy isn't too -- large. Otherwise, call C's memcpy. -genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _ +genCCall' dflags is32Bit (PrimTarget (MO_Memcpy align)) _ [dst, src, CmmLit (CmmInt n _)] _ | fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do code_dst <- getAnyReg dst @@ -1843,7 +2230,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _ dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - i)) -genCCall dflags _ (PrimTarget (MO_Memset align)) _ +genCCall' dflags _ (PrimTarget (MO_Memset align)) _ [dst, CmmLit (CmmInt c _), CmmLit (CmmInt n _)] @@ -1888,14 +2275,14 @@ genCCall dflags _ (PrimTarget (MO_Memset align)) _ dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - i)) -genCCall _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL -genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL +genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL +genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL -- barriers compile to no code on x86/x86-64; -- we keep it this long in order to prevent earlier optimisations. -genCCall _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL +genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL -genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ = +genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ = case n of 0 -> genPrefetch src $ PREFETCH NTA format 1 -> genPrefetch src $ PREFETCH Lvl2 format @@ -1916,9 +2303,10 @@ genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ = ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) )) -- prefetch always takes an address -genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do +genCCall' dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do let platform = targetPlatform dflags - let dst_r = getRegisterReg platform False (CmmLocal dst) + use_sse2 <- sse2Enabled + let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) case width of W64 | is32Bit -> do ChildCode64 vcode rlo <- iselExpr64 src @@ -1938,7 +2326,7 @@ genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do where format = intFormat width -genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] +genCCall' dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] args@[src] bid = do sse4_2 <- sse4_2Enabled let platform = targetPlatform dflags @@ -1964,20 +2352,21 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] let target = ForeignTarget targetExpr (ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn) - genCCall dflags is32Bit target dest_regs args bid + genCCall' dflags is32Bit target dest_regs args bid where format = intFormat width lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width)) -genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst] +genCCall' dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst] args@[src, mask] bid = do let platform = targetPlatform dflags + use_sse2 <- sse2Enabled if isBmi2Enabled dflags then do code_src <- getAnyReg src code_mask <- getAnyReg mask src_r <- getNewRegNat format mask_r <- getNewRegNat format - let dst_r = getRegisterReg platform False (CmmLocal dst) + let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) return $ code_src src_r `appOL` code_mask mask_r `appOL` (if width == W8 then -- The PDEP instruction doesn't take a r/m8 @@ -1997,20 +2386,21 @@ genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst] let target = ForeignTarget targetExpr (ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn) - genCCall dflags is32Bit target dest_regs args bid + genCCall' dflags is32Bit target dest_regs args bid where format = intFormat width lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width)) -genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst] +genCCall' dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst] args@[src, mask] bid = do let platform = targetPlatform dflags + use_sse2 <- sse2Enabled if isBmi2Enabled dflags then do code_src <- getAnyReg src code_mask <- getAnyReg mask src_r <- getNewRegNat format mask_r <- getNewRegNat format - let dst_r = getRegisterReg platform False (CmmLocal dst) + let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) return $ code_src src_r `appOL` code_mask mask_r `appOL` (if width == W8 then -- The PEXT instruction doesn't take a r/m8 @@ -2030,19 +2420,19 @@ genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst] let target = ForeignTarget targetExpr (ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn) - genCCall dflags is32Bit target dest_regs args bid + genCCall' dflags is32Bit target dest_regs args bid where format = intFormat width lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width)) -genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid +genCCall' dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid | is32Bit && width == W64 = do -- Fallback to `hs_clz64` on i386 targetExpr <- cmmMakeDynamicReference dflags CallReference lbl let target = ForeignTarget targetExpr (ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn) - genCCall dflags is32Bit target dest_regs args bid + genCCall' dflags is32Bit target dest_regs args bid | otherwise = do code_src <- getAnyReg src @@ -2067,162 +2457,37 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] b format = if width == W8 then II16 else intFormat width lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width)) -genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid - | is32Bit, width == W64 = do - ChildCode64 vcode rlo <- iselExpr64 src - let rhi = getHiVRegFromLo rlo - dst_r = getRegisterReg platform False (CmmLocal dst) - lbl1 <- getBlockIdNat - lbl2 <- getBlockIdNat - tmp_r <- getNewRegNat format - - -- New CFG Edges: - -- bid -> lbl2 - -- bid -> lbl1 -> lbl2 - -- We also changes edges originating at bid to start at lbl2 instead. - updateCfgNat (addWeightEdge bid lbl1 110 . - addWeightEdge lbl1 lbl2 110 . - addImmediateSuccessor bid lbl2) - - -- The following instruction sequence corresponds to the pseudo-code - -- - -- if (src) { - -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32); - -- } else { - -- dst = 64; - -- } - return $ vcode `appOL` toOL - ([ MOV II32 (OpReg rhi) (OpReg tmp_r) - , OR II32 (OpReg rlo) (OpReg tmp_r) - , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r) - , JXX EQQ lbl2 - , JXX ALWAYS lbl1 - - , NEWBLOCK lbl1 - , BSF II32 (OpReg rhi) dst_r - , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r) - , BSF II32 (OpReg rlo) tmp_r - , CMOV NE II32 (OpReg tmp_r) dst_r - , JXX ALWAYS lbl2 - - , NEWBLOCK lbl2 - ]) - - | otherwise = do - code_src <- getAnyReg src - src_r <- getNewRegNat format - tmp_r <- getNewRegNat format - let dst_r = getRegisterReg platform False (CmmLocal dst) - - -- The following insn sequence makes sure 'ctz 0' has a defined value. - -- starting with Haswell, one could use the TZCNT insn instead. - return $ code_src src_r `appOL` toOL - ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ - [ BSF format (OpReg src_r) tmp_r - , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r) - , CMOV NE format (OpReg tmp_r) dst_r - ]) -- NB: We don't need to zero-extend the result for the - -- W8/W16 cases because the 'MOV' insn already - -- took care of implicitly clearing the upper bits - where - bw = widthInBits width - platform = targetPlatform dflags - format = if width == W8 then II16 else intFormat width - -genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do +genCCall' dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do targetExpr <- cmmMakeDynamicReference dflags CallReference lbl let target = ForeignTarget targetExpr (ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn) - genCCall dflags is32Bit target dest_regs args bid + genCCall' dflags is32Bit target dest_regs args bid where lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width)) -genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop)) - [dst] [addr, n] bid = do - Amode amode addr_code <- - if amop `elem` [AMO_Add, AMO_Sub] - then getAmode addr - else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg - arg <- getNewRegNat format - arg_code <- getAnyReg n - use_sse2 <- sse2Enabled - let platform = targetPlatform dflags - dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) - code <- op_code dst_r arg amode - return $ addr_code `appOL` arg_code arg `appOL` code - where - -- Code for the operation - op_code :: Reg -- Destination reg - -> Reg -- Register containing argument - -> AddrMode -- Address of location to mutate - -> NatM (OrdList Instr) - op_code dst_r arg amode = case amop of - -- In the common case where dst_r is a virtual register the - -- final move should go away, because it's the last use of arg - -- and the first use of dst_r. - AMO_Add -> return $ toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode)) - , MOV format (OpReg arg) (OpReg dst_r) - ] - AMO_Sub -> return $ toOL [ NEGI format (OpReg arg) - , LOCK (XADD format (OpReg arg) (OpAddr amode)) - , MOV format (OpReg arg) (OpReg dst_r) - ] - AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst) - AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst - , NOT format dst - ]) - AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst) - AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst) - where - -- Simulate operation that lacks a dedicated instruction using - -- cmpxchg. - cmpxchg_code :: (Operand -> Operand -> OrdList Instr) - -> NatM (OrdList Instr) - cmpxchg_code instrs = do - lbl <- getBlockIdNat - tmp <- getNewRegNat format - - --Record inserted blocks - addImmediateSuccessorNat bid lbl - updateCfgNat (addWeightEdge lbl lbl 0) - - return $ toOL - [ MOV format (OpAddr amode) (OpReg eax) - , JXX ALWAYS lbl - , NEWBLOCK lbl - -- Keep old value so we can return it: - , MOV format (OpReg eax) (OpReg dst_r) - , MOV format (OpReg eax) (OpReg tmp) - ] - `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL - [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode)) - , JXX NE lbl - ] - - format = intFormat width - -genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do +genCCall' dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do load_code <- intLoadCode (MOV (intFormat width)) addr let platform = targetPlatform dflags use_sse2 <- sse2Enabled + return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst))) -genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do +genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do code <- assignMem_IntCode (intFormat width) addr val return $ code `snocOL` MFENCE -genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do +genCCall' dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do -- On x86 we don't have enough registers to use cmpxchg with a -- complicated addressing mode, so on that architecture we -- pre-compute the address first. + use_sse2 <- sse2Enabled Amode amode addr_code <- getSimpleAmode dflags is32Bit addr newval <- getNewRegNat format newval_code <- getAnyReg new oldval <- getNewRegNat format oldval_code <- getAnyReg old - use_sse2 <- sse2Enabled let platform = targetPlatform dflags dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) code = toOL @@ -2235,7 +2500,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ where format = intFormat width -genCCall _ is32Bit target dest_regs args bid = do +genCCall' _ is32Bit target dest_regs args bid = do dflags <- getDynFlags let platform = targetPlatform dflags sse2 = isSse2Enabled dflags @@ -2853,7 +3118,8 @@ outOfLineCmmOp bid mop res args let target = ForeignTarget targetExpr (ForeignConvention CCallConv [] [] CmmMayReturn) - stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args) + (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args) + return instrs where -- Assume we can call these functions directly, and that they're not in a dynamic library. -- TODO: Why is this ok? Under linux this code will be in libm.so @@ -3426,10 +3692,14 @@ needLlvm = -- | This works on the invariant that all jumps in the given blocks are required. -- Starting from there we try to make a few more jumps redundant by reordering -- them. -invertCondBranches :: CFG -> LabelMap a -> [NatBasicBlock Instr] +-- We depend on the information in the CFG to do so. Without a given CFG +-- we do nothing. +invertCondBranches :: Maybe CFG -- ^ CFG if present + -> LabelMap a -- ^ Blocks with info tables + -> [NatBasicBlock Instr] -- ^ List of basic blocks -> [NatBasicBlock Instr] -invertCondBranches cfg keep bs = - --trace "Foo" $ +invertCondBranches Nothing _ bs = bs +invertCondBranches (Just cfg) keep bs = invert bs where invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr] @@ -3448,7 +3718,7 @@ invertCondBranches cfg keep bs = , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg -- Both jumps come from the same cmm statement , transitionSource edgeInfo1 == transitionSource edgeInfo2 - , (CmmSource cmmCondBranch) <- transitionSource edgeInfo1 + , CmmSource cmmCondBranch <- transitionSource edgeInfo1 --Int comparisons are invertable , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs index 9f4a25b23e..bddd4fdc74 100644 --- a/compiler/nativeGen/X86/Instr.hs +++ b/compiler/nativeGen/X86/Instr.hs @@ -325,7 +325,9 @@ data Instr [Maybe JumpDest] -- Targets of the jump table Section -- Data section jump table should be put in CLabel -- Label of jump table - | CALL (Either Imm Reg) [Reg] + -- | X86 call instruction + | CALL (Either Imm Reg) -- ^ Jump target + [Reg] -- ^ Arguments (required for register allocation) -- Other things. | CLTD Format -- sign extend %eax into %edx:%eax diff --git a/testsuite/tests/codeGen/should_compile/T17334.hs b/testsuite/tests/codeGen/should_compile/T17334.hs new file mode 100644 index 0000000000..6ad6d347e8 --- /dev/null +++ b/testsuite/tests/codeGen/should_compile/T17334.hs @@ -0,0 +1,240 @@ +-- Reproducer for T17334 +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} + +--Reproducer uses 64bit literals in reverseWord. +--It's ok to truncate those in x86 +{-# OPTIONS_GHC -Wno-overflowed-literals #-} + +module Bug (reverseInPlace) where + +import Control.Monad.ST +import Data.Bits +import GHC.Exts +import GHC.ST (ST(..)) +import Data.Kind + +reverseInPlace :: PrimMonad m => UMVector (PrimState m) Bit -> m () +reverseInPlace xs | len == 0 = pure () + | otherwise = loop 0 + where + len = ulength xs + + loop !i + | i' <= j' = do + x <- readWord xs i + y <- readWord xs j' + + writeWord xs i (reverseWord y) + writeWord xs j' (reverseWord x) + + loop i' + | i' < j = do + let w = (j - i) `shiftR` 1 + k = j - w + x <- readWord xs i + y <- readWord xs k + + writeWord xs i (meld w (reversePartialWord w y) x) + writeWord xs k (meld w (reversePartialWord w x) y) + + loop i' + | otherwise = do + let w = j - i + x <- readWord xs i + writeWord xs i (meld w (reversePartialWord w x) x) + where + !j = len - i + !i' = i + wordSize + !j' = j - wordSize +{-# SPECIALIZE reverseInPlace :: UMVector s Bit -> ST s () #-} + +newtype Bit = Bit { unBit :: Bool } + +instance Unbox Bit + +data instance UMVector s Bit = BitMVec !Int !Int !(MutableByteArray s) +data instance UVector Bit = BitVec !Int !Int !ByteArray + +readWord :: PrimMonad m => UMVector (PrimState m) Bit -> Int -> m Word +readWord !(BitMVec _ 0 _) _ = pure 0 +readWord !(BitMVec off len' arr) !i' = do + let len = off + len' + i = off + i' + nMod = modWordSize i + loIx = divWordSize i + loWord <- readByteArray arr loIx + + if nMod == 0 + then pure loWord + else if loIx == divWordSize (len - 1) + then pure (loWord `unsafeShiftR` nMod) + else do + hiWord <- readByteArray arr (loIx + 1) + pure + $ (loWord `unsafeShiftR` nMod) + .|. (hiWord `unsafeShiftL` (wordSize - nMod)) +{-# SPECIALIZE readWord :: UMVector s Bit -> Int -> ST s Word #-} +{-# INLINE readWord #-} + +writeWord :: PrimMonad m => UMVector (PrimState m) Bit -> Int -> Word -> m () +writeWord !(BitMVec _ 0 _) _ _ = pure () +writeWord !(BitMVec off len' arr@(MutableByteArray mba)) !i' !x@(W# x#) = do + let len = off + len' + lenMod = modWordSize len + i = off + i' + nMod = modWordSize i + loIx@(I# loIx#) = divWordSize i + + if nMod == 0 + then if len >= i + wordSize + then primitive $ \state -> + (# atomicWriteIntArray# mba loIx# (word2Int# x#) state, () #) + else do + let W# andMask# = hiMask lenMod + W# orMask# = x .&. loMask lenMod + primitive $ \state -> + let !(# state', _ #) = fetchAndIntArray# mba loIx# (word2Int# andMask#) state in + let !(# state'', _ #) = fetchOrIntArray# mba loIx# (word2Int# orMask#) state' in + (# state'', () #) + else if loIx == divWordSize (len - 1) + then do + loWord <- readByteArray arr loIx + if lenMod == 0 + then + writeByteArray arr loIx + $ (loWord .&. loMask nMod) + .|. (x `unsafeShiftL` nMod) + else + writeByteArray arr loIx + $ (loWord .&. (loMask nMod .|. hiMask lenMod)) + .|. ((x `unsafeShiftL` nMod) .&. loMask lenMod) + else do + loWord <- readByteArray arr loIx + writeByteArray arr loIx + $ (loWord .&. loMask nMod) + .|. (x `unsafeShiftL` nMod) + hiWord <- readByteArray arr (loIx + 1) + writeByteArray arr (loIx + 1) + $ (hiWord .&. hiMask nMod) + .|. (x `unsafeShiftR` (wordSize - nMod)) +{-# SPECIALIZE writeWord :: UMVector s Bit -> Int -> Word -> ST s () #-} +{-# INLINE writeWord #-} + +instance GMVector UMVector Bit where + {-# INLINE basicLength #-} + basicLength (BitMVec _ n _) = n + +instance GVector UVector Bit where + +wordSize :: Int +wordSize = finiteBitSize (0 :: Word) + +lgWordSize :: Int +lgWordSize = case wordSize of + 32 -> 5 + 64 -> 6 + _ -> error "wordsToBytes: unknown architecture" + +divWordSize :: Bits a => a -> a +divWordSize x = unsafeShiftR x lgWordSize +{-# INLINE divWordSize #-} + +modWordSize :: Int -> Int +modWordSize x = x .&. (wordSize - 1) +{-# INLINE modWordSize #-} + +mask :: Int -> Word +mask b = m + where + m | b >= finiteBitSize m = complement 0 + | b < 0 = 0 + | otherwise = bit b - 1 + +meld :: Int -> Word -> Word -> Word +meld b lo hi = (lo .&. m) .|. (hi .&. complement m) where m = mask b +{-# INLINE meld #-} + +reverseWord :: Word -> Word +reverseWord x0 = x6 + where + x1 = ((x0 .&. 0x5555555555555555) `shiftL` 1) .|. ((x0 .&. 0xAAAAAAAAAAAAAAAA) `shiftR` 1) + x2 = ((x1 .&. 0x3333333333333333) `shiftL` 2) .|. ((x1 .&. 0xCCCCCCCCCCCCCCCC) `shiftR` 2) + x3 = ((x2 .&. 0x0F0F0F0F0F0F0F0F) `shiftL` 4) .|. ((x2 .&. 0xF0F0F0F0F0F0F0F0) `shiftR` 4) + x4 = ((x3 .&. 0x00FF00FF00FF00FF) `shiftL` 8) .|. ((x3 .&. 0xFF00FF00FF00FF00) `shiftR` 8) + x5 = ((x4 .&. 0x0000FFFF0000FFFF) `shiftL` 16) .|. ((x4 .&. 0xFFFF0000FFFF0000) `shiftR` 16) + x6 = ((x5 .&. 0x00000000FFFFFFFF) `shiftL` 32) .|. ((x5 .&. 0xFFFFFFFF00000000) `shiftR` 32) + +reversePartialWord :: Int -> Word -> Word +reversePartialWord n w | n >= wordSize = reverseWord w + | otherwise = reverseWord w `shiftR` (wordSize - n) + +loMask :: Int -> Word +loMask n = 1 `unsafeShiftL` n - 1 +{-# INLINE loMask #-} + +hiMask :: Int -> Word +hiMask n = complement (1 `unsafeShiftL` n - 1) +{-# INLINE hiMask #-} + +class GMVector v a where + basicLength :: v s a -> Int + +glength :: GMVector v a => v s a -> Int +{-# INLINE glength #-} +glength = basicLength + +type family GMutable (v :: Type -> Type) :: Type -> Type -> Type +class GMVector (GMutable v) a => GVector v a +data family UMVector s a +data family UVector a +class (GVector UVector a, GMVector UMVector a) => Unbox a +type instance GMutable UVector = UMVector + +ulength :: Unbox a => UMVector s a -> Int +{-# INLINE ulength #-} +ulength = glength + +data ByteArray = ByteArray ByteArray# +data MutableByteArray s = MutableByteArray (MutableByteArray# s) + +readByteArray + :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> m a +{-# INLINE readByteArray #-} +readByteArray (MutableByteArray arr#) (I# i#) + = primitive (readByteArray# arr# i#) + +writeByteArray + :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> a -> m () +{-# INLINE writeByteArray #-} +writeByteArray (MutableByteArray arr#) (I# i#) x + = primitive_ (writeByteArray# arr# i# x) + +class Prim a where + readByteArray# :: MutableByteArray# s -> Int# -> State# s -> (# State# s, a #) + writeByteArray# :: MutableByteArray# s -> Int# -> a -> State# s -> State# s + +instance Prim Word where + readByteArray# arr# i# s# = case readWordArray# arr# i# s# of + (# s1#, x# #) -> (# s1#, W# x# #) + writeByteArray# arr# i# (W# x#) s# = writeWordArray# arr# i# x# s# + +class Monad m => PrimMonad m where + type PrimState m + primitive :: (State# (PrimState m) -> (# State# (PrimState m), a #)) -> m a + +instance PrimMonad (ST s) where + type PrimState (ST s) = s + primitive = ST + {-# INLINE primitive #-} + +primitive_ :: PrimMonad m + => (State# (PrimState m) -> State# (PrimState m)) -> m () +{-# INLINE primitive_ #-} +primitive_ f = primitive (\s# -> + case f s# of + s'# -> (# s'#, () #)) |