summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Klebinger <klebinger.andreas@gmx.at>2019-10-15 00:58:12 +0200
committerAndreas Klebinger <klebinger.andreas@gmx.at>2019-11-12 15:31:27 +0100
commitec962cb95fb6f288f9a449f3c5da8998afb636ba (patch)
tree1ecd5b3a612f5650d6a3d7d56fdaaaeebc8a6853
parentdde5c06a9ac205b4edbff679c682a4ed0fedbc2e (diff)
downloadhaskell-wip/andreask/1953_backport.tar.gz
Backport fixes from !1953 fixing #17334.wip/andreask/1953_backport
-rw-r--r--compiler/nativeGen/AsmCodeGen.hs47
-rw-r--r--compiler/nativeGen/BlockLayout.hs28
-rw-r--r--compiler/nativeGen/CFG.hs78
-rw-r--r--compiler/nativeGen/NCGMonad.hs15
-rw-r--r--compiler/nativeGen/RegAlloc/Liveness.hs2
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs682
-rw-r--r--compiler/nativeGen/X86/Instr.hs4
-rw-r--r--testsuite/tests/codeGen/should_compile/T17334.hs240
8 files changed, 830 insertions, 266 deletions
diff --git a/compiler/nativeGen/AsmCodeGen.hs b/compiler/nativeGen/AsmCodeGen.hs
index 0c21bc0641..cb2fb5476a 100644
--- a/compiler/nativeGen/AsmCodeGen.hs
+++ b/compiler/nativeGen/AsmCodeGen.hs
@@ -550,6 +550,10 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
= do
let platform = targetPlatform dflags
+ let proc_name = case cmm of
+ (CmmProc _ entry_label _ _) -> ppr entry_label
+ _ -> text "DataChunk"
+
-- rewrite assignments to global regs
let fixed_cmm =
{-# SCC "fixStgRegisters" #-}
@@ -579,12 +583,11 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
Opt_D_dump_asm_native "Native code"
(vcat $ map (pprNatCmmDecl ncgImpl) native)
- dumpIfSet_dyn dflags
- Opt_D_dump_cfg_weights "CFG Weights"
- (pprEdgeWeights nativeCfgWeights)
+ maybeDumpCfg dflags (Just nativeCfgWeights) "CFG Weights - Native" proc_name
-- tag instructions with register liveness information
- -- also drops dead code
+ -- also drops dead code. We don't keep the cfg in sync on
+ -- some backends, so don't use it there.
let livenessCfg = if (backendMaintainsCfg dflags)
then Just nativeCfgWeights
else Nothing
@@ -697,12 +700,13 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
cfgRegAllocUpdates = (concatMap Linear.ra_fixupList raStats)
let cfgWithFixupBlks =
- addNodesBetween nativeCfgWeights cfgRegAllocUpdates
+ pure addNodesBetween <*> livenessCfg <*> pure cfgRegAllocUpdates
-- Insert stack update blocks
- let postRegCFG =
- foldl' (\m (from,to) -> addImmediateSuccessor from to m )
- cfgWithFixupBlks stack_updt_blks
+ let postRegCFG :: Maybe CFG
+ postRegCFG =
+ pure (foldl' (\m (from,to) -> addImmediateSuccessor from to m )) <*>
+ cfgWithFixupBlks <*> pure stack_updt_blks
---- x86fp_kludge. This pass inserts ffree instructions to clear
---- the FPU stack on x86. The x86 ABI requires that the FPU stack
@@ -729,11 +733,9 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
shortcutBranches dflags ncgImpl tabled postRegCFG
let optimizedCFG =
- optimizeCFG (cfgWeightInfo dflags) cmm postShortCFG
+ optimizeCFG (cfgWeightInfo dflags) cmm <$> postShortCFG
- dumpIfSet_dyn dflags
- Opt_D_dump_cfg_weights "CFG Final Weights"
- ( pprEdgeWeights optimizedCFG )
+ maybeDumpCfg dflags optimizedCFG "CFG Weights - Final" proc_name
--TODO: Partially check validity of the cfg.
let getBlks (CmmProc _info _lbl _live (ListGraph blocks)) = blocks
@@ -743,8 +745,8 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
(gopt Opt_DoAsmLinting dflags || debugIsOn )) $ do
let blocks = concatMap getBlks shorted
let labels = setFromList $ fmap blockId blocks :: LabelSet
- return $! seq (sanityCheckCfg optimizedCFG labels $
- text "cfg not in lockstep") ()
+ return $! seq (pure sanityCheckCfg <*> optimizedCFG <*> pure labels <*>
+ pure (text "cfg not in lockstep")) ()
---- sequence blocks
let sequenced :: [NatCmmDecl statics instr]
@@ -761,6 +763,8 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
{-# SCC "invertCondBranches" #-}
map invert sequenced
where
+ invertConds :: LabelMap CmmStatics -> [NatBasicBlock instr]
+ -> [NatBasicBlock instr]
invertConds = (invertCondBranches ncgImpl) optimizedCFG
invert top@CmmData {} = top
invert (CmmProc info lbl live (ListGraph blocks)) =
@@ -793,6 +797,15 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
, ppr_raStatsLinear
, unwinds )
+maybeDumpCfg :: DynFlags -> Maybe CFG -> String -> SDoc -> IO ()
+maybeDumpCfg _dflags Nothing _ _ = return ()
+maybeDumpCfg dflags (Just cfg) msg proc_name
+ | null cfg = return ()
+ | otherwise
+ = dumpIfSet_dyn
+ dflags Opt_D_dump_cfg_weights msg
+ (proc_name <> char ':' $$ pprEdgeWeights cfg)
+
-- | Make sure all blocks we want the layout algorithm to place have been placed.
checkLayout :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
-> [NatCmmDecl statics instr]
@@ -917,13 +930,13 @@ shortcutBranches
:: forall statics instr jumpDest. (Outputable jumpDest) => DynFlags
-> NcgImpl statics instr jumpDest
-> [NatCmmDecl statics instr]
- -> CFG
- -> ([NatCmmDecl statics instr],CFG)
+ -> Maybe CFG
+ -> ([NatCmmDecl statics instr],Maybe CFG)
shortcutBranches dflags ncgImpl tops weights
| gopt Opt_AsmShortcutting dflags
= ( map (apply_mapping ncgImpl mapping) tops'
- , shortcutWeightMap weights mappingBid )
+ , shortcutWeightMap mappingBid <$!> weights )
| otherwise
= (tops, weights)
where
diff --git a/compiler/nativeGen/BlockLayout.hs b/compiler/nativeGen/BlockLayout.hs
index 72aea5bf10..8dff0952fc 100644
--- a/compiler/nativeGen/BlockLayout.hs
+++ b/compiler/nativeGen/BlockLayout.hs
@@ -639,29 +639,31 @@ dropJumps info ((BasicBlock lbl ins):todo)
sequenceTop
:: (Instruction instr, Outputable instr)
- => DynFlags --Use new layout code
- -> NcgImpl statics instr jumpDest -> CFG
- -> NatCmmDecl statics instr -> NatCmmDecl statics instr
+ => DynFlags -- Determine which layout algo to use
+ -> NcgImpl statics instr jumpDest
+ -> Maybe CFG -- ^ CFG if we have one.
+ -> NatCmmDecl statics instr -- ^ Function to serialize
+ -> NatCmmDecl statics instr
sequenceTop _ _ _ top@(CmmData _ _) = top
sequenceTop dflags ncgImpl edgeWeights
(CmmProc info lbl live (ListGraph blocks))
| (gopt Opt_CfgBlocklayout dflags) && backendMaintainsCfg dflags
--Use chain based algorithm
+ , Just cfg <- edgeWeights
= CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
- sequenceChain info edgeWeights blocks )
+ {-# SCC layoutBlocks #-}
+ sequenceChain info cfg blocks )
| otherwise
--Use old algorithm
- = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
- sequenceBlocks cfg info blocks)
+ = let cfg = if dontUseCfg then Nothing else edgeWeights
+ in CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
+ {-# SCC layoutBlocks #-}
+ sequenceBlocks cfg info blocks)
where
- cfg
- | (gopt Opt_WeightlessBlocklayout dflags) ||
- (not $ backendMaintainsCfg dflags)
- -- Don't make use of cfg in the old algorithm
- = Nothing
- -- Use cfg in the old algorithm
- | otherwise = Just edgeWeights
+ dontUseCfg = gopt Opt_WeightlessBlocklayout dflags ||
+ (not $ backendMaintainsCfg dflags)
+
-- The old algorithm:
-- It is very simple (and stupid): We make a graph out of
diff --git a/compiler/nativeGen/CFG.hs b/compiler/nativeGen/CFG.hs
index 155e5bcac4..cf155364e7 100644
--- a/compiler/nativeGen/CFG.hs
+++ b/compiler/nativeGen/CFG.hs
@@ -61,8 +61,6 @@ import qualified DynFlags as D
import Data.List
--- import qualified Data.IntMap.Strict as M --TODO: LabelMap
-
type Edge = (BlockId, BlockId)
type Edges = [Edge]
@@ -76,6 +74,13 @@ instance Outputable EdgeWeight where
type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo)
-- | A control flow graph where edges have been annotated with a weight.
+-- Implemented as IntMap (IntMap <edgeData>)
+-- We must uphold the invariant that for each edge A -> B we must have:
+-- A entry B in the outer map.
+-- A entry B in the map we get when looking up A.
+-- Maintaining this invariant is useful as any failed lookup now indicates
+-- an actual error in code which might go unnoticed for a while
+-- otherwise.
type CFG = EdgeInfoMap EdgeInfo
data CfgEdge
@@ -144,11 +149,20 @@ adjustEdgeWeight cfg f from to
= addEdge from to (info { edgeWeight = f weight}) cfg
| otherwise = cfg
+
getCfgNodes :: CFG -> LabelSet
getCfgNodes m = mapFoldMapWithKey (\k v -> setFromList (k:mapKeys v)) m
+-- | Is this block part of this graph?
hasNode :: CFG -> BlockId -> Bool
-hasNode m node = mapMember node m || any (mapMember node) m
+hasNode m node =
+ -- Check the invariant that each node must exist in the first map or not at all.
+ ASSERT( found || not (any (mapMember node) m))
+ found
+ where
+ found = mapMember node m
+
+
-- | Check if the nodes in the cfg and the set of blocks are the same.
-- In a case of a missmatch we panic and show the difference.
@@ -160,7 +174,7 @@ sanityCheckCfg m blockSet msg
pprPanic "Block list and cfg nodes don't match" (
text "difference:" <+> ppr diff $$
text "blocks:" <+> ppr blockSet $$
- text "cfg:" <+> ppr m $$
+ text "cfg:" <+> pprEdgeWeights m $$
msg )
False
where
@@ -224,8 +238,8 @@ This function (shortcutWeightMap) takes the same mapping and
applies the mapping to the CFG in the way layed out above.
-}
-shortcutWeightMap :: CFG -> LabelMap (Maybe BlockId) -> CFG
-shortcutWeightMap cfg cuts =
+shortcutWeightMap :: LabelMap (Maybe BlockId) -> CFG -> CFG
+shortcutWeightMap cuts cfg =
foldl' applyMapping cfg $ mapToList cuts
where
-- takes the tuple (B,C) from the notation in [Updating the CFG during shortcutting]
@@ -259,7 +273,7 @@ shortcutWeightMap cfg cuts =
-- \ \
-- -> C => -> C
--
-addImmediateSuccessor :: BlockId -> BlockId -> CFG -> CFG
+addImmediateSuccessor :: HasDebugCallStack => BlockId -> BlockId -> CFG -> CFG
addImmediateSuccessor node follower cfg
= updateEdges . addWeightEdge node follower uncondWeight $ cfg
where
@@ -275,10 +289,16 @@ addImmediateSuccessor node follower cfg
-- | Adds a new edge, overwrites existing edges if present
addEdge :: BlockId -> BlockId -> EdgeInfo -> CFG -> CFG
addEdge from to info cfg =
- mapAlter addDest from cfg
+ mapAlter addFromToEdge from $
+ mapAlter addDestNode to cfg
where
- addDest Nothing = Just $ mapSingleton to info
- addDest (Just wm) = Just $ mapInsert to info wm
+ -- Simply insert the edge into the edge list.
+ addFromToEdge Nothing = Just $ mapSingleton to info
+ addFromToEdge (Just wm) = Just $ mapInsert to info wm
+ -- We must add the destination node explicitly as well
+ addDestNode Nothing = Just $ mapEmpty
+ addDestNode n@(Just _) = n
+
-- | Adds a edge with the given weight to the cfg
-- If there already existed an edge it is overwritten.
@@ -304,8 +324,11 @@ getSuccEdgesSorted m bid =
sortedEdges
-- | Get successors of a given node with edge weights.
-getSuccessorEdges :: CFG -> BlockId -> [(BlockId,EdgeInfo)]
-getSuccessorEdges m bid = maybe [] mapToList $ mapLookup bid m
+getSuccessorEdges :: HasDebugCallStack => CFG -> BlockId -> [(BlockId,EdgeInfo)]
+getSuccessorEdges m bid = maybe lookupError mapToList (mapLookup bid m)
+ where
+ lookupError = pprPanic "getSuccessorEdges: Block does not exist" $
+ ppr bid $$ text "CFG:" <+> pprEdgeWeights m
getEdgeInfo :: BlockId -> BlockId -> CFG -> Maybe EdgeInfo
getEdgeInfo from to m
@@ -316,12 +339,13 @@ getEdgeInfo from to m
= Nothing
reverseEdges :: CFG -> CFG
-reverseEdges cfg = foldr add mapEmpty flatElems
+reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg
where
- elems = mapToList $ fmap mapToList cfg :: [(BlockId,[(BlockId,EdgeInfo)])]
- flatElems =
- concatMap (\(from,ws) -> map (\(to,info) -> (to,from,info)) ws ) elems
- add (to,from,info) m = addEdge to from info m
+ -- We preserve nodes without outgoing edges!
+ addNode :: CFG -> BlockId -> CFG
+ addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg
+ go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG
+ go cfg from toMap = mapFoldlWithKey (\cfg to info -> addEdge to from info cfg) cfg toMap :: CFG
-- | Returns a unordered list of all edges with info
infoEdgeList :: CFG -> [CfgEdge]
@@ -347,11 +371,14 @@ edgeList m =
mapFoldMapWithKey (\from toMap -> fmap (from,) (mapKeys toMap)) m
-- | Get successors of a given node without edge weights.
-getSuccessors :: CFG -> BlockId -> [BlockId]
+getSuccessors :: HasDebugCallStack => CFG -> BlockId -> [BlockId]
getSuccessors m bid
| Just wm <- mapLookup bid m
= mapKeys wm
- | otherwise = []
+ | otherwise = lookupError
+ where
+ lookupError = pprPanic "getSuccessors: Block does not exist" $
+ ppr bid <+> pprEdgeWeights m
pprEdgeWeights :: CFG -> SDoc
pprEdgeWeights m =
@@ -375,6 +402,7 @@ pprEdgeWeights m =
text "}\n"
{-# INLINE updateEdgeWeight #-} --Allows eliminating the tuple when possible
+-- | Invariant: The edge **must** exist already in the graph.
updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG
updateEdgeWeight f (from, to) cfg
| Just oldInfo <- getEdgeInfo from to cfg
@@ -422,7 +450,8 @@ addNodesBetween m updates =
| otherwise
= pprPanic "Can't find weight for edge that should have one" (
text "triple" <+> ppr (from,between,old) $$
- text "updates" <+> ppr updates )
+ text "updates" <+> ppr updates $$
+ text "cfg:" <+> pprEdgeWeights m )
updateWeight :: CFG -> (BlockId,BlockId,BlockId,EdgeInfo) -> CFG
updateWeight m (from,between,old,edgeInfo)
= addEdge from between edgeInfo .
@@ -550,7 +579,7 @@ getCfg weights graph =
blocks = revPostorder graph :: [CmmBlock]
--Find back edges by BFS
-findBackEdges :: BlockId -> CFG -> Edges
+findBackEdges :: HasDebugCallStack => BlockId -> CFG -> Edges
findBackEdges root cfg =
--pprTraceIt "Backedges:" $
map fst .
@@ -562,7 +591,7 @@ findBackEdges root cfg =
classifyEdges root getSuccs edges :: [((BlockId,BlockId),EdgeType)]
-optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG
+optimizeCFG :: HasDebugCallStack => D.CfgWeights -> RawCmmDecl -> CFG -> CFG
optimizeCFG _ (CmmData {}) cfg = cfg
optimizeCFG weights (CmmProc info _lab _live graph) cfg =
favourFewerPreds .
@@ -641,16 +670,17 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg =
-- | Determine loop membership of blocks based on SCC analysis
-- Ideally we would replace this with a variant giving us loop
-- levels instead but the SCC code will do for now.
-loopMembers :: CFG -> LabelMap Bool
+loopMembers :: HasDebugCallStack => CFG -> LabelMap Bool
loopMembers cfg =
foldl' (flip setLevel) mapEmpty sccs
where
mkNode :: BlockId -> Node BlockId BlockId
mkNode bid = DigraphNode bid bid (getSuccessors cfg bid)
- nodes = map mkNode (setElems $ getCfgNodes cfg)
+ nodes = map mkNode $ setElems (getCfgNodes cfg)
sccs = stronglyConnCompFromEdgedVerticesOrd nodes
setLevel :: SCC BlockId -> LabelMap Bool -> LabelMap Bool
setLevel (AcyclicSCC bid) m = mapInsert bid False m
setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids
+
diff --git a/compiler/nativeGen/NCGMonad.hs b/compiler/nativeGen/NCGMonad.hs
index c22a656d2a..998361a2e3 100644
--- a/compiler/nativeGen/NCGMonad.hs
+++ b/compiler/nativeGen/NCGMonad.hs
@@ -1,4 +1,6 @@
{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE BangPatterns #-}
-- -----------------------------------------------------------------------------
--
@@ -18,7 +20,7 @@ module NCGMonad (
addNodeBetweenNat,
addImmediateSuccessorNat,
updateCfgNat,
- getUniqueNat,
+ getUniqueNat, getCfgNat,
mapAccumLNat,
setDeltaNat,
getDeltaNat,
@@ -65,6 +67,7 @@ import Instruction
import Outputable (SDoc, pprPanic, ppr)
import Cmm (RawCmmDecl, CmmStatics)
import CFG
+import Util
data NcgImpl statics instr jumpDest = NcgImpl {
cmmTopCodeGen :: RawCmmDecl -> NatM [NatCmmDecl statics instr],
@@ -88,7 +91,7 @@ data NcgImpl statics instr jumpDest = NcgImpl {
-- the block's 'UnwindPoint's
-- See Note [What is this unwinding business?] in Debug
-- and Note [Unwinding information in the NCG] in this module.
- invertCondBranches :: CFG -> LabelMap CmmStatics -> [NatBasicBlock instr]
+ invertCondBranches :: Maybe CFG -> LabelMap CmmStatics -> [NatBasicBlock instr]
-> [NatBasicBlock instr]
-- ^ Turn the sequence of `jcc l1; jmp l2` into `jncc l2; <block_l1>`
-- when possible.
@@ -206,7 +209,11 @@ addImportNat imp
updateCfgNat :: (CFG -> CFG) -> NatM ()
updateCfgNat f
- = NatM $ \ st -> ((), st { natm_cfg = f (natm_cfg st) })
+ = NatM $ \ st -> let !cfg' = f (natm_cfg st)
+ in ((), st { natm_cfg = cfg'})
+
+getCfgNat :: NatM CFG
+getCfgNat = NatM $ \ st -> (natm_cfg st, st)
-- | Record that we added a block between `from` and `old`.
addNodeBetweenNat :: BlockId -> BlockId -> BlockId -> NatM ()
@@ -231,7 +238,7 @@ addNodeBetweenNat from between to
-- | Place `succ` after `block` and change any edges
-- block -> X to `succ` -> X
-addImmediateSuccessorNat :: BlockId -> BlockId -> NatM ()
+addImmediateSuccessorNat :: HasDebugCallStack => BlockId -> BlockId -> NatM ()
addImmediateSuccessorNat block succ
= updateCfgNat (addImmediateSuccessor block succ)
diff --git a/compiler/nativeGen/RegAlloc/Liveness.hs b/compiler/nativeGen/RegAlloc/Liveness.hs
index b7f8d1c871..9d68ca12c1 100644
--- a/compiler/nativeGen/RegAlloc/Liveness.hs
+++ b/compiler/nativeGen/RegAlloc/Liveness.hs
@@ -705,7 +705,7 @@ sccBlocks blocks entries mcfg = map (fmap node_payload) sccs
reachable :: LabelSet
reachable
| Just cfg <- mcfg
- -- Our CFG only contains reachable nodes by construction.
+ -- Our CFG only contains reachable nodes by construction at this point.
= getCfgNodes cfg
| otherwise
= setFromList $ [ node_key node | node <- reachablesG g1 roots ]
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 69ab7b202d..11e79acebc 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1,4 +1,5 @@
{-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-}
+{-# LANGUAGE TupleSections #-}
-- The default iteration limit is a bit too low for the definitions
-- in this module.
@@ -36,6 +37,7 @@ import GhcPrelude
import X86.Instr
import X86.Cond
import X86.Regs
+import X86.Ppr ( )
import X86.RegInfo
--TODO: Remove - Just for development/debugging
@@ -130,6 +132,56 @@ cmmTopCodeGen (CmmProc info lab live graph) = do
cmmTopCodeGen (CmmData sec dat) = do
return [CmmData sec (1, dat)] -- no translation, we just use CmmStatic
+{- Note [Verifying basic blocks]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ We want to guarantee a few things about the results
+ of instruction selection.
+
+ Namely that each basic blocks consists of:
+ * A (potentially empty) sequence of straight line instructions
+ followed by
+ * A (potentially empty) sequence of jump like instructions.
+
+ We can verify this by going through the instructions and
+ making sure that any non-jumpish instruction can't appear
+ after a jumpish instruction.
+
+ There are gotchas however:
+ * CALLs are strictly speaking control flow but here we care
+ not about them. Hence we treat them as regular instructions.
+
+ It's safe for them to appear inside a basic block
+ as (ignoring side effects inside the call) they will result in
+ straight line code.
+
+ * NEWBLOCK marks the start of a new basic block so can
+ be followed by any instructions.
+-}
+
+-- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally.
+verifyBasicBlock :: [Instr] -> ()
+verifyBasicBlock instrs
+ | debugIsOn = go False instrs
+ | otherwise = ()
+ where
+ go _ [] = ()
+ go atEnd (i:instr)
+ = case i of
+ -- Start a new basic block
+ NEWBLOCK {} -> go False instr
+ -- Calls are not viable block terminators
+ CALL {} | atEnd -> faultyBlockWith i
+ | not atEnd -> go atEnd instr
+ -- All instructions ok, check if we reached the end and continue.
+ _ | not atEnd -> go (isJumpishInstr i) instr
+ -- Only jumps allowed at the end of basic blocks.
+ | otherwise -> if isJumpishInstr i
+ then go True instr
+ else faultyBlockWith i
+ faultyBlockWith i
+ = pprPanic "Non control flow instructions after end of basic block."
+ (ppr i <+> text "in:" $$ vcat (map ppr instrs))
basicBlockCodeGen
:: CmmBlock
@@ -148,9 +200,10 @@ basicBlockCodeGen block = do
let line = srcSpanStartLine span; col = srcSpanStartCol span
return $ unitOL $ LOCATION fileId line col name
_ -> return nilOL
- mid_instrs <- stmtsToInstrs id stmts
- tail_instrs <- stmtToInstrs id tail
+ (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+ (tail_instrs,_) <- stmtToInstrs mid_bid tail
let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+ return $! verifyBasicBlock (fromOL instrs)
instrs' <- fold <$> traverse addSpUnwindings instrs
-- code generation may introduce new basic block boundaries, which
-- are indicated by the NEWBLOCK instruction. We must split up the
@@ -180,60 +233,137 @@ addSpUnwindings instr@(DELTA d) = do
else return (unitOL instr)
addSpUnwindings instr = return $ unitOL instr
-stmtsToInstrs :: BlockId -> [CmmNode e x] -> NatM InstrBlock
-stmtsToInstrs bid stmts
- = do instrss <- mapM (stmtToInstrs bid) stmts
- return (concatOL instrss)
+{- Note [Keeping track of the current block]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When generating instructions for Cmm we sometimes require
+the current block for things like retry loops.
+
+We also sometimes change the current block, if a MachOP
+results in branching control flow.
+
+Issues arise if we have two statements in the same block,
+which both depend on the current block id *and* change the
+basic block after them. This happens for atomic primops
+in the X86 backend where we want to update the CFG data structure
+when introducing new basic blocks.
+
+For example in #17334 we got this Cmm code:
+
+ c3Bf: // global
+ (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18);
+ (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0);
+ _s3sT::I64 = _s3sV::I64;
+ goto c3B1;
+
+This resulted in two new basic blocks being inserted:
+
+ c3Bf:
+ movl $18,%vI_n3Bo
+ movq 88(%vI_s3sQ),%rax
+ jmp _n3Bp
+ n3Bp:
+ ...
+ cmpxchgq %vI_n3Bq,88(%vI_s3sQ)
+ jne _n3Bp
+ ...
+ jmp _n3Bs
+ n3Bs:
+ ...
+ cmpxchgq %vI_n3Bt,88(%vI_s3sQ)
+ jne _n3Bs
+ ...
+ jmp _c3B1
+ ...
+
+Based on the Cmm we called stmtToInstrs we translated both atomic operations under
+the assumption they would be placed into their Cmm basic block `c3Bf`.
+However for the retry loop we introduce new labels, so this is not the case
+for the second statement.
+This resulted in a desync between the explicit control flow graph
+we construct as a separate data type and the actual control flow graph in the code.
+
+Instead we now return the new basic block if a statement causes a change
+in the current block and use the block for all following statements.
+
+For this reason genCCall is also split into two parts.
+One for calls which *won't* change the basic blocks in
+which successive instructions will be placed.
+A different one for calls which *are* known to change the
+basic block.
+
+-}
+
+-- See Note [Keeping track of the current block] for why
+-- we pass the BlockId.
+stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
+ -> [CmmNode O O] -- ^ Cmm Statement
+ -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
+stmtsToInstrs bid stmts =
+ go bid stmts nilOL
+ where
+ go bid [] instrs = return (instrs,bid)
+ go bid (s:stmts) instrs = do
+ (instrs',bid') <- stmtToInstrs bid s
+ -- If the statement introduced a new block, we use that one
+ let newBid = fromMaybe bid bid'
+ go newBid stmts (instrs `appOL` instrs')
-- | `bid` refers to the current block and is used to update the CFG
-- if new blocks are inserted in the control flow.
-stmtToInstrs :: BlockId -> CmmNode e x -> NatM InstrBlock
+-- See Note [Keeping track of the current block] for more details.
+stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
+ -> CmmNode e x
+ -> NatM (InstrBlock, Maybe BlockId)
+ -- ^ Instructions, and bid of new block if successive
+ -- statements are placed in a different basic block.
stmtToInstrs bid stmt = do
dflags <- getDynFlags
is32Bit <- is32BitPlatform
case stmt of
- CmmComment s -> return (unitOL (COMMENT s))
- CmmTick {} -> return nilOL
-
- CmmUnwind regs -> do
- let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
- to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
- case foldMap to_unwind_entry regs of
- tbl | M.null tbl -> return nilOL
- | otherwise -> do
- lbl <- mkAsmTempLabel <$> getUniqueM
- return $ unitOL $ UNWIND lbl tbl
-
- CmmAssign reg src
- | isFloatType ty -> assignReg_FltCode format reg src
- | is32Bit && isWord64 ty -> assignReg_I64Code reg src
- | otherwise -> assignReg_IntCode format reg src
- where ty = cmmRegType dflags reg
- format = cmmTypeFormat ty
-
- CmmStore addr src
- | isFloatType ty -> assignMem_FltCode format addr src
- | is32Bit && isWord64 ty -> assignMem_I64Code addr src
- | otherwise -> assignMem_IntCode format addr src
- where ty = cmmExprType dflags src
- format = cmmTypeFormat ty
-
CmmUnsafeForeignCall target result_regs args
-> genCCall dflags is32Bit target result_regs args bid
- CmmBranch id -> return $ genBranch id
-
- --We try to arrange blocks such that the likely branch is the fallthrough
- --in CmmContFlowOpt. So we can assume the condition is likely false here.
- CmmCondBranch arg true false _ -> genCondBranch bid true false arg
- CmmSwitch arg ids -> do dflags <- getDynFlags
- genSwitch dflags arg ids
- CmmCall { cml_target = arg
- , cml_args_regs = gregs } -> do
- dflags <- getDynFlags
- genJump arg (jumpRegs dflags gregs)
- _ ->
- panic "stmtToInstrs: statement should have been cps'd away"
+ _ -> (,Nothing) <$> case stmt of
+ CmmComment s -> return (unitOL (COMMENT s))
+ CmmTick {} -> return nilOL
+
+ CmmUnwind regs -> do
+ let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
+ to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
+ case foldMap to_unwind_entry regs of
+ tbl | M.null tbl -> return nilOL
+ | otherwise -> do
+ lbl <- mkAsmTempLabel <$> getUniqueM
+ return $ unitOL $ UNWIND lbl tbl
+
+ CmmAssign reg src
+ | isFloatType ty -> assignReg_FltCode format reg src
+ | is32Bit && isWord64 ty -> assignReg_I64Code reg src
+ | otherwise -> assignReg_IntCode format reg src
+ where ty = cmmRegType dflags reg
+ format = cmmTypeFormat ty
+
+ CmmStore addr src
+ | isFloatType ty -> assignMem_FltCode format addr src
+ | is32Bit && isWord64 ty -> assignMem_I64Code addr src
+ | otherwise -> assignMem_IntCode format addr src
+ where ty = cmmExprType dflags src
+ format = cmmTypeFormat ty
+
+ CmmBranch id -> return $ genBranch id
+
+ --We try to arrange blocks such that the likely branch is the fallthrough
+ --in CmmContFlowOpt. So we can assume the condition is likely false here.
+ CmmCondBranch arg true false _ -> genCondBranch bid true false arg
+ CmmSwitch arg ids -> do dflags <- getDynFlags
+ genSwitch dflags arg ids
+ CmmCall { cml_target = arg
+ , cml_args_regs = gregs } -> do
+ dflags <- getDynFlags
+ genJump arg (jumpRegs dflags gregs)
+ _ ->
+ panic "stmtToInstrs: statement should have been cps'd away"
jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
@@ -1772,6 +1902,109 @@ genCondBranch' _ bid id false bool = do
updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
return (cond_code `appOL` code)
+{- Note [Introducing cfg edges inside basic blocks]
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ During instruction selection a statement `s`
+ in a block B with control of the sort: B -> C
+ will sometimes result in control
+ flow of the sort:
+
+ ┌ < ┐
+ v ^
+ B -> B1 ┴ -> C
+
+ as is the case for some atomic operations.
+
+ Now to keep the CFG in sync when introducing B1 we clearly
+ want to insert it between B and C. However there is
+ a catch when we have to deal with self loops.
+
+ We might start with code and a CFG of these forms:
+
+ loop:
+ stmt1 ┌ < ┐
+ .... v ^
+ stmtX loop ┘
+ stmtY
+ ....
+ goto loop:
+
+ Now we introduce B1:
+ ┌ ─ ─ ─ ─ ─┐
+ loop: │ ┌ < ┐ │
+ instrs v │ │ ^
+ .... loop ┴ B1 ┴ ┘
+ instrsFromX
+ stmtY
+ goto loop:
+
+ This is simple, all outgoing edges from loop now simply
+ start from B1 instead and the code generator knows which
+ new edges it introduced for the self loop of B1.
+
+ Disaster strikes if the statement Y follows the same pattern.
+ If we apply the same rule that all outgoing edges change then
+ we end up with:
+
+ loop ─> B1 ─> B2 ┬─┐
+ │ │ └─<┤ │
+ │ └───<───┘ │
+ └───────<────────┘
+
+ This is problematic. The edge B1->B1 is modified as expected.
+ However the modification is wrong!
+
+ The assembly in this case looked like this:
+
+ _loop:
+ <instrs>
+ _B1:
+ ...
+ cmpxchgq ...
+ jne _B1
+ <instrs>
+ <end _B1>
+ _B2:
+ ...
+ cmpxchgq ...
+ jne _B2
+ <instrs>
+ jmp loop
+
+ There is no edge _B2 -> _B1 here. It's still a self loop onto _B1.
+
+ The problem here is that really B1 should be two basic blocks.
+ Otherwise we have control flow in the *middle* of a basic block.
+ A contradiction!
+
+ So to account for this we add yet another basic block marker:
+
+ _B:
+ <instrs>
+ _B1:
+ ...
+ cmpxchgq ...
+ jne _B1
+ jmp _B1'
+ _B1':
+ <instrs>
+ <end _B1>
+ _B2:
+ ...
+
+ Now when inserting B2 we will only look at the outgoing edges of B1' and
+ everything will work out nicely.
+
+ You might also wonder why we don't insert jumps at the end of _B1'. There is
+ no way another block ends up jumping to the labels _B1 or _B2 since they are
+ essentially invisible to other blocks. View them as control flow labels local
+ to the basic block if you'd like.
+
+ Not doing this ultimately caused (part 2 of) #17334.
+-}
+
+
-- -----------------------------------------------------------------------------
-- Generating C calls
@@ -1789,14 +2022,168 @@ genCCall
-> [CmmFormal] -- where to put the result
-> [CmmActual] -- arguments (of mixed type)
-> BlockId -- The block we are in
- -> NatM InstrBlock
+ -> NatM (InstrBlock, Maybe BlockId)
+
+-- First we deal with cases which might introduce new blocks in the stream.
+
+genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
+ [dst] [addr, n] bid = do
+ use_sse2 <- sse2Enabled
+ Amode amode addr_code <-
+ if amop `elem` [AMO_Add, AMO_Sub]
+ then getAmode addr
+ else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg
+ arg <- getNewRegNat format
+ arg_code <- getAnyReg n
+ let platform = targetPlatform dflags
+ dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
+ (code, lbl) <- op_code dst_r arg amode
+ return (addr_code `appOL` arg_code arg `appOL` code, Just lbl)
+ where
+ -- Code for the operation
+ op_code :: Reg -- Destination reg
+ -> Reg -- Register containing argument
+ -> AddrMode -- Address of location to mutate
+ -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId
+ op_code dst_r arg amode = case amop of
+ -- In the common case where dst_r is a virtual register the
+ -- final move should go away, because it's the last use of arg
+ -- and the first use of dst_r.
+ AMO_Add -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
+ , MOV format (OpReg arg) (OpReg dst_r)
+ ], bid)
+ AMO_Sub -> return $ (toOL [ NEGI format (OpReg arg)
+ , LOCK (XADD format (OpReg arg) (OpAddr amode))
+ , MOV format (OpReg arg) (OpReg dst_r)
+ ], bid)
+ -- In these cases we need a new block id, and have to return it so
+ -- that later instruction selection can reference it.
+ AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
+ AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
+ , NOT format dst
+ ])
+ AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
+ AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
+ where
+ -- Simulate operation that lacks a dedicated instruction using
+ -- cmpxchg.
+ cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
+ -> NatM (OrdList Instr, BlockId)
+ cmpxchg_code instrs = do
+ lbl1 <- getBlockIdNat
+ lbl2 <- getBlockIdNat
+ tmp <- getNewRegNat format
+
+ --Record inserted blocks
+ -- We turn A -> B into A -> A' -> A'' -> B
+ -- with a self loop on A'.
+ addImmediateSuccessorNat bid lbl1
+ addImmediateSuccessorNat lbl1 lbl2
+ updateCfgNat (addWeightEdge lbl1 lbl1 0)
+
+ return $ (toOL
+ [ MOV format (OpAddr amode) (OpReg eax)
+ , JXX ALWAYS lbl1
+ , NEWBLOCK lbl1
+ -- Keep old value so we can return it:
+ , MOV format (OpReg eax) (OpReg dst_r)
+ , MOV format (OpReg eax) (OpReg tmp)
+ ]
+ `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
+ [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
+ , JXX NE lbl1
+ -- See Note [Introducing cfg edges inside basic blocks]
+ -- why this basic block is required.
+ , JXX ALWAYS lbl2
+ , NEWBLOCK lbl2
+ ],
+ lbl2)
+ format = intFormat width
+
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
+ | is32Bit, width == W64 = do
+ ChildCode64 vcode rlo <- iselExpr64 src
+ use_sse2 <- sse2Enabled
+ let rhi = getHiVRegFromLo rlo
+ dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
+ lbl1 <- getBlockIdNat
+ lbl2 <- getBlockIdNat
+ let format = if width == W8 then II16 else intFormat width
+ tmp_r <- getNewRegNat format
+
+ -- New CFG Edges:
+ -- bid -> lbl2
+ -- bid -> lbl1 -> lbl2
+ -- We also changes edges originating at bid to start at lbl2 instead.
+ updateCfgNat (addWeightEdge bid lbl1 110 .
+ addWeightEdge lbl1 lbl2 110 .
+ addImmediateSuccessor bid lbl2)
+
+ -- The following instruction sequence corresponds to the pseudo-code
+ --
+ -- if (src) {
+ -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
+ -- } else {
+ -- dst = 64;
+ -- }
+ let instrs = vcode `appOL` toOL
+ ([ MOV II32 (OpReg rhi) (OpReg tmp_r)
+ , OR II32 (OpReg rlo) (OpReg tmp_r)
+ , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+ , JXX EQQ lbl2
+ , JXX ALWAYS lbl1
+
+ , NEWBLOCK lbl1
+ , BSF II32 (OpReg rhi) dst_r
+ , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r)
+ , BSF II32 (OpReg rlo) tmp_r
+ , CMOV NE II32 (OpReg tmp_r) dst_r
+ , JXX ALWAYS lbl2
+
+ , NEWBLOCK lbl2
+ ])
+ return (instrs, Just lbl2)
+
+ | otherwise = do
+ code_src <- getAnyReg src
+ use_sse2 <- sse2Enabled
+ let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
+
+ -- The following insn sequence makes sure 'ctz 0' has a defined value.
+ -- starting with Haswell, one could use the TZCNT insn instead.
+ let format = if width == W8 then II16 else intFormat width
+ src_r <- getNewRegNat format
+ tmp_r <- getNewRegNat format
+ let instrs = code_src src_r `appOL` toOL
+ ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+ [ BSF format (OpReg src_r) tmp_r
+ , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
+ , CMOV NE format (OpReg tmp_r) dst_r
+ ]) -- NB: We don't need to zero-extend the result for the
+ -- W8/W16 cases because the 'MOV' insn already
+ -- took care of implicitly clearing the upper bits
+ return (instrs, Nothing)
+ where
+ bw = widthInBits width
+ platform = targetPlatform dflags
+
+genCCall dflags bits mop dst args bid = do
+ instr <- genCCall' dflags bits mop dst args bid
+ return (instr, Nothing)
--- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+-- genCCall' handles cases not introducing new code blocks.
+genCCall'
+ :: DynFlags
+ -> Bool -- 32 bit platform?
+ -> ForeignTarget -- function to call
+ -> [CmmFormal] -- where to put the result
+ -> [CmmActual] -- arguments (of mixed type)
+ -> BlockId -- The block we are in
+ -> NatM InstrBlock
--- Unroll memcpy calls if the source and destination pointers are at
--- least DWORD aligned and the number of bytes to copy isn't too
+-- Unroll memcpy calls if the number of bytes to copy isn't too
-- large. Otherwise, call C's memcpy.
-genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
+genCCall' dflags is32Bit (PrimTarget (MO_Memcpy align)) _
[dst, src, CmmLit (CmmInt n _)] _
| fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do
code_dst <- getAnyReg dst
@@ -1843,7 +2230,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
(ImmInteger (n - i))
-genCCall dflags _ (PrimTarget (MO_Memset align)) _
+genCCall' dflags _ (PrimTarget (MO_Memset align)) _
[dst,
CmmLit (CmmInt c _),
CmmLit (CmmInt n _)]
@@ -1888,14 +2275,14 @@ genCCall dflags _ (PrimTarget (MO_Memset align)) _
dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
(ImmInteger (n - i))
-genCCall _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL
-genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
+genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL
+genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
-- barriers compile to no code on x86/x86-64;
-- we keep it this long in order to prevent earlier optimisations.
-genCCall _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
+genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
-genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ =
+genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ =
case n of
0 -> genPrefetch src $ PREFETCH NTA format
1 -> genPrefetch src $ PREFETCH Lvl2 format
@@ -1916,9 +2303,10 @@ genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] _ =
((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) ))
-- prefetch always takes an address
-genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
+genCCall' dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
let platform = targetPlatform dflags
- let dst_r = getRegisterReg platform False (CmmLocal dst)
+ use_sse2 <- sse2Enabled
+ let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
case width of
W64 | is32Bit -> do
ChildCode64 vcode rlo <- iselExpr64 src
@@ -1938,7 +2326,7 @@ genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
where
format = intFormat width
-genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
+genCCall' dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
args@[src] bid = do
sse4_2 <- sse4_2Enabled
let platform = targetPlatform dflags
@@ -1964,20 +2352,21 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
- genCCall dflags is32Bit target dest_regs args bid
+ genCCall' dflags is32Bit target dest_regs args bid
where
format = intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
-genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
+genCCall' dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
args@[src, mask] bid = do
let platform = targetPlatform dflags
+ use_sse2 <- sse2Enabled
if isBmi2Enabled dflags
then do code_src <- getAnyReg src
code_mask <- getAnyReg mask
src_r <- getNewRegNat format
mask_r <- getNewRegNat format
- let dst_r = getRegisterReg platform False (CmmLocal dst)
+ let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
return $ code_src src_r `appOL` code_mask mask_r `appOL`
(if width == W8 then
-- The PDEP instruction doesn't take a r/m8
@@ -1997,20 +2386,21 @@ genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
- genCCall dflags is32Bit target dest_regs args bid
+ genCCall' dflags is32Bit target dest_regs args bid
where
format = intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
-genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
+genCCall' dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
args@[src, mask] bid = do
let platform = targetPlatform dflags
+ use_sse2 <- sse2Enabled
if isBmi2Enabled dflags
then do code_src <- getAnyReg src
code_mask <- getAnyReg mask
src_r <- getNewRegNat format
mask_r <- getNewRegNat format
- let dst_r = getRegisterReg platform False (CmmLocal dst)
+ let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
return $ code_src src_r `appOL` code_mask mask_r `appOL`
(if width == W8 then
-- The PEXT instruction doesn't take a r/m8
@@ -2030,19 +2420,19 @@ genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
- genCCall dflags is32Bit target dest_regs args bid
+ genCCall' dflags is32Bit target dest_regs args bid
where
format = intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
-genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
+genCCall' dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
| is32Bit && width == W64 = do
-- Fallback to `hs_clz64` on i386
targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
- genCCall dflags is32Bit target dest_regs args bid
+ genCCall' dflags is32Bit target dest_regs args bid
| otherwise = do
code_src <- getAnyReg src
@@ -2067,162 +2457,37 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] b
format = if width == W8 then II16 else intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
-genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
- | is32Bit, width == W64 = do
- ChildCode64 vcode rlo <- iselExpr64 src
- let rhi = getHiVRegFromLo rlo
- dst_r = getRegisterReg platform False (CmmLocal dst)
- lbl1 <- getBlockIdNat
- lbl2 <- getBlockIdNat
- tmp_r <- getNewRegNat format
-
- -- New CFG Edges:
- -- bid -> lbl2
- -- bid -> lbl1 -> lbl2
- -- We also changes edges originating at bid to start at lbl2 instead.
- updateCfgNat (addWeightEdge bid lbl1 110 .
- addWeightEdge lbl1 lbl2 110 .
- addImmediateSuccessor bid lbl2)
-
- -- The following instruction sequence corresponds to the pseudo-code
- --
- -- if (src) {
- -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
- -- } else {
- -- dst = 64;
- -- }
- return $ vcode `appOL` toOL
- ([ MOV II32 (OpReg rhi) (OpReg tmp_r)
- , OR II32 (OpReg rlo) (OpReg tmp_r)
- , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
- , JXX EQQ lbl2
- , JXX ALWAYS lbl1
-
- , NEWBLOCK lbl1
- , BSF II32 (OpReg rhi) dst_r
- , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r)
- , BSF II32 (OpReg rlo) tmp_r
- , CMOV NE II32 (OpReg tmp_r) dst_r
- , JXX ALWAYS lbl2
-
- , NEWBLOCK lbl2
- ])
-
- | otherwise = do
- code_src <- getAnyReg src
- src_r <- getNewRegNat format
- tmp_r <- getNewRegNat format
- let dst_r = getRegisterReg platform False (CmmLocal dst)
-
- -- The following insn sequence makes sure 'ctz 0' has a defined value.
- -- starting with Haswell, one could use the TZCNT insn instead.
- return $ code_src src_r `appOL` toOL
- ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
- [ BSF format (OpReg src_r) tmp_r
- , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
- , CMOV NE format (OpReg tmp_r) dst_r
- ]) -- NB: We don't need to zero-extend the result for the
- -- W8/W16 cases because the 'MOV' insn already
- -- took care of implicitly clearing the upper bits
- where
- bw = widthInBits width
- platform = targetPlatform dflags
- format = if width == W8 then II16 else intFormat width
-
-genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
+genCCall' dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
targetExpr <- cmmMakeDynamicReference dflags
CallReference lbl
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
- genCCall dflags is32Bit target dest_regs args bid
+ genCCall' dflags is32Bit target dest_regs args bid
where
lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width))
-genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
- [dst] [addr, n] bid = do
- Amode amode addr_code <-
- if amop `elem` [AMO_Add, AMO_Sub]
- then getAmode addr
- else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg
- arg <- getNewRegNat format
- arg_code <- getAnyReg n
- use_sse2 <- sse2Enabled
- let platform = targetPlatform dflags
- dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
- code <- op_code dst_r arg amode
- return $ addr_code `appOL` arg_code arg `appOL` code
- where
- -- Code for the operation
- op_code :: Reg -- Destination reg
- -> Reg -- Register containing argument
- -> AddrMode -- Address of location to mutate
- -> NatM (OrdList Instr)
- op_code dst_r arg amode = case amop of
- -- In the common case where dst_r is a virtual register the
- -- final move should go away, because it's the last use of arg
- -- and the first use of dst_r.
- AMO_Add -> return $ toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
- , MOV format (OpReg arg) (OpReg dst_r)
- ]
- AMO_Sub -> return $ toOL [ NEGI format (OpReg arg)
- , LOCK (XADD format (OpReg arg) (OpAddr amode))
- , MOV format (OpReg arg) (OpReg dst_r)
- ]
- AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
- AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
- , NOT format dst
- ])
- AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
- AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
- where
- -- Simulate operation that lacks a dedicated instruction using
- -- cmpxchg.
- cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
- -> NatM (OrdList Instr)
- cmpxchg_code instrs = do
- lbl <- getBlockIdNat
- tmp <- getNewRegNat format
-
- --Record inserted blocks
- addImmediateSuccessorNat bid lbl
- updateCfgNat (addWeightEdge lbl lbl 0)
-
- return $ toOL
- [ MOV format (OpAddr amode) (OpReg eax)
- , JXX ALWAYS lbl
- , NEWBLOCK lbl
- -- Keep old value so we can return it:
- , MOV format (OpReg eax) (OpReg dst_r)
- , MOV format (OpReg eax) (OpReg tmp)
- ]
- `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
- [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
- , JXX NE lbl
- ]
-
- format = intFormat width
-
-genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
+genCCall' dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
load_code <- intLoadCode (MOV (intFormat width)) addr
let platform = targetPlatform dflags
use_sse2 <- sse2Enabled
+
return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
-genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
+genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
code <- assignMem_IntCode (intFormat width) addr val
return $ code `snocOL` MFENCE
-genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
+genCCall' dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
-- On x86 we don't have enough registers to use cmpxchg with a
-- complicated addressing mode, so on that architecture we
-- pre-compute the address first.
+ use_sse2 <- sse2Enabled
Amode amode addr_code <- getSimpleAmode dflags is32Bit addr
newval <- getNewRegNat format
newval_code <- getAnyReg new
oldval <- getNewRegNat format
oldval_code <- getAnyReg old
- use_sse2 <- sse2Enabled
let platform = targetPlatform dflags
dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
code = toOL
@@ -2235,7 +2500,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _
where
format = intFormat width
-genCCall _ is32Bit target dest_regs args bid = do
+genCCall' _ is32Bit target dest_regs args bid = do
dflags <- getDynFlags
let platform = targetPlatform dflags
sse2 = isSse2Enabled dflags
@@ -2853,7 +3118,8 @@ outOfLineCmmOp bid mop res args
let target = ForeignTarget targetExpr
(ForeignConvention CCallConv [] [] CmmMayReturn)
- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
+ (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
+ return instrs
where
-- Assume we can call these functions directly, and that they're not in a dynamic library.
-- TODO: Why is this ok? Under linux this code will be in libm.so
@@ -3426,10 +3692,14 @@ needLlvm =
-- | This works on the invariant that all jumps in the given blocks are required.
-- Starting from there we try to make a few more jumps redundant by reordering
-- them.
-invertCondBranches :: CFG -> LabelMap a -> [NatBasicBlock Instr]
+-- We depend on the information in the CFG to do so. Without a given CFG
+-- we do nothing.
+invertCondBranches :: Maybe CFG -- ^ CFG if present
+ -> LabelMap a -- ^ Blocks with info tables
+ -> [NatBasicBlock Instr] -- ^ List of basic blocks
-> [NatBasicBlock Instr]
-invertCondBranches cfg keep bs =
- --trace "Foo" $
+invertCondBranches Nothing _ bs = bs
+invertCondBranches (Just cfg) keep bs =
invert bs
where
invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
@@ -3448,7 +3718,7 @@ invertCondBranches cfg keep bs =
, Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
-- Both jumps come from the same cmm statement
, transitionSource edgeInfo1 == transitionSource edgeInfo2
- , (CmmSource cmmCondBranch) <- transitionSource edgeInfo1
+ , CmmSource cmmCondBranch <- transitionSource edgeInfo1
--Int comparisons are invertable
, CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 9f4a25b23e..bddd4fdc74 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -325,7 +325,9 @@ data Instr
[Maybe JumpDest] -- Targets of the jump table
Section -- Data section jump table should be put in
CLabel -- Label of jump table
- | CALL (Either Imm Reg) [Reg]
+ -- | X86 call instruction
+ | CALL (Either Imm Reg) -- ^ Jump target
+ [Reg] -- ^ Arguments (required for register allocation)
-- Other things.
| CLTD Format -- sign extend %eax into %edx:%eax
diff --git a/testsuite/tests/codeGen/should_compile/T17334.hs b/testsuite/tests/codeGen/should_compile/T17334.hs
new file mode 100644
index 0000000000..6ad6d347e8
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T17334.hs
@@ -0,0 +1,240 @@
+-- Reproducer for T17334
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE UnboxedTuples #-}
+
+--Reproducer uses 64bit literals in reverseWord.
+--It's ok to truncate those in x86
+{-# OPTIONS_GHC -Wno-overflowed-literals #-}
+
+module Bug (reverseInPlace) where
+
+import Control.Monad.ST
+import Data.Bits
+import GHC.Exts
+import GHC.ST (ST(..))
+import Data.Kind
+
+reverseInPlace :: PrimMonad m => UMVector (PrimState m) Bit -> m ()
+reverseInPlace xs | len == 0 = pure ()
+ | otherwise = loop 0
+ where
+ len = ulength xs
+
+ loop !i
+ | i' <= j' = do
+ x <- readWord xs i
+ y <- readWord xs j'
+
+ writeWord xs i (reverseWord y)
+ writeWord xs j' (reverseWord x)
+
+ loop i'
+ | i' < j = do
+ let w = (j - i) `shiftR` 1
+ k = j - w
+ x <- readWord xs i
+ y <- readWord xs k
+
+ writeWord xs i (meld w (reversePartialWord w y) x)
+ writeWord xs k (meld w (reversePartialWord w x) y)
+
+ loop i'
+ | otherwise = do
+ let w = j - i
+ x <- readWord xs i
+ writeWord xs i (meld w (reversePartialWord w x) x)
+ where
+ !j = len - i
+ !i' = i + wordSize
+ !j' = j - wordSize
+{-# SPECIALIZE reverseInPlace :: UMVector s Bit -> ST s () #-}
+
+newtype Bit = Bit { unBit :: Bool }
+
+instance Unbox Bit
+
+data instance UMVector s Bit = BitMVec !Int !Int !(MutableByteArray s)
+data instance UVector Bit = BitVec !Int !Int !ByteArray
+
+readWord :: PrimMonad m => UMVector (PrimState m) Bit -> Int -> m Word
+readWord !(BitMVec _ 0 _) _ = pure 0
+readWord !(BitMVec off len' arr) !i' = do
+ let len = off + len'
+ i = off + i'
+ nMod = modWordSize i
+ loIx = divWordSize i
+ loWord <- readByteArray arr loIx
+
+ if nMod == 0
+ then pure loWord
+ else if loIx == divWordSize (len - 1)
+ then pure (loWord `unsafeShiftR` nMod)
+ else do
+ hiWord <- readByteArray arr (loIx + 1)
+ pure
+ $ (loWord `unsafeShiftR` nMod)
+ .|. (hiWord `unsafeShiftL` (wordSize - nMod))
+{-# SPECIALIZE readWord :: UMVector s Bit -> Int -> ST s Word #-}
+{-# INLINE readWord #-}
+
+writeWord :: PrimMonad m => UMVector (PrimState m) Bit -> Int -> Word -> m ()
+writeWord !(BitMVec _ 0 _) _ _ = pure ()
+writeWord !(BitMVec off len' arr@(MutableByteArray mba)) !i' !x@(W# x#) = do
+ let len = off + len'
+ lenMod = modWordSize len
+ i = off + i'
+ nMod = modWordSize i
+ loIx@(I# loIx#) = divWordSize i
+
+ if nMod == 0
+ then if len >= i + wordSize
+ then primitive $ \state ->
+ (# atomicWriteIntArray# mba loIx# (word2Int# x#) state, () #)
+ else do
+ let W# andMask# = hiMask lenMod
+ W# orMask# = x .&. loMask lenMod
+ primitive $ \state ->
+ let !(# state', _ #) = fetchAndIntArray# mba loIx# (word2Int# andMask#) state in
+ let !(# state'', _ #) = fetchOrIntArray# mba loIx# (word2Int# orMask#) state' in
+ (# state'', () #)
+ else if loIx == divWordSize (len - 1)
+ then do
+ loWord <- readByteArray arr loIx
+ if lenMod == 0
+ then
+ writeByteArray arr loIx
+ $ (loWord .&. loMask nMod)
+ .|. (x `unsafeShiftL` nMod)
+ else
+ writeByteArray arr loIx
+ $ (loWord .&. (loMask nMod .|. hiMask lenMod))
+ .|. ((x `unsafeShiftL` nMod) .&. loMask lenMod)
+ else do
+ loWord <- readByteArray arr loIx
+ writeByteArray arr loIx
+ $ (loWord .&. loMask nMod)
+ .|. (x `unsafeShiftL` nMod)
+ hiWord <- readByteArray arr (loIx + 1)
+ writeByteArray arr (loIx + 1)
+ $ (hiWord .&. hiMask nMod)
+ .|. (x `unsafeShiftR` (wordSize - nMod))
+{-# SPECIALIZE writeWord :: UMVector s Bit -> Int -> Word -> ST s () #-}
+{-# INLINE writeWord #-}
+
+instance GMVector UMVector Bit where
+ {-# INLINE basicLength #-}
+ basicLength (BitMVec _ n _) = n
+
+instance GVector UVector Bit where
+
+wordSize :: Int
+wordSize = finiteBitSize (0 :: Word)
+
+lgWordSize :: Int
+lgWordSize = case wordSize of
+ 32 -> 5
+ 64 -> 6
+ _ -> error "wordsToBytes: unknown architecture"
+
+divWordSize :: Bits a => a -> a
+divWordSize x = unsafeShiftR x lgWordSize
+{-# INLINE divWordSize #-}
+
+modWordSize :: Int -> Int
+modWordSize x = x .&. (wordSize - 1)
+{-# INLINE modWordSize #-}
+
+mask :: Int -> Word
+mask b = m
+ where
+ m | b >= finiteBitSize m = complement 0
+ | b < 0 = 0
+ | otherwise = bit b - 1
+
+meld :: Int -> Word -> Word -> Word
+meld b lo hi = (lo .&. m) .|. (hi .&. complement m) where m = mask b
+{-# INLINE meld #-}
+
+reverseWord :: Word -> Word
+reverseWord x0 = x6
+ where
+ x1 = ((x0 .&. 0x5555555555555555) `shiftL` 1) .|. ((x0 .&. 0xAAAAAAAAAAAAAAAA) `shiftR` 1)
+ x2 = ((x1 .&. 0x3333333333333333) `shiftL` 2) .|. ((x1 .&. 0xCCCCCCCCCCCCCCCC) `shiftR` 2)
+ x3 = ((x2 .&. 0x0F0F0F0F0F0F0F0F) `shiftL` 4) .|. ((x2 .&. 0xF0F0F0F0F0F0F0F0) `shiftR` 4)
+ x4 = ((x3 .&. 0x00FF00FF00FF00FF) `shiftL` 8) .|. ((x3 .&. 0xFF00FF00FF00FF00) `shiftR` 8)
+ x5 = ((x4 .&. 0x0000FFFF0000FFFF) `shiftL` 16) .|. ((x4 .&. 0xFFFF0000FFFF0000) `shiftR` 16)
+ x6 = ((x5 .&. 0x00000000FFFFFFFF) `shiftL` 32) .|. ((x5 .&. 0xFFFFFFFF00000000) `shiftR` 32)
+
+reversePartialWord :: Int -> Word -> Word
+reversePartialWord n w | n >= wordSize = reverseWord w
+ | otherwise = reverseWord w `shiftR` (wordSize - n)
+
+loMask :: Int -> Word
+loMask n = 1 `unsafeShiftL` n - 1
+{-# INLINE loMask #-}
+
+hiMask :: Int -> Word
+hiMask n = complement (1 `unsafeShiftL` n - 1)
+{-# INLINE hiMask #-}
+
+class GMVector v a where
+ basicLength :: v s a -> Int
+
+glength :: GMVector v a => v s a -> Int
+{-# INLINE glength #-}
+glength = basicLength
+
+type family GMutable (v :: Type -> Type) :: Type -> Type -> Type
+class GMVector (GMutable v) a => GVector v a
+data family UMVector s a
+data family UVector a
+class (GVector UVector a, GMVector UMVector a) => Unbox a
+type instance GMutable UVector = UMVector
+
+ulength :: Unbox a => UMVector s a -> Int
+{-# INLINE ulength #-}
+ulength = glength
+
+data ByteArray = ByteArray ByteArray#
+data MutableByteArray s = MutableByteArray (MutableByteArray# s)
+
+readByteArray
+ :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> m a
+{-# INLINE readByteArray #-}
+readByteArray (MutableByteArray arr#) (I# i#)
+ = primitive (readByteArray# arr# i#)
+
+writeByteArray
+ :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> a -> m ()
+{-# INLINE writeByteArray #-}
+writeByteArray (MutableByteArray arr#) (I# i#) x
+ = primitive_ (writeByteArray# arr# i# x)
+
+class Prim a where
+ readByteArray# :: MutableByteArray# s -> Int# -> State# s -> (# State# s, a #)
+ writeByteArray# :: MutableByteArray# s -> Int# -> a -> State# s -> State# s
+
+instance Prim Word where
+ readByteArray# arr# i# s# = case readWordArray# arr# i# s# of
+ (# s1#, x# #) -> (# s1#, W# x# #)
+ writeByteArray# arr# i# (W# x#) s# = writeWordArray# arr# i# x# s#
+
+class Monad m => PrimMonad m where
+ type PrimState m
+ primitive :: (State# (PrimState m) -> (# State# (PrimState m), a #)) -> m a
+
+instance PrimMonad (ST s) where
+ type PrimState (ST s) = s
+ primitive = ST
+ {-# INLINE primitive #-}
+
+primitive_ :: PrimMonad m
+ => (State# (PrimState m) -> State# (PrimState m)) -> m ()
+{-# INLINE primitive_ #-}
+primitive_ f = primitive (\s# ->
+ case f s# of
+ s'# -> (# s'#, () #))