summaryrefslogtreecommitdiff
path: root/compiler
diff options
context:
space:
mode:
authorJan Stolarek <jan.stolarek@p.lodz.pl>2013-09-12 14:29:37 +0100
committerJan Stolarek <jan.stolarek@p.lodz.pl>2013-09-12 17:04:41 +0100
commitad15c2b4bd37082ce989268b3d2f86a2cd34386a (patch)
tree7e72bd4792691f93f27ae8bae6589963755c4cf7 /compiler
parent66aa489fcbfca30dc3c3b553fce4f1e4debfb7c1 (diff)
downloadhaskell-ad15c2b4bd37082ce989268b3d2f86a2cd34386a.tar.gz
Improve sinking pass
This commit does two things: * Allows duplicating of global registers and literals by inlining them. Previously we would only inline global register or literal if it was used only once. * Changes method of determining conflicts between a node and an assignment. New method has two advantages. It relies on DefinerOfRegs and UserOfRegs typeclasses, so if a set of registers defined or used by a node should ever change, `conflicts` function will use the changed definition. This definition also catches more cases than the previous one (namely CmmCall and CmmForeignCall) which is a step towards making it possible to run sinking pass before stack layout (currently this doesn't work). This patch also adds a lot of comments that are result of about two-week long investigation of how sinking pass works and why it does what it does.
Diffstat (limited to 'compiler')
-rw-r--r--compiler/cmm/CmmLayoutStack.hs21
-rw-r--r--compiler/cmm/CmmPipeline.hs140
-rw-r--r--compiler/cmm/CmmSink.hs97
3 files changed, 216 insertions, 42 deletions
diff --git a/compiler/cmm/CmmLayoutStack.hs b/compiler/cmm/CmmLayoutStack.hs
index 2b2dccdaed..17d111d46f 100644
--- a/compiler/cmm/CmmLayoutStack.hs
+++ b/compiler/cmm/CmmLayoutStack.hs
@@ -164,24 +164,24 @@ layout dflags procpoints liveness entry entry_args final_stackmaps final_hwm blo
go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
= do
let (entry0@(CmmEntry entry_lbl), middle0, last0) = blockSplit b0
-
+
let stack0@StackMap { sm_sp = sp0 }
= mapFindWithDefault
(pprPanic "no stack map for" (ppr entry_lbl))
entry_lbl acc_stackmaps
-
+
-- pprTrace "layout" (ppr entry_lbl <+> ppr stack0) $ return ()
-
+
-- (a) Update the stack map to include the effects of
-- assignments in this block
let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
-
+
-- (b) Insert assignments to reload all the live variables if this
-- block is a proc point
let middle1 = if entry_lbl `setMember` procpoints
then foldr blockCons middle0 (insertReloads stack0)
else middle0
-
+
-- (c) Look at the last node and if we are making a call or
-- jumping to a proc point, we must save the live
-- variables, adjust Sp, and construct the StackMaps for
@@ -190,7 +190,7 @@ layout dflags procpoints liveness entry entry_args final_stackmaps final_hwm blo
(middle2, sp_off, last1, fixup_blocks, out)
<- handleLastNode dflags procpoints liveness cont_info
acc_stackmaps stack1 middle0 last0
-
+
-- pprTrace "layout(out)" (ppr out) $ return ()
-- (d) Manifest Sp: run over the nodes in the block and replace
@@ -566,9 +566,9 @@ setupStackFrame dflags lbl liveness updfr_off ret_args stack0
-- So to fix this we want to set up the stack frame before the
-- conditional jump. How do we know when to do this, and when it is
-- safe? The basic idea is, when we see the assignment
---
+--
-- Sp[young(L)] = L
---
+--
-- we know that
-- * we are definitely heading for L
-- * there can be no more reads from another stack area, because young(L)
@@ -877,7 +877,7 @@ stackMapToLiveness dflags StackMap{..} =
-- Lowering safe foreign calls
{-
-Note [lower safe foreign calls]
+Note [Lower safe foreign calls]
We start with
@@ -907,7 +907,8 @@ live across the call. Our job now is to expand the call so we get
...
Note the copyOut, which saves the results in the places that L1 is
-expecting them (see Note {safe foreign call convention]).
+expecting them (see Note {safe foreign call convention]). Note also
+that safe foreign call is replace by an unsafe one in the Cmm graph.
-}
lowerSafeForeignCall :: DynFlags -> CmmBlock -> UniqSM CmmBlock
diff --git a/compiler/cmm/CmmPipeline.hs b/compiler/cmm/CmmPipeline.hs
index dfcdce0dfb..86504662c4 100644
--- a/compiler/cmm/CmmPipeline.hs
+++ b/compiler/cmm/CmmPipeline.hs
@@ -88,13 +88,6 @@ cpsTop hsc_env proc =
when (not (setNull noncall_pps) && dopt Opt_D_dump_cmm dflags) $
pprTrace "Non-call proc points: " (ppr noncall_pps) $ return ()
- ----------- Sink and inline assignments *before* stack layout -----------
- {- Maybe enable this later
- g <- {-# SCC "sink1" #-}
- condPass Opt_CmmSink (cmmSink dflags) g
- Opt_D_dump_cmm_rewrite "Sink assignments (1)"
- -}
-
----------- Layout the stack and manifest Sp ----------------------------
(g, stackmaps) <-
{-# SCC "layoutStack" #-}
@@ -103,10 +96,10 @@ cpsTop hsc_env proc =
else return (g, mapEmpty)
dump Opt_D_dump_cmm_sp "Layout Stack" g
- ----------- Sink and inline assignments *after* stack layout ------------
- g <- {-# SCC "sink2" #-}
+ ----------- Sink and inline assignments --------------------------------
+ g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
condPass Opt_CmmSink (cmmSink dflags) g
- Opt_D_dump_cmm_rewrite "Sink assignments (2)"
+ Opt_D_dump_cmm_rewrite "Sink assignments"
------------- CAF analysis ----------------------------------------------
let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
@@ -190,6 +183,133 @@ cpsTop hsc_env proc =
(ArchPPC, OSDarwin, pic) -> pic
_ -> False
+-- Note [Sinking after stack layout]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- In the past we considered running sinking pass also before stack
+-- layout, but after making some measurements we realized that:
+--
+-- a) running sinking only before stack layout produces slower
+-- code than running sinking only before stack layout
+--
+-- b) running sinking both before and after stack layout produces
+-- code that has the same performance as when running sinking
+-- only after stack layout.
+--
+-- In other words sinking before stack layout doesn't buy as anything.
+--
+-- An interesting question is "why is it better to run sinking after
+-- stack layout"? It seems that the major reason are stores and loads
+-- generated by stack layout. Consider this code before stack layout:
+--
+-- c1E:
+-- _c1C::P64 = R3;
+-- _c1B::P64 = R2;
+-- _c1A::P64 = R1;
+-- I64[(young<c1D> + 8)] = c1D;
+-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+-- c1D:
+-- R3 = _c1C::P64;
+-- R2 = _c1B::P64;
+-- R1 = _c1A::P64;
+-- call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- Stack layout pass will save all local variables live across a call
+-- (_c1C, _c1B and _c1A in this example) on the stack just before
+-- making a call and reload them from the stack after returning from a
+-- call:
+--
+-- c1E:
+-- _c1C::P64 = R3;
+-- _c1B::P64 = R2;
+-- _c1A::P64 = R1;
+-- I64[Sp - 32] = c1D;
+-- P64[Sp - 24] = _c1A::P64;
+-- P64[Sp - 16] = _c1B::P64;
+-- P64[Sp - 8] = _c1C::P64;
+-- Sp = Sp - 32;
+-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+-- c1D:
+-- _c1A::P64 = P64[Sp + 8];
+-- _c1B::P64 = P64[Sp + 16];
+-- _c1C::P64 = P64[Sp + 24];
+-- R3 = _c1C::P64;
+-- R2 = _c1B::P64;
+-- R1 = _c1A::P64;
+-- Sp = Sp + 32;
+-- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- If we don't run sinking pass after stack layout we are basically
+-- left with such code. However, running sinking on this code can lead
+-- to significant improvements:
+--
+-- c1E:
+-- I64[Sp - 32] = c1D;
+-- P64[Sp - 24] = R1;
+-- P64[Sp - 16] = R2;
+-- P64[Sp - 8] = R3;
+-- Sp = Sp - 32;
+-- call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+-- c1D:
+-- R3 = P64[Sp + 24];
+-- R2 = P64[Sp + 16];
+-- R1 = P64[Sp + 8];
+-- Sp = Sp + 32;
+-- call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- Now we only have 9 assignments instead of 15.
+--
+-- There is one case when running sinking before stack layout could
+-- be beneficial. Consider this:
+--
+-- L1:
+-- x = y
+-- call f() returns L2
+-- L2: ...x...y...
+--
+-- Since both x and y are live across a call to f, they will be stored
+-- on the stack during stack layout and restored after the call:
+--
+-- L1:
+-- x = y
+-- P64[Sp - 24] = L2
+-- P64[Sp - 16] = x
+-- P64[Sp - 8] = y
+-- Sp = Sp - 24
+-- call f() returns L2
+-- L2:
+-- y = P64[Sp + 16]
+-- x = P64[Sp + 8]
+-- Sp = Sp + 24
+-- ...x...y...
+--
+-- However, if we run sinking before stack layout we would propagate x
+-- to its usage place (both x and y must be local register for this to
+-- be possible - global registers cannot be floated past a call):
+--
+-- L1:
+-- x = y
+-- call f() returns L2
+-- L2: ...y...y...
+--
+-- Thus making x dead at the call to f(). If we ran stack layout now
+-- we would generate less stores and loads:
+--
+-- L1:
+-- x = y
+-- P64[Sp - 16] = L2
+-- P64[Sp - 8] = y
+-- Sp = Sp - 16
+-- call f() returns L2
+-- L2:
+-- y = P64[Sp + 8]
+-- Sp = Sp + 16
+-- ...y...y...
+--
+-- But since we don't see any benefits from running sinking befroe stack
+-- layout, this situation probably doesn't arise too often in practice.
+--
+
{- Note [inconsistent-pic-reg]
On x86/Darwin, PIC is implemented by inserting a sequence like
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs
index 41323ecad3..7b5aaa6aff 100644
--- a/compiler/cmm/CmmSink.hs
+++ b/compiler/cmm/CmmSink.hs
@@ -3,8 +3,6 @@ module CmmSink (
cmmSink
) where
-import CodeGen.Platform (callerSaves)
-
import Cmm
import CmmOpt
import BlockId
@@ -220,18 +218,26 @@ cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
-- small: an expression we don't mind duplicating
isSmall :: CmmExpr -> Bool
-isSmall (CmmReg (CmmLocal _)) = True -- not globals, we want to coalesce them instead
+isSmall (CmmReg (CmmLocal _)) = True -- not globals, we want to coalesce them instead* See below
isSmall (CmmLit _) = True
isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
isSmall (CmmRegOff (CmmLocal _) _) = True
isSmall _ = False
+
+Coalesce global registers? What does that mean? We observed no decrease
+in performance comming from inlining of global registers, hence we do it now
+(see isTrivial function). Ideally we'd like to measure performance using
+some tool like perf or VTune and make decisions what to inline based on that.
-}
+--
+-- We allow duplication of trivial expressions: registers (both local and
+-- global) and literals.
+--
isTrivial :: CmmExpr -> Bool
-isTrivial (CmmReg (CmmLocal _)) = True
--- isTrivial (CmmLit _) = True -- Disabled because it used to make thing worse.
- -- Needs further investigation
-isTrivial _ = False
+isTrivial (CmmReg _) = True
+isTrivial (CmmLit _) = True
+isTrivial _ = False
--
-- annotate each node with the set of registers live *after* the node
@@ -430,6 +436,7 @@ tryToInline dflags live node assigs = go usages node [] assigs
inline other = other
-- Note [dependent assignments]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-- If our assignment list looks like
--
@@ -474,7 +481,8 @@ regsUsedIn ls e = wrapRecExpf f e False
-- nor the NCG can do it. See Note [Register parameter passing]
-- See also StgCmmForeign:load_args_into_temps.
okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
-okToInline dflags expr CmmUnsafeForeignCall{} = not (anyCallerSavesRegs dflags expr)
+okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
+ not (globalRegistersConflict dflags expr node)
okToInline _ _ _ = True
-- -----------------------------------------------------------------------------
@@ -487,23 +495,23 @@ okToInline _ _ _ = True
conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
conflicts dflags (r, rhs, addr) node
- -- (1) an assignment to a register conflicts with a use of the register
- | CmmAssign reg _ <- node, reg `regUsedIn` rhs = True
+ -- (1) node defines registers used by rhs of assignment. This catches
+ -- assignmnets and all three kinds of calls. See Note [Sinking and calls]
+ | globalRegistersConflict dflags rhs node = True
+ | localRegistersConflict dflags rhs node = True
+
+ -- (2) node uses register defined by assignment
| foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
- -- (2) a store to an address conflicts with a read of the same memory
+ -- (3) a store to an address conflicts with a read of the same memory
| CmmStore addr' e <- node
, memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
- -- (3) an assignment to Hp/Sp conflicts with a heap/stack read respectively
+ -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
| HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
| StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
| SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
- -- (4) assignments that read caller-saves GlobalRegs conflict with a
- -- foreign call. See Note [Unsafe foreign calls clobber caller-save registers]
- | CmmUnsafeForeignCall{} <- node, anyCallerSavesRegs dflags rhs = True
-
-- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
| CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
@@ -513,12 +521,57 @@ conflicts dflags (r, rhs, addr) node
-- (7) otherwise, no conflict
| otherwise = False
-
-anyCallerSavesRegs :: DynFlags -> CmmExpr -> Bool
-anyCallerSavesRegs dflags e = wrapRecExpf f e False
- where f (CmmReg (CmmGlobal r)) _
- | callerSaves (targetPlatform dflags) r = True
- f _ z = z
+-- Returns True if node defines any global registers that are used in the
+-- Cmm expression
+globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+globalRegistersConflict dflags expr node =
+ foldRegsDefd dflags (\b r -> b || (CmmGlobal r) `regUsedIn` expr) False node
+
+-- Returns True if node defines any local registers that are used in the
+-- Cmm expression
+localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+localRegistersConflict dflags expr node =
+ foldRegsDefd dflags (\b r -> b || (CmmLocal r) `regUsedIn` expr) False node
+
+-- Note [Sinking and calls]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
+-- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
+-- stack layout (see Note [Sinking after stack layout]) which leads to two
+-- invariants related to calls:
+--
+-- a) during stack layout phase all safe foreign calls are turned into
+-- unsafe foreign calls (see Note [Lower safe foreign calls]). This
+-- means that we will never encounter CmmForeignCall node when running
+-- sinking after stack layout
+--
+-- b) stack layout saves all variables live across a call on the stack
+-- just before making a call (remember we are not sinking assignments to
+-- stack):
+--
+-- L1:
+-- x = R1
+-- P64[Sp - 16] = L2
+-- P64[Sp - 8] = x
+-- Sp = Sp - 16
+-- call f() returns L2
+-- L2:
+--
+-- We will attempt to sink { x = R1 } but we will detect conflict with
+-- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
+-- checking whether it conflicts with { call f() }. In this way we will
+-- never need to check any assignment conflicts with CmmCall. Remeber
+-- that we still need to check for potential memory conflicts.
+--
+-- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
+-- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
+-- This assumption holds only when we do sinking after stack layout. If we run
+-- it before stack layout we need to check for possible conflicts with all three
+-- kinds of calls. Our `conflicts` function does that by using a generic
+-- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
+-- UserOfRegs typeclasses.
+--
-- An abstraction of memory read or written.
data AbsMem