diff options
author | Jan Stolarek <jan.stolarek@p.lodz.pl> | 2013-08-29 10:57:04 +0100 |
---|---|---|
committer | Jan Stolarek <jan.stolarek@p.lodz.pl> | 2013-08-29 12:56:09 +0100 |
commit | d61c3ac186c94021c851f7a2a6d20631e35fc1ba (patch) | |
tree | ff43791dfcd729fb9951feb6568be5306292fc9d | |
parent | 1d1ab12d084c07bd6aee03177ef6008c7ab08127 (diff) | |
download | haskell-d61c3ac186c94021c851f7a2a6d20631e35fc1ba.tar.gz |
Optimize self-recursive tail calls
This patch implements loopification optimization. It was described
in "Low-level code optimisations in the Glasgow Haskell Compiler" by
Krzysztof Woś, but we use a different approach here. Krzysztof's
approach was to perform optimization as a Cmm-to-Cmm pass. Our
approach is to generate properly optimized tail calls in the code
generator, which saves us the trouble of processing Cmm. This idea
was proposed by Simon Marlow. Implementation details are explained
in Note [Self-recursive tail calls].
Performance of most nofib benchmarks is not affected. There are
some benchmarks that show 5-7% improvement, with an average improvement
of 2.6%. It would require some further investigation to check if this
is related to benchamrking noise or does this optimization really
help make some class of programs faster.
As a minor cleanup, this patch renames forkProc to forkLneBody.
It also moves some data declarations from StgCmmMonad to
StgCmmClosure, because they are needed there and it seems that
StgCmmClosure is on top of the whole StgCmm* hierarchy.
-rw-r--r-- | compiler/codeGen/StgCmmBind.hs | 16 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmClosure.hs | 110 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmExpr.hs | 104 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmMonad.hs | 86 |
4 files changed, 222 insertions, 94 deletions
diff --git a/compiler/codeGen/StgCmmBind.hs b/compiler/codeGen/StgCmmBind.hs index ce5491dc10..dccefd0fb0 100644 --- a/compiler/codeGen/StgCmmBind.hs +++ b/compiler/codeGen/StgCmmBind.hs @@ -30,6 +30,7 @@ import StgCmmForeign (emitPrimCall) import MkGraph import CoreSyn ( AltCon(..) ) import SMRep +import BlockId import Cmm import CmmInfo import CmmUtils @@ -476,7 +477,17 @@ closureCodeBody top_lvl bndr cl_info cc args arity body fv_details ; let node_points = nodeMustPointToIt dflags lf_info node' = if node_points then Just node else Nothing ; when node_points (ldvEnterClosure cl_info) - + -- Emit new label that might potentially be a header + -- of a self-recursive tail call. See Note + -- [Self-recursive tail calls] in StgCmmExpr + ; u <- newUnique + ; let loop_header_id = mkBlockId u + ; emitLabel loop_header_id + -- Extend reader monad with information that + -- self-recursive tail calls can be optimized into local + -- jumps + ; withSelfLoop (bndr, loop_header_id, arg_regs) $ do + { -- Main payload ; entryHeapCheck cl_info node' arity arg_regs $ do { -- ticky after heap check to avoid double counting @@ -490,7 +501,8 @@ closureCodeBody top_lvl bndr cl_info cc args arity body fv_details -- heap check, to reduce live vars over check ; when node_points $ load_fvs node lf_info fv_bindings ; void $ cgExpr body - }} + }}} + } -- A function closure pointer may be tagged, so we diff --git a/compiler/codeGen/StgCmmClosure.hs b/compiler/codeGen/StgCmmClosure.hs index 3b9f38d36b..04297b4258 100644 --- a/compiler/codeGen/StgCmmClosure.hs +++ b/compiler/codeGen/StgCmmClosure.hs @@ -27,10 +27,9 @@ module StgCmmClosure ( lfDynTag, maybeIsLFCon, isLFThunk, isLFReEntrant, lfUpdatable, - nodeMustPointToIt, - CallMethod(..), getCallMethod, - - isKnownFun, funTag, tagForArity, + -- * Used by other modules + CgLoc(..), SelfLoopInfo, CallMethod(..), + nodeMustPointToIt, isKnownFun, funTag, tagForArity, getCallMethod, -- * ClosureInfo ClosureInfo, @@ -69,11 +68,14 @@ module StgCmmClosure ( import StgSyn import SMRep import Cmm +import PprCmmExpr() +import BlockId import CLabel import Id import IdInfo import DataCon +import FastString import Name import Type import TypeRep @@ -85,6 +87,37 @@ import DynFlags import Util ----------------------------------------------------------------------------- +-- Data types and synonyms +----------------------------------------------------------------------------- + +-- These data types are mostly used by other modules, especially StgCmmMonad, +-- but we define them here because some functions in this module need to +-- have access to them as well + +data CgLoc + = CmmLoc CmmExpr -- A stable CmmExpr; that is, one not mentioning + -- Hp, so that it remains valid across calls + + | LneLoc BlockId [LocalReg] -- A join point + -- A join point (= let-no-escape) should only + -- be tail-called, and in a saturated way. + -- To tail-call it, assign to these locals, + -- and branch to the block id + +instance Outputable CgLoc where + ppr (CmmLoc e) = ptext (sLit "cmm") <+> ppr e + ppr (LneLoc b rs) = ptext (sLit "lne") <+> ppr b <+> ppr rs + +type SelfLoopInfo = (Id, BlockId, [LocalReg]) + +-- used by ticky profiling +isKnownFun :: LambdaFormInfo -> Bool +isKnownFun (LFReEntrant _ _ _ _) = True +isKnownFun LFLetNoEscape = True +isKnownFun _ = False + + +----------------------------------------------------------------------------- -- Representations ----------------------------------------------------------------------------- @@ -465,49 +498,65 @@ When black-holing, single-entry closures could also be entered via node (rather than directly) to catch double-entry. -} data CallMethod - = EnterIt -- No args, not a function + = EnterIt -- No args, not a function - | JumpToIt -- A join point + | JumpToIt BlockId [LocalReg] -- A join point or a header of a local loop - | ReturnIt -- It's a value (function, unboxed value, + | ReturnIt -- It's a value (function, unboxed value, -- or constructor), so just return it. | SlowCall -- Unknown fun, or known fun with -- too few args. | DirectEntry -- Jump directly, with args in regs - CLabel -- The code label - RepArity -- Its arity + CLabel -- The code label + RepArity -- Its arity getCallMethod :: DynFlags -> Name -- Function being applied - -> CafInfo -- Can it refer to CAF's? - -> LambdaFormInfo -- Its info - -> RepArity -- Number of available arguments + -> Id -- Function Id used to chech if it can refer to + -- CAF's and whether the function is tail-calling + -- itself + -> LambdaFormInfo -- Its info + -> RepArity -- Number of available arguments + -> CgLoc -- Passed in from cgIdApp so that we can + -- handle let-no-escape bindings and self-recursive + -- tail calls using the same data constructor, + -- JumpToIt. This saves us one case branch in + -- cgIdApp + -> Maybe SelfLoopInfo -- can we perform a self-recursive tail call? -> CallMethod -getCallMethod dflags _name _ lf_info _n_args +getCallMethod _ _ id _ n_args _cg_loc (Just (self_loop_id, block_id, args)) + | id == self_loop_id, n_args == length args + -- If these patterns match then we know that: + -- * function is performing a self-recursive call in a tail position + -- * number of parameters of the function matches functions arity. + -- See Note [Self-recursive tail calls] in StgCmmExpr for more details + = JumpToIt block_id args + +getCallMethod dflags _name _ lf_info _n_args _cg_loc _self_loop_info | nodeMustPointToIt dflags lf_info && gopt Opt_Parallel dflags - = -- If we're parallel, then we must always enter via node. + = -- If we're parallel, then we must always enter via node. -- The reason is that the closure may have been -- fetched since we allocated it. EnterIt -getCallMethod dflags name caf (LFReEntrant _ arity _ _) n_args +getCallMethod dflags name id (LFReEntrant _ arity _ _) n_args _cg_loc _self_loop_info | n_args == 0 = ASSERT( arity /= 0 ) ReturnIt -- No args at all | n_args < arity = SlowCall -- Not enough args - | otherwise = DirectEntry (enterIdLabel dflags name caf) arity + | otherwise = DirectEntry (enterIdLabel dflags name (idCafInfo id)) arity -getCallMethod _ _name _ LFUnLifted n_args +getCallMethod _ _name _ LFUnLifted n_args _cg_loc _self_loop_info = ASSERT( n_args == 0 ) ReturnIt -getCallMethod _ _name _ (LFCon _) n_args +getCallMethod _ _name _ (LFCon _) n_args _cg_loc _self_loop_info = ASSERT( n_args == 0 ) ReturnIt -getCallMethod dflags name caf (LFThunk _ _ updatable std_form_info is_fun) n_args - | is_fun -- it *might* be a function, so we must "call" it (which is always safe) - = SlowCall -- We cannot just enter it [in eval/apply, the entry code +getCallMethod dflags name id (LFThunk _ _ updatable std_form_info is_fun) n_args _cg_loc _self_loop_info + | is_fun -- it *might* be a function, so we must "call" it (which is always safe) + = SlowCall -- We cannot just enter it [in eval/apply, the entry code -- is the fast-entry code] -- Since is_fun is False, we are *definitely* looking at a data value @@ -527,27 +576,24 @@ getCallMethod dflags name caf (LFThunk _ _ updatable std_form_info is_fun) n_arg | otherwise -- Jump direct to code for single-entry thunks = ASSERT( n_args == 0 ) - DirectEntry (thunkEntryLabel dflags name caf std_form_info updatable) 0 + DirectEntry (thunkEntryLabel dflags name (idCafInfo id) std_form_info updatable) 0 -getCallMethod _ _name _ (LFUnknown True) _n_args +getCallMethod _ _name _ (LFUnknown True) _n_arg _cg_locs _self_loop_info = SlowCall -- might be a function -getCallMethod _ name _ (LFUnknown False) n_args +getCallMethod _ name _ (LFUnknown False) n_args _cg_loc _self_loop_info = ASSERT2( n_args == 0, ppr name <+> ppr n_args ) EnterIt -- Not a function -getCallMethod _ _name _ LFBlackHole _n_args - = SlowCall -- Presumably the black hole has by now +getCallMethod _ _name _ LFBlackHole _n_args _cg_loc _self_loop_info + = SlowCall -- Presumably the black hole has by now -- been updated, but we don't know with -- what, so we slow call it -getCallMethod _ _name _ LFLetNoEscape _n_args - = JumpToIt +getCallMethod _ _name _ LFLetNoEscape _n_args (LneLoc blk_id lne_regs) _self_loop_info + = JumpToIt blk_id lne_regs -isKnownFun :: LambdaFormInfo -> Bool -isKnownFun (LFReEntrant _ _ _ _) = True -isKnownFun LFLetNoEscape = True -isKnownFun _ = False +getCallMethod _ _ _ _ _ _ _ = panic "Unknown call method" ----------------------------------------------------------------------------- -- staticClosureRequired diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs index c0623690b0..331e65819f 100644 --- a/compiler/codeGen/StgCmmExpr.hs +++ b/compiler/codeGen/StgCmmExpr.hs @@ -160,7 +160,7 @@ cgLetNoEscapeClosure bndr cc_slot _unused_cc args body return ( lneIdInfo dflags bndr args , code ) where - code = forkProc $ do { + code = forkLneBody $ do { ; withNewTickyCounterLNE (idName bndr) args $ do ; restoreCurrentCostCentre cc_slot ; arg_regs <- bindArgsToRegs args @@ -632,19 +632,20 @@ cgConApp con stg_args cgIdApp :: Id -> [StgArg] -> FCode ReturnKind cgIdApp fun_id [] | isVoidId fun_id = emitReturn [] cgIdApp fun_id args = do - dflags <- getDynFlags - fun_info <- getCgIdInfo fun_id + dflags <- getDynFlags + fun_info <- getCgIdInfo fun_id + self_loop_info <- getSelfLoop let cg_fun_id = cg_id fun_info -- NB: use (cg_id fun_info) instead of fun_id, because -- the former may be externalised for -split-objs. -- See Note [Externalise when splitting] in StgCmmMonad fun_arg = StgVarArg cg_fun_id - fun_name = idName cg_fun_id - fun = idInfoToAmode fun_info - lf_info = cg_lf fun_info + fun_name = idName cg_fun_id + fun = idInfoToAmode fun_info + lf_info = cg_lf fun_info node_points dflags = nodeMustPointToIt dflags lf_info - case (getCallMethod dflags fun_name (idCafInfo cg_fun_id) lf_info (length args)) of + case (getCallMethod dflags fun_name cg_fun_id lf_info (length args) (cg_loc fun_info) self_loop_info) of -- A value in WHNF, so we can just return it. ReturnIt -> emitReturn [fun] -- ToDo: does ReturnIt guarantee tagged? @@ -664,14 +665,87 @@ cgIdApp fun_id args = do then directCall NativeNodeCall lbl arity (fun_arg:args) else directCall NativeDirectCall lbl arity args } - -- Let-no-escape call - JumpToIt -> let (LneLoc blk_id lne_regs) = cg_loc fun_info - in do - { adjustHpBackwards -- always do this before a tail-call - ; cmm_args <- getNonVoidArgAmodes args - ; emitMultiAssign lne_regs cmm_args - ; emit (mkBranch blk_id) - ; return AssignedDirectly } + -- Let-no-escape call or self-recursive tail-call + JumpToIt blk_id lne_regs -> do + { adjustHpBackwards -- always do this before a tail-call + ; cmm_args <- getNonVoidArgAmodes args + ; emitMultiAssign lne_regs cmm_args + ; emit (mkBranch blk_id) + ; return AssignedDirectly } + +-- Note [Self-recursive tail calls] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- Self-recursive tail calls can be optimized into a local jump in the same +-- way as let-no-escape bindings (see Note [What is a non-escaping let] in +-- stgSyn/CoreToStg.lhs). Consider this: +-- +-- foo.info: +-- a = R1 // calling convention +-- b = R2 +-- goto L1 +-- L1: ... +-- ... +-- ... +-- L2: R1 = x +-- R2 = y +-- call foo(R1,R2) +-- +-- Instead of putting x and y into registers (or other locations required by the +-- calling convention) and performing a call we can put them into local +-- variables a and b and perform jump to L1: +-- +-- foo.info: +-- a = R1 +-- b = R2 +-- goto L1 +-- L1: ... +-- ... +-- ... +-- L2: a = x +-- b = y +-- goto L1 +-- +-- This can be done only when function is calling itself in a tail position +-- and only if the call passes number of parameters equal to function's arity. +-- Note that this cannot be performed if a function calls itself with a +-- continuation. +-- +-- This in fact implements optimization known as "loopification". It was +-- described in "Low-level code optimizations in the Glasgow Haskell Compiler" +-- by Krzysztof Woś, though we use different approach. Krzysztof performed his +-- optimization at the Cmm level, whereas we perform ours during code generation +-- (Stg-to-Cmm pass) essentially making sure that optimized Cmm code is +-- generated in the first place. +-- +-- Implementation is spread across a couple of places in the code: +-- +-- * FCode monad stores additional information in its reader environment +-- (cgd_self_loop field). This information tells us which function can +-- tail call itself in an optimized way (it is the function currently +-- being compiled), what is the label of a loop header (L1 in example above) +-- and information about local registers in which we should arguments +-- before making a call (this would be a and b in example above). +-- +-- * Whenever we are compiling a function, we set that information to reflect +-- the fact that function currently being compiled can be jumped to, instead +-- of called. We also have to emit a label to which we will be jumping. Both +-- things are done in closureCodyBody in StgCmmBind. +-- +-- * When we began compilation of another closure we remove the additional +-- information from the environment. This is done by forkClosureBody +-- in StgCmmMonad. Other functions that duplicate the environment - +-- forkLneBody, forkAlts, codeOnly - duplicate that information. In other +-- words, we only need to clean the environment of the self-loop information +-- when compiling right hand side of a closure (binding). +-- +-- * When compiling a call (cgIdApp) we use getCallMethod to decide what kind +-- of call will be generated. getCallMethod decides to generate a self +-- recursive tail call when (a) environment stores information about +-- possible self tail-call; (b) that tail call is to a function currently +-- being compiled; (c) number of passed arguments is equal to function's +-- arity. + emitEnter :: CmmExpr -> FCode ReturnKind emitEnter fun = do diff --git a/compiler/codeGen/StgCmmMonad.hs b/compiler/codeGen/StgCmmMonad.hs index ddb677a49e..27d4fd6386 100644 --- a/compiler/codeGen/StgCmmMonad.hs +++ b/compiler/codeGen/StgCmmMonad.hs @@ -26,7 +26,7 @@ module StgCmmMonad ( mkCmmIfThenElse, mkCmmIfThen, mkCmmIfGoto, mkCall, mkCmmCall, - forkClosureBody, forkAlts, forkProc, codeOnly, + forkClosureBody, forkLneBody, forkAlts, codeOnly, ConTagZ, @@ -44,10 +44,10 @@ module StgCmmMonad ( getModuleName, -- ideally we wouldn't export these, but some other modules access internal state - getState, setState, getInfoDown, getDynFlags, getThisPackage, + getState, setState, getSelfLoop, withSelfLoop, getInfoDown, getDynFlags, getThisPackage, -- more localised access to monad state - CgIdInfo(..), CgLoc(..), + CgIdInfo(..), getBinds, setBinds, -- out of general friendliness, we also export ... @@ -60,6 +60,7 @@ import Cmm import StgCmmClosure import DynFlags import Hoopl +import Maybes import MkGraph import BlockId import CLabel @@ -100,11 +101,10 @@ infixr 9 `thenFC` -- - A reader monad, for CgInfoDownwards, containing -- - DynFlags, -- - the current Module --- - the static top-level environmnet -- - the update-frame offset -- - the ticky counter label -- - the Sequel (the continuation to return to) - +-- - the self-recursive tail call information -------------------------------------------------------- @@ -169,11 +169,15 @@ fixC fcode = FCode ( data CgInfoDownwards -- information only passed *downwards* by the monad = MkCgInfoDown { - cgd_dflags :: DynFlags, - cgd_mod :: Module, -- Module being compiled - cgd_updfr_off :: UpdFrameOffset, -- Size of current update frame - cgd_ticky :: CLabel, -- Current destination for ticky counts - cgd_sequel :: Sequel -- What to do at end of basic block + cgd_dflags :: DynFlags, + cgd_mod :: Module, -- Module being compiled + cgd_updfr_off :: UpdFrameOffset, -- Size of current update frame + cgd_ticky :: CLabel, -- Current destination for ticky counts + cgd_sequel :: Sequel, -- What to do at end of basic block + cgd_self_loop :: Maybe SelfLoopInfo -- Which tail calls can be compiled + -- as local jumps? See Note + -- [Self-recursive tail calls] in + -- StgCmmExpr } type CgBindings = IdEnv CgIdInfo @@ -195,25 +199,10 @@ data CgIdInfo -- use the externalised one in any C label we use which refers to this -- name. -data CgLoc - = CmmLoc CmmExpr -- A stable CmmExpr; that is, one not mentioning - -- Hp, so that it remains valid across calls - - | LneLoc BlockId [LocalReg] -- A join point - -- A join point (= let-no-escape) should only - -- be tail-called, and in a saturated way. - -- To tail-call it, assign to these locals, - -- and branch to the block id - instance Outputable CgIdInfo where ppr (CgIdInfo { cg_id = id, cg_loc = loc }) = ppr id <+> ptext (sLit "-->") <+> ppr loc -instance Outputable CgLoc where - ppr (CmmLoc e) = ptext (sLit "cmm") <+> ppr e - ppr (LneLoc b rs) = ptext (sLit "lne") <+> ppr b <+> ppr rs - - -- Sequel tells what to do with the result of this expression data Sequel = Return Bool -- Return result(s) to continuation found on the stack. @@ -308,7 +297,8 @@ initCgInfoDown dflags mod , cgd_mod = mod , cgd_updfr_off = initUpdFrameOff dflags , cgd_ticky = mkTopTickyCtrLabel - , cgd_sequel = initSequel } + , cgd_sequel = initSequel + , cgd_self_loop = Nothing } initSequel :: Sequel initSequel = Return False @@ -455,6 +445,16 @@ newUnique = do getInfoDown :: FCode CgInfoDownwards getInfoDown = FCode $ \info_down state -> (# info_down,state #) +getSelfLoop :: FCode (Maybe SelfLoopInfo) +getSelfLoop = do + info_down <- getInfoDown + return $ cgd_self_loop info_down + +withSelfLoop :: SelfLoopInfo -> FCode a -> FCode a +withSelfLoop self_loop code = do + info_down <- getInfoDown + withInfoDown code (info_down {cgd_self_loop = Just self_loop}) + instance HasDynFlags FCode where getDynFlags = liftM cgd_dflags getInfoDown @@ -481,7 +481,7 @@ getModuleName = do { info <- getInfoDown; return (cgd_mod info) } withSequel :: Sequel -> FCode a -> FCode a withSequel sequel code = do { info <- getInfoDown - ; withInfoDown code (info {cgd_sequel = sequel }) } + ; withInfoDown code (info {cgd_sequel = sequel, cgd_self_loop = Nothing }) } getSequel :: FCode Sequel getSequel = do { info <- getInfoDown @@ -526,15 +526,12 @@ setTickyCtrLabel ticky code = do -------------------------------------------------------- forkClosureBody :: FCode () -> FCode () --- forkClosureBody takes a code, $c$, and compiles it in a --- fresh environment, except that: --- - compilation info and statics are passed in unchanged. --- - local bindings are passed in unchanged --- (it's up to the enclosed code to re-bind the --- free variables to a field of the closure) --- --- The current state is passed on completely unaltered, except that --- C-- from the fork is incorporated. +-- forkClosureBody compiles body_code in environment where: +-- - sequel, update stack frame and self loop info are +-- set to fresh values +-- - state is set to a fresh value, except for local bindings +-- that are passed in unchanged. It's up to the enclosed code to +-- re-bind the free variables to a field of the closure. forkClosureBody body_code = do { dflags <- getDynFlags @@ -542,26 +539,25 @@ forkClosureBody body_code ; us <- newUniqSupply ; state <- getState ; let body_info_down = info { cgd_sequel = initSequel - , cgd_updfr_off = initUpdFrameOff dflags } + , cgd_updfr_off = initUpdFrameOff dflags + , cgd_self_loop = Nothing } fork_state_in = (initCgState us) { cgs_binds = cgs_binds state } ((),fork_state_out) = doFCode body_code body_info_down fork_state_in ; setState $ state `addCodeBlocksFrom` fork_state_out } -forkProc :: FCode a -> FCode a --- 'forkProc' takes a code and compiles it in the *current* environment, --- returning the graph thus constructed. +forkLneBody :: FCode a -> FCode a +-- 'forkLneBody' takes a body of let-no-escape binding and compiles +-- it in the *current* environment, returning the graph thus constructed. -- -- The current environment is passed on completely unchanged to -- the successor. In particular, any heap usage from the enclosed -- code is discarded; it should deal with its own heap consumption. --- forkProc is used to compile let-no-escape bindings. -forkProc body_code +forkLneBody body_code = do { info_down <- getInfoDown ; us <- newUniqSupply ; state <- getState - ; let info_down' = info_down -- { cgd_sequel = initSequel } - fork_state_in = (initCgState us) { cgs_binds = cgs_binds state } - (result, fork_state_out) = doFCode body_code info_down' fork_state_in + ; let fork_state_in = (initCgState us) { cgs_binds = cgs_binds state } + (result, fork_state_out) = doFCode body_code info_down fork_state_in ; setState $ state `addCodeBlocksFrom` fork_state_out ; return result } |