diff options
author | Sylvain Henry <sylvain@haskus.fr> | 2019-08-13 17:26:32 +0200 |
---|---|---|
committer | Sylvain Henry <sylvain@haskus.fr> | 2019-09-10 00:04:50 +0200 |
commit | 447864a94a1679b5b079e08bb7208a0005381cef (patch) | |
tree | baa469c52620ce7ae02def3e5e6a6f109cc89f40 /compiler/GHC | |
parent | 270fbe8512f04b6107755fa22bdec62205c0a567 (diff) | |
download | haskell-447864a94a1679b5b079e08bb7208a0005381cef.tar.gz |
Module hierarchy: StgToCmm (#13009)
Add StgToCmm module hierarchy. Platform modules that are used in several
other places (NCG, LLVM codegen, Cmm transformations) are put into
GHC.Platform.
Diffstat (limited to 'compiler/GHC')
27 files changed, 11330 insertions, 0 deletions
diff --git a/compiler/GHC/Platform/ARM.hs b/compiler/GHC/Platform/ARM.hs new file mode 100644 index 0000000000..d0c7e5811a --- /dev/null +++ b/compiler/GHC/Platform/ARM.hs @@ -0,0 +1,10 @@ +{-# LANGUAGE CPP #-} + +module GHC.Platform.ARM where + +import GhcPrelude + +#define MACHREGS_NO_REGS 0 +#define MACHREGS_arm 1 +#include "../../../includes/CodeGen.Platform.hs" + diff --git a/compiler/GHC/Platform/ARM64.hs b/compiler/GHC/Platform/ARM64.hs new file mode 100644 index 0000000000..ebd66b92c5 --- /dev/null +++ b/compiler/GHC/Platform/ARM64.hs @@ -0,0 +1,10 @@ +{-# LANGUAGE CPP #-} + +module GHC.Platform.ARM64 where + +import GhcPrelude + +#define MACHREGS_NO_REGS 0 +#define MACHREGS_aarch64 1 +#include "../../../includes/CodeGen.Platform.hs" + diff --git a/compiler/GHC/Platform/NoRegs.hs b/compiler/GHC/Platform/NoRegs.hs new file mode 100644 index 0000000000..e8abf44253 --- /dev/null +++ b/compiler/GHC/Platform/NoRegs.hs @@ -0,0 +1,9 @@ +{-# LANGUAGE CPP #-} + +module GHC.Platform.NoRegs where + +import GhcPrelude + +#define MACHREGS_NO_REGS 1 +#include "../../../includes/CodeGen.Platform.hs" + diff --git a/compiler/GHC/Platform/PPC.hs b/compiler/GHC/Platform/PPC.hs new file mode 100644 index 0000000000..f405f95438 --- /dev/null +++ b/compiler/GHC/Platform/PPC.hs @@ -0,0 +1,10 @@ +{-# LANGUAGE CPP #-} + +module GHC.Platform.PPC where + +import GhcPrelude + +#define MACHREGS_NO_REGS 0 +#define MACHREGS_powerpc 1 +#include "../../../includes/CodeGen.Platform.hs" + diff --git a/compiler/GHC/Platform/Regs.hs b/compiler/GHC/Platform/Regs.hs new file mode 100644 index 0000000000..e7887fbe72 --- /dev/null +++ b/compiler/GHC/Platform/Regs.hs @@ -0,0 +1,107 @@ + +module GHC.Platform.Regs + (callerSaves, activeStgRegs, haveRegBase, globalRegMaybe, freeReg) + where + +import GhcPrelude + +import CmmExpr +import GHC.Platform +import Reg + +import qualified GHC.Platform.ARM as ARM +import qualified GHC.Platform.ARM64 as ARM64 +import qualified GHC.Platform.PPC as PPC +import qualified GHC.Platform.SPARC as SPARC +import qualified GHC.Platform.X86 as X86 +import qualified GHC.Platform.X86_64 as X86_64 +import qualified GHC.Platform.NoRegs as NoRegs + +-- | Returns 'True' if this global register is stored in a caller-saves +-- machine register. + +callerSaves :: Platform -> GlobalReg -> Bool +callerSaves platform + | platformUnregisterised platform = NoRegs.callerSaves + | otherwise + = case platformArch platform of + ArchX86 -> X86.callerSaves + ArchX86_64 -> X86_64.callerSaves + ArchSPARC -> SPARC.callerSaves + ArchARM {} -> ARM.callerSaves + ArchARM64 -> ARM64.callerSaves + arch + | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] -> + PPC.callerSaves + + | otherwise -> NoRegs.callerSaves + +-- | Here is where the STG register map is defined for each target arch. +-- The order matters (for the llvm backend anyway)! We must make sure to +-- maintain the order here with the order used in the LLVM calling conventions. +-- Note that also, this isn't all registers, just the ones that are currently +-- possbily mapped to real registers. +activeStgRegs :: Platform -> [GlobalReg] +activeStgRegs platform + | platformUnregisterised platform = NoRegs.activeStgRegs + | otherwise + = case platformArch platform of + ArchX86 -> X86.activeStgRegs + ArchX86_64 -> X86_64.activeStgRegs + ArchSPARC -> SPARC.activeStgRegs + ArchARM {} -> ARM.activeStgRegs + ArchARM64 -> ARM64.activeStgRegs + arch + | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] -> + PPC.activeStgRegs + + | otherwise -> NoRegs.activeStgRegs + +haveRegBase :: Platform -> Bool +haveRegBase platform + | platformUnregisterised platform = NoRegs.haveRegBase + | otherwise + = case platformArch platform of + ArchX86 -> X86.haveRegBase + ArchX86_64 -> X86_64.haveRegBase + ArchSPARC -> SPARC.haveRegBase + ArchARM {} -> ARM.haveRegBase + ArchARM64 -> ARM64.haveRegBase + arch + | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] -> + PPC.haveRegBase + + | otherwise -> NoRegs.haveRegBase + +globalRegMaybe :: Platform -> GlobalReg -> Maybe RealReg +globalRegMaybe platform + | platformUnregisterised platform = NoRegs.globalRegMaybe + | otherwise + = case platformArch platform of + ArchX86 -> X86.globalRegMaybe + ArchX86_64 -> X86_64.globalRegMaybe + ArchSPARC -> SPARC.globalRegMaybe + ArchARM {} -> ARM.globalRegMaybe + ArchARM64 -> ARM64.globalRegMaybe + arch + | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] -> + PPC.globalRegMaybe + + | otherwise -> NoRegs.globalRegMaybe + +freeReg :: Platform -> RegNo -> Bool +freeReg platform + | platformUnregisterised platform = NoRegs.freeReg + | otherwise + = case platformArch platform of + ArchX86 -> X86.freeReg + ArchX86_64 -> X86_64.freeReg + ArchSPARC -> SPARC.freeReg + ArchARM {} -> ARM.freeReg + ArchARM64 -> ARM64.freeReg + arch + | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] -> + PPC.freeReg + + | otherwise -> NoRegs.freeReg + diff --git a/compiler/GHC/Platform/SPARC.hs b/compiler/GHC/Platform/SPARC.hs new file mode 100644 index 0000000000..b0cdb27f44 --- /dev/null +++ b/compiler/GHC/Platform/SPARC.hs @@ -0,0 +1,10 @@ +{-# LANGUAGE CPP #-} + +module GHC.Platform.SPARC where + +import GhcPrelude + +#define MACHREGS_NO_REGS 0 +#define MACHREGS_sparc 1 +#include "../../../includes/CodeGen.Platform.hs" + diff --git a/compiler/GHC/Platform/X86.hs b/compiler/GHC/Platform/X86.hs new file mode 100644 index 0000000000..1570ba9fa0 --- /dev/null +++ b/compiler/GHC/Platform/X86.hs @@ -0,0 +1,10 @@ +{-# LANGUAGE CPP #-} + +module GHC.Platform.X86 where + +import GhcPrelude + +#define MACHREGS_NO_REGS 0 +#define MACHREGS_i386 1 +#include "../../../includes/CodeGen.Platform.hs" + diff --git a/compiler/GHC/Platform/X86_64.hs b/compiler/GHC/Platform/X86_64.hs new file mode 100644 index 0000000000..d2d1b15c71 --- /dev/null +++ b/compiler/GHC/Platform/X86_64.hs @@ -0,0 +1,10 @@ +{-# LANGUAGE CPP #-} + +module GHC.Platform.X86_64 where + +import GhcPrelude + +#define MACHREGS_NO_REGS 0 +#define MACHREGS_x86_64 1 +#include "../../../includes/CodeGen.Platform.hs" + diff --git a/compiler/GHC/StgToCmm.hs b/compiler/GHC/StgToCmm.hs new file mode 100644 index 0000000000..c7ee604692 --- /dev/null +++ b/compiler/GHC/StgToCmm.hs @@ -0,0 +1,223 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE DataKinds #-} + +----------------------------------------------------------------------------- +-- +-- Stg to C-- code generation +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm ( codeGen ) where + +#include "HsVersions.h" + +import GhcPrelude as Prelude + +import GHC.StgToCmm.Prof (initCostCentres, ldvEnter) +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Env +import GHC.StgToCmm.Bind +import GHC.StgToCmm.Con +import GHC.StgToCmm.Layout +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Closure +import GHC.StgToCmm.Hpc +import GHC.StgToCmm.Ticky + +import Cmm +import CmmUtils +import CLabel + +import StgSyn +import DynFlags +import ErrUtils + +import HscTypes +import CostCentre +import Id +import IdInfo +import RepType +import DataCon +import TyCon +import Module +import Outputable +import Stream +import BasicTypes +import VarSet ( isEmptyDVarSet ) + +import OrdList +import MkGraph + +import Data.IORef +import Control.Monad (when,void) +import Util + +codeGen :: DynFlags + -> Module + -> [TyCon] + -> CollectedCCs -- (Local/global) cost-centres needing declaring/registering. + -> [CgStgTopBinding] -- Bindings to convert + -> HpcInfo + -> Stream IO CmmGroup () -- Output as a stream, so codegen can + -- be interleaved with output + +codeGen dflags this_mod data_tycons + cost_centre_info stg_binds hpc_info + = do { -- cg: run the code generator, and yield the resulting CmmGroup + -- Using an IORef to store the state is a bit crude, but otherwise + -- we would need to add a state monad layer. + ; cgref <- liftIO $ newIORef =<< initC + ; let cg :: FCode () -> Stream IO CmmGroup () + cg fcode = do + cmm <- liftIO . withTiming (return dflags) (text "STG -> Cmm") (`seq` ()) $ do + st <- readIORef cgref + let (a,st') = runC dflags this_mod st (getCmm fcode) + + -- NB. stub-out cgs_tops and cgs_stmts. This fixes + -- a big space leak. DO NOT REMOVE! + writeIORef cgref $! st'{ cgs_tops = nilOL, + cgs_stmts = mkNop } + return a + yield cmm + + -- Note [codegen-split-init] the cmm_init block must come + -- FIRST. This is because when -split-objs is on we need to + -- combine this block with its initialisation routines; see + -- Note [pipeline-split-init]. + ; cg (mkModuleInit cost_centre_info this_mod hpc_info) + + ; mapM_ (cg . cgTopBinding dflags) stg_binds + + -- Put datatype_stuff after code_stuff, because the + -- datatype closure table (for enumeration types) to + -- (say) PrelBase_True_closure, which is defined in + -- code_stuff + ; let do_tycon tycon = do + -- Generate a table of static closures for an + -- enumeration type Note that the closure pointers are + -- tagged. + when (isEnumerationTyCon tycon) $ cg (cgEnumerationTyCon tycon) + mapM_ (cg . cgDataCon) (tyConDataCons tycon) + + ; mapM_ do_tycon data_tycons + } + +--------------------------------------------------------------- +-- Top-level bindings +--------------------------------------------------------------- + +{- 'cgTopBinding' is only used for top-level bindings, since they need +to be allocated statically (not in the heap) and need to be labelled. +No unboxed bindings can happen at top level. + +In the code below, the static bindings are accumulated in the +@MkCgState@, and transferred into the ``statics'' slot by @forkStatics@. +This is so that we can write the top level processing in a compositional +style, with the increasing static environment being plumbed as a state +variable. -} + +cgTopBinding :: DynFlags -> CgStgTopBinding -> FCode () +cgTopBinding dflags (StgTopLifted (StgNonRec id rhs)) + = do { let (info, fcode) = cgTopRhs dflags NonRecursive id rhs + ; fcode + ; addBindC info + } + +cgTopBinding dflags (StgTopLifted (StgRec pairs)) + = do { let (bndrs, rhss) = unzip pairs + ; let pairs' = zip bndrs rhss + r = unzipWith (cgTopRhs dflags Recursive) pairs' + (infos, fcodes) = unzip r + ; addBindsC infos + ; sequence_ fcodes + } + +cgTopBinding dflags (StgTopStringLit id str) + = do { let label = mkBytesLabel (idName id) + ; let (lit, decl) = mkByteStringCLit label str + ; emitDecl decl + ; addBindC (litIdInfo dflags id mkLFStringLit lit) + } + +cgTopRhs :: DynFlags -> RecFlag -> Id -> CgStgRhs -> (CgIdInfo, FCode ()) + -- The Id is passed along for setting up a binding... + +cgTopRhs dflags _rec bndr (StgRhsCon _cc con args) + = cgTopRhsCon dflags bndr con (assertNonVoidStgArgs args) + -- con args are always non-void, + -- see Note [Post-unarisation invariants] in UnariseStg + +cgTopRhs dflags rec bndr (StgRhsClosure fvs cc upd_flag args body) + = ASSERT(isEmptyDVarSet fvs) -- There should be no free variables + cgTopRhsClosure dflags rec bndr cc upd_flag args body + + +--------------------------------------------------------------- +-- Module initialisation code +--------------------------------------------------------------- + +mkModuleInit + :: CollectedCCs -- cost centre info + -> Module + -> HpcInfo + -> FCode () + +mkModuleInit cost_centre_info this_mod hpc_info + = do { initHpc this_mod hpc_info + ; initCostCentres cost_centre_info + } + + +--------------------------------------------------------------- +-- Generating static stuff for algebraic data types +--------------------------------------------------------------- + + +cgEnumerationTyCon :: TyCon -> FCode () +cgEnumerationTyCon tycon + = do dflags <- getDynFlags + emitRODataLits (mkLocalClosureTableLabel (tyConName tycon) NoCafRefs) + [ CmmLabelOff (mkLocalClosureLabel (dataConName con) NoCafRefs) + (tagForCon dflags con) + | con <- tyConDataCons tycon] + + +cgDataCon :: DataCon -> FCode () +-- Generate the entry code, info tables, and (for niladic constructor) +-- the static closure, for a constructor. +cgDataCon data_con + = do { dflags <- getDynFlags + ; let + (tot_wds, -- #ptr_wds + #nonptr_wds + ptr_wds) -- #ptr_wds + = mkVirtConstrSizes dflags arg_reps + + nonptr_wds = tot_wds - ptr_wds + + dyn_info_tbl = + mkDataConInfoTable dflags data_con False ptr_wds nonptr_wds + + -- We're generating info tables, so we don't know and care about + -- what the actual arguments are. Using () here as the place holder. + arg_reps :: [NonVoid PrimRep] + arg_reps = [ NonVoid rep_ty + | ty <- dataConRepArgTys data_con + , rep_ty <- typePrimRep ty + , not (isVoidRep rep_ty) ] + + ; emitClosureAndInfoTable dyn_info_tbl NativeDirectCall [] $ + -- NB: the closure pointer is assumed *untagged* on + -- entry to a constructor. If the pointer is tagged, + -- then we should not be entering it. This assumption + -- is used in ldvEnter and when tagging the pointer to + -- return it. + -- NB 2: We don't set CC when entering data (WDP 94/06) + do { tickyEnterDynCon + ; ldvEnter (CmmReg nodeReg) + ; tickyReturnOldCon (length arg_reps) + ; void $ emitReturn [cmmOffsetB dflags (CmmReg nodeReg) (tagForCon dflags data_con)] + } + -- The case continuation code expects a tagged pointer + } diff --git a/compiler/GHC/StgToCmm/ArgRep.hs b/compiler/GHC/StgToCmm/ArgRep.hs new file mode 100644 index 0000000000..cc2fe8306a --- /dev/null +++ b/compiler/GHC/StgToCmm/ArgRep.hs @@ -0,0 +1,160 @@ +----------------------------------------------------------------------------- +-- +-- Argument representations used in GHC.StgToCmm.Layout. +-- +-- (c) The University of Glasgow 2013 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.ArgRep ( + ArgRep(..), toArgRep, argRepSizeW, + + argRepString, isNonV, idArgRep, + + slowCallPattern, + + ) where + +import GhcPrelude + +import GHC.StgToCmm.Closure ( idPrimRep ) + +import SMRep ( WordOff ) +import Id ( Id ) +import TyCon ( PrimRep(..), primElemRepSizeB ) +import BasicTypes ( RepArity ) +import Constants ( wORD64_SIZE ) +import DynFlags + +import Outputable +import FastString + +-- I extricated this code as this new module in order to avoid a +-- cyclic dependency between GHC.StgToCmm.Layout and GHC.StgToCmm.Ticky. +-- +-- NSF 18 Feb 2013 + +------------------------------------------------------------------------- +-- Classifying arguments: ArgRep +------------------------------------------------------------------------- + +-- ArgRep is re-exported by GHC.StgToCmm.Layout, but only for use in the +-- byte-code generator which also needs to know about the +-- classification of arguments. + +data ArgRep = P -- GC Ptr + | N -- Word-sized non-ptr + | L -- 64-bit non-ptr (long) + | V -- Void + | F -- Float + | D -- Double + | V16 -- 16-byte (128-bit) vectors of Float/Double/Int8/Word32/etc. + | V32 -- 32-byte (256-bit) vectors of Float/Double/Int8/Word32/etc. + | V64 -- 64-byte (512-bit) vectors of Float/Double/Int8/Word32/etc. +instance Outputable ArgRep where ppr = text . argRepString + +argRepString :: ArgRep -> String +argRepString P = "P" +argRepString N = "N" +argRepString L = "L" +argRepString V = "V" +argRepString F = "F" +argRepString D = "D" +argRepString V16 = "V16" +argRepString V32 = "V32" +argRepString V64 = "V64" + +toArgRep :: PrimRep -> ArgRep +toArgRep VoidRep = V +toArgRep LiftedRep = P +toArgRep UnliftedRep = P +toArgRep IntRep = N +toArgRep WordRep = N +toArgRep Int8Rep = N -- Gets widened to native word width for calls +toArgRep Word8Rep = N -- Gets widened to native word width for calls +toArgRep Int16Rep = N -- Gets widened to native word width for calls +toArgRep Word16Rep = N -- Gets widened to native word width for calls +toArgRep Int32Rep = N -- Gets widened to native word width for calls +toArgRep Word32Rep = N -- Gets widened to native word width for calls +toArgRep AddrRep = N +toArgRep Int64Rep = L +toArgRep Word64Rep = L +toArgRep FloatRep = F +toArgRep DoubleRep = D +toArgRep (VecRep len elem) = case len*primElemRepSizeB elem of + 16 -> V16 + 32 -> V32 + 64 -> V64 + _ -> error "toArgRep: bad vector primrep" + +isNonV :: ArgRep -> Bool +isNonV V = False +isNonV _ = True + +argRepSizeW :: DynFlags -> ArgRep -> WordOff -- Size in words +argRepSizeW _ N = 1 +argRepSizeW _ P = 1 +argRepSizeW _ F = 1 +argRepSizeW dflags L = wORD64_SIZE `quot` wORD_SIZE dflags +argRepSizeW dflags D = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags +argRepSizeW _ V = 0 +argRepSizeW dflags V16 = 16 `quot` wORD_SIZE dflags +argRepSizeW dflags V32 = 32 `quot` wORD_SIZE dflags +argRepSizeW dflags V64 = 64 `quot` wORD_SIZE dflags + +idArgRep :: Id -> ArgRep +idArgRep = toArgRep . idPrimRep + +-- This list of argument patterns should be kept in sync with at least +-- the following: +-- +-- * GHC.StgToCmm.Layout.stdPattern maybe to some degree? +-- +-- * the RTS_RET(stg_ap_*) and RTS_FUN_DECL(stg_ap_*_fast) +-- declarations in includes/stg/MiscClosures.h +-- +-- * the SLOW_CALL_*_ctr declarations in includes/stg/Ticky.h, +-- +-- * the TICK_SLOW_CALL_*() #defines in includes/Cmm.h, +-- +-- * the PR_CTR(SLOW_CALL_*_ctr) calls in rts/Ticky.c, +-- +-- * and the SymI_HasProto(stg_ap_*_{ret,info,fast}) calls and +-- SymI_HasProto(SLOW_CALL_*_ctr) calls in rts/Linker.c +-- +-- There may be more places that I haven't found; I merely igrep'd for +-- pppppp and excluded things that seemed ghci-specific. +-- +-- Also, it seems at the moment that ticky counters with void +-- arguments will never be bumped, but I'm still declaring those +-- counters, defensively. +-- +-- NSF 6 Mar 2013 + +slowCallPattern :: [ArgRep] -> (FastString, RepArity) +-- Returns the generic apply function and arity +-- +-- The first batch of cases match (some) specialised entries +-- The last group deals exhaustively with the cases for the first argument +-- (and the zero-argument case) +-- +-- In 99% of cases this function will match *all* the arguments in one batch + +slowCallPattern (P: P: P: P: P: P: _) = (fsLit "stg_ap_pppppp", 6) +slowCallPattern (P: P: P: P: P: _) = (fsLit "stg_ap_ppppp", 5) +slowCallPattern (P: P: P: P: _) = (fsLit "stg_ap_pppp", 4) +slowCallPattern (P: P: P: V: _) = (fsLit "stg_ap_pppv", 4) +slowCallPattern (P: P: P: _) = (fsLit "stg_ap_ppp", 3) +slowCallPattern (P: P: V: _) = (fsLit "stg_ap_ppv", 3) +slowCallPattern (P: P: _) = (fsLit "stg_ap_pp", 2) +slowCallPattern (P: V: _) = (fsLit "stg_ap_pv", 2) +slowCallPattern (P: _) = (fsLit "stg_ap_p", 1) +slowCallPattern (V: _) = (fsLit "stg_ap_v", 1) +slowCallPattern (N: _) = (fsLit "stg_ap_n", 1) +slowCallPattern (F: _) = (fsLit "stg_ap_f", 1) +slowCallPattern (D: _) = (fsLit "stg_ap_d", 1) +slowCallPattern (L: _) = (fsLit "stg_ap_l", 1) +slowCallPattern (V16: _) = (fsLit "stg_ap_v16", 1) +slowCallPattern (V32: _) = (fsLit "stg_ap_v32", 1) +slowCallPattern (V64: _) = (fsLit "stg_ap_v64", 1) +slowCallPattern [] = (fsLit "stg_ap_0", 0) diff --git a/compiler/GHC/StgToCmm/Bind.hs b/compiler/GHC/StgToCmm/Bind.hs new file mode 100644 index 0000000000..bfe9255783 --- /dev/null +++ b/compiler/GHC/StgToCmm/Bind.hs @@ -0,0 +1,753 @@ +----------------------------------------------------------------------------- +-- +-- Stg to C-- code generation: bindings +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Bind ( + cgTopRhsClosure, + cgBind, + emitBlackHoleCode, + pushUpdateFrame, emitUpdateFrame + ) where + +import GhcPrelude hiding ((<*>)) + +import GHC.StgToCmm.Expr +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Env +import GHC.StgToCmm.Con +import GHC.StgToCmm.Heap +import GHC.StgToCmm.Prof (ldvEnterClosure, enterCostCentreFun, enterCostCentreThunk, + initUpdFrameProf) +import GHC.StgToCmm.Ticky +import GHC.StgToCmm.Layout +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Closure +import GHC.StgToCmm.Foreign (emitPrimCall) + +import MkGraph +import CoreSyn ( AltCon(..), tickishIsCode ) +import BlockId +import SMRep +import Cmm +import CmmInfo +import CmmUtils +import CLabel +import StgSyn +import CostCentre +import Id +import IdInfo +import Name +import Module +import ListSetOps +import Util +import VarSet +import BasicTypes +import Outputable +import FastString +import DynFlags + +import Control.Monad + +------------------------------------------------------------------------ +-- Top-level bindings +------------------------------------------------------------------------ + +-- For closures bound at top level, allocate in static space. +-- They should have no free variables. + +cgTopRhsClosure :: DynFlags + -> RecFlag -- member of a recursive group? + -> Id + -> CostCentreStack -- Optional cost centre annotation + -> UpdateFlag + -> [Id] -- Args + -> CgStgExpr + -> (CgIdInfo, FCode ()) + +cgTopRhsClosure dflags rec id ccs upd_flag args body = + let closure_label = mkLocalClosureLabel (idName id) (idCafInfo id) + cg_id_info = litIdInfo dflags id lf_info (CmmLabel closure_label) + lf_info = mkClosureLFInfo dflags id TopLevel [] upd_flag args + in (cg_id_info, gen_code dflags lf_info closure_label) + where + -- special case for a indirection (f = g). We create an IND_STATIC + -- closure pointing directly to the indirectee. This is exactly + -- what the CAF will eventually evaluate to anyway, we're just + -- shortcutting the whole process, and generating a lot less code + -- (#7308). Eventually the IND_STATIC closure will be eliminated + -- by assembly '.equiv' directives, where possible (#15155). + -- See note [emit-time elimination of static indirections] in CLabel. + -- + -- Note: we omit the optimisation when this binding is part of a + -- recursive group, because the optimisation would inhibit the black + -- hole detection from working in that case. Test + -- concurrent/should_run/4030 fails, for instance. + -- + gen_code dflags _ closure_label + | StgApp f [] <- body, null args, isNonRec rec + = do + cg_info <- getCgIdInfo f + let closure_rep = mkStaticClosureFields dflags + indStaticInfoTable ccs MayHaveCafRefs + [unLit (idInfoToAmode cg_info)] + emitDataLits closure_label closure_rep + return () + + gen_code dflags lf_info _closure_label + = do { let name = idName id + ; mod_name <- getModuleName + ; let descr = closureDescription dflags mod_name name + closure_info = mkClosureInfo dflags True id lf_info 0 0 descr + + -- We don't generate the static closure here, because we might + -- want to add references to static closures to it later. The + -- static closure is generated by CmmBuildInfoTables.updInfoSRTs, + -- See Note [SRTs], specifically the [FUN] optimisation. + + ; let fv_details :: [(NonVoid Id, ByteOff)] + header = if isLFThunk lf_info then ThunkHeader else StdHeader + (_, _, fv_details) = mkVirtHeapOffsets dflags header [] + -- Don't drop the non-void args until the closure info has been made + ; forkClosureBody (closureCodeBody True id closure_info ccs + (nonVoidIds args) (length args) body fv_details) + + ; return () } + + unLit (CmmLit l) = l + unLit _ = panic "unLit" + +------------------------------------------------------------------------ +-- Non-top-level bindings +------------------------------------------------------------------------ + +cgBind :: CgStgBinding -> FCode () +cgBind (StgNonRec name rhs) + = do { (info, fcode) <- cgRhs name rhs + ; addBindC info + ; init <- fcode + ; emit init } + -- init cannot be used in body, so slightly better to sink it eagerly + +cgBind (StgRec pairs) + = do { r <- sequence $ unzipWith cgRhs pairs + ; let (id_infos, fcodes) = unzip r + ; addBindsC id_infos + ; (inits, body) <- getCodeR $ sequence fcodes + ; emit (catAGraphs inits <*> body) } + +{- Note [cgBind rec] + + Recursive let-bindings are tricky. + Consider the following pseudocode: + + let x = \_ -> ... y ... + y = \_ -> ... z ... + z = \_ -> ... x ... + in ... + + For each binding, we need to allocate a closure, and each closure must + capture the address of the other closures. + We want to generate the following C-- code: + // Initialization Code + x = hp - 24; // heap address of x's closure + y = hp - 40; // heap address of x's closure + z = hp - 64; // heap address of x's closure + // allocate and initialize x + m[hp-8] = ... + m[hp-16] = y // the closure for x captures y + m[hp-24] = x_info; + // allocate and initialize y + m[hp-32] = z; // the closure for y captures z + m[hp-40] = y_info; + // allocate and initialize z + ... + + For each closure, we must generate not only the code to allocate and + initialize the closure itself, but also some initialization Code that + sets a variable holding the closure pointer. + + We could generate a pair of the (init code, body code), but since + the bindings are recursive we also have to initialise the + environment with the CgIdInfo for all the bindings before compiling + anything. So we do this in 3 stages: + + 1. collect all the CgIdInfos and initialise the environment + 2. compile each binding into (init, body) code + 3. emit all the inits, and then all the bodies + + We'd rather not have separate functions to do steps 1 and 2 for + each binding, since in pratice they share a lot of code. So we + have just one function, cgRhs, that returns a pair of the CgIdInfo + for step 1, and a monadic computation to generate the code in step + 2. + + The alternative to separating things in this way is to use a + fixpoint. That's what we used to do, but it introduces a + maintenance nightmare because there is a subtle dependency on not + being too strict everywhere. Doing things this way means that the + FCode monad can be strict, for example. + -} + +cgRhs :: Id + -> CgStgRhs + -> FCode ( + CgIdInfo -- The info for this binding + , FCode CmmAGraph -- A computation which will generate the + -- code for the binding, and return an + -- assignent of the form "x = Hp - n" + -- (see above) + ) + +cgRhs id (StgRhsCon cc con args) + = withNewTickyCounterCon (idName id) $ + buildDynCon id True cc con (assertNonVoidStgArgs args) + -- con args are always non-void, + -- see Note [Post-unarisation invariants] in UnariseStg + +{- See Note [GC recovery] in compiler/GHC.StgToCmm/Closure.hs -} +cgRhs id (StgRhsClosure fvs cc upd_flag args body) + = do dflags <- getDynFlags + mkRhsClosure dflags id cc (nonVoidIds (dVarSetElems fvs)) upd_flag args body + +------------------------------------------------------------------------ +-- Non-constructor right hand sides +------------------------------------------------------------------------ + +mkRhsClosure :: DynFlags -> Id -> CostCentreStack + -> [NonVoid Id] -- Free vars + -> UpdateFlag + -> [Id] -- Args + -> CgStgExpr + -> FCode (CgIdInfo, FCode CmmAGraph) + +{- mkRhsClosure looks for two special forms of the right-hand side: + a) selector thunks + b) AP thunks + +If neither happens, it just calls mkClosureLFInfo. You might think +that mkClosureLFInfo should do all this, but it seems wrong for the +latter to look at the structure of an expression + +Note [Selectors] +~~~~~~~~~~~~~~~~ +We look at the body of the closure to see if it's a selector---turgid, +but nothing deep. We are looking for a closure of {\em exactly} the +form: + +... = [the_fv] \ u [] -> + case the_fv of + con a_1 ... a_n -> a_i + +Note [Ap thunks] +~~~~~~~~~~~~~~~~ +A more generic AP thunk of the form + + x = [ x_1...x_n ] \.. [] -> x_1 ... x_n + +A set of these is compiled statically into the RTS, so we just use +those. We could extend the idea to thunks where some of the x_i are +global ids (and hence not free variables), but this would entail +generating a larger thunk. It might be an option for non-optimising +compilation, though. + +We only generate an Ap thunk if all the free variables are pointers, +for semi-obvious reasons. + +-} + +---------- Note [Selectors] ------------------ +mkRhsClosure dflags bndr _cc + [NonVoid the_fv] -- Just one free var + upd_flag -- Updatable thunk + [] -- A thunk + expr + | let strip = stripStgTicksTopE (not . tickishIsCode) + , StgCase (StgApp scrutinee [{-no args-}]) + _ -- ignore bndr + (AlgAlt _) + [(DataAlt _, params, sel_expr)] <- strip expr + , StgApp selectee [{-no args-}] <- strip sel_expr + , the_fv == scrutinee -- Scrutinee is the only free variable + + , let (_, _, params_w_offsets) = mkVirtConstrOffsets dflags (addIdReps (assertNonVoidIds params)) + -- pattern binders are always non-void, + -- see Note [Post-unarisation invariants] in UnariseStg + , Just the_offset <- assocMaybe params_w_offsets (NonVoid selectee) + + , let offset_into_int = bytesToWordsRoundUp dflags the_offset + - fixedHdrSizeW dflags + , offset_into_int <= mAX_SPEC_SELECTEE_SIZE dflags -- Offset is small enough + = -- NOT TRUE: ASSERT(is_single_constructor) + -- The simplifier may have statically determined that the single alternative + -- is the only possible case and eliminated the others, even if there are + -- other constructors in the datatype. It's still ok to make a selector + -- thunk in this case, because we *know* which constructor the scrutinee + -- will evaluate to. + -- + -- srt is discarded; it must be empty + let lf_info = mkSelectorLFInfo bndr offset_into_int (isUpdatable upd_flag) + in cgRhsStdThunk bndr lf_info [StgVarArg the_fv] + +---------- Note [Ap thunks] ------------------ +mkRhsClosure dflags bndr _cc + fvs + upd_flag + [] -- No args; a thunk + (StgApp fun_id args) + + -- We are looking for an "ApThunk"; see data con ApThunk in GHC.StgToCmm.Closure + -- of form (x1 x2 .... xn), where all the xi are locals (not top-level) + -- So the xi will all be free variables + | args `lengthIs` (n_fvs-1) -- This happens only if the fun_id and + -- args are all distinct local variables + -- The "-1" is for fun_id + -- Missed opportunity: (f x x) is not detected + , all (isGcPtrRep . idPrimRep . fromNonVoid) fvs + , isUpdatable upd_flag + , n_fvs <= mAX_SPEC_AP_SIZE dflags + , not (gopt Opt_SccProfilingOn dflags) + -- not when profiling: we don't want to + -- lose information about this particular + -- thunk (e.g. its type) (#949) + , idArity fun_id == unknownArity -- don't spoil a known call + + -- Ha! an Ap thunk + = cgRhsStdThunk bndr lf_info payload + + where + n_fvs = length fvs + lf_info = mkApLFInfo bndr upd_flag n_fvs + -- the payload has to be in the correct order, hence we can't + -- just use the fvs. + payload = StgVarArg fun_id : args + +---------- Default case ------------------ +mkRhsClosure dflags bndr cc fvs upd_flag args body + = do { let lf_info = mkClosureLFInfo dflags bndr NotTopLevel fvs upd_flag args + ; (id_info, reg) <- rhsIdInfo bndr lf_info + ; return (id_info, gen_code lf_info reg) } + where + gen_code lf_info reg + = do { -- LAY OUT THE OBJECT + -- If the binder is itself a free variable, then don't store + -- it in the closure. Instead, just bind it to Node on entry. + -- NB we can be sure that Node will point to it, because we + -- haven't told mkClosureLFInfo about this; so if the binder + -- _was_ a free var of its RHS, mkClosureLFInfo thinks it *is* + -- stored in the closure itself, so it will make sure that + -- Node points to it... + ; let reduced_fvs = filter (NonVoid bndr /=) fvs + + -- MAKE CLOSURE INFO FOR THIS CLOSURE + ; mod_name <- getModuleName + ; dflags <- getDynFlags + ; let name = idName bndr + descr = closureDescription dflags mod_name name + fv_details :: [(NonVoid Id, ByteOff)] + header = if isLFThunk lf_info then ThunkHeader else StdHeader + (tot_wds, ptr_wds, fv_details) + = mkVirtHeapOffsets dflags header (addIdReps reduced_fvs) + closure_info = mkClosureInfo dflags False -- Not static + bndr lf_info tot_wds ptr_wds + descr + + -- BUILD ITS INFO TABLE AND CODE + ; forkClosureBody $ + -- forkClosureBody: (a) ensure that bindings in here are not seen elsewhere + -- (b) ignore Sequel from context; use empty Sequel + -- And compile the body + closureCodeBody False bndr closure_info cc (nonVoidIds args) + (length args) body fv_details + + -- BUILD THE OBJECT +-- ; (use_cc, blame_cc) <- chooseDynCostCentres cc args body + ; let use_cc = cccsExpr; blame_cc = cccsExpr + ; emit (mkComment $ mkFastString "calling allocDynClosure") + ; let toVarArg (NonVoid a, off) = (NonVoid (StgVarArg a), off) + ; let info_tbl = mkCmmInfo closure_info bndr currentCCS + ; hp_plus_n <- allocDynClosure (Just bndr) info_tbl lf_info use_cc blame_cc + (map toVarArg fv_details) + + -- RETURN + ; return (mkRhsInit dflags reg lf_info hp_plus_n) } + +------------------------- +cgRhsStdThunk + :: Id + -> LambdaFormInfo + -> [StgArg] -- payload + -> FCode (CgIdInfo, FCode CmmAGraph) + +cgRhsStdThunk bndr lf_info payload + = do { (id_info, reg) <- rhsIdInfo bndr lf_info + ; return (id_info, gen_code reg) + } + where + gen_code reg -- AHA! A STANDARD-FORM THUNK + = withNewTickyCounterStdThunk (lfUpdatable lf_info) (idName bndr) $ + do + { -- LAY OUT THE OBJECT + mod_name <- getModuleName + ; dflags <- getDynFlags + ; let header = if isLFThunk lf_info then ThunkHeader else StdHeader + (tot_wds, ptr_wds, payload_w_offsets) + = mkVirtHeapOffsets dflags header + (addArgReps (nonVoidStgArgs payload)) + + descr = closureDescription dflags mod_name (idName bndr) + closure_info = mkClosureInfo dflags False -- Not static + bndr lf_info tot_wds ptr_wds + descr + +-- ; (use_cc, blame_cc) <- chooseDynCostCentres cc [{- no args-}] body + ; let use_cc = cccsExpr; blame_cc = cccsExpr + + + -- BUILD THE OBJECT + ; let info_tbl = mkCmmInfo closure_info bndr currentCCS + ; hp_plus_n <- allocDynClosure (Just bndr) info_tbl lf_info + use_cc blame_cc payload_w_offsets + + -- RETURN + ; return (mkRhsInit dflags reg lf_info hp_plus_n) } + + +mkClosureLFInfo :: DynFlags + -> Id -- The binder + -> TopLevelFlag -- True of top level + -> [NonVoid Id] -- Free vars + -> UpdateFlag -- Update flag + -> [Id] -- Args + -> LambdaFormInfo +mkClosureLFInfo dflags bndr top fvs upd_flag args + | null args = + mkLFThunk (idType bndr) top (map fromNonVoid fvs) upd_flag + | otherwise = + mkLFReEntrant top (map fromNonVoid fvs) args (mkArgDescr dflags args) + + +------------------------------------------------------------------------ +-- The code for closures +------------------------------------------------------------------------ + +closureCodeBody :: Bool -- whether this is a top-level binding + -> Id -- the closure's name + -> ClosureInfo -- Lots of information about this closure + -> CostCentreStack -- Optional cost centre attached to closure + -> [NonVoid Id] -- incoming args to the closure + -> Int -- arity, including void args + -> CgStgExpr + -> [(NonVoid Id, ByteOff)] -- the closure's free vars + -> FCode () + +{- There are two main cases for the code for closures. + +* If there are *no arguments*, then the closure is a thunk, and not in + normal form. So it should set up an update frame (if it is + shared). NB: Thunks cannot have a primitive type! + +* If there is *at least one* argument, then this closure is in + normal form, so there is no need to set up an update frame. +-} + +closureCodeBody top_lvl bndr cl_info cc _args arity body fv_details + | arity == 0 -- No args i.e. thunk + = withNewTickyCounterThunk + (isStaticClosure cl_info) + (closureUpdReqd cl_info) + (closureName cl_info) $ + emitClosureProcAndInfoTable top_lvl bndr lf_info info_tbl [] $ + \(_, node, _) -> thunkCode cl_info fv_details cc node arity body + where + lf_info = closureLFInfo cl_info + info_tbl = mkCmmInfo cl_info bndr cc + +closureCodeBody top_lvl bndr cl_info cc args arity body fv_details + = -- Note: args may be [], if all args are Void + withNewTickyCounterFun + (closureSingleEntry cl_info) + (closureName cl_info) + args $ do { + + ; let + lf_info = closureLFInfo cl_info + info_tbl = mkCmmInfo cl_info bndr cc + + -- Emit the main entry code + ; emitClosureProcAndInfoTable top_lvl bndr lf_info info_tbl args $ + \(_offset, node, arg_regs) -> do + -- Emit slow-entry code (for entering a closure through a PAP) + { mkSlowEntryCode bndr cl_info arg_regs + ; dflags <- getDynFlags + ; let node_points = nodeMustPointToIt dflags lf_info + node' = if node_points then Just node else Nothing + ; loop_header_id <- newBlockId + -- Extend reader monad with information that + -- self-recursive tail calls can be optimized into local + -- jumps. See Note [Self-recursive tail calls] in GHC.StgToCmm.Expr. + ; withSelfLoop (bndr, loop_header_id, arg_regs) $ do + { + -- Main payload + ; entryHeapCheck cl_info node' arity arg_regs $ do + { -- emit LDV code when profiling + when node_points (ldvEnterClosure cl_info (CmmLocal node)) + -- ticky after heap check to avoid double counting + ; tickyEnterFun cl_info + ; enterCostCentreFun cc + (CmmMachOp (mo_wordSub dflags) + [ CmmReg (CmmLocal node) -- See [NodeReg clobbered with loopification] + , mkIntExpr dflags (funTag dflags cl_info) ]) + ; fv_bindings <- mapM bind_fv fv_details + -- Load free vars out of closure *after* + -- heap check, to reduce live vars over check + ; when node_points $ load_fvs node lf_info fv_bindings + ; void $ cgExpr body + }}} + + } + +-- Note [NodeReg clobbered with loopification] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- Previously we used to pass nodeReg (aka R1) here. With profiling, upon +-- entering a closure, enterFunCCS was called with R1 passed to it. But since R1 +-- may get clobbered inside the body of a closure, and since a self-recursive +-- tail call does not restore R1, a subsequent call to enterFunCCS received a +-- possibly bogus value from R1. The solution is to not pass nodeReg (aka R1) to +-- enterFunCCS. Instead, we pass node, the callee-saved temporary that stores +-- the original value of R1. This way R1 may get modified but loopification will +-- not care. + +-- A function closure pointer may be tagged, so we +-- must take it into account when accessing the free variables. +bind_fv :: (NonVoid Id, ByteOff) -> FCode (LocalReg, ByteOff) +bind_fv (id, off) = do { reg <- rebindToReg id; return (reg, off) } + +load_fvs :: LocalReg -> LambdaFormInfo -> [(LocalReg, ByteOff)] -> FCode () +load_fvs node lf_info = mapM_ (\ (reg, off) -> + do dflags <- getDynFlags + let tag = lfDynTag dflags lf_info + emit $ mkTaggedObjectLoad dflags reg node off tag) + +----------------------------------------- +-- The "slow entry" code for a function. This entry point takes its +-- arguments on the stack. It loads the arguments into registers +-- according to the calling convention, and jumps to the function's +-- normal entry point. The function's closure is assumed to be in +-- R1/node. +-- +-- The slow entry point is used for unknown calls: eg. stg_PAP_entry + +mkSlowEntryCode :: Id -> ClosureInfo -> [LocalReg] -> FCode () +-- If this function doesn't have a specialised ArgDescr, we need +-- to generate the function's arg bitmap and slow-entry code. +-- Here, we emit the slow-entry code. +mkSlowEntryCode bndr cl_info arg_regs -- function closure is already in `Node' + | Just (_, ArgGen _) <- closureFunInfo cl_info + = do dflags <- getDynFlags + let node = idToReg dflags (NonVoid bndr) + slow_lbl = closureSlowEntryLabel cl_info + fast_lbl = closureLocalEntryLabel dflags cl_info + -- mkDirectJump does not clobber `Node' containing function closure + jump = mkJump dflags NativeNodeCall + (mkLblExpr fast_lbl) + (map (CmmReg . CmmLocal) (node : arg_regs)) + (initUpdFrameOff dflags) + tscope <- getTickScope + emitProcWithConvention Slow Nothing slow_lbl + (node : arg_regs) (jump, tscope) + | otherwise = return () + +----------------------------------------- +thunkCode :: ClosureInfo -> [(NonVoid Id, ByteOff)] -> CostCentreStack + -> LocalReg -> Int -> CgStgExpr -> FCode () +thunkCode cl_info fv_details _cc node arity body + = do { dflags <- getDynFlags + ; let node_points = nodeMustPointToIt dflags (closureLFInfo cl_info) + node' = if node_points then Just node else Nothing + ; ldvEnterClosure cl_info (CmmLocal node) -- NB: Node always points when profiling + + -- Heap overflow check + ; entryHeapCheck cl_info node' arity [] $ do + { -- Overwrite with black hole if necessary + -- but *after* the heap-overflow check + ; tickyEnterThunk cl_info + ; when (blackHoleOnEntry cl_info && node_points) + (blackHoleIt node) + + -- Push update frame + ; setupUpdate cl_info node $ + -- We only enter cc after setting up update so + -- that cc of enclosing scope will be recorded + -- in update frame CAF/DICT functions will be + -- subsumed by this enclosing cc + do { enterCostCentreThunk (CmmReg nodeReg) + ; let lf_info = closureLFInfo cl_info + ; fv_bindings <- mapM bind_fv fv_details + ; load_fvs node lf_info fv_bindings + ; void $ cgExpr body }}} + + +------------------------------------------------------------------------ +-- Update and black-hole wrappers +------------------------------------------------------------------------ + +blackHoleIt :: LocalReg -> FCode () +-- Only called for closures with no args +-- Node points to the closure +blackHoleIt node_reg + = emitBlackHoleCode (CmmReg (CmmLocal node_reg)) + +emitBlackHoleCode :: CmmExpr -> FCode () +emitBlackHoleCode node = do + dflags <- getDynFlags + + -- Eager blackholing is normally disabled, but can be turned on with + -- -feager-blackholing. When it is on, we replace the info pointer + -- of the thunk with stg_EAGER_BLACKHOLE_info on entry. + + -- If we wanted to do eager blackholing with slop filling, we'd need + -- to do it at the *end* of a basic block, otherwise we overwrite + -- the free variables in the thunk that we still need. We have a + -- patch for this from Andy Cheadle, but not incorporated yet. --SDM + -- [6/2004] + -- + -- Previously, eager blackholing was enabled when ticky-ticky was + -- on. But it didn't work, and it wasn't strictly necessary to bring + -- back minimal ticky-ticky, so now EAGER_BLACKHOLING is + -- unconditionally disabled. -- krc 1/2007 + + -- Note the eager-blackholing check is here rather than in blackHoleOnEntry, + -- because emitBlackHoleCode is called from CmmParse. + + let eager_blackholing = not (gopt Opt_SccProfilingOn dflags) + && gopt Opt_EagerBlackHoling dflags + -- Profiling needs slop filling (to support LDV + -- profiling), so currently eager blackholing doesn't + -- work with profiling. + + when eager_blackholing $ do + emitStore (cmmOffsetW dflags node (fixedHdrSizeW dflags)) currentTSOExpr + -- See Note [Heap memory barriers] in SMP.h. + emitPrimCall [] MO_WriteBarrier [] + emitStore node (CmmReg (CmmGlobal EagerBlackholeInfo)) + +setupUpdate :: ClosureInfo -> LocalReg -> FCode () -> FCode () + -- Nota Bene: this function does not change Node (even if it's a CAF), + -- so that the cost centre in the original closure can still be + -- extracted by a subsequent enterCostCentre +setupUpdate closure_info node body + | not (lfUpdatable (closureLFInfo closure_info)) + = body + + | not (isStaticClosure closure_info) + = if not (closureUpdReqd closure_info) + then do tickyUpdateFrameOmitted; body + else do + tickyPushUpdateFrame + dflags <- getDynFlags + let + bh = blackHoleOnEntry closure_info && + not (gopt Opt_SccProfilingOn dflags) && + gopt Opt_EagerBlackHoling dflags + + lbl | bh = mkBHUpdInfoLabel + | otherwise = mkUpdInfoLabel + + pushUpdateFrame lbl (CmmReg (CmmLocal node)) body + + | otherwise -- A static closure + = do { tickyUpdateBhCaf closure_info + + ; if closureUpdReqd closure_info + then do -- Blackhole the (updatable) CAF: + { upd_closure <- link_caf node + ; pushUpdateFrame mkBHUpdInfoLabel upd_closure body } + else do {tickyUpdateFrameOmitted; body} + } + +----------------------------------------------------------------------------- +-- Setting up update frames + +-- Push the update frame on the stack in the Entry area, +-- leaving room for the return address that is already +-- at the old end of the area. +-- +pushUpdateFrame :: CLabel -> CmmExpr -> FCode () -> FCode () +pushUpdateFrame lbl updatee body + = do + updfr <- getUpdFrameOff + dflags <- getDynFlags + let + hdr = fixedHdrSize dflags + frame = updfr + hdr + sIZEOF_StgUpdateFrame_NoHdr dflags + -- + emitUpdateFrame dflags (CmmStackSlot Old frame) lbl updatee + withUpdFrameOff frame body + +emitUpdateFrame :: DynFlags -> CmmExpr -> CLabel -> CmmExpr -> FCode () +emitUpdateFrame dflags frame lbl updatee = do + let + hdr = fixedHdrSize dflags + off_updatee = hdr + oFFSET_StgUpdateFrame_updatee dflags + -- + emitStore frame (mkLblExpr lbl) + emitStore (cmmOffset dflags frame off_updatee) updatee + initUpdFrameProf frame + +----------------------------------------------------------------------------- +-- Entering a CAF +-- +-- See Note [CAF management] in rts/sm/Storage.c + +link_caf :: LocalReg -- pointer to the closure + -> FCode CmmExpr -- Returns amode for closure to be updated +-- This function returns the address of the black hole, so it can be +-- updated with the new value when available. +link_caf node = do + { dflags <- getDynFlags + -- Call the RTS function newCAF, returning the newly-allocated + -- blackhole indirection closure + ; let newCAF_lbl = mkForeignLabel (fsLit "newCAF") Nothing + ForeignLabelInExternalPackage IsFunction + ; bh <- newTemp (bWord dflags) + ; emitRtsCallGen [(bh,AddrHint)] newCAF_lbl + [ (baseExpr, AddrHint), + (CmmReg (CmmLocal node), AddrHint) ] + False + + -- see Note [atomic CAF entry] in rts/sm/Storage.c + ; updfr <- getUpdFrameOff + ; let target = entryCode dflags (closureInfoPtr dflags (CmmReg (CmmLocal node))) + ; emit =<< mkCmmIfThen + (cmmEqWord dflags (CmmReg (CmmLocal bh)) (zeroExpr dflags)) + -- re-enter the CAF + (mkJump dflags NativeNodeCall target [] updfr) + + ; return (CmmReg (CmmLocal bh)) } + +------------------------------------------------------------------------ +-- Profiling +------------------------------------------------------------------------ + +-- For "global" data constructors the description is simply occurrence +-- name of the data constructor itself. Otherwise it is determined by +-- @closureDescription@ from the let binding information. + +closureDescription :: DynFlags + -> Module -- Module + -> Name -- Id of closure binding + -> String + -- Not called for StgRhsCon which have global info tables built in + -- CgConTbls.hs with a description generated from the data constructor +closureDescription dflags mod_name name + = showSDocDump dflags (char '<' <> + (if isExternalName name + then ppr name -- ppr will include the module name prefix + else pprModule mod_name <> char '.' <> ppr name) <> + char '>') + -- showSDocDump, because we want to see the unique on the Name. diff --git a/compiler/GHC/StgToCmm/Bind.hs-boot b/compiler/GHC/StgToCmm/Bind.hs-boot new file mode 100644 index 0000000000..d16c34ebd3 --- /dev/null +++ b/compiler/GHC/StgToCmm/Bind.hs-boot @@ -0,0 +1,6 @@ +module GHC.StgToCmm.Bind where + +import GHC.StgToCmm.Monad( FCode ) +import StgSyn( CgStgBinding ) + +cgBind :: CgStgBinding -> FCode () diff --git a/compiler/GHC/StgToCmm/CgUtils.hs b/compiler/GHC/StgToCmm/CgUtils.hs new file mode 100644 index 0000000000..f3dccd9745 --- /dev/null +++ b/compiler/GHC/StgToCmm/CgUtils.hs @@ -0,0 +1,186 @@ +{-# LANGUAGE GADTs #-} + +----------------------------------------------------------------------------- +-- +-- Code generator utilities; mostly monadic +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.CgUtils ( + fixStgRegisters, + baseRegOffset, + get_Regtable_addr_from_offset, + regTableOffset, + get_GlobalReg_addr, + ) where + +import GhcPrelude + +import GHC.Platform.Regs +import Cmm +import Hoopl.Block +import Hoopl.Graph +import CmmUtils +import CLabel +import DynFlags +import Outputable + +-- ----------------------------------------------------------------------------- +-- Information about global registers + +baseRegOffset :: DynFlags -> GlobalReg -> Int + +baseRegOffset dflags (VanillaReg 1 _) = oFFSET_StgRegTable_rR1 dflags +baseRegOffset dflags (VanillaReg 2 _) = oFFSET_StgRegTable_rR2 dflags +baseRegOffset dflags (VanillaReg 3 _) = oFFSET_StgRegTable_rR3 dflags +baseRegOffset dflags (VanillaReg 4 _) = oFFSET_StgRegTable_rR4 dflags +baseRegOffset dflags (VanillaReg 5 _) = oFFSET_StgRegTable_rR5 dflags +baseRegOffset dflags (VanillaReg 6 _) = oFFSET_StgRegTable_rR6 dflags +baseRegOffset dflags (VanillaReg 7 _) = oFFSET_StgRegTable_rR7 dflags +baseRegOffset dflags (VanillaReg 8 _) = oFFSET_StgRegTable_rR8 dflags +baseRegOffset dflags (VanillaReg 9 _) = oFFSET_StgRegTable_rR9 dflags +baseRegOffset dflags (VanillaReg 10 _) = oFFSET_StgRegTable_rR10 dflags +baseRegOffset _ (VanillaReg n _) = panic ("Registers above R10 are not supported (tried to use R" ++ show n ++ ")") +baseRegOffset dflags (FloatReg 1) = oFFSET_StgRegTable_rF1 dflags +baseRegOffset dflags (FloatReg 2) = oFFSET_StgRegTable_rF2 dflags +baseRegOffset dflags (FloatReg 3) = oFFSET_StgRegTable_rF3 dflags +baseRegOffset dflags (FloatReg 4) = oFFSET_StgRegTable_rF4 dflags +baseRegOffset dflags (FloatReg 5) = oFFSET_StgRegTable_rF5 dflags +baseRegOffset dflags (FloatReg 6) = oFFSET_StgRegTable_rF6 dflags +baseRegOffset _ (FloatReg n) = panic ("Registers above F6 are not supported (tried to use F" ++ show n ++ ")") +baseRegOffset dflags (DoubleReg 1) = oFFSET_StgRegTable_rD1 dflags +baseRegOffset dflags (DoubleReg 2) = oFFSET_StgRegTable_rD2 dflags +baseRegOffset dflags (DoubleReg 3) = oFFSET_StgRegTable_rD3 dflags +baseRegOffset dflags (DoubleReg 4) = oFFSET_StgRegTable_rD4 dflags +baseRegOffset dflags (DoubleReg 5) = oFFSET_StgRegTable_rD5 dflags +baseRegOffset dflags (DoubleReg 6) = oFFSET_StgRegTable_rD6 dflags +baseRegOffset _ (DoubleReg n) = panic ("Registers above D6 are not supported (tried to use D" ++ show n ++ ")") +baseRegOffset dflags (XmmReg 1) = oFFSET_StgRegTable_rXMM1 dflags +baseRegOffset dflags (XmmReg 2) = oFFSET_StgRegTable_rXMM2 dflags +baseRegOffset dflags (XmmReg 3) = oFFSET_StgRegTable_rXMM3 dflags +baseRegOffset dflags (XmmReg 4) = oFFSET_StgRegTable_rXMM4 dflags +baseRegOffset dflags (XmmReg 5) = oFFSET_StgRegTable_rXMM5 dflags +baseRegOffset dflags (XmmReg 6) = oFFSET_StgRegTable_rXMM6 dflags +baseRegOffset _ (XmmReg n) = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")") +baseRegOffset dflags (YmmReg 1) = oFFSET_StgRegTable_rYMM1 dflags +baseRegOffset dflags (YmmReg 2) = oFFSET_StgRegTable_rYMM2 dflags +baseRegOffset dflags (YmmReg 3) = oFFSET_StgRegTable_rYMM3 dflags +baseRegOffset dflags (YmmReg 4) = oFFSET_StgRegTable_rYMM4 dflags +baseRegOffset dflags (YmmReg 5) = oFFSET_StgRegTable_rYMM5 dflags +baseRegOffset dflags (YmmReg 6) = oFFSET_StgRegTable_rYMM6 dflags +baseRegOffset _ (YmmReg n) = panic ("Registers above YMM6 are not supported (tried to use YMM" ++ show n ++ ")") +baseRegOffset dflags (ZmmReg 1) = oFFSET_StgRegTable_rZMM1 dflags +baseRegOffset dflags (ZmmReg 2) = oFFSET_StgRegTable_rZMM2 dflags +baseRegOffset dflags (ZmmReg 3) = oFFSET_StgRegTable_rZMM3 dflags +baseRegOffset dflags (ZmmReg 4) = oFFSET_StgRegTable_rZMM4 dflags +baseRegOffset dflags (ZmmReg 5) = oFFSET_StgRegTable_rZMM5 dflags +baseRegOffset dflags (ZmmReg 6) = oFFSET_StgRegTable_rZMM6 dflags +baseRegOffset _ (ZmmReg n) = panic ("Registers above ZMM6 are not supported (tried to use ZMM" ++ show n ++ ")") +baseRegOffset dflags Sp = oFFSET_StgRegTable_rSp dflags +baseRegOffset dflags SpLim = oFFSET_StgRegTable_rSpLim dflags +baseRegOffset dflags (LongReg 1) = oFFSET_StgRegTable_rL1 dflags +baseRegOffset _ (LongReg n) = panic ("Registers above L1 are not supported (tried to use L" ++ show n ++ ")") +baseRegOffset dflags Hp = oFFSET_StgRegTable_rHp dflags +baseRegOffset dflags HpLim = oFFSET_StgRegTable_rHpLim dflags +baseRegOffset dflags CCCS = oFFSET_StgRegTable_rCCCS dflags +baseRegOffset dflags CurrentTSO = oFFSET_StgRegTable_rCurrentTSO dflags +baseRegOffset dflags CurrentNursery = oFFSET_StgRegTable_rCurrentNursery dflags +baseRegOffset dflags HpAlloc = oFFSET_StgRegTable_rHpAlloc dflags +baseRegOffset dflags EagerBlackholeInfo = oFFSET_stgEagerBlackholeInfo dflags +baseRegOffset dflags GCEnter1 = oFFSET_stgGCEnter1 dflags +baseRegOffset dflags GCFun = oFFSET_stgGCFun dflags +baseRegOffset _ BaseReg = panic "CgUtils.baseRegOffset:BaseReg" +baseRegOffset _ PicBaseReg = panic "CgUtils.baseRegOffset:PicBaseReg" +baseRegOffset _ MachSp = panic "CgUtils.baseRegOffset:MachSp" +baseRegOffset _ UnwindReturnReg = panic "CgUtils.baseRegOffset:UnwindReturnReg" + + +-- ----------------------------------------------------------------------------- +-- +-- STG/Cmm GlobalReg +-- +-- ----------------------------------------------------------------------------- + +-- | We map STG registers onto appropriate CmmExprs. Either they map +-- to real machine registers or stored as offsets from BaseReg. Given +-- a GlobalReg, get_GlobalReg_addr always produces the +-- register table address for it. +get_GlobalReg_addr :: DynFlags -> GlobalReg -> CmmExpr +get_GlobalReg_addr dflags BaseReg = regTableOffset dflags 0 +get_GlobalReg_addr dflags mid + = get_Regtable_addr_from_offset dflags (baseRegOffset dflags mid) + +-- Calculate a literal representing an offset into the register table. +-- Used when we don't have an actual BaseReg to offset from. +regTableOffset :: DynFlags -> Int -> CmmExpr +regTableOffset dflags n = + CmmLit (CmmLabelOff mkMainCapabilityLabel (oFFSET_Capability_r dflags + n)) + +get_Regtable_addr_from_offset :: DynFlags -> Int -> CmmExpr +get_Regtable_addr_from_offset dflags offset = + if haveRegBase (targetPlatform dflags) + then CmmRegOff baseReg offset + else regTableOffset dflags offset + +-- | Fixup global registers so that they assign to locations within the +-- RegTable if they aren't pinned for the current target. +fixStgRegisters :: DynFlags -> RawCmmDecl -> RawCmmDecl +fixStgRegisters _ top@(CmmData _ _) = top + +fixStgRegisters dflags (CmmProc info lbl live graph) = + let graph' = modifyGraph (mapGraphBlocks (fixStgRegBlock dflags)) graph + in CmmProc info lbl live graph' + +fixStgRegBlock :: DynFlags -> Block CmmNode e x -> Block CmmNode e x +fixStgRegBlock dflags block = mapBlock (fixStgRegStmt dflags) block + +fixStgRegStmt :: DynFlags -> CmmNode e x -> CmmNode e x +fixStgRegStmt dflags stmt = fixAssign $ mapExpDeep fixExpr stmt + where + platform = targetPlatform dflags + + fixAssign stmt = + case stmt of + CmmAssign (CmmGlobal reg) src + -- MachSp isn't an STG register; it's merely here for tracking unwind + -- information + | reg == MachSp -> stmt + | otherwise -> + let baseAddr = get_GlobalReg_addr dflags reg + in case reg `elem` activeStgRegs (targetPlatform dflags) of + True -> CmmAssign (CmmGlobal reg) src + False -> CmmStore baseAddr src + other_stmt -> other_stmt + + fixExpr expr = case expr of + -- MachSp isn't an STG; it's merely here for tracking unwind information + CmmReg (CmmGlobal MachSp) -> expr + CmmReg (CmmGlobal reg) -> + -- Replace register leaves with appropriate StixTrees for + -- the given target. MagicIds which map to a reg on this + -- arch are left unchanged. For the rest, BaseReg is taken + -- to mean the address of the reg table in MainCapability, + -- and for all others we generate an indirection to its + -- location in the register table. + case reg `elem` activeStgRegs platform of + True -> expr + False -> + let baseAddr = get_GlobalReg_addr dflags reg + in case reg of + BaseReg -> baseAddr + _other -> CmmLoad baseAddr (globalRegType dflags reg) + + CmmRegOff (CmmGlobal reg) offset -> + -- RegOf leaves are just a shorthand form. If the reg maps + -- to a real reg, we keep the shorthand, otherwise, we just + -- expand it and defer to the above code. + case reg `elem` activeStgRegs platform of + True -> expr + False -> CmmMachOp (MO_Add (wordWidth dflags)) [ + fixExpr (CmmReg (CmmGlobal reg)), + CmmLit (CmmInt (fromIntegral offset) + (wordWidth dflags))] + + other_expr -> other_expr diff --git a/compiler/GHC/StgToCmm/Closure.hs b/compiler/GHC/StgToCmm/Closure.hs new file mode 100644 index 0000000000..b56b06f399 --- /dev/null +++ b/compiler/GHC/StgToCmm/Closure.hs @@ -0,0 +1,1008 @@ +{-# LANGUAGE CPP, RecordWildCards #-} + +----------------------------------------------------------------------------- +-- +-- Stg to C-- code generation: +-- +-- The types LambdaFormInfo +-- ClosureInfo +-- +-- Nothing monadic in here! +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Closure ( + DynTag, tagForCon, isSmallFamily, + + idPrimRep, isVoidRep, isGcPtrRep, addIdReps, addArgReps, + argPrimRep, + + NonVoid(..), fromNonVoid, nonVoidIds, nonVoidStgArgs, + assertNonVoidIds, assertNonVoidStgArgs, + + -- * LambdaFormInfo + LambdaFormInfo, -- Abstract + StandardFormInfo, -- ...ditto... + mkLFThunk, mkLFReEntrant, mkConLFInfo, mkSelectorLFInfo, + mkApLFInfo, mkLFImported, mkLFArgument, mkLFLetNoEscape, + mkLFStringLit, + lfDynTag, + isLFThunk, isLFReEntrant, lfUpdatable, + + -- * Used by other modules + CgLoc(..), SelfLoopInfo, CallMethod(..), + nodeMustPointToIt, isKnownFun, funTag, tagForArity, getCallMethod, + + -- * ClosureInfo + ClosureInfo, + mkClosureInfo, + mkCmmInfo, + + -- ** Inspection + closureLFInfo, closureName, + + -- ** Labels + -- These just need the info table label + closureInfoLabel, staticClosureLabel, + closureSlowEntryLabel, closureLocalEntryLabel, + + -- ** Predicates + -- These are really just functions on LambdaFormInfo + closureUpdReqd, closureSingleEntry, + closureReEntrant, closureFunInfo, + isToplevClosure, + + blackHoleOnEntry, -- Needs LambdaFormInfo and SMRep + isStaticClosure, -- Needs SMPre + + -- * InfoTables + mkDataConInfoTable, + cafBlackHoleInfoTable, + indStaticInfoTable, + staticClosureNeedsLink, + ) where + +#include "../includes/MachDeps.h" + +#include "HsVersions.h" + +import GhcPrelude + +import StgSyn +import SMRep +import Cmm +import PprCmmExpr() -- For Outputable instances + +import CostCentre +import BlockId +import CLabel +import Id +import IdInfo +import DataCon +import Name +import Type +import TyCoRep +import TcType +import TyCon +import RepType +import BasicTypes +import Outputable +import DynFlags +import Util + +import Data.Coerce (coerce) +import qualified Data.ByteString.Char8 as BS8 + +----------------------------------------------------------------------------- +-- Data types and synonyms +----------------------------------------------------------------------------- + +-- These data types are mostly used by other modules, especially +-- GHC.StgToCmm.Monad, but we define them here because some functions in this +-- module need to have access to them as well + +data CgLoc + = CmmLoc CmmExpr -- A stable CmmExpr; that is, one not mentioning + -- Hp, so that it remains valid across calls + + | LneLoc BlockId [LocalReg] -- A join point + -- A join point (= let-no-escape) should only + -- be tail-called, and in a saturated way. + -- To tail-call it, assign to these locals, + -- and branch to the block id + +instance Outputable CgLoc where + ppr (CmmLoc e) = text "cmm" <+> ppr e + ppr (LneLoc b rs) = text "lne" <+> ppr b <+> ppr rs + +type SelfLoopInfo = (Id, BlockId, [LocalReg]) + +-- used by ticky profiling +isKnownFun :: LambdaFormInfo -> Bool +isKnownFun LFReEntrant{} = True +isKnownFun LFLetNoEscape = True +isKnownFun _ = False + + +------------------------------------- +-- Non-void types +------------------------------------- +-- We frequently need the invariant that an Id or a an argument +-- is of a non-void type. This type is a witness to the invariant. + +newtype NonVoid a = NonVoid a + deriving (Eq, Show) + +fromNonVoid :: NonVoid a -> a +fromNonVoid (NonVoid a) = a + +instance (Outputable a) => Outputable (NonVoid a) where + ppr (NonVoid a) = ppr a + +nonVoidIds :: [Id] -> [NonVoid Id] +nonVoidIds ids = [NonVoid id | id <- ids, not (isVoidTy (idType id))] + +-- | Used in places where some invariant ensures that all these Ids are +-- non-void; e.g. constructor field binders in case expressions. +-- See Note [Post-unarisation invariants] in UnariseStg. +assertNonVoidIds :: [Id] -> [NonVoid Id] +assertNonVoidIds ids = ASSERT(not (any (isVoidTy . idType) ids)) + coerce ids + +nonVoidStgArgs :: [StgArg] -> [NonVoid StgArg] +nonVoidStgArgs args = [NonVoid arg | arg <- args, not (isVoidTy (stgArgType arg))] + +-- | Used in places where some invariant ensures that all these arguments are +-- non-void; e.g. constructor arguments. +-- See Note [Post-unarisation invariants] in UnariseStg. +assertNonVoidStgArgs :: [StgArg] -> [NonVoid StgArg] +assertNonVoidStgArgs args = ASSERT(not (any (isVoidTy . stgArgType) args)) + coerce args + + +----------------------------------------------------------------------------- +-- Representations +----------------------------------------------------------------------------- + +-- Why are these here? + +-- | Assumes that there is precisely one 'PrimRep' of the type. This assumption +-- holds after unarise. +-- See Note [Post-unarisation invariants] +idPrimRep :: Id -> PrimRep +idPrimRep id = typePrimRep1 (idType id) + -- See also Note [VoidRep] in RepType + +-- | Assumes that Ids have one PrimRep, which holds after unarisation. +-- See Note [Post-unarisation invariants] +addIdReps :: [NonVoid Id] -> [NonVoid (PrimRep, Id)] +addIdReps = map (\id -> let id' = fromNonVoid id + in NonVoid (idPrimRep id', id')) + +-- | Assumes that arguments have one PrimRep, which holds after unarisation. +-- See Note [Post-unarisation invariants] +addArgReps :: [NonVoid StgArg] -> [NonVoid (PrimRep, StgArg)] +addArgReps = map (\arg -> let arg' = fromNonVoid arg + in NonVoid (argPrimRep arg', arg')) + +-- | Assumes that the argument has one PrimRep, which holds after unarisation. +-- See Note [Post-unarisation invariants] +argPrimRep :: StgArg -> PrimRep +argPrimRep arg = typePrimRep1 (stgArgType arg) + + +----------------------------------------------------------------------------- +-- LambdaFormInfo +----------------------------------------------------------------------------- + +-- Information about an identifier, from the code generator's point of +-- view. Every identifier is bound to a LambdaFormInfo in the +-- environment, which gives the code generator enough info to be able to +-- tail call or return that identifier. + +data LambdaFormInfo + = LFReEntrant -- Reentrant closure (a function) + TopLevelFlag -- True if top level + OneShotInfo + !RepArity -- Arity. Invariant: always > 0 + !Bool -- True <=> no fvs + ArgDescr -- Argument descriptor (should really be in ClosureInfo) + + | LFThunk -- Thunk (zero arity) + TopLevelFlag + !Bool -- True <=> no free vars + !Bool -- True <=> updatable (i.e., *not* single-entry) + StandardFormInfo + !Bool -- True <=> *might* be a function type + + | LFCon -- A saturated constructor application + DataCon -- The constructor + + | LFUnknown -- Used for function arguments and imported things. + -- We know nothing about this closure. + -- Treat like updatable "LFThunk"... + -- Imported things which we *do* know something about use + -- one of the other LF constructors (eg LFReEntrant for + -- known functions) + !Bool -- True <=> *might* be a function type + -- The False case is good when we want to enter it, + -- because then we know the entry code will do + -- For a function, the entry code is the fast entry point + + | LFUnlifted -- A value of unboxed type; + -- always a value, needs evaluation + + | LFLetNoEscape -- See LetNoEscape module for precise description + + +------------------------- +-- StandardFormInfo tells whether this thunk has one of +-- a small number of standard forms + +data StandardFormInfo + = NonStandardThunk + -- The usual case: not of the standard forms + + | SelectorThunk + -- A SelectorThunk is of form + -- case x of + -- con a1,..,an -> ak + -- and the constructor is from a single-constr type. + WordOff -- 0-origin offset of ak within the "goods" of + -- constructor (Recall that the a1,...,an may be laid + -- out in the heap in a non-obvious order.) + + | ApThunk + -- An ApThunk is of form + -- x1 ... xn + -- The code for the thunk just pushes x2..xn on the stack and enters x1. + -- There are a few of these (for 1 <= n <= MAX_SPEC_AP_SIZE) pre-compiled + -- in the RTS to save space. + RepArity -- Arity, n + + +------------------------------------------------------ +-- Building LambdaFormInfo +------------------------------------------------------ + +mkLFArgument :: Id -> LambdaFormInfo +mkLFArgument id + | isUnliftedType ty = LFUnlifted + | might_be_a_function ty = LFUnknown True + | otherwise = LFUnknown False + where + ty = idType id + +------------- +mkLFLetNoEscape :: LambdaFormInfo +mkLFLetNoEscape = LFLetNoEscape + +------------- +mkLFReEntrant :: TopLevelFlag -- True of top level + -> [Id] -- Free vars + -> [Id] -- Args + -> ArgDescr -- Argument descriptor + -> LambdaFormInfo + +mkLFReEntrant _ _ [] _ + = pprPanic "mkLFReEntrant" empty +mkLFReEntrant top fvs args arg_descr + = LFReEntrant top os_info (length args) (null fvs) arg_descr + where os_info = idOneShotInfo (head args) + +------------- +mkLFThunk :: Type -> TopLevelFlag -> [Id] -> UpdateFlag -> LambdaFormInfo +mkLFThunk thunk_ty top fvs upd_flag + = ASSERT( not (isUpdatable upd_flag) || not (isUnliftedType thunk_ty) ) + LFThunk top (null fvs) + (isUpdatable upd_flag) + NonStandardThunk + (might_be_a_function thunk_ty) + +-------------- +might_be_a_function :: Type -> Bool +-- Return False only if we are *sure* it's a data type +-- Look through newtypes etc as much as poss +might_be_a_function ty + | [LiftedRep] <- typePrimRep ty + , Just tc <- tyConAppTyCon_maybe (unwrapType ty) + , isDataTyCon tc + = False + | otherwise + = True + +------------- +mkConLFInfo :: DataCon -> LambdaFormInfo +mkConLFInfo con = LFCon con + +------------- +mkSelectorLFInfo :: Id -> Int -> Bool -> LambdaFormInfo +mkSelectorLFInfo id offset updatable + = LFThunk NotTopLevel False updatable (SelectorThunk offset) + (might_be_a_function (idType id)) + +------------- +mkApLFInfo :: Id -> UpdateFlag -> Arity -> LambdaFormInfo +mkApLFInfo id upd_flag arity + = LFThunk NotTopLevel (arity == 0) (isUpdatable upd_flag) (ApThunk arity) + (might_be_a_function (idType id)) + +------------- +mkLFImported :: Id -> LambdaFormInfo +mkLFImported id + | Just con <- isDataConWorkId_maybe id + , isNullaryRepDataCon con + = LFCon con -- An imported nullary constructor + -- We assume that the constructor is evaluated so that + -- the id really does point directly to the constructor + + | arity > 0 + = LFReEntrant TopLevel noOneShotInfo arity True (panic "arg_descr") + + | otherwise + = mkLFArgument id -- Not sure of exact arity + where + arity = idFunRepArity id + +------------- +mkLFStringLit :: LambdaFormInfo +mkLFStringLit = LFUnlifted + +----------------------------------------------------- +-- Dynamic pointer tagging +----------------------------------------------------- + +type DynTag = Int -- The tag on a *pointer* + -- (from the dynamic-tagging paper) + +-- Note [Data constructor dynamic tags] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- The family size of a data type (the number of constructors +-- or the arity of a function) can be either: +-- * small, if the family size < 2**tag_bits +-- * big, otherwise. +-- +-- Small families can have the constructor tag in the tag bits. +-- Big families only use the tag value 1 to represent evaluatedness. +-- We don't have very many tag bits: for example, we have 2 bits on +-- x86-32 and 3 bits on x86-64. + +isSmallFamily :: DynFlags -> Int -> Bool +isSmallFamily dflags fam_size = fam_size <= mAX_PTR_TAG dflags + +tagForCon :: DynFlags -> DataCon -> DynTag +tagForCon dflags con + | isSmallFamily dflags fam_size = con_tag + | otherwise = 1 + where + con_tag = dataConTag con -- NB: 1-indexed + fam_size = tyConFamilySize (dataConTyCon con) + +tagForArity :: DynFlags -> RepArity -> DynTag +tagForArity dflags arity + | isSmallFamily dflags arity = arity + | otherwise = 0 + +lfDynTag :: DynFlags -> LambdaFormInfo -> DynTag +-- Return the tag in the low order bits of a variable bound +-- to this LambdaForm +lfDynTag dflags (LFCon con) = tagForCon dflags con +lfDynTag dflags (LFReEntrant _ _ arity _ _) = tagForArity dflags arity +lfDynTag _ _other = 0 + + +----------------------------------------------------------------------------- +-- Observing LambdaFormInfo +----------------------------------------------------------------------------- + +------------ +isLFThunk :: LambdaFormInfo -> Bool +isLFThunk (LFThunk {}) = True +isLFThunk _ = False + +isLFReEntrant :: LambdaFormInfo -> Bool +isLFReEntrant (LFReEntrant {}) = True +isLFReEntrant _ = False + +----------------------------------------------------------------------------- +-- Choosing SM reps +----------------------------------------------------------------------------- + +lfClosureType :: LambdaFormInfo -> ClosureTypeInfo +lfClosureType (LFReEntrant _ _ arity _ argd) = Fun arity argd +lfClosureType (LFCon con) = Constr (dataConTagZ con) + (dataConIdentity con) +lfClosureType (LFThunk _ _ _ is_sel _) = thunkClosureType is_sel +lfClosureType _ = panic "lfClosureType" + +thunkClosureType :: StandardFormInfo -> ClosureTypeInfo +thunkClosureType (SelectorThunk off) = ThunkSelector off +thunkClosureType _ = Thunk + +-- We *do* get non-updatable top-level thunks sometimes. eg. f = g +-- gets compiled to a jump to g (if g has non-zero arity), instead of +-- messing around with update frames and PAPs. We set the closure type +-- to FUN_STATIC in this case. + +----------------------------------------------------------------------------- +-- nodeMustPointToIt +----------------------------------------------------------------------------- + +nodeMustPointToIt :: DynFlags -> LambdaFormInfo -> Bool +-- If nodeMustPointToIt is true, then the entry convention for +-- this closure has R1 (the "Node" register) pointing to the +-- closure itself --- the "self" argument + +nodeMustPointToIt _ (LFReEntrant top _ _ no_fvs _) + = not no_fvs -- Certainly if it has fvs we need to point to it + || isNotTopLevel top -- See Note [GC recovery] + -- For lex_profiling we also access the cost centre for a + -- non-inherited (i.e. non-top-level) function. + -- The isNotTopLevel test above ensures this is ok. + +nodeMustPointToIt dflags (LFThunk top no_fvs updatable NonStandardThunk _) + = not no_fvs -- Self parameter + || isNotTopLevel top -- Note [GC recovery] + || updatable -- Need to push update frame + || gopt Opt_SccProfilingOn dflags + -- For the non-updatable (single-entry case): + -- + -- True if has fvs (in which case we need access to them, and we + -- should black-hole it) + -- or profiling (in which case we need to recover the cost centre + -- from inside it) ToDo: do we need this even for + -- top-level thunks? If not, + -- isNotTopLevel subsumes this + +nodeMustPointToIt _ (LFThunk {}) -- Node must point to a standard-form thunk + = True + +nodeMustPointToIt _ (LFCon _) = True + + -- Strictly speaking, the above two don't need Node to point + -- to it if the arity = 0. But this is a *really* unlikely + -- situation. If we know it's nil (say) and we are entering + -- it. Eg: let x = [] in x then we will certainly have inlined + -- x, since nil is a simple atom. So we gain little by not + -- having Node point to known zero-arity things. On the other + -- hand, we do lose something; Patrick's code for figuring out + -- when something has been updated but not entered relies on + -- having Node point to the result of an update. SLPJ + -- 27/11/92. + +nodeMustPointToIt _ (LFUnknown _) = True +nodeMustPointToIt _ LFUnlifted = False +nodeMustPointToIt _ LFLetNoEscape = False + +{- Note [GC recovery] +~~~~~~~~~~~~~~~~~~~~~ +If we a have a local let-binding (function or thunk) + let f = <body> in ... +AND <body> allocates, then the heap-overflow check needs to know how +to re-start the evaluation. It uses the "self" pointer to do this. +So even if there are no free variables in <body>, we still make +nodeMustPointToIt be True for non-top-level bindings. + +Why do any such bindings exist? After all, let-floating should have +floated them out. Well, a clever optimiser might leave one there to +avoid a space leak, deliberately recomputing a thunk. Also (and this +really does happen occasionally) let-floating may make a function f smaller +so it can be inlined, so now (f True) may generate a local no-fv closure. +This actually happened during bootstrapping GHC itself, with f=mkRdrFunBind +in TcGenDeriv.) -} + +----------------------------------------------------------------------------- +-- getCallMethod +----------------------------------------------------------------------------- + +{- The entry conventions depend on the type of closure being entered, +whether or not it has free variables, and whether we're running +sequentially or in parallel. + +Closure Node Argument Enter +Characteristics Par Req'd Passing Via +--------------------------------------------------------------------------- +Unknown & no & yes & stack & node +Known fun (>1 arg), no fvs & no & no & registers & fast entry (enough args) + & slow entry (otherwise) +Known fun (>1 arg), fvs & no & yes & registers & fast entry (enough args) +0 arg, no fvs \r,\s & no & no & n/a & direct entry +0 arg, no fvs \u & no & yes & n/a & node +0 arg, fvs \r,\s,selector & no & yes & n/a & node +0 arg, fvs \r,\s & no & yes & n/a & direct entry +0 arg, fvs \u & no & yes & n/a & node +Unknown & yes & yes & stack & node +Known fun (>1 arg), no fvs & yes & no & registers & fast entry (enough args) + & slow entry (otherwise) +Known fun (>1 arg), fvs & yes & yes & registers & node +0 arg, fvs \r,\s,selector & yes & yes & n/a & node +0 arg, no fvs \r,\s & yes & no & n/a & direct entry +0 arg, no fvs \u & yes & yes & n/a & node +0 arg, fvs \r,\s & yes & yes & n/a & node +0 arg, fvs \u & yes & yes & n/a & node + +When black-holing, single-entry closures could also be entered via node +(rather than directly) to catch double-entry. -} + +data CallMethod + = EnterIt -- No args, not a function + + | JumpToIt BlockId [LocalReg] -- A join point or a header of a local loop + + | ReturnIt -- It's a value (function, unboxed value, + -- or constructor), so just return it. + + | SlowCall -- Unknown fun, or known fun with + -- too few args. + + | DirectEntry -- Jump directly, with args in regs + CLabel -- The code label + RepArity -- Its arity + +getCallMethod :: DynFlags + -> Name -- Function being applied + -> Id -- Function Id used to chech if it can refer to + -- CAF's and whether the function is tail-calling + -- itself + -> LambdaFormInfo -- Its info + -> RepArity -- Number of available arguments + -> RepArity -- Number of them being void arguments + -> CgLoc -- Passed in from cgIdApp so that we can + -- handle let-no-escape bindings and self-recursive + -- tail calls using the same data constructor, + -- JumpToIt. This saves us one case branch in + -- cgIdApp + -> Maybe SelfLoopInfo -- can we perform a self-recursive tail call? + -> CallMethod + +getCallMethod dflags _ id _ n_args v_args _cg_loc + (Just (self_loop_id, block_id, args)) + | gopt Opt_Loopification dflags + , id == self_loop_id + , args `lengthIs` (n_args - v_args) + -- If these patterns match then we know that: + -- * loopification optimisation is turned on + -- * function is performing a self-recursive call in a tail position + -- * number of non-void parameters of the function matches functions arity. + -- See Note [Self-recursive tail calls] and Note [Void arguments in + -- self-recursive tail calls] in GHC.StgToCmm.Expr for more details + = JumpToIt block_id args + +getCallMethod dflags name id (LFReEntrant _ _ arity _ _) n_args _v_args _cg_loc + _self_loop_info + | n_args == 0 -- No args at all + && not (gopt Opt_SccProfilingOn dflags) + -- See Note [Evaluating functions with profiling] in rts/Apply.cmm + = ASSERT( arity /= 0 ) ReturnIt + | n_args < arity = SlowCall -- Not enough args + | otherwise = DirectEntry (enterIdLabel dflags name (idCafInfo id)) arity + +getCallMethod _ _name _ LFUnlifted n_args _v_args _cg_loc _self_loop_info + = ASSERT( n_args == 0 ) ReturnIt + +getCallMethod _ _name _ (LFCon _) n_args _v_args _cg_loc _self_loop_info + = ASSERT( n_args == 0 ) ReturnIt + -- n_args=0 because it'd be ill-typed to apply a saturated + -- constructor application to anything + +getCallMethod dflags name id (LFThunk _ _ updatable std_form_info is_fun) + n_args _v_args _cg_loc _self_loop_info + | is_fun -- it *might* be a function, so we must "call" it (which is always safe) + = SlowCall -- We cannot just enter it [in eval/apply, the entry code + -- is the fast-entry code] + + -- Since is_fun is False, we are *definitely* looking at a data value + | updatable || gopt Opt_Ticky dflags -- to catch double entry + {- OLD: || opt_SMP + I decided to remove this, because in SMP mode it doesn't matter + if we enter the same thunk multiple times, so the optimisation + of jumping directly to the entry code is still valid. --SDM + -} + = EnterIt + + -- even a non-updatable selector thunk can be updated by the garbage + -- collector, so we must enter it. (#8817) + | SelectorThunk{} <- std_form_info + = EnterIt + + -- We used to have ASSERT( n_args == 0 ), but actually it is + -- possible for the optimiser to generate + -- let bot :: Int = error Int "urk" + -- in (bot `cast` unsafeCoerce Int (Int -> Int)) 3 + -- This happens as a result of the case-of-error transformation + -- So the right thing to do is just to enter the thing + + | otherwise -- Jump direct to code for single-entry thunks + = ASSERT( n_args == 0 ) + DirectEntry (thunkEntryLabel dflags name (idCafInfo id) std_form_info + updatable) 0 + +getCallMethod _ _name _ (LFUnknown True) _n_arg _v_args _cg_locs _self_loop_info + = SlowCall -- might be a function + +getCallMethod _ name _ (LFUnknown False) n_args _v_args _cg_loc _self_loop_info + = ASSERT2( n_args == 0, ppr name <+> ppr n_args ) + EnterIt -- Not a function + +getCallMethod _ _name _ LFLetNoEscape _n_args _v_args (LneLoc blk_id lne_regs) + _self_loop_info + = JumpToIt blk_id lne_regs + +getCallMethod _ _ _ _ _ _ _ _ = panic "Unknown call method" + +----------------------------------------------------------------------------- +-- Data types for closure information +----------------------------------------------------------------------------- + + +{- ClosureInfo: information about a binding + + We make a ClosureInfo for each let binding (both top level and not), + but not bindings for data constructors: for those we build a CmmInfoTable + directly (see mkDataConInfoTable). + + To a first approximation: + ClosureInfo = (LambdaFormInfo, CmmInfoTable) + + A ClosureInfo has enough information + a) to construct the info table itself, and build other things + related to the binding (e.g. slow entry points for a function) + b) to allocate a closure containing that info pointer (i.e. + it knows the info table label) +-} + +data ClosureInfo + = ClosureInfo { + closureName :: !Name, -- The thing bound to this closure + -- we don't really need this field: it's only used in generating + -- code for ticky and profiling, and we could pass the information + -- around separately, but it doesn't do much harm to keep it here. + + closureLFInfo :: !LambdaFormInfo, -- NOTE: not an LFCon + -- this tells us about what the closure contains: it's right-hand-side. + + -- the rest is just an unpacked CmmInfoTable. + closureInfoLabel :: !CLabel, + closureSMRep :: !SMRep, -- representation used by storage mgr + closureProf :: !ProfilingInfo + } + +-- | Convert from 'ClosureInfo' to 'CmmInfoTable'. +mkCmmInfo :: ClosureInfo -> Id -> CostCentreStack -> CmmInfoTable +mkCmmInfo ClosureInfo {..} id ccs + = CmmInfoTable { cit_lbl = closureInfoLabel + , cit_rep = closureSMRep + , cit_prof = closureProf + , cit_srt = Nothing + , cit_clo = if isStaticRep closureSMRep + then Just (id,ccs) + else Nothing } + +-------------------------------------- +-- Building ClosureInfos +-------------------------------------- + +mkClosureInfo :: DynFlags + -> Bool -- Is static + -> Id + -> LambdaFormInfo + -> Int -> Int -- Total and pointer words + -> String -- String descriptor + -> ClosureInfo +mkClosureInfo dflags is_static id lf_info tot_wds ptr_wds val_descr + = ClosureInfo { closureName = name + , closureLFInfo = lf_info + , closureInfoLabel = info_lbl -- These three fields are + , closureSMRep = sm_rep -- (almost) an info table + , closureProf = prof } -- (we don't have an SRT yet) + where + name = idName id + sm_rep = mkHeapRep dflags is_static ptr_wds nonptr_wds (lfClosureType lf_info) + prof = mkProfilingInfo dflags id val_descr + nonptr_wds = tot_wds - ptr_wds + + info_lbl = mkClosureInfoTableLabel id lf_info + +-------------------------------------- +-- Other functions over ClosureInfo +-------------------------------------- + +-- Eager blackholing is normally disabled, but can be turned on with +-- -feager-blackholing. When it is on, we replace the info pointer of +-- the thunk with stg_EAGER_BLACKHOLE_info on entry. + +-- If we wanted to do eager blackholing with slop filling, +-- we'd need to do it at the *end* of a basic block, otherwise +-- we overwrite the free variables in the thunk that we still +-- need. We have a patch for this from Andy Cheadle, but not +-- incorporated yet. --SDM [6/2004] +-- +-- Previously, eager blackholing was enabled when ticky-ticky +-- was on. But it didn't work, and it wasn't strictly necessary +-- to bring back minimal ticky-ticky, so now EAGER_BLACKHOLING +-- is unconditionally disabled. -- krc 1/2007 +-- +-- Static closures are never themselves black-holed. + +blackHoleOnEntry :: ClosureInfo -> Bool +blackHoleOnEntry cl_info + | isStaticRep (closureSMRep cl_info) + = False -- Never black-hole a static closure + + | otherwise + = case closureLFInfo cl_info of + LFReEntrant {} -> False + LFLetNoEscape -> False + LFThunk _ _no_fvs upd _ _ -> upd -- See Note [Black-holing non-updatable thunks] + _other -> panic "blackHoleOnEntry" + +{- Note [Black-holing non-updatable thunks] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We must not black-hole non-updatable (single-entry) thunks otherwise +we run into issues like #10414. Specifically: + + * There is no reason to black-hole a non-updatable thunk: it should + not be competed for by multiple threads + + * It could, conceivably, cause a space leak if we don't black-hole + it, if there was a live but never-followed pointer pointing to it. + Let's hope that doesn't happen. + + * It is dangerous to black-hole a non-updatable thunk because + - is not updated (of course) + - hence, if it is black-holed and another thread tries to evaluate + it, that thread will block forever + This actually happened in #10414. So we do not black-hole + non-updatable thunks. + + * How could two threads evaluate the same non-updatable (single-entry) + thunk? See Reid Barton's example below. + + * Only eager blackholing could possibly black-hole a non-updatable + thunk, because lazy black-holing only affects thunks with an + update frame on the stack. + +Here is and example due to Reid Barton (#10414): + x = \u [] concat [[1], []] +with the following definitions, + + concat x = case x of + [] -> [] + (:) x xs -> (++) x (concat xs) + + (++) xs ys = case xs of + [] -> ys + (:) x rest -> (:) x ((++) rest ys) + +Where we use the syntax @\u []@ to denote an updatable thunk and @\s []@ to +denote a single-entry (i.e. non-updatable) thunk. After a thread evaluates @x@ +to WHNF and calls @(++)@ the heap will contain the following thunks, + + x = 1 : y + y = \u [] (++) [] z + z = \s [] concat [] + +Now that the stage is set, consider the follow evaluations by two racing threads +A and B, + + 1. Both threads enter @y@ before either is able to replace it with an + indirection + + 2. Thread A does the case analysis in @(++)@ and consequently enters @z@, + replacing it with a black-hole + + 3. At some later point thread B does the same case analysis and also attempts + to enter @z@. However, it finds that it has been replaced with a black-hole + so it blocks. + + 4. Thread A eventually finishes evaluating @z@ (to @[]@) and updates @y@ + accordingly. It does *not* update @z@, however, as it is single-entry. This + leaves Thread B blocked forever on a black-hole which will never be + updated. + +To avoid this sort of condition we never black-hole non-updatable thunks. +-} + +isStaticClosure :: ClosureInfo -> Bool +isStaticClosure cl_info = isStaticRep (closureSMRep cl_info) + +closureUpdReqd :: ClosureInfo -> Bool +closureUpdReqd ClosureInfo{ closureLFInfo = lf_info } = lfUpdatable lf_info + +lfUpdatable :: LambdaFormInfo -> Bool +lfUpdatable (LFThunk _ _ upd _ _) = upd +lfUpdatable _ = False + +closureSingleEntry :: ClosureInfo -> Bool +closureSingleEntry (ClosureInfo { closureLFInfo = LFThunk _ _ upd _ _}) = not upd +closureSingleEntry (ClosureInfo { closureLFInfo = LFReEntrant _ OneShotLam _ _ _}) = True +closureSingleEntry _ = False + +closureReEntrant :: ClosureInfo -> Bool +closureReEntrant (ClosureInfo { closureLFInfo = LFReEntrant {} }) = True +closureReEntrant _ = False + +closureFunInfo :: ClosureInfo -> Maybe (RepArity, ArgDescr) +closureFunInfo (ClosureInfo { closureLFInfo = lf_info }) = lfFunInfo lf_info + +lfFunInfo :: LambdaFormInfo -> Maybe (RepArity, ArgDescr) +lfFunInfo (LFReEntrant _ _ arity _ arg_desc) = Just (arity, arg_desc) +lfFunInfo _ = Nothing + +funTag :: DynFlags -> ClosureInfo -> DynTag +funTag dflags (ClosureInfo { closureLFInfo = lf_info }) + = lfDynTag dflags lf_info + +isToplevClosure :: ClosureInfo -> Bool +isToplevClosure (ClosureInfo { closureLFInfo = lf_info }) + = case lf_info of + LFReEntrant TopLevel _ _ _ _ -> True + LFThunk TopLevel _ _ _ _ -> True + _other -> False + +-------------------------------------- +-- Label generation +-------------------------------------- + +staticClosureLabel :: ClosureInfo -> CLabel +staticClosureLabel = toClosureLbl . closureInfoLabel + +closureSlowEntryLabel :: ClosureInfo -> CLabel +closureSlowEntryLabel = toSlowEntryLbl . closureInfoLabel + +closureLocalEntryLabel :: DynFlags -> ClosureInfo -> CLabel +closureLocalEntryLabel dflags + | tablesNextToCode dflags = toInfoLbl . closureInfoLabel + | otherwise = toEntryLbl . closureInfoLabel + +mkClosureInfoTableLabel :: Id -> LambdaFormInfo -> CLabel +mkClosureInfoTableLabel id lf_info + = case lf_info of + LFThunk _ _ upd_flag (SelectorThunk offset) _ + -> mkSelectorInfoLabel upd_flag offset + + LFThunk _ _ upd_flag (ApThunk arity) _ + -> mkApInfoTableLabel upd_flag arity + + LFThunk{} -> std_mk_lbl name cafs + LFReEntrant{} -> std_mk_lbl name cafs + _other -> panic "closureInfoTableLabel" + + where + name = idName id + + std_mk_lbl | is_local = mkLocalInfoTableLabel + | otherwise = mkInfoTableLabel + + cafs = idCafInfo id + is_local = isDataConWorkId id + -- Make the _info pointer for the implicit datacon worker + -- binding local. The reason we can do this is that importing + -- code always either uses the _closure or _con_info. By the + -- invariants in CorePrep anything else gets eta expanded. + + +thunkEntryLabel :: DynFlags -> Name -> CafInfo -> StandardFormInfo -> Bool -> CLabel +-- thunkEntryLabel is a local help function, not exported. It's used from +-- getCallMethod. +thunkEntryLabel dflags _thunk_id _ (ApThunk arity) upd_flag + = enterApLabel dflags upd_flag arity +thunkEntryLabel dflags _thunk_id _ (SelectorThunk offset) upd_flag + = enterSelectorLabel dflags upd_flag offset +thunkEntryLabel dflags thunk_id c _ _ + = enterIdLabel dflags thunk_id c + +enterApLabel :: DynFlags -> Bool -> Arity -> CLabel +enterApLabel dflags is_updatable arity + | tablesNextToCode dflags = mkApInfoTableLabel is_updatable arity + | otherwise = mkApEntryLabel is_updatable arity + +enterSelectorLabel :: DynFlags -> Bool -> WordOff -> CLabel +enterSelectorLabel dflags upd_flag offset + | tablesNextToCode dflags = mkSelectorInfoLabel upd_flag offset + | otherwise = mkSelectorEntryLabel upd_flag offset + +enterIdLabel :: DynFlags -> Name -> CafInfo -> CLabel +enterIdLabel dflags id c + | tablesNextToCode dflags = mkInfoTableLabel id c + | otherwise = mkEntryLabel id c + + +-------------------------------------- +-- Profiling +-------------------------------------- + +-- Profiling requires two pieces of information to be determined for +-- each closure's info table --- description and type. + +-- The description is stored directly in the @CClosureInfoTable@ when the +-- info table is built. + +-- The type is determined from the type information stored with the @Id@ +-- in the closure info using @closureTypeDescr@. + +mkProfilingInfo :: DynFlags -> Id -> String -> ProfilingInfo +mkProfilingInfo dflags id val_descr + | not (gopt Opt_SccProfilingOn dflags) = NoProfilingInfo + | otherwise = ProfilingInfo ty_descr_w8 (BS8.pack val_descr) + where + ty_descr_w8 = BS8.pack (getTyDescription (idType id)) + +getTyDescription :: Type -> String +getTyDescription ty + = case (tcSplitSigmaTy ty) of { (_, _, tau_ty) -> + case tau_ty of + TyVarTy _ -> "*" + AppTy fun _ -> getTyDescription fun + TyConApp tycon _ -> getOccString tycon + FunTy {} -> '-' : fun_result tau_ty + ForAllTy _ ty -> getTyDescription ty + LitTy n -> getTyLitDescription n + CastTy ty _ -> getTyDescription ty + CoercionTy co -> pprPanic "getTyDescription" (ppr co) + } + where + fun_result (FunTy { ft_res = res }) = '>' : fun_result res + fun_result other = getTyDescription other + +getTyLitDescription :: TyLit -> String +getTyLitDescription l = + case l of + NumTyLit n -> show n + StrTyLit n -> show n + +-------------------------------------- +-- CmmInfoTable-related things +-------------------------------------- + +mkDataConInfoTable :: DynFlags -> DataCon -> Bool -> Int -> Int -> CmmInfoTable +mkDataConInfoTable dflags data_con is_static ptr_wds nonptr_wds + = CmmInfoTable { cit_lbl = info_lbl + , cit_rep = sm_rep + , cit_prof = prof + , cit_srt = Nothing + , cit_clo = Nothing } + where + name = dataConName data_con + info_lbl = mkConInfoTableLabel name NoCafRefs + sm_rep = mkHeapRep dflags is_static ptr_wds nonptr_wds cl_type + cl_type = Constr (dataConTagZ data_con) (dataConIdentity data_con) + -- We keep the *zero-indexed* tag in the srt_len field + -- of the info table of a data constructor. + + prof | not (gopt Opt_SccProfilingOn dflags) = NoProfilingInfo + | otherwise = ProfilingInfo ty_descr val_descr + + ty_descr = BS8.pack $ occNameString $ getOccName $ dataConTyCon data_con + val_descr = BS8.pack $ occNameString $ getOccName data_con + +-- We need a black-hole closure info to pass to @allocDynClosure@ when we +-- want to allocate the black hole on entry to a CAF. + +cafBlackHoleInfoTable :: CmmInfoTable +cafBlackHoleInfoTable + = CmmInfoTable { cit_lbl = mkCAFBlackHoleInfoTableLabel + , cit_rep = blackHoleRep + , cit_prof = NoProfilingInfo + , cit_srt = Nothing + , cit_clo = Nothing } + +indStaticInfoTable :: CmmInfoTable +indStaticInfoTable + = CmmInfoTable { cit_lbl = mkIndStaticInfoLabel + , cit_rep = indStaticRep + , cit_prof = NoProfilingInfo + , cit_srt = Nothing + , cit_clo = Nothing } + +staticClosureNeedsLink :: Bool -> CmmInfoTable -> Bool +-- A static closure needs a link field to aid the GC when traversing +-- the static closure graph. But it only needs such a field if either +-- a) it has an SRT +-- b) it's a constructor with one or more pointer fields +-- In case (b), the constructor's fields themselves play the role +-- of the SRT. +staticClosureNeedsLink has_srt CmmInfoTable{ cit_rep = smrep } + | isConRep smrep = not (isStaticNoCafCon smrep) + | otherwise = has_srt diff --git a/compiler/GHC/StgToCmm/Con.hs b/compiler/GHC/StgToCmm/Con.hs new file mode 100644 index 0000000000..08508fbecc --- /dev/null +++ b/compiler/GHC/StgToCmm/Con.hs @@ -0,0 +1,285 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Stg to C--: code generation for constructors +-- +-- This module provides the support code for StgToCmm to deal with with +-- constructors on the RHSs of let(rec)s. +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Con ( + cgTopRhsCon, buildDynCon, bindConArgs + ) where + +#include "HsVersions.h" + +import GhcPrelude + +import StgSyn +import CoreSyn ( AltCon(..) ) + +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Env +import GHC.StgToCmm.Heap +import GHC.StgToCmm.Layout +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Closure + +import CmmExpr +import CmmUtils +import CLabel +import MkGraph +import SMRep +import CostCentre +import Module +import DataCon +import DynFlags +import FastString +import Id +import RepType (countConRepArgs) +import Literal +import PrelInfo +import Outputable +import GHC.Platform +import Util +import MonadUtils (mapMaybeM) + +import Control.Monad +import Data.Char + + + +--------------------------------------------------------------- +-- Top-level constructors +--------------------------------------------------------------- + +cgTopRhsCon :: DynFlags + -> Id -- Name of thing bound to this RHS + -> DataCon -- Id + -> [NonVoid StgArg] -- Args + -> (CgIdInfo, FCode ()) +cgTopRhsCon dflags id con args = + let id_info = litIdInfo dflags id (mkConLFInfo con) (CmmLabel closure_label) + in (id_info, gen_code) + where + name = idName id + caffy = idCafInfo id -- any stgArgHasCafRefs args + closure_label = mkClosureLabel name caffy + + gen_code = + do { this_mod <- getModuleName + ; when (platformOS (targetPlatform dflags) == OSMinGW32) $ + -- Windows DLLs have a problem with static cross-DLL refs. + MASSERT( not (isDllConApp dflags this_mod con (map fromNonVoid args)) ) + ; ASSERT( args `lengthIs` countConRepArgs con ) return () + + -- LAY IT OUT + ; let + (tot_wds, -- #ptr_wds + #nonptr_wds + ptr_wds, -- #ptr_wds + nv_args_w_offsets) = + mkVirtHeapOffsetsWithPadding dflags StdHeader (addArgReps args) + + mk_payload (Padding len _) = return (CmmInt 0 (widthFromBytes len)) + mk_payload (FieldOff arg _) = do + amode <- getArgAmode arg + case amode of + CmmLit lit -> return lit + _ -> panic "GHC.StgToCmm.Con.cgTopRhsCon" + + nonptr_wds = tot_wds - ptr_wds + + -- we're not really going to emit an info table, so having + -- to make a CmmInfoTable is a bit overkill, but mkStaticClosureFields + -- needs to poke around inside it. + info_tbl = mkDataConInfoTable dflags con True ptr_wds nonptr_wds + + + ; payload <- mapM mk_payload nv_args_w_offsets + -- NB1: nv_args_w_offsets is sorted into ptrs then non-ptrs + -- NB2: all the amodes should be Lits! + -- TODO (osa): Why? + + ; let closure_rep = mkStaticClosureFields + dflags + info_tbl + dontCareCCS -- Because it's static data + caffy -- Has CAF refs + payload + + -- BUILD THE OBJECT + ; emitDataLits closure_label closure_rep + + ; return () } + + +--------------------------------------------------------------- +-- Lay out and allocate non-top-level constructors +--------------------------------------------------------------- + +buildDynCon :: Id -- Name of the thing to which this constr will + -- be bound + -> Bool -- is it genuinely bound to that name, or just + -- for profiling? + -> CostCentreStack -- Where to grab cost centre from; + -- current CCS if currentOrSubsumedCCS + -> DataCon -- The data constructor + -> [NonVoid StgArg] -- Its args + -> FCode (CgIdInfo, FCode CmmAGraph) + -- Return details about how to find it and initialization code +buildDynCon binder actually_bound cc con args + = do dflags <- getDynFlags + buildDynCon' dflags (targetPlatform dflags) binder actually_bound cc con args + + +buildDynCon' :: DynFlags + -> Platform + -> Id -> Bool + -> CostCentreStack + -> DataCon + -> [NonVoid StgArg] + -> FCode (CgIdInfo, FCode CmmAGraph) + +{- We used to pass a boolean indicating whether all the +args were of size zero, so we could use a static +constructor; but I concluded that it just isn't worth it. +Now I/O uses unboxed tuples there just aren't any constructors +with all size-zero args. + +The reason for having a separate argument, rather than looking at +the addr modes of the args is that we may be in a "knot", and +premature looking at the args will cause the compiler to black-hole! +-} + + +-------- buildDynCon': Nullary constructors -------------- +-- First we deal with the case of zero-arity constructors. They +-- will probably be unfolded, so we don't expect to see this case much, +-- if at all, but it does no harm, and sets the scene for characters. +-- +-- In the case of zero-arity constructors, or, more accurately, those +-- which have exclusively size-zero (VoidRep) args, we generate no code +-- at all. + +buildDynCon' dflags _ binder _ _cc con [] + | isNullaryRepDataCon con + = return (litIdInfo dflags binder (mkConLFInfo con) + (CmmLabel (mkClosureLabel (dataConName con) (idCafInfo binder))), + return mkNop) + +-------- buildDynCon': Charlike and Intlike constructors ----------- +{- The following three paragraphs about @Char@-like and @Int@-like +closures are obsolete, but I don't understand the details well enough +to properly word them, sorry. I've changed the treatment of @Char@s to +be analogous to @Int@s: only a subset is preallocated, because @Char@ +has now 31 bits. Only literals are handled here. -- Qrczak + +Now for @Char@-like closures. We generate an assignment of the +address of the closure to a temporary. It would be possible simply to +generate no code, and record the addressing mode in the environment, +but we'd have to be careful if the argument wasn't a constant --- so +for simplicity we just always assign to a temporary. + +Last special case: @Int@-like closures. We only special-case the +situation in which the argument is a literal in the range +@mIN_INTLIKE@..@mAX_INTLILKE@. NB: for @Char@-like closures we can +work with any old argument, but for @Int@-like ones the argument has +to be a literal. Reason: @Char@ like closures have an argument type +which is guaranteed in range. + +Because of this, we use can safely return an addressing mode. + +We don't support this optimisation when compiling into Windows DLLs yet +because they don't support cross package data references well. +-} + +buildDynCon' dflags platform binder _ _cc con [arg] + | maybeIntLikeCon con + , platformOS platform /= OSMinGW32 || not (positionIndependent dflags) + , NonVoid (StgLitArg (LitNumber LitNumInt val _)) <- arg + , val <= fromIntegral (mAX_INTLIKE dflags) -- Comparisons at type Integer! + , val >= fromIntegral (mIN_INTLIKE dflags) -- ...ditto... + = do { let intlike_lbl = mkCmmClosureLabel rtsUnitId (fsLit "stg_INTLIKE") + val_int = fromIntegral val :: Int + offsetW = (val_int - mIN_INTLIKE dflags) * (fixedHdrSizeW dflags + 1) + -- INTLIKE closures consist of a header and one word payload + intlike_amode = cmmLabelOffW dflags intlike_lbl offsetW + ; return ( litIdInfo dflags binder (mkConLFInfo con) intlike_amode + , return mkNop) } + +buildDynCon' dflags platform binder _ _cc con [arg] + | maybeCharLikeCon con + , platformOS platform /= OSMinGW32 || not (positionIndependent dflags) + , NonVoid (StgLitArg (LitChar val)) <- arg + , let val_int = ord val :: Int + , val_int <= mAX_CHARLIKE dflags + , val_int >= mIN_CHARLIKE dflags + = do { let charlike_lbl = mkCmmClosureLabel rtsUnitId (fsLit "stg_CHARLIKE") + offsetW = (val_int - mIN_CHARLIKE dflags) * (fixedHdrSizeW dflags + 1) + -- CHARLIKE closures consist of a header and one word payload + charlike_amode = cmmLabelOffW dflags charlike_lbl offsetW + ; return ( litIdInfo dflags binder (mkConLFInfo con) charlike_amode + , return mkNop) } + +-------- buildDynCon': the general case ----------- +buildDynCon' dflags _ binder actually_bound ccs con args + = do { (id_info, reg) <- rhsIdInfo binder lf_info + ; return (id_info, gen_code reg) + } + where + lf_info = mkConLFInfo con + + gen_code reg + = do { let (tot_wds, ptr_wds, args_w_offsets) + = mkVirtConstrOffsets dflags (addArgReps args) + nonptr_wds = tot_wds - ptr_wds + info_tbl = mkDataConInfoTable dflags con False + ptr_wds nonptr_wds + ; let ticky_name | actually_bound = Just binder + | otherwise = Nothing + + ; hp_plus_n <- allocDynClosure ticky_name info_tbl lf_info + use_cc blame_cc args_w_offsets + ; return (mkRhsInit dflags reg lf_info hp_plus_n) } + where + use_cc -- cost-centre to stick in the object + | isCurrentCCS ccs = cccsExpr + | otherwise = panic "buildDynCon: non-current CCS not implemented" + + blame_cc = use_cc -- cost-centre on which to blame the alloc (same) + + +--------------------------------------------------------------- +-- Binding constructor arguments +--------------------------------------------------------------- + +bindConArgs :: AltCon -> LocalReg -> [NonVoid Id] -> FCode [LocalReg] +-- bindConArgs is called from cgAlt of a case +-- (bindConArgs con args) augments the environment with bindings for the +-- binders args, assuming that we have just returned from a 'case' which +-- found a con +bindConArgs (DataAlt con) base args + = ASSERT(not (isUnboxedTupleCon con)) + do dflags <- getDynFlags + let (_, _, args_w_offsets) = mkVirtConstrOffsets dflags (addIdReps args) + tag = tagForCon dflags con + + -- The binding below forces the masking out of the tag bits + -- when accessing the constructor field. + bind_arg :: (NonVoid Id, ByteOff) -> FCode (Maybe LocalReg) + bind_arg (arg@(NonVoid b), offset) + | isDeadBinder b -- See Note [Dead-binder optimisation] in GHC.StgToCmm.Expr + = return Nothing + | otherwise + = do { emit $ mkTaggedObjectLoad dflags (idToReg dflags arg) + base offset tag + ; Just <$> bindArgToReg arg } + + mapMaybeM bind_arg args_w_offsets + +bindConArgs _other_con _base args + = ASSERT( null args ) return [] diff --git a/compiler/GHC/StgToCmm/Env.hs b/compiler/GHC/StgToCmm/Env.hs new file mode 100644 index 0000000000..e32c6a1ecb --- /dev/null +++ b/compiler/GHC/StgToCmm/Env.hs @@ -0,0 +1,208 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Stg to C-- code generation: the binding environment +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- +module GHC.StgToCmm.Env ( + CgIdInfo, + + litIdInfo, lneIdInfo, rhsIdInfo, mkRhsInit, + idInfoToAmode, + + addBindC, addBindsC, + + bindArgsToRegs, bindToReg, rebindToReg, + bindArgToReg, idToReg, + getArgAmode, getNonVoidArgAmodes, + getCgIdInfo, + maybeLetNoEscape, + ) where + +#include "HsVersions.h" + +import GhcPrelude + +import TyCon +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Closure + +import CLabel + +import BlockId +import CmmExpr +import CmmUtils +import DynFlags +import Id +import MkGraph +import Name +import Outputable +import StgSyn +import Type +import TysPrim +import UniqFM +import Util +import VarEnv + +------------------------------------- +-- Manipulating CgIdInfo +------------------------------------- + +mkCgIdInfo :: Id -> LambdaFormInfo -> CmmExpr -> CgIdInfo +mkCgIdInfo id lf expr + = CgIdInfo { cg_id = id, cg_lf = lf + , cg_loc = CmmLoc expr } + +litIdInfo :: DynFlags -> Id -> LambdaFormInfo -> CmmLit -> CgIdInfo +litIdInfo dflags id lf lit + = CgIdInfo { cg_id = id, cg_lf = lf + , cg_loc = CmmLoc (addDynTag dflags (CmmLit lit) tag) } + where + tag = lfDynTag dflags lf + +lneIdInfo :: DynFlags -> Id -> [NonVoid Id] -> CgIdInfo +lneIdInfo dflags id regs + = CgIdInfo { cg_id = id, cg_lf = lf + , cg_loc = LneLoc blk_id (map (idToReg dflags) regs) } + where + lf = mkLFLetNoEscape + blk_id = mkBlockId (idUnique id) + + +rhsIdInfo :: Id -> LambdaFormInfo -> FCode (CgIdInfo, LocalReg) +rhsIdInfo id lf_info + = do dflags <- getDynFlags + reg <- newTemp (gcWord dflags) + return (mkCgIdInfo id lf_info (CmmReg (CmmLocal reg)), reg) + +mkRhsInit :: DynFlags -> LocalReg -> LambdaFormInfo -> CmmExpr -> CmmAGraph +mkRhsInit dflags reg lf_info expr + = mkAssign (CmmLocal reg) (addDynTag dflags expr (lfDynTag dflags lf_info)) + +idInfoToAmode :: CgIdInfo -> CmmExpr +-- Returns a CmmExpr for the *tagged* pointer +idInfoToAmode (CgIdInfo { cg_loc = CmmLoc e }) = e +idInfoToAmode cg_info + = pprPanic "idInfoToAmode" (ppr (cg_id cg_info)) -- LneLoc + +addDynTag :: DynFlags -> CmmExpr -> DynTag -> CmmExpr +-- A tag adds a byte offset to the pointer +addDynTag dflags expr tag = cmmOffsetB dflags expr tag + +maybeLetNoEscape :: CgIdInfo -> Maybe (BlockId, [LocalReg]) +maybeLetNoEscape (CgIdInfo { cg_loc = LneLoc blk_id args}) = Just (blk_id, args) +maybeLetNoEscape _other = Nothing + + + +--------------------------------------------------------- +-- The binding environment +-- +-- There are three basic routines, for adding (addBindC), +-- modifying(modifyBindC) and looking up (getCgIdInfo) bindings. +--------------------------------------------------------- + +addBindC :: CgIdInfo -> FCode () +addBindC stuff_to_bind = do + binds <- getBinds + setBinds $ extendVarEnv binds (cg_id stuff_to_bind) stuff_to_bind + +addBindsC :: [CgIdInfo] -> FCode () +addBindsC new_bindings = do + binds <- getBinds + let new_binds = foldl' (\ binds info -> extendVarEnv binds (cg_id info) info) + binds + new_bindings + setBinds new_binds + +getCgIdInfo :: Id -> FCode CgIdInfo +getCgIdInfo id + = do { dflags <- getDynFlags + ; local_binds <- getBinds -- Try local bindings first + ; case lookupVarEnv local_binds id of { + Just info -> return info ; + Nothing -> do { + + -- Should be imported; make up a CgIdInfo for it + let name = idName id + ; if isExternalName name then + let ext_lbl + | isUnliftedType (idType id) = + -- An unlifted external Id must refer to a top-level + -- string literal. See Note [Bytes label] in CLabel. + ASSERT( idType id `eqType` addrPrimTy ) + mkBytesLabel name + | otherwise = mkClosureLabel name $ idCafInfo id + in return $ + litIdInfo dflags id (mkLFImported id) (CmmLabel ext_lbl) + else + cgLookupPanic id -- Bug + }}} + +cgLookupPanic :: Id -> FCode a +cgLookupPanic id + = do local_binds <- getBinds + pprPanic "GHC.StgToCmm.Env: variable not found" + (vcat [ppr id, + text "local binds for:", + pprUFM local_binds $ \infos -> + vcat [ ppr (cg_id info) | info <- infos ] + ]) + + +-------------------- +getArgAmode :: NonVoid StgArg -> FCode CmmExpr +getArgAmode (NonVoid (StgVarArg var)) = idInfoToAmode <$> getCgIdInfo var +getArgAmode (NonVoid (StgLitArg lit)) = CmmLit <$> cgLit lit + +getNonVoidArgAmodes :: [StgArg] -> FCode [CmmExpr] +-- NB: Filters out void args, +-- so the result list may be shorter than the argument list +getNonVoidArgAmodes [] = return [] +getNonVoidArgAmodes (arg:args) + | isVoidRep (argPrimRep arg) = getNonVoidArgAmodes args + | otherwise = do { amode <- getArgAmode (NonVoid arg) + ; amodes <- getNonVoidArgAmodes args + ; return ( amode : amodes ) } + + +------------------------------------------------------------------------ +-- Interface functions for binding and re-binding names +------------------------------------------------------------------------ + +bindToReg :: NonVoid Id -> LambdaFormInfo -> FCode LocalReg +-- Bind an Id to a fresh LocalReg +bindToReg nvid@(NonVoid id) lf_info + = do dflags <- getDynFlags + let reg = idToReg dflags nvid + addBindC (mkCgIdInfo id lf_info (CmmReg (CmmLocal reg))) + return reg + +rebindToReg :: NonVoid Id -> FCode LocalReg +-- Like bindToReg, but the Id is already in scope, so +-- get its LF info from the envt +rebindToReg nvid@(NonVoid id) + = do { info <- getCgIdInfo id + ; bindToReg nvid (cg_lf info) } + +bindArgToReg :: NonVoid Id -> FCode LocalReg +bindArgToReg nvid@(NonVoid id) = bindToReg nvid (mkLFArgument id) + +bindArgsToRegs :: [NonVoid Id] -> FCode [LocalReg] +bindArgsToRegs args = mapM bindArgToReg args + +idToReg :: DynFlags -> NonVoid Id -> LocalReg +-- Make a register from an Id, typically a function argument, +-- free variable, or case binder +-- +-- We re-use the Unique from the Id to make it easier to see what is going on +-- +-- By now the Ids should be uniquely named; else one would worry +-- about accidental collision +idToReg dflags (NonVoid id) + = LocalReg (idUnique id) + (primRepCmmType dflags (idPrimRep id)) diff --git a/compiler/GHC/StgToCmm/Expr.hs b/compiler/GHC/StgToCmm/Expr.hs new file mode 100644 index 0000000000..59cd246441 --- /dev/null +++ b/compiler/GHC/StgToCmm/Expr.hs @@ -0,0 +1,992 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Stg to C-- code generation: expressions +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Expr ( cgExpr ) where + +#include "HsVersions.h" + +import GhcPrelude hiding ((<*>)) + +import {-# SOURCE #-} GHC.StgToCmm.Bind ( cgBind ) + +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Heap +import GHC.StgToCmm.Env +import GHC.StgToCmm.Con +import GHC.StgToCmm.Prof (saveCurrentCostCentre, restoreCurrentCostCentre, emitSetCCC) +import GHC.StgToCmm.Layout +import GHC.StgToCmm.Prim +import GHC.StgToCmm.Hpc +import GHC.StgToCmm.Ticky +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Closure + +import StgSyn + +import MkGraph +import BlockId +import Cmm +import CmmInfo +import CoreSyn +import DataCon +import ForeignCall +import Id +import PrimOp +import TyCon +import Type ( isUnliftedType ) +import RepType ( isVoidTy, countConRepArgs, primRepSlot ) +import CostCentre ( CostCentreStack, currentCCS ) +import Maybes +import Util +import FastString +import Outputable + +import Control.Monad (unless,void) +import Control.Arrow (first) +import Data.Function ( on ) + +------------------------------------------------------------------------ +-- cgExpr: the main function +------------------------------------------------------------------------ + +cgExpr :: CgStgExpr -> FCode ReturnKind + +cgExpr (StgApp fun args) = cgIdApp fun args + +-- seq# a s ==> a +-- See Note [seq# magic] in PrelRules +cgExpr (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _res_ty) = + cgIdApp a [] + +-- dataToTag# :: a -> Int# +-- See Note [dataToTag#] in primops.txt.pp +cgExpr (StgOpApp (StgPrimOp DataToTagOp) [StgVarArg a] _res_ty) = do + dflags <- getDynFlags + emitComment (mkFastString "dataToTag#") + tmp <- newTemp (bWord dflags) + _ <- withSequel (AssignTo [tmp] False) (cgIdApp a []) + -- TODO: For small types look at the tag bits instead of reading info table + emitReturn [getConstrTag dflags (cmmUntag dflags (CmmReg (CmmLocal tmp)))] + +cgExpr (StgOpApp op args ty) = cgOpApp op args ty +cgExpr (StgConApp con args _)= cgConApp con args +cgExpr (StgTick t e) = cgTick t >> cgExpr e +cgExpr (StgLit lit) = do cmm_lit <- cgLit lit + emitReturn [CmmLit cmm_lit] + +cgExpr (StgLet _ binds expr) = do { cgBind binds; cgExpr expr } +cgExpr (StgLetNoEscape _ binds expr) = + do { u <- newUnique + ; let join_id = mkBlockId u + ; cgLneBinds join_id binds + ; r <- cgExpr expr + ; emitLabel join_id + ; return r } + +cgExpr (StgCase expr bndr alt_type alts) = + cgCase expr bndr alt_type alts + +cgExpr (StgLam {}) = panic "cgExpr: StgLam" + +------------------------------------------------------------------------ +-- Let no escape +------------------------------------------------------------------------ + +{- Generating code for a let-no-escape binding, aka join point is very +very similar to what we do for a case expression. The duality is +between + let-no-escape x = b + in e +and + case e of ... -> b + +That is, the RHS of 'x' (ie 'b') will execute *later*, just like +the alternative of the case; it needs to be compiled in an environment +in which all volatile bindings are forgotten, and the free vars are +bound only to stable things like stack locations.. The 'e' part will +execute *next*, just like the scrutinee of a case. -} + +------------------------- +cgLneBinds :: BlockId -> CgStgBinding -> FCode () +cgLneBinds join_id (StgNonRec bndr rhs) + = do { local_cc <- saveCurrentCostCentre + -- See Note [Saving the current cost centre] + ; (info, fcode) <- cgLetNoEscapeRhs join_id local_cc bndr rhs + ; fcode + ; addBindC info } + +cgLneBinds join_id (StgRec pairs) + = do { local_cc <- saveCurrentCostCentre + ; r <- sequence $ unzipWith (cgLetNoEscapeRhs join_id local_cc) pairs + ; let (infos, fcodes) = unzip r + ; addBindsC infos + ; sequence_ fcodes + } + +------------------------- +cgLetNoEscapeRhs + :: BlockId -- join point for successor of let-no-escape + -> Maybe LocalReg -- Saved cost centre + -> Id + -> CgStgRhs + -> FCode (CgIdInfo, FCode ()) + +cgLetNoEscapeRhs join_id local_cc bndr rhs = + do { (info, rhs_code) <- cgLetNoEscapeRhsBody local_cc bndr rhs + ; let (bid, _) = expectJust "cgLetNoEscapeRhs" $ maybeLetNoEscape info + ; let code = do { (_, body) <- getCodeScoped rhs_code + ; emitOutOfLine bid (first (<*> mkBranch join_id) body) } + ; return (info, code) + } + +cgLetNoEscapeRhsBody + :: Maybe LocalReg -- Saved cost centre + -> Id + -> CgStgRhs + -> FCode (CgIdInfo, FCode ()) +cgLetNoEscapeRhsBody local_cc bndr (StgRhsClosure _ cc _upd args body) + = cgLetNoEscapeClosure bndr local_cc cc (nonVoidIds args) body +cgLetNoEscapeRhsBody local_cc bndr (StgRhsCon cc con args) + = cgLetNoEscapeClosure bndr local_cc cc [] + (StgConApp con args (pprPanic "cgLetNoEscapeRhsBody" $ + text "StgRhsCon doesn't have type args")) + -- For a constructor RHS we want to generate a single chunk of + -- code which can be jumped to from many places, which will + -- return the constructor. It's easy; just behave as if it + -- was an StgRhsClosure with a ConApp inside! + +------------------------- +cgLetNoEscapeClosure + :: Id -- binder + -> Maybe LocalReg -- Slot for saved current cost centre + -> CostCentreStack -- XXX: *** NOT USED *** why not? + -> [NonVoid Id] -- Args (as in \ args -> body) + -> CgStgExpr -- Body (as in above) + -> FCode (CgIdInfo, FCode ()) + +cgLetNoEscapeClosure bndr cc_slot _unused_cc args body + = do dflags <- getDynFlags + return ( lneIdInfo dflags bndr args + , code ) + where + code = forkLneBody $ do { + ; withNewTickyCounterLNE (idName bndr) args $ do + ; restoreCurrentCostCentre cc_slot + ; arg_regs <- bindArgsToRegs args + ; void $ noEscapeHeapCheck arg_regs (tickyEnterLNE >> cgExpr body) } + + +------------------------------------------------------------------------ +-- Case expressions +------------------------------------------------------------------------ + +{- Note [Compiling case expressions] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +It is quite interesting to decide whether to put a heap-check at the +start of each alternative. Of course we certainly have to do so if +the case forces an evaluation, or if there is a primitive op which can +trigger GC. + +A more interesting situation is this (a Plan-B situation) + + !P!; + ...P... + case x# of + 0# -> !Q!; ...Q... + default -> !R!; ...R... + +where !x! indicates a possible heap-check point. The heap checks +in the alternatives *can* be omitted, in which case the topmost +heapcheck will take their worst case into account. + +In favour of omitting !Q!, !R!: + + - *May* save a heap overflow test, + if ...P... allocates anything. + + - We can use relative addressing from a single Hp to + get at all the closures so allocated. + + - No need to save volatile vars etc across heap checks + in !Q!, !R! + +Against omitting !Q!, !R! + + - May put a heap-check into the inner loop. Suppose + the main loop is P -> R -> P -> R... + Q is the loop exit, and only it does allocation. + This only hurts us if P does no allocation. If P allocates, + then there is a heap check in the inner loop anyway. + + - May do more allocation than reqd. This sometimes bites us + badly. For example, nfib (ha!) allocates about 30\% more space if the + worst-casing is done, because many many calls to nfib are leaf calls + which don't need to allocate anything. + + We can un-allocate, but that costs an instruction + +Neither problem hurts us if there is only one alternative. + +Suppose the inner loop is P->R->P->R etc. Then here is +how many heap checks we get in the *inner loop* under various +conditions + + Alloc Heap check in branches (!Q!, !R!)? + P Q R yes no (absorb to !P!) +-------------------------------------- + n n n 0 0 + n y n 0 1 + n . y 1 1 + y . y 2 1 + y . n 1 1 + +Best choices: absorb heap checks from Q and R into !P! iff + a) P itself does some allocation +or + b) P does allocation, or there is exactly one alternative + +We adopt (b) because that is more likely to put the heap check at the +entry to a function, when not many things are live. After a bunch of +single-branch cases, we may have lots of things live + +Hence: two basic plans for + + case e of r { alts } + +------ Plan A: the general case --------- + + ...save current cost centre... + + ...code for e, + with sequel (SetLocals r) + + ...restore current cost centre... + ...code for alts... + ...alts do their own heap checks + +------ Plan B: special case when --------- + (i) e does not allocate or call GC + (ii) either upstream code performs allocation + or there is just one alternative + + Then heap allocation in the (single) case branch + is absorbed by the upstream check. + Very common example: primops on unboxed values + + ...code for e, + with sequel (SetLocals r)... + + ...code for alts... + ...no heap check... +-} + + + +------------------------------------- +data GcPlan + = GcInAlts -- Put a GC check at the start the case alternatives, + [LocalReg] -- which binds these registers + | NoGcInAlts -- The scrutinee is a primitive value, or a call to a + -- primitive op which does no GC. Absorb the allocation + -- of the case alternative(s) into the upstream check + +------------------------------------- +cgCase :: CgStgExpr -> Id -> AltType -> [CgStgAlt] -> FCode ReturnKind + +cgCase (StgOpApp (StgPrimOp op) args _) bndr (AlgAlt tycon) alts + | isEnumerationTyCon tycon -- Note [case on bool] + = do { tag_expr <- do_enum_primop op args + + -- If the binder is not dead, convert the tag to a constructor + -- and assign it. See Note [Dead-binder optimisation] + ; unless (isDeadBinder bndr) $ do + { dflags <- getDynFlags + ; tmp_reg <- bindArgToReg (NonVoid bndr) + ; emitAssign (CmmLocal tmp_reg) + (tagToClosure dflags tycon tag_expr) } + + ; (mb_deflt, branches) <- cgAlgAltRhss (NoGcInAlts,AssignedDirectly) + (NonVoid bndr) alts + -- See Note [GC for conditionals] + ; emitSwitch tag_expr branches mb_deflt 0 (tyConFamilySize tycon - 1) + ; return AssignedDirectly + } + where + do_enum_primop :: PrimOp -> [StgArg] -> FCode CmmExpr + do_enum_primop TagToEnumOp [arg] -- No code! + = getArgAmode (NonVoid arg) + do_enum_primop primop args + = do dflags <- getDynFlags + tmp <- newTemp (bWord dflags) + cgPrimOp [tmp] primop args + return (CmmReg (CmmLocal tmp)) + +{- +Note [case on bool] +~~~~~~~~~~~~~~~~~~~ +This special case handles code like + + case a <# b of + True -> + False -> + +--> case tagToEnum# (a <$# b) of + True -> .. ; False -> ... + +--> case (a <$# b) of r -> + case tagToEnum# r of + True -> .. ; False -> ... + +If we let the ordinary case code handle it, we'll get something like + + tmp1 = a < b + tmp2 = Bool_closure_tbl[tmp1] + if (tmp2 & 7 != 0) then ... // normal tagged case + +but this junk won't optimise away. What we really want is just an +inline comparison: + + if (a < b) then ... + +So we add a special case to generate + + tmp1 = a < b + if (tmp1 == 0) then ... + +and later optimisations will further improve this. + +Now that #6135 has been resolved it should be possible to remove that +special case. The idea behind this special case and pre-6135 implementation +of Bool-returning primops was that tagToEnum# was added implicitly in the +codegen and then optimized away. Now the call to tagToEnum# is explicit +in the source code, which allows to optimize it away at the earlier stages +of compilation (i.e. at the Core level). + +Note [Scrutinising VoidRep] +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Suppose we have this STG code: + f = \[s : State# RealWorld] -> + case s of _ -> blah +This is very odd. Why are we scrutinising a state token? But it +can arise with bizarre NOINLINE pragmas (#9964) + crash :: IO () + crash = IO (\s -> let {-# NOINLINE s' #-} + s' = s + in (# s', () #)) + +Now the trouble is that 's' has VoidRep, and we do not bind void +arguments in the environment; they don't live anywhere. See the +calls to nonVoidIds in various places. So we must not look up +'s' in the environment. Instead, just evaluate the RHS! Simple. + +Note [Dead-binder optimisation] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A case-binder, or data-constructor argument, may be marked as dead, +because we preserve occurrence-info on binders in CoreTidy (see +CoreTidy.tidyIdBndr). + +If the binder is dead, we can sometimes eliminate a load. While +CmmSink will eliminate that load, it's very easy to kill it at source +(giving CmmSink less work to do), and in any case CmmSink only runs +with -O. Since the majority of case binders are dead, this +optimisation probably still has a great benefit-cost ratio and we want +to keep it for -O0. See also Phab:D5358. + +This probably also was the reason for occurrence hack in Phab:D5339 to +exist, perhaps because the occurrence information preserved by +'CoreTidy.tidyIdBndr' was insufficient. But now that CmmSink does the +job we deleted the hacks. +-} + +cgCase (StgApp v []) _ (PrimAlt _) alts + | isVoidRep (idPrimRep v) -- See Note [Scrutinising VoidRep] + , [(DEFAULT, _, rhs)] <- alts + = cgExpr rhs + +{- Note [Dodgy unsafeCoerce 1] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Consider + case (x :: HValue) |> co of (y :: MutVar# Int) + DEFAULT -> ... +We want to gnerate an assignment + y := x +We want to allow this assignment to be generated in the case when the +types are compatible, because this allows some slightly-dodgy but +occasionally-useful casts to be used, such as in RtClosureInspect +where we cast an HValue to a MutVar# so we can print out the contents +of the MutVar#. If instead we generate code that enters the HValue, +then we'll get a runtime panic, because the HValue really is a +MutVar#. The types are compatible though, so we can just generate an +assignment. +-} +cgCase (StgApp v []) bndr alt_type@(PrimAlt _) alts + | isUnliftedType (idType v) -- Note [Dodgy unsafeCoerce 1] + || reps_compatible + = -- assignment suffices for unlifted types + do { dflags <- getDynFlags + ; unless reps_compatible $ + pprPanic "cgCase: reps do not match, perhaps a dodgy unsafeCoerce?" + (pp_bndr v $$ pp_bndr bndr) + ; v_info <- getCgIdInfo v + ; emitAssign (CmmLocal (idToReg dflags (NonVoid bndr))) + (idInfoToAmode v_info) + -- Add bndr to the environment + ; _ <- bindArgToReg (NonVoid bndr) + ; cgAlts (NoGcInAlts,AssignedDirectly) (NonVoid bndr) alt_type alts } + where + reps_compatible = ((==) `on` (primRepSlot . idPrimRep)) v bndr + -- Must compare SlotTys, not proper PrimReps, because with unboxed sums, + -- the types of the binders are generated from slotPrimRep and might not + -- match. Test case: + -- swap :: (# Int | Int #) -> (# Int | Int #) + -- swap (# x | #) = (# | x #) + -- swap (# | y #) = (# y | #) + + pp_bndr id = ppr id <+> dcolon <+> ppr (idType id) <+> parens (ppr (idPrimRep id)) + +{- Note [Dodgy unsafeCoerce 2, #3132] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In all other cases of a lifted Id being cast to an unlifted type, the +Id should be bound to bottom, otherwise this is an unsafe use of +unsafeCoerce. We can generate code to enter the Id and assume that +it will never return. Hence, we emit the usual enter/return code, and +because bottom must be untagged, it will be entered. The Sequel is a +type-correct assignment, albeit bogus. The (dead) continuation loops; +it would be better to invoke some kind of panic function here. +-} +cgCase scrut@(StgApp v []) _ (PrimAlt _) _ + = do { dflags <- getDynFlags + ; mb_cc <- maybeSaveCostCentre True + ; _ <- withSequel + (AssignTo [idToReg dflags (NonVoid v)] False) (cgExpr scrut) + ; restoreCurrentCostCentre mb_cc + ; emitComment $ mkFastString "should be unreachable code" + ; l <- newBlockId + ; emitLabel l + ; emit (mkBranch l) -- an infinite loop + ; return AssignedDirectly + } + +{- Note [Handle seq#] +~~~~~~~~~~~~~~~~~~~~~ +See Note [seq# magic] in PrelRules. +The special case for seq# in cgCase does this: + + case seq# a s of v + (# s', a' #) -> e +==> + case a of v + (# s', a' #) -> e + +(taking advantage of the fact that the return convention for (# State#, a #) +is the same as the return convention for just 'a') +-} + +cgCase (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _) bndr alt_type alts + = -- Note [Handle seq#] + -- And see Note [seq# magic] in PrelRules + -- Use the same return convention as vanilla 'a'. + cgCase (StgApp a []) bndr alt_type alts + +cgCase scrut bndr alt_type alts + = -- the general case + do { dflags <- getDynFlags + ; up_hp_usg <- getVirtHp -- Upstream heap usage + ; let ret_bndrs = chooseReturnBndrs bndr alt_type alts + alt_regs = map (idToReg dflags) ret_bndrs + ; simple_scrut <- isSimpleScrut scrut alt_type + ; let do_gc | is_cmp_op scrut = False -- See Note [GC for conditionals] + | not simple_scrut = True + | isSingleton alts = False + | up_hp_usg > 0 = False + | otherwise = True + -- cf Note [Compiling case expressions] + gc_plan = if do_gc then GcInAlts alt_regs else NoGcInAlts + + ; mb_cc <- maybeSaveCostCentre simple_scrut + + ; let sequel = AssignTo alt_regs do_gc{- Note [scrut sequel] -} + ; ret_kind <- withSequel sequel (cgExpr scrut) + ; restoreCurrentCostCentre mb_cc + ; _ <- bindArgsToRegs ret_bndrs + ; cgAlts (gc_plan,ret_kind) (NonVoid bndr) alt_type alts + } + where + is_cmp_op (StgOpApp (StgPrimOp op) _ _) = isComparisonPrimOp op + is_cmp_op _ = False + +{- Note [GC for conditionals] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For boolean conditionals it seems that we have always done NoGcInAlts. +That is, we have always done the GC check before the conditional. +This is enshrined in the special case for + case tagToEnum# (a>b) of ... +See Note [case on bool] + +It's odd, and it's flagrantly inconsistent with the rules described +Note [Compiling case expressions]. However, after eliminating the +tagToEnum# (#13397) we will have: + case (a>b) of ... +Rather than make it behave quite differently, I am testing for a +comparison operator here in in the general case as well. + +ToDo: figure out what the Right Rule should be. + +Note [scrut sequel] +~~~~~~~~~~~~~~~~~~~ +The job of the scrutinee is to assign its value(s) to alt_regs. +Additionally, if we plan to do a heap-check in the alternatives (see +Note [Compiling case expressions]), then we *must* retreat Hp to +recover any unused heap before passing control to the sequel. If we +don't do this, then any unused heap will become slop because the heap +check will reset the heap usage. Slop in the heap breaks LDV profiling +(+RTS -hb) which needs to do a linear sweep through the nursery. + + +Note [Inlining out-of-line primops and heap checks] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If shouldInlinePrimOp returns True when called from GHC.StgToCmm.Expr for the +purpose of heap check placement, we *must* inline the primop later in +GHC.StgToCmm.Prim. If we don't things will go wrong. +-} + +----------------- +maybeSaveCostCentre :: Bool -> FCode (Maybe LocalReg) +maybeSaveCostCentre simple_scrut + | simple_scrut = return Nothing + | otherwise = saveCurrentCostCentre + + +----------------- +isSimpleScrut :: CgStgExpr -> AltType -> FCode Bool +-- Simple scrutinee, does not block or allocate; hence safe to amalgamate +-- heap usage from alternatives into the stuff before the case +-- NB: if you get this wrong, and claim that the expression doesn't allocate +-- when it does, you'll deeply mess up allocation +isSimpleScrut (StgOpApp op args _) _ = isSimpleOp op args +isSimpleScrut (StgLit _) _ = return True -- case 1# of { 0# -> ..; ... } +isSimpleScrut (StgApp _ []) (PrimAlt _) = return True -- case x# of { 0# -> ..; ... } +isSimpleScrut _ _ = return False + +isSimpleOp :: StgOp -> [StgArg] -> FCode Bool +-- True iff the op cannot block or allocate +isSimpleOp (StgFCallOp (CCall (CCallSpec _ _ safe)) _) _ = return $! not (playSafe safe) +-- dataToTag# evalautes its argument, see Note [dataToTag#] in primops.txt.pp +isSimpleOp (StgPrimOp DataToTagOp) _ = return False +isSimpleOp (StgPrimOp op) stg_args = do + arg_exprs <- getNonVoidArgAmodes stg_args + dflags <- getDynFlags + -- See Note [Inlining out-of-line primops and heap checks] + return $! isJust $ shouldInlinePrimOp dflags op arg_exprs +isSimpleOp (StgPrimCallOp _) _ = return False + +----------------- +chooseReturnBndrs :: Id -> AltType -> [CgStgAlt] -> [NonVoid Id] +-- These are the binders of a case that are assigned by the evaluation of the +-- scrutinee. +-- They're non-void, see Note [Post-unarisation invariants] in UnariseStg. +chooseReturnBndrs bndr (PrimAlt _) _alts + = assertNonVoidIds [bndr] + +chooseReturnBndrs _bndr (MultiValAlt n) [(_, ids, _)] + = ASSERT2(ids `lengthIs` n, ppr n $$ ppr ids $$ ppr _bndr) + assertNonVoidIds ids -- 'bndr' is not assigned! + +chooseReturnBndrs bndr (AlgAlt _) _alts + = assertNonVoidIds [bndr] -- Only 'bndr' is assigned + +chooseReturnBndrs bndr PolyAlt _alts + = assertNonVoidIds [bndr] -- Only 'bndr' is assigned + +chooseReturnBndrs _ _ _ = panic "chooseReturnBndrs" + -- MultiValAlt has only one alternative + +------------------------------------- +cgAlts :: (GcPlan,ReturnKind) -> NonVoid Id -> AltType -> [CgStgAlt] + -> FCode ReturnKind +-- At this point the result of the case are in the binders +cgAlts gc_plan _bndr PolyAlt [(_, _, rhs)] + = maybeAltHeapCheck gc_plan (cgExpr rhs) + +cgAlts gc_plan _bndr (MultiValAlt _) [(_, _, rhs)] + = maybeAltHeapCheck gc_plan (cgExpr rhs) + -- Here bndrs are *already* in scope, so don't rebind them + +cgAlts gc_plan bndr (PrimAlt _) alts + = do { dflags <- getDynFlags + + ; tagged_cmms <- cgAltRhss gc_plan bndr alts + + ; let bndr_reg = CmmLocal (idToReg dflags bndr) + (DEFAULT,deflt) = head tagged_cmms + -- PrimAlts always have a DEFAULT case + -- and it always comes first + + tagged_cmms' = [(lit,code) + | (LitAlt lit, code) <- tagged_cmms] + ; emitCmmLitSwitch (CmmReg bndr_reg) tagged_cmms' deflt + ; return AssignedDirectly } + +cgAlts gc_plan bndr (AlgAlt tycon) alts + = do { dflags <- getDynFlags + + ; (mb_deflt, branches) <- cgAlgAltRhss gc_plan bndr alts + + ; let fam_sz = tyConFamilySize tycon + bndr_reg = CmmLocal (idToReg dflags bndr) + + -- Is the constructor tag in the node reg? + ; if isSmallFamily dflags fam_sz + then do + let -- Yes, bndr_reg has constr. tag in ls bits + tag_expr = cmmConstrTag1 dflags (CmmReg bndr_reg) + branches' = [(tag+1,branch) | (tag,branch) <- branches] + emitSwitch tag_expr branches' mb_deflt 1 fam_sz + + else -- No, get tag from info table + let -- Note that ptr _always_ has tag 1 + -- when the family size is big enough + untagged_ptr = cmmRegOffB bndr_reg (-1) + tag_expr = getConstrTag dflags (untagged_ptr) + in emitSwitch tag_expr branches mb_deflt 0 (fam_sz - 1) + + ; return AssignedDirectly } + +cgAlts _ _ _ _ = panic "cgAlts" + -- UbxTupAlt and PolyAlt have only one alternative + + +-- Note [alg-alt heap check] +-- +-- In an algebraic case with more than one alternative, we will have +-- code like +-- +-- L0: +-- x = R1 +-- goto L1 +-- L1: +-- if (x & 7 >= 2) then goto L2 else goto L3 +-- L2: +-- Hp = Hp + 16 +-- if (Hp > HpLim) then goto L4 +-- ... +-- L4: +-- call gc() returns to L5 +-- L5: +-- x = R1 +-- goto L1 + +------------------- +cgAlgAltRhss :: (GcPlan,ReturnKind) -> NonVoid Id -> [CgStgAlt] + -> FCode ( Maybe CmmAGraphScoped + , [(ConTagZ, CmmAGraphScoped)] ) +cgAlgAltRhss gc_plan bndr alts + = do { tagged_cmms <- cgAltRhss gc_plan bndr alts + + ; let { mb_deflt = case tagged_cmms of + ((DEFAULT,rhs) : _) -> Just rhs + _other -> Nothing + -- DEFAULT is always first, if present + + ; branches = [ (dataConTagZ con, cmm) + | (DataAlt con, cmm) <- tagged_cmms ] + } + + ; return (mb_deflt, branches) + } + + +------------------- +cgAltRhss :: (GcPlan,ReturnKind) -> NonVoid Id -> [CgStgAlt] + -> FCode [(AltCon, CmmAGraphScoped)] +cgAltRhss gc_plan bndr alts = do + dflags <- getDynFlags + let + base_reg = idToReg dflags bndr + cg_alt :: CgStgAlt -> FCode (AltCon, CmmAGraphScoped) + cg_alt (con, bndrs, rhs) + = getCodeScoped $ + maybeAltHeapCheck gc_plan $ + do { _ <- bindConArgs con base_reg (assertNonVoidIds bndrs) + -- alt binders are always non-void, + -- see Note [Post-unarisation invariants] in UnariseStg + ; _ <- cgExpr rhs + ; return con } + forkAlts (map cg_alt alts) + +maybeAltHeapCheck :: (GcPlan,ReturnKind) -> FCode a -> FCode a +maybeAltHeapCheck (NoGcInAlts,_) code = code +maybeAltHeapCheck (GcInAlts regs, AssignedDirectly) code = + altHeapCheck regs code +maybeAltHeapCheck (GcInAlts regs, ReturnedTo lret off) code = + altHeapCheckReturnsTo regs lret off code + +----------------------------------------------------------------------------- +-- Tail calls +----------------------------------------------------------------------------- + +cgConApp :: DataCon -> [StgArg] -> FCode ReturnKind +cgConApp con stg_args + | isUnboxedTupleCon con -- Unboxed tuple: assign and return + = do { arg_exprs <- getNonVoidArgAmodes stg_args + ; tickyUnboxedTupleReturn (length arg_exprs) + ; emitReturn arg_exprs } + + | otherwise -- Boxed constructors; allocate and return + = ASSERT2( stg_args `lengthIs` countConRepArgs con, ppr con <> parens (ppr (countConRepArgs con)) <+> ppr stg_args ) + do { (idinfo, fcode_init) <- buildDynCon (dataConWorkId con) False + currentCCS con (assertNonVoidStgArgs stg_args) + -- con args are always non-void, + -- see Note [Post-unarisation invariants] in UnariseStg + -- The first "con" says that the name bound to this + -- closure is "con", which is a bit of a fudge, but + -- it only affects profiling (hence the False) + + ; emit =<< fcode_init + ; tickyReturnNewCon (length stg_args) + ; emitReturn [idInfoToAmode idinfo] } + +cgIdApp :: Id -> [StgArg] -> FCode ReturnKind +cgIdApp fun_id args = do + dflags <- getDynFlags + fun_info <- getCgIdInfo fun_id + self_loop_info <- getSelfLoop + let fun_arg = StgVarArg fun_id + fun_name = idName fun_id + fun = idInfoToAmode fun_info + lf_info = cg_lf fun_info + n_args = length args + v_args = length $ filter (isVoidTy . stgArgType) args + node_points dflags = nodeMustPointToIt dflags lf_info + case getCallMethod dflags fun_name fun_id lf_info n_args v_args (cg_loc fun_info) self_loop_info of + -- A value in WHNF, so we can just return it. + ReturnIt + | isVoidTy (idType fun_id) -> emitReturn [] + | otherwise -> emitReturn [fun] + -- ToDo: does ReturnIt guarantee tagged? + + EnterIt -> ASSERT( null args ) -- Discarding arguments + emitEnter fun + + SlowCall -> do -- A slow function call via the RTS apply routines + { tickySlowCall lf_info args + ; emitComment $ mkFastString "slowCall" + ; slowCall fun args } + + -- A direct function call (possibly with some left-over arguments) + DirectEntry lbl arity -> do + { tickyDirectCall arity args + ; if node_points dflags + then directCall NativeNodeCall lbl arity (fun_arg:args) + else directCall NativeDirectCall lbl arity args } + + -- Let-no-escape call or self-recursive tail-call + JumpToIt blk_id lne_regs -> do + { adjustHpBackwards -- always do this before a tail-call + ; cmm_args <- getNonVoidArgAmodes args + ; emitMultiAssign lne_regs cmm_args + ; emit (mkBranch blk_id) + ; return AssignedDirectly } + +-- Note [Self-recursive tail calls] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- Self-recursive tail calls can be optimized into a local jump in the same +-- way as let-no-escape bindings (see Note [What is a non-escaping let] in +-- stgSyn/CoreToStg.hs). Consider this: +-- +-- foo.info: +-- a = R1 // calling convention +-- b = R2 +-- goto L1 +-- L1: ... +-- ... +-- ... +-- L2: R1 = x +-- R2 = y +-- call foo(R1,R2) +-- +-- Instead of putting x and y into registers (or other locations required by the +-- calling convention) and performing a call we can put them into local +-- variables a and b and perform jump to L1: +-- +-- foo.info: +-- a = R1 +-- b = R2 +-- goto L1 +-- L1: ... +-- ... +-- ... +-- L2: a = x +-- b = y +-- goto L1 +-- +-- This can be done only when function is calling itself in a tail position +-- and only if the call passes number of parameters equal to function's arity. +-- Note that this cannot be performed if a function calls itself with a +-- continuation. +-- +-- This in fact implements optimization known as "loopification". It was +-- described in "Low-level code optimizations in the Glasgow Haskell Compiler" +-- by Krzysztof WoÅ›, though we use different approach. Krzysztof performed his +-- optimization at the Cmm level, whereas we perform ours during code generation +-- (Stg-to-Cmm pass) essentially making sure that optimized Cmm code is +-- generated in the first place. +-- +-- Implementation is spread across a couple of places in the code: +-- +-- * FCode monad stores additional information in its reader environment +-- (cgd_self_loop field). This information tells us which function can +-- tail call itself in an optimized way (it is the function currently +-- being compiled), what is the label of a loop header (L1 in example above) +-- and information about local registers in which we should arguments +-- before making a call (this would be a and b in example above). +-- +-- * Whenever we are compiling a function, we set that information to reflect +-- the fact that function currently being compiled can be jumped to, instead +-- of called. This is done in closureCodyBody in GHC.StgToCmm.Bind. +-- +-- * We also have to emit a label to which we will be jumping. We make sure +-- that the label is placed after a stack check but before the heap +-- check. The reason is that making a recursive tail-call does not increase +-- the stack so we only need to check once. But it may grow the heap, so we +-- have to repeat the heap check in every self-call. This is done in +-- do_checks in GHC.StgToCmm.Heap. +-- +-- * When we begin compilation of another closure we remove the additional +-- information from the environment. This is done by forkClosureBody +-- in GHC.StgToCmm.Monad. Other functions that duplicate the environment - +-- forkLneBody, forkAlts, codeOnly - duplicate that information. In other +-- words, we only need to clean the environment of the self-loop information +-- when compiling right hand side of a closure (binding). +-- +-- * When compiling a call (cgIdApp) we use getCallMethod to decide what kind +-- of call will be generated. getCallMethod decides to generate a self +-- recursive tail call when (a) environment stores information about +-- possible self tail-call; (b) that tail call is to a function currently +-- being compiled; (c) number of passed non-void arguments is equal to +-- function's arity. (d) loopification is turned on via -floopification +-- command-line option. +-- +-- * Command line option to turn loopification on and off is implemented in +-- DynFlags. +-- +-- +-- Note [Void arguments in self-recursive tail calls] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- State# tokens can get in the way of the loopification optimization as seen in +-- #11372. Consider this: +-- +-- foo :: [a] +-- -> (a -> State# s -> (# State s, Bool #)) +-- -> State# s +-- -> (# State# s, Maybe a #) +-- foo [] f s = (# s, Nothing #) +-- foo (x:xs) f s = case f x s of +-- (# s', b #) -> case b of +-- True -> (# s', Just x #) +-- False -> foo xs f s' +-- +-- We would like to compile the call to foo as a local jump instead of a call +-- (see Note [Self-recursive tail calls]). However, the generated function has +-- an arity of 2 while we apply it to 3 arguments, one of them being of void +-- type. Thus, we mustn't count arguments of void type when checking whether +-- we can turn a call into a self-recursive jump. +-- + +emitEnter :: CmmExpr -> FCode ReturnKind +emitEnter fun = do + { dflags <- getDynFlags + ; adjustHpBackwards + ; sequel <- getSequel + ; updfr_off <- getUpdFrameOff + ; case sequel of + -- For a return, we have the option of generating a tag-test or + -- not. If the value is tagged, we can return directly, which + -- is quicker than entering the value. This is a code + -- size/speed trade-off: when optimising for speed rather than + -- size we could generate the tag test. + -- + -- Right now, we do what the old codegen did, and omit the tag + -- test, just generating an enter. + Return -> do + { let entry = entryCode dflags $ closureInfoPtr dflags $ CmmReg nodeReg + ; emit $ mkJump dflags NativeNodeCall entry + [cmmUntag dflags fun] updfr_off + ; return AssignedDirectly + } + + -- The result will be scrutinised in the sequel. This is where + -- we generate a tag-test to avoid entering the closure if + -- possible. + -- + -- The generated code will be something like this: + -- + -- R1 = fun -- copyout + -- if (fun & 7 != 0) goto Lret else goto Lcall + -- Lcall: + -- call [fun] returns to Lret + -- Lret: + -- fun' = R1 -- copyin + -- ... + -- + -- Note in particular that the label Lret is used as a + -- destination by both the tag-test and the call. This is + -- because Lret will necessarily be a proc-point, and we want to + -- ensure that we generate only one proc-point for this + -- sequence. + -- + -- Furthermore, we tell the caller that we generated a native + -- return continuation by returning (ReturnedTo Lret off), so + -- that the continuation can be reused by the heap-check failure + -- code in the enclosing case expression. + -- + AssignTo res_regs _ -> do + { lret <- newBlockId + ; let (off, _, copyin) = copyInOflow dflags NativeReturn (Young lret) res_regs [] + ; lcall <- newBlockId + ; updfr_off <- getUpdFrameOff + ; let area = Young lret + ; let (outArgs, regs, copyout) = copyOutOflow dflags NativeNodeCall Call area + [fun] updfr_off [] + -- refer to fun via nodeReg after the copyout, to avoid having + -- both live simultaneously; this sometimes enables fun to be + -- inlined in the RHS of the R1 assignment. + ; let entry = entryCode dflags (closureInfoPtr dflags (CmmReg nodeReg)) + the_call = toCall entry (Just lret) updfr_off off outArgs regs + ; tscope <- getTickScope + ; emit $ + copyout <*> + mkCbranch (cmmIsTagged dflags (CmmReg nodeReg)) + lret lcall Nothing <*> + outOfLine lcall (the_call,tscope) <*> + mkLabel lret tscope <*> + copyin + ; return (ReturnedTo lret off) + } + } + +------------------------------------------------------------------------ +-- Ticks +------------------------------------------------------------------------ + +-- | Generate Cmm code for a tick. Depending on the type of Tickish, +-- this will either generate actual Cmm instrumentation code, or +-- simply pass on the annotation as a @CmmTickish@. +cgTick :: Tickish Id -> FCode () +cgTick tick + = do { dflags <- getDynFlags + ; case tick of + ProfNote cc t p -> emitSetCCC cc t p + HpcTick m n -> emit (mkTickBox dflags m n) + SourceNote s n -> emitTick $ SourceNote s n + _other -> return () -- ignore + } diff --git a/compiler/GHC/StgToCmm/ExtCode.hs b/compiler/GHC/StgToCmm/ExtCode.hs new file mode 100644 index 0000000000..be2592edd3 --- /dev/null +++ b/compiler/GHC/StgToCmm/ExtCode.hs @@ -0,0 +1,252 @@ +{-# LANGUAGE DeriveFunctor #-} +-- | Our extended FCode monad. + +-- We add a mapping from names to CmmExpr, to support local variable names in +-- the concrete C-- code. The unique supply of the underlying FCode monad +-- is used to grab a new unique for each local variable. + +-- In C--, a local variable can be declared anywhere within a proc, +-- and it scopes from the beginning of the proc to the end. Hence, we have +-- to collect declarations as we parse the proc, and feed the environment +-- back in circularly (to avoid a two-pass algorithm). + +module GHC.StgToCmm.ExtCode ( + CmmParse, unEC, + Named(..), Env, + + loopDecls, + getEnv, + + withName, + getName, + + newLocal, + newLabel, + newBlockId, + newFunctionName, + newImport, + lookupLabel, + lookupName, + + code, + emit, emitLabel, emitAssign, emitStore, + getCode, getCodeR, getCodeScoped, + emitOutOfLine, + withUpdFrameOff, getUpdFrameOff +) + +where + +import GhcPrelude + +import qualified GHC.StgToCmm.Monad as F +import GHC.StgToCmm.Monad (FCode, newUnique) + +import Cmm +import CLabel +import MkGraph + +import BlockId +import DynFlags +import FastString +import Module +import UniqFM +import Unique +import UniqSupply + +import Control.Monad (ap) + +-- | The environment contains variable definitions or blockids. +data Named + = VarN CmmExpr -- ^ Holds CmmLit(CmmLabel ..) which gives the label type, + -- eg, RtsLabel, ForeignLabel, CmmLabel etc. + + | FunN UnitId -- ^ A function name from this package + | LabelN BlockId -- ^ A blockid of some code or data. + +-- | An environment of named things. +type Env = UniqFM Named + +-- | Local declarations that are in scope during code generation. +type Decls = [(FastString,Named)] + +-- | Does a computation in the FCode monad, with a current environment +-- and a list of local declarations. Returns the resulting list of declarations. +newtype CmmParse a + = EC { unEC :: String -> Env -> Decls -> FCode (Decls, a) } + deriving (Functor) + +type ExtCode = CmmParse () + +returnExtFC :: a -> CmmParse a +returnExtFC a = EC $ \_ _ s -> return (s, a) + +thenExtFC :: CmmParse a -> (a -> CmmParse b) -> CmmParse b +thenExtFC (EC m) k = EC $ \c e s -> do (s',r) <- m c e s; unEC (k r) c e s' + +instance Applicative CmmParse where + pure = returnExtFC + (<*>) = ap + +instance Monad CmmParse where + (>>=) = thenExtFC + +instance MonadUnique CmmParse where + getUniqueSupplyM = code getUniqueSupplyM + getUniqueM = EC $ \_ _ decls -> do + u <- getUniqueM + return (decls, u) + +instance HasDynFlags CmmParse where + getDynFlags = EC (\_ _ d -> do dflags <- getDynFlags + return (d, dflags)) + + +-- | Takes the variable decarations and imports from the monad +-- and makes an environment, which is looped back into the computation. +-- In this way, we can have embedded declarations that scope over the whole +-- procedure, and imports that scope over the entire module. +-- Discards the local declaration contained within decl' +-- +loopDecls :: CmmParse a -> CmmParse a +loopDecls (EC fcode) = + EC $ \c e globalDecls -> do + (_, a) <- F.fixC $ \ ~(decls, _) -> + fcode c (addListToUFM e decls) globalDecls + return (globalDecls, a) + + +-- | Get the current environment from the monad. +getEnv :: CmmParse Env +getEnv = EC $ \_ e s -> return (s, e) + +-- | Get the current context name from the monad +getName :: CmmParse String +getName = EC $ \c _ s -> return (s, c) + +-- | Set context name for a sub-parse +withName :: String -> CmmParse a -> CmmParse a +withName c' (EC fcode) = EC $ \_ e s -> fcode c' e s + +addDecl :: FastString -> Named -> ExtCode +addDecl name named = EC $ \_ _ s -> return ((name, named) : s, ()) + + +-- | Add a new variable to the list of local declarations. +-- The CmmExpr says where the value is stored. +addVarDecl :: FastString -> CmmExpr -> ExtCode +addVarDecl var expr = addDecl var (VarN expr) + +-- | Add a new label to the list of local declarations. +addLabel :: FastString -> BlockId -> ExtCode +addLabel name block_id = addDecl name (LabelN block_id) + + +-- | Create a fresh local variable of a given type. +newLocal + :: CmmType -- ^ data type + -> FastString -- ^ name of variable + -> CmmParse LocalReg -- ^ register holding the value + +newLocal ty name = do + u <- code newUnique + let reg = LocalReg u ty + addVarDecl name (CmmReg (CmmLocal reg)) + return reg + + +-- | Allocate a fresh label. +newLabel :: FastString -> CmmParse BlockId +newLabel name = do + u <- code newUnique + addLabel name (mkBlockId u) + return (mkBlockId u) + +-- | Add add a local function to the environment. +newFunctionName + :: FastString -- ^ name of the function + -> UnitId -- ^ package of the current module + -> ExtCode + +newFunctionName name pkg = addDecl name (FunN pkg) + + +-- | Add an imported foreign label to the list of local declarations. +-- If this is done at the start of the module the declaration will scope +-- over the whole module. +newImport + :: (FastString, CLabel) + -> CmmParse () + +newImport (name, cmmLabel) + = addVarDecl name (CmmLit (CmmLabel cmmLabel)) + + +-- | Lookup the BlockId bound to the label with this name. +-- If one hasn't been bound yet, create a fresh one based on the +-- Unique of the name. +lookupLabel :: FastString -> CmmParse BlockId +lookupLabel name = do + env <- getEnv + return $ + case lookupUFM env name of + Just (LabelN l) -> l + _other -> mkBlockId (newTagUnique (getUnique name) 'L') + + +-- | Lookup the location of a named variable. +-- Unknown names are treated as if they had been 'import'ed from the runtime system. +-- This saves us a lot of bother in the RTS sources, at the expense of +-- deferring some errors to link time. +lookupName :: FastString -> CmmParse CmmExpr +lookupName name = do + env <- getEnv + return $ + case lookupUFM env name of + Just (VarN e) -> e + Just (FunN pkg) -> CmmLit (CmmLabel (mkCmmCodeLabel pkg name)) + _other -> CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId name)) + + +-- | Lift an FCode computation into the CmmParse monad +code :: FCode a -> CmmParse a +code fc = EC $ \_ _ s -> do + r <- fc + return (s, r) + +emit :: CmmAGraph -> CmmParse () +emit = code . F.emit + +emitLabel :: BlockId -> CmmParse () +emitLabel = code . F.emitLabel + +emitAssign :: CmmReg -> CmmExpr -> CmmParse () +emitAssign l r = code (F.emitAssign l r) + +emitStore :: CmmExpr -> CmmExpr -> CmmParse () +emitStore l r = code (F.emitStore l r) + +getCode :: CmmParse a -> CmmParse CmmAGraph +getCode (EC ec) = EC $ \c e s -> do + ((s',_), gr) <- F.getCodeR (ec c e s) + return (s', gr) + +getCodeR :: CmmParse a -> CmmParse (a, CmmAGraph) +getCodeR (EC ec) = EC $ \c e s -> do + ((s', r), gr) <- F.getCodeR (ec c e s) + return (s', (r,gr)) + +getCodeScoped :: CmmParse a -> CmmParse (a, CmmAGraphScoped) +getCodeScoped (EC ec) = EC $ \c e s -> do + ((s', r), gr) <- F.getCodeScoped (ec c e s) + return (s', (r,gr)) + +emitOutOfLine :: BlockId -> CmmAGraphScoped -> CmmParse () +emitOutOfLine l g = code (F.emitOutOfLine l g) + +withUpdFrameOff :: UpdFrameOffset -> CmmParse () -> CmmParse () +withUpdFrameOff size inner + = EC $ \c e s -> F.withUpdFrameOff size $ (unEC inner) c e s + +getUpdFrameOff :: CmmParse UpdFrameOffset +getUpdFrameOff = code $ F.getUpdFrameOff diff --git a/compiler/GHC/StgToCmm/Foreign.hs b/compiler/GHC/StgToCmm/Foreign.hs new file mode 100644 index 0000000000..dacaff41ba --- /dev/null +++ b/compiler/GHC/StgToCmm/Foreign.hs @@ -0,0 +1,627 @@ +----------------------------------------------------------------------------- +-- +-- Code generation for foreign calls. +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Foreign ( + cgForeignCall, + emitPrimCall, emitCCall, + emitForeignCall, -- For CmmParse + emitSaveThreadState, + saveThreadState, + emitLoadThreadState, + loadThreadState, + emitOpenNursery, + emitCloseNursery, + ) where + +import GhcPrelude hiding( succ, (<*>) ) + +import StgSyn +import GHC.StgToCmm.Prof (storeCurCCS, ccsType) +import GHC.StgToCmm.Env +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Closure +import GHC.StgToCmm.Layout + +import BlockId (newBlockId) +import Cmm +import CmmUtils +import MkGraph +import Type +import RepType +import CLabel +import SMRep +import ForeignCall +import DynFlags +import Maybes +import Outputable +import UniqSupply +import BasicTypes + +import TyCoRep +import TysPrim +import Util (zipEqual) + +import Control.Monad + +----------------------------------------------------------------------------- +-- Code generation for Foreign Calls +----------------------------------------------------------------------------- + +-- | Emit code for a foreign call, and return the results to the sequel. +-- Precondition: the length of the arguments list is the same as the +-- arity of the foreign function. +cgForeignCall :: ForeignCall -- the op + -> Type -- type of foreign function + -> [StgArg] -- x,y arguments + -> Type -- result type + -> FCode ReturnKind + +cgForeignCall (CCall (CCallSpec target cconv safety)) typ stg_args res_ty + = do { dflags <- getDynFlags + ; let -- in the stdcall calling convention, the symbol needs @size appended + -- to it, where size is the total number of bytes of arguments. We + -- attach this info to the CLabel here, and the CLabel pretty printer + -- will generate the suffix when the label is printed. + call_size args + | StdCallConv <- cconv = Just (sum (map arg_size args)) + | otherwise = Nothing + + -- ToDo: this might not be correct for 64-bit API + arg_size (arg, _) = max (widthInBytes $ typeWidth $ cmmExprType dflags arg) + (wORD_SIZE dflags) + ; cmm_args <- getFCallArgs stg_args typ + ; (res_regs, res_hints) <- newUnboxedTupleRegs res_ty + ; let ((call_args, arg_hints), cmm_target) + = case target of + StaticTarget _ _ _ False -> + panic "cgForeignCall: unexpected FFI value import" + StaticTarget _ lbl mPkgId True + -> let labelSource + = case mPkgId of + Nothing -> ForeignLabelInThisPackage + Just pkgId -> ForeignLabelInPackage pkgId + size = call_size cmm_args + in ( unzip cmm_args + , CmmLit (CmmLabel + (mkForeignLabel lbl size labelSource IsFunction))) + + DynamicTarget -> case cmm_args of + (fn,_):rest -> (unzip rest, fn) + [] -> panic "cgForeignCall []" + fc = ForeignConvention cconv arg_hints res_hints CmmMayReturn + call_target = ForeignTarget cmm_target fc + + -- we want to emit code for the call, and then emitReturn. + -- However, if the sequel is AssignTo, we shortcut a little + -- and generate a foreign call that assigns the results + -- directly. Otherwise we end up generating a bunch of + -- useless "r = r" assignments, which are not merely annoying: + -- they prevent the common block elimination from working correctly + -- in the case of a safe foreign call. + -- See Note [safe foreign call convention] + -- + ; sequel <- getSequel + ; case sequel of + AssignTo assign_to_these _ -> + emitForeignCall safety assign_to_these call_target call_args + + _something_else -> + do { _ <- emitForeignCall safety res_regs call_target call_args + ; emitReturn (map (CmmReg . CmmLocal) res_regs) + } + } + +{- Note [safe foreign call convention] + +The simple thing to do for a safe foreign call would be the same as an +unsafe one: just + + emitForeignCall ... + emitReturn ... + +but consider what happens in this case + + case foo x y z of + (# s, r #) -> ... + +The sequel is AssignTo [r]. The call to newUnboxedTupleRegs picks [r] +as the result reg, and we generate + + r = foo(x,y,z) returns to L1 -- emitForeignCall + L1: + r = r -- emitReturn + goto L2 +L2: + ... + +Now L1 is a proc point (by definition, it is the continuation of the +safe foreign call). If L2 does a heap check, then L2 will also be a +proc point. + +Furthermore, the stack layout algorithm has to arrange to save r +somewhere between the call and the jump to L1, which is annoying: we +would have to treat r differently from the other live variables, which +have to be saved *before* the call. + +So we adopt a special convention for safe foreign calls: the results +are copied out according to the NativeReturn convention by the call, +and the continuation of the call should copyIn the results. (The +copyOut code is actually inserted when the safe foreign call is +lowered later). The result regs attached to the safe foreign call are +only used temporarily to hold the results before they are copied out. + +We will now generate this: + + r = foo(x,y,z) returns to L1 + L1: + r = R1 -- copyIn, inserted by mkSafeCall + goto L2 + L2: + ... r ... + +And when the safe foreign call is lowered later (see Note [lower safe +foreign calls]) we get this: + + suspendThread() + r = foo(x,y,z) + resumeThread() + R1 = r -- copyOut, inserted by lowerSafeForeignCall + jump L1 + L1: + r = R1 -- copyIn, inserted by mkSafeCall + goto L2 + L2: + ... r ... + +Now consider what happens if L2 does a heap check: the Adams +optimisation kicks in and commons up L1 with the heap-check +continuation, resulting in just one proc point instead of two. Yay! +-} + + +emitCCall :: [(CmmFormal,ForeignHint)] + -> CmmExpr + -> [(CmmActual,ForeignHint)] + -> FCode () +emitCCall hinted_results fn hinted_args + = void $ emitForeignCall PlayRisky results target args + where + (args, arg_hints) = unzip hinted_args + (results, result_hints) = unzip hinted_results + target = ForeignTarget fn fc + fc = ForeignConvention CCallConv arg_hints result_hints CmmMayReturn + + +emitPrimCall :: [CmmFormal] -> CallishMachOp -> [CmmActual] -> FCode () +emitPrimCall res op args + = void $ emitForeignCall PlayRisky res (PrimTarget op) args + +-- alternative entry point, used by CmmParse +emitForeignCall + :: Safety + -> [CmmFormal] -- where to put the results + -> ForeignTarget -- the op + -> [CmmActual] -- arguments + -> FCode ReturnKind +emitForeignCall safety results target args + | not (playSafe safety) = do + dflags <- getDynFlags + let (caller_save, caller_load) = callerSaveVolatileRegs dflags + emit caller_save + target' <- load_target_into_temp target + args' <- mapM maybe_assign_temp args + emit $ mkUnsafeCall target' results args' + emit caller_load + return AssignedDirectly + + | otherwise = do + dflags <- getDynFlags + updfr_off <- getUpdFrameOff + target' <- load_target_into_temp target + args' <- mapM maybe_assign_temp args + k <- newBlockId + let (off, _, copyout) = copyInOflow dflags NativeReturn (Young k) results [] + -- see Note [safe foreign call convention] + tscope <- getTickScope + emit $ + ( mkStore (CmmStackSlot (Young k) (widthInBytes (wordWidth dflags))) + (CmmLit (CmmBlock k)) + <*> mkLast (CmmForeignCall { tgt = target' + , res = results + , args = args' + , succ = k + , ret_args = off + , ret_off = updfr_off + , intrbl = playInterruptible safety }) + <*> mkLabel k tscope + <*> copyout + ) + return (ReturnedTo k off) + +load_target_into_temp :: ForeignTarget -> FCode ForeignTarget +load_target_into_temp (ForeignTarget expr conv) = do + tmp <- maybe_assign_temp expr + return (ForeignTarget tmp conv) +load_target_into_temp other_target@(PrimTarget _) = + return other_target + +-- What we want to do here is create a new temporary for the foreign +-- call argument if it is not safe to use the expression directly, +-- because the expression mentions caller-saves GlobalRegs (see +-- Note [Register Parameter Passing]). +-- +-- However, we can't pattern-match on the expression here, because +-- this is used in a loop by CmmParse, and testing the expression +-- results in a black hole. So we always create a temporary, and rely +-- on CmmSink to clean it up later. (Yuck, ToDo). The generated code +-- ends up being the same, at least for the RTS .cmm code. +-- +maybe_assign_temp :: CmmExpr -> FCode CmmExpr +maybe_assign_temp e = do + dflags <- getDynFlags + reg <- newTemp (cmmExprType dflags e) + emitAssign (CmmLocal reg) e + return (CmmReg (CmmLocal reg)) + +-- ----------------------------------------------------------------------------- +-- Save/restore the thread state in the TSO + +-- This stuff can't be done in suspendThread/resumeThread, because it +-- refers to global registers which aren't available in the C world. + +emitSaveThreadState :: FCode () +emitSaveThreadState = do + dflags <- getDynFlags + code <- saveThreadState dflags + emit code + +-- | Produce code to save the current thread state to @CurrentTSO@ +saveThreadState :: MonadUnique m => DynFlags -> m CmmAGraph +saveThreadState dflags = do + tso <- newTemp (gcWord dflags) + close_nursery <- closeNursery dflags tso + pure $ catAGraphs [ + -- tso = CurrentTSO; + mkAssign (CmmLocal tso) currentTSOExpr, + -- tso->stackobj->sp = Sp; + mkStore (cmmOffset dflags + (CmmLoad (cmmOffset dflags + (CmmReg (CmmLocal tso)) + (tso_stackobj dflags)) + (bWord dflags)) + (stack_SP dflags)) + spExpr, + close_nursery, + -- and save the current cost centre stack in the TSO when profiling: + if gopt Opt_SccProfilingOn dflags then + mkStore (cmmOffset dflags (CmmReg (CmmLocal tso)) (tso_CCCS dflags)) cccsExpr + else mkNop + ] + +emitCloseNursery :: FCode () +emitCloseNursery = do + dflags <- getDynFlags + tso <- newTemp (bWord dflags) + code <- closeNursery dflags tso + emit $ mkAssign (CmmLocal tso) currentTSOExpr <*> code + +{- | +@closeNursery dflags tso@ produces code to close the nursery. +A local register holding the value of @CurrentTSO@ is expected for +efficiency. + +Closing the nursery corresponds to the following code: + +@ + tso = CurrentTSO; + cn = CurrentNuresry; + + // Update the allocation limit for the current thread. We don't + // check to see whether it has overflowed at this point, that check is + // made when we run out of space in the current heap block (stg_gc_noregs) + // and in the scheduler when context switching (schedulePostRunThread). + tso->alloc_limit -= Hp + WDS(1) - cn->start; + + // Set cn->free to the next unoccupied word in the block + cn->free = Hp + WDS(1); +@ +-} +closeNursery :: MonadUnique m => DynFlags -> LocalReg -> m CmmAGraph +closeNursery df tso = do + let tsoreg = CmmLocal tso + cnreg <- CmmLocal <$> newTemp (bWord df) + pure $ catAGraphs [ + mkAssign cnreg currentNurseryExpr, + + -- CurrentNursery->free = Hp+1; + mkStore (nursery_bdescr_free df cnreg) (cmmOffsetW df hpExpr 1), + + let alloc = + CmmMachOp (mo_wordSub df) + [ cmmOffsetW df hpExpr 1 + , CmmLoad (nursery_bdescr_start df cnreg) (bWord df) + ] + + alloc_limit = cmmOffset df (CmmReg tsoreg) (tso_alloc_limit df) + in + + -- tso->alloc_limit += alloc + mkStore alloc_limit (CmmMachOp (MO_Sub W64) + [ CmmLoad alloc_limit b64 + , CmmMachOp (mo_WordTo64 df) [alloc] ]) + ] + +emitLoadThreadState :: FCode () +emitLoadThreadState = do + dflags <- getDynFlags + code <- loadThreadState dflags + emit code + +-- | Produce code to load the current thread state from @CurrentTSO@ +loadThreadState :: MonadUnique m => DynFlags -> m CmmAGraph +loadThreadState dflags = do + tso <- newTemp (gcWord dflags) + stack <- newTemp (gcWord dflags) + open_nursery <- openNursery dflags tso + pure $ catAGraphs [ + -- tso = CurrentTSO; + mkAssign (CmmLocal tso) currentTSOExpr, + -- stack = tso->stackobj; + mkAssign (CmmLocal stack) (CmmLoad (cmmOffset dflags (CmmReg (CmmLocal tso)) (tso_stackobj dflags)) (bWord dflags)), + -- Sp = stack->sp; + mkAssign spReg (CmmLoad (cmmOffset dflags (CmmReg (CmmLocal stack)) (stack_SP dflags)) (bWord dflags)), + -- SpLim = stack->stack + RESERVED_STACK_WORDS; + mkAssign spLimReg (cmmOffsetW dflags (cmmOffset dflags (CmmReg (CmmLocal stack)) (stack_STACK dflags)) + (rESERVED_STACK_WORDS dflags)), + -- HpAlloc = 0; + -- HpAlloc is assumed to be set to non-zero only by a failed + -- a heap check, see HeapStackCheck.cmm:GC_GENERIC + mkAssign hpAllocReg (zeroExpr dflags), + open_nursery, + -- and load the current cost centre stack from the TSO when profiling: + if gopt Opt_SccProfilingOn dflags + then storeCurCCS + (CmmLoad (cmmOffset dflags (CmmReg (CmmLocal tso)) + (tso_CCCS dflags)) (ccsType dflags)) + else mkNop + ] + + +emitOpenNursery :: FCode () +emitOpenNursery = do + dflags <- getDynFlags + tso <- newTemp (bWord dflags) + code <- openNursery dflags tso + emit $ mkAssign (CmmLocal tso) currentTSOExpr <*> code + +{- | +@openNursery dflags tso@ produces code to open the nursery. A local register +holding the value of @CurrentTSO@ is expected for efficiency. + +Opening the nursery corresponds to the following code: + +@ + tso = CurrentTSO; + cn = CurrentNursery; + bdfree = CurrentNursery->free; + bdstart = CurrentNursery->start; + + // We *add* the currently occupied portion of the nursery block to + // the allocation limit, because we will subtract it again in + // closeNursery. + tso->alloc_limit += bdfree - bdstart; + + // Set Hp to the last occupied word of the heap block. Why not the + // next unocupied word? Doing it this way means that we get to use + // an offset of zero more often, which might lead to slightly smaller + // code on some architectures. + Hp = bdfree - WDS(1); + + // Set HpLim to the end of the current nursery block (note that this block + // might be a block group, consisting of several adjacent blocks. + HpLim = bdstart + CurrentNursery->blocks*BLOCK_SIZE_W - 1; +@ +-} +openNursery :: MonadUnique m => DynFlags -> LocalReg -> m CmmAGraph +openNursery df tso = do + let tsoreg = CmmLocal tso + cnreg <- CmmLocal <$> newTemp (bWord df) + bdfreereg <- CmmLocal <$> newTemp (bWord df) + bdstartreg <- CmmLocal <$> newTemp (bWord df) + + -- These assignments are carefully ordered to reduce register + -- pressure and generate not completely awful code on x86. To see + -- what code we generate, look at the assembly for + -- stg_returnToStackTop in rts/StgStartup.cmm. + pure $ catAGraphs [ + mkAssign cnreg currentNurseryExpr, + mkAssign bdfreereg (CmmLoad (nursery_bdescr_free df cnreg) (bWord df)), + + -- Hp = CurrentNursery->free - 1; + mkAssign hpReg (cmmOffsetW df (CmmReg bdfreereg) (-1)), + + mkAssign bdstartreg (CmmLoad (nursery_bdescr_start df cnreg) (bWord df)), + + -- HpLim = CurrentNursery->start + + -- CurrentNursery->blocks*BLOCK_SIZE_W - 1; + mkAssign hpLimReg + (cmmOffsetExpr df + (CmmReg bdstartreg) + (cmmOffset df + (CmmMachOp (mo_wordMul df) [ + CmmMachOp (MO_SS_Conv W32 (wordWidth df)) + [CmmLoad (nursery_bdescr_blocks df cnreg) b32], + mkIntExpr df (bLOCK_SIZE df) + ]) + (-1) + ) + ), + + -- alloc = bd->free - bd->start + let alloc = + CmmMachOp (mo_wordSub df) [CmmReg bdfreereg, CmmReg bdstartreg] + + alloc_limit = cmmOffset df (CmmReg tsoreg) (tso_alloc_limit df) + in + + -- tso->alloc_limit += alloc + mkStore alloc_limit (CmmMachOp (MO_Add W64) + [ CmmLoad alloc_limit b64 + , CmmMachOp (mo_WordTo64 df) [alloc] ]) + + ] + +nursery_bdescr_free, nursery_bdescr_start, nursery_bdescr_blocks + :: DynFlags -> CmmReg -> CmmExpr +nursery_bdescr_free dflags cn = + cmmOffset dflags (CmmReg cn) (oFFSET_bdescr_free dflags) +nursery_bdescr_start dflags cn = + cmmOffset dflags (CmmReg cn) (oFFSET_bdescr_start dflags) +nursery_bdescr_blocks dflags cn = + cmmOffset dflags (CmmReg cn) (oFFSET_bdescr_blocks dflags) + +tso_stackobj, tso_CCCS, tso_alloc_limit, stack_STACK, stack_SP :: DynFlags -> ByteOff +tso_stackobj dflags = closureField dflags (oFFSET_StgTSO_stackobj dflags) +tso_alloc_limit dflags = closureField dflags (oFFSET_StgTSO_alloc_limit dflags) +tso_CCCS dflags = closureField dflags (oFFSET_StgTSO_cccs dflags) +stack_STACK dflags = closureField dflags (oFFSET_StgStack_stack dflags) +stack_SP dflags = closureField dflags (oFFSET_StgStack_sp dflags) + + +closureField :: DynFlags -> ByteOff -> ByteOff +closureField dflags off = off + fixedHdrSize dflags + +-- Note [Unlifted boxed arguments to foreign calls] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- For certain types passed to foreign calls, we adjust the actual +-- value passed to the call. For ByteArray#, Array#, SmallArray#, +-- and ArrayArray#, we pass the address of the array's payload, not +-- the address of the heap object. For example, consider +-- foreign import "c_foo" foo :: ByteArray# -> Int# -> IO () +-- At a Haskell call like `foo x y`, we'll generate a C call that +-- is more like +-- c_foo( x+8, y ) +-- where the "+8" takes the heap pointer (x :: ByteArray#) and moves +-- it past the header words of the ByteArray object to point directly +-- to the data inside the ByteArray#. (The exact offset depends +-- on the target architecture and on profiling) By contrast, (y :: Int#) +-- requires no such adjustment. +-- +-- This adjustment is performed by 'add_shim'. The size of the +-- adjustment depends on the type of heap object. But +-- how can we determine that type? There are two available options. +-- We could use the types of the actual values that the foreign call +-- has been applied to, or we could use the types present in the +-- foreign function's type. Prior to GHC 8.10, we used the former +-- strategy since it's a little more simple. However, in issue #16650 +-- and more compellingly in the comments of +-- https://gitlab.haskell.org/ghc/ghc/merge_requests/939, it was +-- demonstrated that this leads to bad behavior in the presence +-- of unsafeCoerce#. Returning to the above example, suppose the +-- Haskell call looked like +-- foo (unsafeCoerce# p) +-- where the types of expressions comprising the arguments are +-- p :: (Any :: TYPE 'UnliftedRep) +-- i :: Int# +-- so that the unsafe-coerce is between Any and ByteArray#. +-- These two types have the same kind (they are both represented by +-- a heap pointer) so no GC errors will occur if we do this unsafe coerce. +-- By the time this gets to the code generator the cast has been +-- discarded so we have +-- foo p y +-- But we *must* adjust the pointer to p by a ByteArray# shim, +-- *not* by an Any shim (the Any shim involves no offset at all). +-- +-- To avoid this bad behavior, we adopt the second strategy: use +-- the types present in the foreign function's type. +-- In collectStgFArgTypes, we convert the foreign function's +-- type to a list of StgFArgType. Then, in add_shim, we interpret +-- these as numeric offsets. + +getFCallArgs :: + [StgArg] + -> Type -- the type of the foreign function + -> FCode [(CmmExpr, ForeignHint)] +-- (a) Drop void args +-- (b) Add foreign-call shim code +-- It's (b) that makes this differ from getNonVoidArgAmodes +-- Precondition: args and typs have the same length +-- See Note [Unlifted boxed arguments to foreign calls] +getFCallArgs args typ + = do { mb_cmms <- mapM get (zipEqual "getFCallArgs" args (collectStgFArgTypes typ)) + ; return (catMaybes mb_cmms) } + where + get (arg,typ) + | null arg_reps + = return Nothing + | otherwise + = do { cmm <- getArgAmode (NonVoid arg) + ; dflags <- getDynFlags + ; return (Just (add_shim dflags typ cmm, hint)) } + where + arg_ty = stgArgType arg + arg_reps = typePrimRep arg_ty + hint = typeForeignHint arg_ty + +-- The minimum amount of information needed to determine +-- the offset to apply to an argument to a foreign call. +-- See Note [Unlifted boxed arguments to foreign calls] +data StgFArgType + = StgPlainType + | StgArrayType + | StgSmallArrayType + | StgByteArrayType + +-- See Note [Unlifted boxed arguments to foreign calls] +add_shim :: DynFlags -> StgFArgType -> CmmExpr -> CmmExpr +add_shim dflags ty expr = case ty of + StgPlainType -> expr + StgArrayType -> cmmOffsetB dflags expr (arrPtrsHdrSize dflags) + StgSmallArrayType -> cmmOffsetB dflags expr (smallArrPtrsHdrSize dflags) + StgByteArrayType -> cmmOffsetB dflags expr (arrWordsHdrSize dflags) + +-- From a function, extract information needed to determine +-- the offset of each argument when used as a C FFI argument. +-- See Note [Unlifted boxed arguments to foreign calls] +collectStgFArgTypes :: Type -> [StgFArgType] +collectStgFArgTypes = go [] + where + -- Skip foralls + go bs (ForAllTy _ res) = go bs res + go bs (AppTy{}) = reverse bs + go bs (TyConApp{}) = reverse bs + go bs (LitTy{}) = reverse bs + go bs (TyVarTy{}) = reverse bs + go _ (CastTy{}) = panic "myCollectTypeArgs: CastTy" + go _ (CoercionTy{}) = panic "myCollectTypeArgs: CoercionTy" + go bs (FunTy {ft_arg = arg, ft_res=res}) = + go (typeToStgFArgType arg:bs) res + +-- Choose the offset based on the type. For anything other +-- than an unlifted boxed type, there is no offset. +-- See Note [Unlifted boxed arguments to foreign calls] +typeToStgFArgType :: Type -> StgFArgType +typeToStgFArgType typ + | tycon == arrayPrimTyCon = StgArrayType + | tycon == mutableArrayPrimTyCon = StgArrayType + | tycon == arrayArrayPrimTyCon = StgArrayType + | tycon == mutableArrayArrayPrimTyCon = StgArrayType + | tycon == smallArrayPrimTyCon = StgSmallArrayType + | tycon == smallMutableArrayPrimTyCon = StgSmallArrayType + | tycon == byteArrayPrimTyCon = StgByteArrayType + | tycon == mutableByteArrayPrimTyCon = StgByteArrayType + | otherwise = StgPlainType + where + -- Should be a tycon app, since this is a foreign call. We look + -- through newtypes so the offset does not change if a user replaces + -- a type in a foreign function signature with a representationally + -- equivalent newtype. + tycon = tyConAppTyCon (unwrapType typ) + diff --git a/compiler/GHC/StgToCmm/Heap.hs b/compiler/GHC/StgToCmm/Heap.hs new file mode 100644 index 0000000000..a1f016c13c --- /dev/null +++ b/compiler/GHC/StgToCmm/Heap.hs @@ -0,0 +1,680 @@ +----------------------------------------------------------------------------- +-- +-- Stg to C--: heap management functions +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Heap ( + getVirtHp, setVirtHp, setRealHp, + getHpRelOffset, + + entryHeapCheck, altHeapCheck, noEscapeHeapCheck, altHeapCheckReturnsTo, + heapStackCheckGen, + entryHeapCheck', + + mkStaticClosureFields, mkStaticClosure, + + allocDynClosure, allocDynClosureCmm, allocHeapClosure, + emitSetDynHdr + ) where + +import GhcPrelude hiding ((<*>)) + +import StgSyn +import CLabel +import GHC.StgToCmm.Layout +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Prof (profDynAlloc, dynProfHdr, staticProfHdr) +import GHC.StgToCmm.Ticky +import GHC.StgToCmm.Closure +import GHC.StgToCmm.Env + +import MkGraph + +import Hoopl.Label +import SMRep +import BlockId +import Cmm +import CmmUtils +import CostCentre +import IdInfo( CafInfo(..), mayHaveCafRefs ) +import Id ( Id ) +import Module +import DynFlags +import FastString( mkFastString, fsLit ) +import Panic( sorry ) + +import Control.Monad (when) +import Data.Maybe (isJust) + +----------------------------------------------------------- +-- Initialise dynamic heap objects +----------------------------------------------------------- + +allocDynClosure + :: Maybe Id + -> CmmInfoTable + -> LambdaFormInfo + -> CmmExpr -- Cost Centre to stick in the object + -> CmmExpr -- Cost Centre to blame for this alloc + -- (usually the same; sometimes "OVERHEAD") + + -> [(NonVoid StgArg, VirtualHpOffset)] -- Offsets from start of object + -- ie Info ptr has offset zero. + -- No void args in here + -> FCode CmmExpr -- returns Hp+n + +allocDynClosureCmm + :: Maybe Id -> CmmInfoTable -> LambdaFormInfo -> CmmExpr -> CmmExpr + -> [(CmmExpr, ByteOff)] + -> FCode CmmExpr -- returns Hp+n + +-- allocDynClosure allocates the thing in the heap, +-- and modifies the virtual Hp to account for this. +-- The second return value is the graph that sets the value of the +-- returned LocalReg, which should point to the closure after executing +-- the graph. + +-- allocDynClosure returns an (Hp+8) CmmExpr, and hence the result is +-- only valid until Hp is changed. The caller should assign the +-- result to a LocalReg if it is required to remain live. +-- +-- The reason we don't assign it to a LocalReg here is that the caller +-- is often about to call regIdInfo, which immediately assigns the +-- result of allocDynClosure to a new temp in order to add the tag. +-- So by not generating a LocalReg here we avoid a common source of +-- new temporaries and save some compile time. This can be quite +-- significant - see test T4801. + + +allocDynClosure mb_id info_tbl lf_info use_cc _blame_cc args_w_offsets = do + let (args, offsets) = unzip args_w_offsets + cmm_args <- mapM getArgAmode args -- No void args + allocDynClosureCmm mb_id info_tbl lf_info + use_cc _blame_cc (zip cmm_args offsets) + + +allocDynClosureCmm mb_id info_tbl lf_info use_cc _blame_cc amodes_w_offsets = do + -- SAY WHAT WE ARE ABOUT TO DO + let rep = cit_rep info_tbl + tickyDynAlloc mb_id rep lf_info + let info_ptr = CmmLit (CmmLabel (cit_lbl info_tbl)) + allocHeapClosure rep info_ptr use_cc amodes_w_offsets + + +-- | Low-level heap object allocation. +allocHeapClosure + :: SMRep -- ^ representation of the object + -> CmmExpr -- ^ info pointer + -> CmmExpr -- ^ cost centre + -> [(CmmExpr,ByteOff)] -- ^ payload + -> FCode CmmExpr -- ^ returns the address of the object +allocHeapClosure rep info_ptr use_cc payload = do + profDynAlloc rep use_cc + + virt_hp <- getVirtHp + + -- Find the offset of the info-ptr word + let info_offset = virt_hp + 1 + -- info_offset is the VirtualHpOffset of the first + -- word of the new object + -- Remember, virtHp points to last allocated word, + -- ie 1 *before* the info-ptr word of new object. + + base <- getHpRelOffset info_offset + emitComment $ mkFastString "allocHeapClosure" + emitSetDynHdr base info_ptr use_cc + + -- Fill in the fields + hpStore base payload + + -- Bump the virtual heap pointer + dflags <- getDynFlags + setVirtHp (virt_hp + heapClosureSizeW dflags rep) + + return base + + +emitSetDynHdr :: CmmExpr -> CmmExpr -> CmmExpr -> FCode () +emitSetDynHdr base info_ptr ccs + = do dflags <- getDynFlags + hpStore base (zip (header dflags) [0, wORD_SIZE dflags ..]) + where + header :: DynFlags -> [CmmExpr] + header dflags = [info_ptr] ++ dynProfHdr dflags ccs + -- ToDo: Parallel stuff + -- No ticky header + +-- Store the item (expr,off) in base[off] +hpStore :: CmmExpr -> [(CmmExpr, ByteOff)] -> FCode () +hpStore base vals = do + dflags <- getDynFlags + sequence_ $ + [ emitStore (cmmOffsetB dflags base off) val | (val,off) <- vals ] + +----------------------------------------------------------- +-- Layout of static closures +----------------------------------------------------------- + +-- Make a static closure, adding on any extra padding needed for CAFs, +-- and adding a static link field if necessary. + +mkStaticClosureFields + :: DynFlags + -> CmmInfoTable + -> CostCentreStack + -> CafInfo + -> [CmmLit] -- Payload + -> [CmmLit] -- The full closure +mkStaticClosureFields dflags info_tbl ccs caf_refs payload + = mkStaticClosure dflags info_lbl ccs payload padding + static_link_field saved_info_field + where + info_lbl = cit_lbl info_tbl + + -- CAFs must have consistent layout, regardless of whether they + -- are actually updatable or not. The layout of a CAF is: + -- + -- 3 saved_info + -- 2 static_link + -- 1 indirectee + -- 0 info ptr + -- + -- the static_link and saved_info fields must always be in the + -- same place. So we use isThunkRep rather than closureUpdReqd + -- here: + + is_caf = isThunkRep (cit_rep info_tbl) + + padding + | is_caf && null payload = [mkIntCLit dflags 0] + | otherwise = [] + + static_link_field + | is_caf || staticClosureNeedsLink (mayHaveCafRefs caf_refs) info_tbl + = [static_link_value] + | otherwise + = [] + + saved_info_field + | is_caf = [mkIntCLit dflags 0] + | otherwise = [] + + -- For a static constructor which has NoCafRefs, we set the + -- static link field to a non-zero value so the garbage + -- collector will ignore it. + static_link_value + | mayHaveCafRefs caf_refs = mkIntCLit dflags 0 + | otherwise = mkIntCLit dflags 3 -- No CAF refs + -- See Note [STATIC_LINK fields] + -- in rts/sm/Storage.h + +mkStaticClosure :: DynFlags -> CLabel -> CostCentreStack -> [CmmLit] + -> [CmmLit] -> [CmmLit] -> [CmmLit] -> [CmmLit] +mkStaticClosure dflags info_lbl ccs payload padding static_link_field saved_info_field + = [CmmLabel info_lbl] + ++ staticProfHdr dflags ccs + ++ payload + ++ padding + ++ static_link_field + ++ saved_info_field + +----------------------------------------------------------- +-- Heap overflow checking +----------------------------------------------------------- + +{- Note [Heap checks] + ~~~~~~~~~~~~~~~~~~ +Heap checks come in various forms. We provide the following entry +points to the runtime system, all of which use the native C-- entry +convention. + + * gc() performs garbage collection and returns + nothing to its caller + + * A series of canned entry points like + r = gc_1p( r ) + where r is a pointer. This performs gc, and + then returns its argument r to its caller. + + * A series of canned entry points like + gcfun_2p( f, x, y ) + where f is a function closure of arity 2 + This performs garbage collection, keeping alive the + three argument ptrs, and then tail-calls f(x,y) + +These are used in the following circumstances + +* entryHeapCheck: Function entry + (a) With a canned GC entry sequence + f( f_clo, x:ptr, y:ptr ) { + Hp = Hp+8 + if Hp > HpLim goto L + ... + L: HpAlloc = 8 + jump gcfun_2p( f_clo, x, y ) } + Note the tail call to the garbage collector; + it should do no register shuffling + + (b) No canned sequence + f( f_clo, x:ptr, y:ptr, ...etc... ) { + T: Hp = Hp+8 + if Hp > HpLim goto L + ... + L: HpAlloc = 8 + call gc() -- Needs an info table + goto T } + +* altHeapCheck: Immediately following an eval + Started as + case f x y of r { (p,q) -> rhs } + (a) With a canned sequence for the results of f + (which is the very common case since + all boxed cases return just one pointer + ... + r = f( x, y ) + K: -- K needs an info table + Hp = Hp+8 + if Hp > HpLim goto L + ...code for rhs... + + L: r = gc_1p( r ) + goto K } + + Here, the info table needed by the call + to gc_1p should be the *same* as the + one for the call to f; the C-- optimiser + spots this sharing opportunity) + + (b) No canned sequence for results of f + Note second info table + ... + (r1,r2,r3) = call f( x, y ) + K: + Hp = Hp+8 + if Hp > HpLim goto L + ...code for rhs... + + L: call gc() -- Extra info table here + goto K + +* generalHeapCheck: Anywhere else + e.g. entry to thunk + case branch *not* following eval, + or let-no-escape + Exactly the same as the previous case: + + K: -- K needs an info table + Hp = Hp+8 + if Hp > HpLim goto L + ... + + L: call gc() + goto K +-} + +-------------------------------------------------------------- +-- A heap/stack check at a function or thunk entry point. + +entryHeapCheck :: ClosureInfo + -> Maybe LocalReg -- Function (closure environment) + -> Int -- Arity -- not same as len args b/c of voids + -> [LocalReg] -- Non-void args (empty for thunk) + -> FCode () + -> FCode () + +entryHeapCheck cl_info nodeSet arity args code + = entryHeapCheck' is_fastf node arity args code + where + node = case nodeSet of + Just r -> CmmReg (CmmLocal r) + Nothing -> CmmLit (CmmLabel $ staticClosureLabel cl_info) + + is_fastf = case closureFunInfo cl_info of + Just (_, ArgGen _) -> False + _otherwise -> True + +-- | lower-level version for CmmParse +entryHeapCheck' :: Bool -- is a known function pattern + -> CmmExpr -- expression for the closure pointer + -> Int -- Arity -- not same as len args b/c of voids + -> [LocalReg] -- Non-void args (empty for thunk) + -> FCode () + -> FCode () +entryHeapCheck' is_fastf node arity args code + = do dflags <- getDynFlags + let is_thunk = arity == 0 + + args' = map (CmmReg . CmmLocal) args + stg_gc_fun = CmmReg (CmmGlobal GCFun) + stg_gc_enter1 = CmmReg (CmmGlobal GCEnter1) + + {- Thunks: jump stg_gc_enter_1 + + Function (fast): call (NativeNode) stg_gc_fun(fun, args) + + Function (slow): call (slow) stg_gc_fun(fun, args) + -} + gc_call upd + | is_thunk + = mkJump dflags NativeNodeCall stg_gc_enter1 [node] upd + + | is_fastf + = mkJump dflags NativeNodeCall stg_gc_fun (node : args') upd + + | otherwise + = mkJump dflags Slow stg_gc_fun (node : args') upd + + updfr_sz <- getUpdFrameOff + + loop_id <- newBlockId + emitLabel loop_id + heapCheck True True (gc_call updfr_sz <*> mkBranch loop_id) code + +-- ------------------------------------------------------------ +-- A heap/stack check in a case alternative + + +-- If there are multiple alts and we need to GC, but don't have a +-- continuation already (the scrut was simple), then we should +-- pre-generate the continuation. (if there are multiple alts it is +-- always a canned GC point). + +-- altHeapCheck: +-- If we have a return continuation, +-- then if it is a canned GC pattern, +-- then we do mkJumpReturnsTo +-- else we do a normal call to stg_gc_noregs +-- else if it is a canned GC pattern, +-- then generate the continuation and do mkCallReturnsTo +-- else we do a normal call to stg_gc_noregs + +altHeapCheck :: [LocalReg] -> FCode a -> FCode a +altHeapCheck regs code = altOrNoEscapeHeapCheck False regs code + +altOrNoEscapeHeapCheck :: Bool -> [LocalReg] -> FCode a -> FCode a +altOrNoEscapeHeapCheck checkYield regs code = do + dflags <- getDynFlags + case cannedGCEntryPoint dflags regs of + Nothing -> genericGC checkYield code + Just gc -> do + lret <- newBlockId + let (off, _, copyin) = copyInOflow dflags NativeReturn (Young lret) regs [] + lcont <- newBlockId + tscope <- getTickScope + emitOutOfLine lret (copyin <*> mkBranch lcont, tscope) + emitLabel lcont + cannedGCReturnsTo checkYield False gc regs lret off code + +altHeapCheckReturnsTo :: [LocalReg] -> Label -> ByteOff -> FCode a -> FCode a +altHeapCheckReturnsTo regs lret off code + = do dflags <- getDynFlags + case cannedGCEntryPoint dflags regs of + Nothing -> genericGC False code + Just gc -> cannedGCReturnsTo False True gc regs lret off code + +-- noEscapeHeapCheck is implemented identically to altHeapCheck (which +-- is more efficient), but cannot be optimized away in the non-allocating +-- case because it may occur in a loop +noEscapeHeapCheck :: [LocalReg] -> FCode a -> FCode a +noEscapeHeapCheck regs code = altOrNoEscapeHeapCheck True regs code + +cannedGCReturnsTo :: Bool -> Bool -> CmmExpr -> [LocalReg] -> Label -> ByteOff + -> FCode a + -> FCode a +cannedGCReturnsTo checkYield cont_on_stack gc regs lret off code + = do dflags <- getDynFlags + updfr_sz <- getUpdFrameOff + heapCheck False checkYield (gc_call dflags gc updfr_sz) code + where + reg_exprs = map (CmmReg . CmmLocal) regs + -- Note [stg_gc arguments] + + -- NB. we use the NativeReturn convention for passing arguments + -- to the canned heap-check routines, because we are in a case + -- alternative and hence the [LocalReg] was passed to us in the + -- NativeReturn convention. + gc_call dflags label sp + | cont_on_stack + = mkJumpReturnsTo dflags label NativeReturn reg_exprs lret off sp + | otherwise + = mkCallReturnsTo dflags label NativeReturn reg_exprs lret off sp [] + +genericGC :: Bool -> FCode a -> FCode a +genericGC checkYield code + = do updfr_sz <- getUpdFrameOff + lretry <- newBlockId + emitLabel lretry + call <- mkCall generic_gc (GC, GC) [] [] updfr_sz [] + heapCheck False checkYield (call <*> mkBranch lretry) code + +cannedGCEntryPoint :: DynFlags -> [LocalReg] -> Maybe CmmExpr +cannedGCEntryPoint dflags regs + = case map localRegType regs of + [] -> Just (mkGcLabel "stg_gc_noregs") + [ty] + | isGcPtrType ty -> Just (mkGcLabel "stg_gc_unpt_r1") + | isFloatType ty -> case width of + W32 -> Just (mkGcLabel "stg_gc_f1") + W64 -> Just (mkGcLabel "stg_gc_d1") + _ -> Nothing + + | width == wordWidth dflags -> Just (mkGcLabel "stg_gc_unbx_r1") + | width == W64 -> Just (mkGcLabel "stg_gc_l1") + | otherwise -> Nothing + where + width = typeWidth ty + [ty1,ty2] + | isGcPtrType ty1 + && isGcPtrType ty2 -> Just (mkGcLabel "stg_gc_pp") + [ty1,ty2,ty3] + | isGcPtrType ty1 + && isGcPtrType ty2 + && isGcPtrType ty3 -> Just (mkGcLabel "stg_gc_ppp") + [ty1,ty2,ty3,ty4] + | isGcPtrType ty1 + && isGcPtrType ty2 + && isGcPtrType ty3 + && isGcPtrType ty4 -> Just (mkGcLabel "stg_gc_pppp") + _otherwise -> Nothing + +-- Note [stg_gc arguments] +-- It might seem that we could avoid passing the arguments to the +-- stg_gc function, because they are already in the right registers. +-- While this is usually the case, it isn't always. Sometimes the +-- code generator has cleverly avoided the eval in a case, e.g. in +-- ffi/should_run/4221.hs we found +-- +-- case a_r1mb of z +-- FunPtr x y -> ... +-- +-- where a_r1mb is bound a top-level constructor, and is known to be +-- evaluated. The codegen just assigns x, y and z, and continues; +-- R1 is never assigned. +-- +-- So we'll have to rely on optimisations to eliminatethese +-- assignments where possible. + + +-- | The generic GC procedure; no params, no results +generic_gc :: CmmExpr +generic_gc = mkGcLabel "stg_gc_noregs" + +-- | Create a CLabel for calling a garbage collector entry point +mkGcLabel :: String -> CmmExpr +mkGcLabel s = CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit s))) + +------------------------------- +heapCheck :: Bool -> Bool -> CmmAGraph -> FCode a -> FCode a +heapCheck checkStack checkYield do_gc code + = getHeapUsage $ \ hpHw -> + -- Emit heap checks, but be sure to do it lazily so + -- that the conditionals on hpHw don't cause a black hole + do { dflags <- getDynFlags + ; let mb_alloc_bytes + | hpHw > mBLOCK_SIZE = sorry $ unlines + [" Trying to allocate more than "++show mBLOCK_SIZE++" bytes.", + "", + "This is currently not possible due to a limitation of GHC's code generator.", + "See https://gitlab.haskell.org/ghc/ghc/issues/4505 for details.", + "Suggestion: read data from a file instead of having large static data", + "structures in code."] + | hpHw > 0 = Just (mkIntExpr dflags (hpHw * (wORD_SIZE dflags))) + | otherwise = Nothing + where mBLOCK_SIZE = bLOCKS_PER_MBLOCK dflags * bLOCK_SIZE_W dflags + stk_hwm | checkStack = Just (CmmLit CmmHighStackMark) + | otherwise = Nothing + ; codeOnly $ do_checks stk_hwm checkYield mb_alloc_bytes do_gc + ; tickyAllocHeap True hpHw + ; setRealHp hpHw + ; code } + +heapStackCheckGen :: Maybe CmmExpr -> Maybe CmmExpr -> FCode () +heapStackCheckGen stk_hwm mb_bytes + = do updfr_sz <- getUpdFrameOff + lretry <- newBlockId + emitLabel lretry + call <- mkCall generic_gc (GC, GC) [] [] updfr_sz [] + do_checks stk_hwm False mb_bytes (call <*> mkBranch lretry) + +-- Note [Single stack check] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~ +-- When compiling a function we can determine how much stack space it +-- will use. We therefore need to perform only a single stack check at +-- the beginning of a function to see if we have enough stack space. +-- +-- The check boils down to comparing Sp-N with SpLim, where N is the +-- amount of stack space needed (see Note [Stack usage] below). *BUT* +-- at this stage of the pipeline we are not supposed to refer to Sp +-- itself, because the stack is not yet manifest, so we don't quite +-- know where Sp pointing. + +-- So instead of referring directly to Sp - as we used to do in the +-- past - the code generator uses (old + 0) in the stack check. That +-- is the address of the first word of the old area, so if we add N +-- we'll get the address of highest used word. +-- +-- This makes the check robust. For example, while we need to perform +-- only one stack check for each function, we could in theory place +-- more stack checks later in the function. They would be redundant, +-- but not incorrect (in a sense that they should not change program +-- behaviour). We need to make sure however that a stack check +-- inserted after incrementing the stack pointer checks for a +-- respectively smaller stack space. This would not be the case if the +-- code generator produced direct references to Sp. By referencing +-- (old + 0) we make sure that we always check for a correct amount of +-- stack: when converting (old + 0) to Sp the stack layout phase takes +-- into account changes already made to stack pointer. The idea for +-- this change came from observations made while debugging #8275. + +-- Note [Stack usage] +-- ~~~~~~~~~~~~~~~~~~ +-- At the moment we convert from STG to Cmm we don't know N, the +-- number of bytes of stack that the function will use, so we use a +-- special late-bound CmmLit, namely +-- CmmHighStackMark +-- to stand for the number of bytes needed. When the stack is made +-- manifest, the number of bytes needed is calculated, and used to +-- replace occurrences of CmmHighStackMark +-- +-- The (Maybe CmmExpr) passed to do_checks is usually +-- Just (CmmLit CmmHighStackMark) +-- but can also (in certain hand-written RTS functions) +-- Just (CmmLit 8) or some other fixed valuet +-- If it is Nothing, we don't generate a stack check at all. + +do_checks :: Maybe CmmExpr -- Should we check the stack? + -- See Note [Stack usage] + -> Bool -- Should we check for preemption? + -> Maybe CmmExpr -- Heap headroom (bytes) + -> CmmAGraph -- What to do on failure + -> FCode () +do_checks mb_stk_hwm checkYield mb_alloc_lit do_gc = do + dflags <- getDynFlags + gc_id <- newBlockId + + let + Just alloc_lit = mb_alloc_lit + + bump_hp = cmmOffsetExprB dflags hpExpr alloc_lit + + -- Sp overflow if ((old + 0) - CmmHighStack < SpLim) + -- At the beginning of a function old + 0 = Sp + -- See Note [Single stack check] + sp_oflo sp_hwm = + CmmMachOp (mo_wordULt dflags) + [CmmMachOp (MO_Sub (typeWidth (cmmRegType dflags spReg))) + [CmmStackSlot Old 0, sp_hwm], + CmmReg spLimReg] + + -- Hp overflow if (Hp > HpLim) + -- (Hp has been incremented by now) + -- HpLim points to the LAST WORD of valid allocation space. + hp_oflo = CmmMachOp (mo_wordUGt dflags) [hpExpr, hpLimExpr] + + alloc_n = mkAssign hpAllocReg alloc_lit + + case mb_stk_hwm of + Nothing -> return () + Just stk_hwm -> tickyStackCheck + >> (emit =<< mkCmmIfGoto' (sp_oflo stk_hwm) gc_id (Just False) ) + + -- Emit new label that might potentially be a header + -- of a self-recursive tail call. + -- See Note [Self-recursive loop header]. + self_loop_info <- getSelfLoop + case self_loop_info of + Just (_, loop_header_id, _) + | checkYield && isJust mb_stk_hwm -> emitLabel loop_header_id + _otherwise -> return () + + if (isJust mb_alloc_lit) + then do + tickyHeapCheck + emitAssign hpReg bump_hp + emit =<< mkCmmIfThen' hp_oflo (alloc_n <*> mkBranch gc_id) (Just False) + else do + when (checkYield && not (gopt Opt_OmitYields dflags)) $ do + -- Yielding if HpLim == 0 + let yielding = CmmMachOp (mo_wordEq dflags) + [CmmReg hpLimReg, + CmmLit (zeroCLit dflags)] + emit =<< mkCmmIfGoto' yielding gc_id (Just False) + + tscope <- getTickScope + emitOutOfLine gc_id + (do_gc, tscope) -- this is expected to jump back somewhere + + -- Test for stack pointer exhaustion, then + -- bump heap pointer, and test for heap exhaustion + -- Note that we don't move the heap pointer unless the + -- stack check succeeds. Otherwise we might end up + -- with slop at the end of the current block, which can + -- confuse the LDV profiler. + +-- Note [Self-recursive loop header] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- Self-recursive loop header is required by loopification optimization (See +-- Note [Self-recursive tail calls] in GHC.StgToCmm.Expr). We emit it if: +-- +-- 1. There is information about self-loop in the FCode environment. We don't +-- check the binder (first component of the self_loop_info) because we are +-- certain that if the self-loop info is present then we are compiling the +-- binder body. Reason: the only possible way to get here with the +-- self_loop_info present is from closureCodeBody. +-- +-- 2. checkYield && isJust mb_stk_hwm. checkYield tells us that it is possible +-- to preempt the heap check (see #367 for motivation behind this check). It +-- is True for heap checks placed at the entry to a function and +-- let-no-escape heap checks but false for other heap checks (eg. in case +-- alternatives or created from hand-written high-level Cmm). The second +-- check (isJust mb_stk_hwm) is true for heap checks at the entry to a +-- function and some heap checks created in hand-written Cmm. Otherwise it +-- is Nothing. In other words the only situation when both conditions are +-- true is when compiling stack and heap checks at the entry to a +-- function. This is the only situation when we want to emit a self-loop +-- label. diff --git a/compiler/GHC/StgToCmm/Hpc.hs b/compiler/GHC/StgToCmm/Hpc.hs new file mode 100644 index 0000000000..e33d39245c --- /dev/null +++ b/compiler/GHC/StgToCmm/Hpc.hs @@ -0,0 +1,48 @@ +----------------------------------------------------------------------------- +-- +-- Code generation for coverage +-- +-- (c) Galois Connections, Inc. 2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Hpc ( initHpc, mkTickBox ) where + +import GhcPrelude + +import GHC.StgToCmm.Monad + +import MkGraph +import CmmExpr +import CLabel +import Module +import CmmUtils +import GHC.StgToCmm.Utils +import HscTypes +import DynFlags + +import Control.Monad + +mkTickBox :: DynFlags -> Module -> Int -> CmmAGraph +mkTickBox dflags mod n + = mkStore tick_box (CmmMachOp (MO_Add W64) + [ CmmLoad tick_box b64 + , CmmLit (CmmInt 1 W64) + ]) + where + tick_box = cmmIndex dflags W64 + (CmmLit $ CmmLabel $ mkHpcTicksLabel $ mod) + n + +initHpc :: Module -> HpcInfo -> FCode () +-- Emit top-level tables for HPC and return code to initialise +initHpc _ (NoHpcInfo {}) + = return () +initHpc this_mod (HpcInfo tickCount _hashNo) + = do dflags <- getDynFlags + when (gopt Opt_Hpc dflags) $ + do emitDataLits (mkHpcTicksLabel this_mod) + [ (CmmInt 0 W64) + | _ <- take tickCount [0 :: Int ..] + ] + diff --git a/compiler/GHC/StgToCmm/Layout.hs b/compiler/GHC/StgToCmm/Layout.hs new file mode 100644 index 0000000000..f4834376ed --- /dev/null +++ b/compiler/GHC/StgToCmm/Layout.hs @@ -0,0 +1,623 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Building info tables. +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Layout ( + mkArgDescr, + emitCall, emitReturn, adjustHpBackwards, + + emitClosureProcAndInfoTable, + emitClosureAndInfoTable, + + slowCall, directCall, + + FieldOffOrPadding(..), + ClosureHeader(..), + mkVirtHeapOffsets, + mkVirtHeapOffsetsWithPadding, + mkVirtConstrOffsets, + mkVirtConstrSizes, + getHpRelOffset, + + ArgRep(..), toArgRep, argRepSizeW -- re-exported from GHC.StgToCmm.ArgRep + ) where + + +#include "HsVersions.h" + +import GhcPrelude hiding ((<*>)) + +import GHC.StgToCmm.Closure +import GHC.StgToCmm.Env +import GHC.StgToCmm.ArgRep -- notably: ( slowCallPattern ) +import GHC.StgToCmm.Ticky +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Utils + +import MkGraph +import SMRep +import BlockId +import Cmm +import CmmUtils +import CmmInfo +import CLabel +import StgSyn +import Id +import TyCon ( PrimRep(..), primRepSizeB ) +import BasicTypes ( RepArity ) +import DynFlags +import Module + +import Util +import Data.List +import Outputable +import FastString +import Control.Monad + +------------------------------------------------------------------------ +-- Call and return sequences +------------------------------------------------------------------------ + +-- | Return multiple values to the sequel +-- +-- If the sequel is @Return@ +-- +-- > return (x,y) +-- +-- If the sequel is @AssignTo [p,q]@ +-- +-- > p=x; q=y; +-- +emitReturn :: [CmmExpr] -> FCode ReturnKind +emitReturn results + = do { dflags <- getDynFlags + ; sequel <- getSequel + ; updfr_off <- getUpdFrameOff + ; case sequel of + Return -> + do { adjustHpBackwards + ; let e = CmmLoad (CmmStackSlot Old updfr_off) (gcWord dflags) + ; emit (mkReturn dflags (entryCode dflags e) results updfr_off) + } + AssignTo regs adjust -> + do { when adjust adjustHpBackwards + ; emitMultiAssign regs results } + ; return AssignedDirectly + } + + +-- | @emitCall conv fun args@ makes a call to the entry-code of @fun@, +-- using the call/return convention @conv@, passing @args@, and +-- returning the results to the current sequel. +-- +emitCall :: (Convention, Convention) -> CmmExpr -> [CmmExpr] -> FCode ReturnKind +emitCall convs fun args + = emitCallWithExtraStack convs fun args noExtraStack + + +-- | @emitCallWithExtraStack conv fun args stack@ makes a call to the +-- entry-code of @fun@, using the call/return convention @conv@, +-- passing @args@, pushing some extra stack frames described by +-- @stack@, and returning the results to the current sequel. +-- +emitCallWithExtraStack + :: (Convention, Convention) -> CmmExpr -> [CmmExpr] + -> [CmmExpr] -> FCode ReturnKind +emitCallWithExtraStack (callConv, retConv) fun args extra_stack + = do { dflags <- getDynFlags + ; adjustHpBackwards + ; sequel <- getSequel + ; updfr_off <- getUpdFrameOff + ; case sequel of + Return -> do + emit $ mkJumpExtra dflags callConv fun args updfr_off extra_stack + return AssignedDirectly + AssignTo res_regs _ -> do + k <- newBlockId + let area = Young k + (off, _, copyin) = copyInOflow dflags retConv area res_regs [] + copyout = mkCallReturnsTo dflags fun callConv args k off updfr_off + extra_stack + tscope <- getTickScope + emit (copyout <*> mkLabel k tscope <*> copyin) + return (ReturnedTo k off) + } + + +adjustHpBackwards :: FCode () +-- This function adjusts the heap pointer just before a tail call or +-- return. At a call or return, the virtual heap pointer may be less +-- than the real Hp, because the latter was advanced to deal with +-- the worst-case branch of the code, and we may be in a better-case +-- branch. In that case, move the real Hp *back* and retract some +-- ticky allocation count. +-- +-- It *does not* deal with high-water-mark adjustment. That's done by +-- functions which allocate heap. +adjustHpBackwards + = do { hp_usg <- getHpUsage + ; let rHp = realHp hp_usg + vHp = virtHp hp_usg + adjust_words = vHp -rHp + ; new_hp <- getHpRelOffset vHp + + ; emit (if adjust_words == 0 + then mkNop + else mkAssign hpReg new_hp) -- Generates nothing when vHp==rHp + + ; tickyAllocHeap False adjust_words -- ...ditto + + ; setRealHp vHp + } + + +------------------------------------------------------------------------- +-- Making calls: directCall and slowCall +------------------------------------------------------------------------- + +-- General plan is: +-- - we'll make *one* fast call, either to the function itself +-- (directCall) or to stg_ap_<pat>_fast (slowCall) +-- Any left-over arguments will be pushed on the stack, +-- +-- e.g. Sp[old+8] = arg1 +-- Sp[old+16] = arg2 +-- Sp[old+32] = stg_ap_pp_info +-- R2 = arg3 +-- R3 = arg4 +-- call f() return to Nothing updfr_off: 32 + + +directCall :: Convention -> CLabel -> RepArity -> [StgArg] -> FCode ReturnKind +-- (directCall f n args) +-- calls f(arg1, ..., argn), and applies the result to the remaining args +-- The function f has arity n, and there are guaranteed at least n args +-- Both arity and args include void args +directCall conv lbl arity stg_args + = do { argreps <- getArgRepsAmodes stg_args + ; direct_call "directCall" conv lbl arity argreps } + + +slowCall :: CmmExpr -> [StgArg] -> FCode ReturnKind +-- (slowCall fun args) applies fun to args, returning the results to Sequel +slowCall fun stg_args + = do dflags <- getDynFlags + argsreps <- getArgRepsAmodes stg_args + let (rts_fun, arity) = slowCallPattern (map fst argsreps) + + (r, slow_code) <- getCodeR $ do + r <- direct_call "slow_call" NativeNodeCall + (mkRtsApFastLabel rts_fun) arity ((P,Just fun):argsreps) + emitComment $ mkFastString ("slow_call for " ++ + showSDoc dflags (ppr fun) ++ + " with pat " ++ unpackFS rts_fun) + return r + + -- Note [avoid intermediate PAPs] + let n_args = length stg_args + if n_args > arity && optLevel dflags >= 2 + then do + funv <- (CmmReg . CmmLocal) `fmap` assignTemp fun + fun_iptr <- (CmmReg . CmmLocal) `fmap` + assignTemp (closureInfoPtr dflags (cmmUntag dflags funv)) + + -- ToDo: we could do slightly better here by reusing the + -- continuation from the slow call, which we have in r. + -- Also we'd like to push the continuation on the stack + -- before the branch, so that we only get one copy of the + -- code that saves all the live variables across the + -- call, but that might need some improvements to the + -- special case in the stack layout code to handle this + -- (see Note [diamond proc point]). + + fast_code <- getCode $ + emitCall (NativeNodeCall, NativeReturn) + (entryCode dflags fun_iptr) + (nonVArgs ((P,Just funv):argsreps)) + + slow_lbl <- newBlockId + fast_lbl <- newBlockId + is_tagged_lbl <- newBlockId + end_lbl <- newBlockId + + let correct_arity = cmmEqWord dflags (funInfoArity dflags fun_iptr) + (mkIntExpr dflags n_args) + + tscope <- getTickScope + emit (mkCbranch (cmmIsTagged dflags funv) + is_tagged_lbl slow_lbl (Just True) + <*> mkLabel is_tagged_lbl tscope + <*> mkCbranch correct_arity fast_lbl slow_lbl (Just True) + <*> mkLabel fast_lbl tscope + <*> fast_code + <*> mkBranch end_lbl + <*> mkLabel slow_lbl tscope + <*> slow_code + <*> mkLabel end_lbl tscope) + return r + + else do + emit slow_code + return r + + +-- Note [avoid intermediate PAPs] +-- +-- A slow call which needs multiple generic apply patterns will be +-- almost guaranteed to create one or more intermediate PAPs when +-- applied to a function that takes the correct number of arguments. +-- We try to avoid this situation by generating code to test whether +-- we are calling a function with the correct number of arguments +-- first, i.e.: +-- +-- if (TAG(f) != 0} { // f is not a thunk +-- if (f->info.arity == n) { +-- ... make a fast call to f ... +-- } +-- } +-- ... otherwise make the slow call ... +-- +-- We *only* do this when the call requires multiple generic apply +-- functions, which requires pushing extra stack frames and probably +-- results in intermediate PAPs. (I say probably, because it might be +-- that we're over-applying a function, but that seems even less +-- likely). +-- +-- This very rarely applies, but if it does happen in an inner loop it +-- can have a severe impact on performance (#6084). + + +-------------- +direct_call :: String + -> Convention -- e.g. NativeNodeCall or NativeDirectCall + -> CLabel -> RepArity + -> [(ArgRep,Maybe CmmExpr)] -> FCode ReturnKind +direct_call caller call_conv lbl arity args + | debugIsOn && args `lengthLessThan` real_arity -- Too few args + = do -- Caller should ensure that there enough args! + pprPanic "direct_call" $ + text caller <+> ppr arity <+> + ppr lbl <+> ppr (length args) <+> + ppr (map snd args) <+> ppr (map fst args) + + | null rest_args -- Precisely the right number of arguments + = emitCall (call_conv, NativeReturn) target (nonVArgs args) + + | otherwise -- Note [over-saturated calls] + = do dflags <- getDynFlags + emitCallWithExtraStack (call_conv, NativeReturn) + target + (nonVArgs fast_args) + (nonVArgs (stack_args dflags)) + where + target = CmmLit (CmmLabel lbl) + (fast_args, rest_args) = splitAt real_arity args + stack_args dflags = slowArgs dflags rest_args + real_arity = case call_conv of + NativeNodeCall -> arity+1 + _ -> arity + + +-- When constructing calls, it is easier to keep the ArgReps and the +-- CmmExprs zipped together. However, a void argument has no +-- representation, so we need to use Maybe CmmExpr (the alternative of +-- using zeroCLit or even undefined would work, but would be ugly). +-- +getArgRepsAmodes :: [StgArg] -> FCode [(ArgRep, Maybe CmmExpr)] +getArgRepsAmodes = mapM getArgRepAmode + where getArgRepAmode arg + | V <- rep = return (V, Nothing) + | otherwise = do expr <- getArgAmode (NonVoid arg) + return (rep, Just expr) + where rep = toArgRep (argPrimRep arg) + +nonVArgs :: [(ArgRep, Maybe CmmExpr)] -> [CmmExpr] +nonVArgs [] = [] +nonVArgs ((_,Nothing) : args) = nonVArgs args +nonVArgs ((_,Just arg) : args) = arg : nonVArgs args + +{- +Note [over-saturated calls] + +The natural thing to do for an over-saturated call would be to call +the function with the correct number of arguments, and then apply the +remaining arguments to the value returned, e.g. + + f a b c d (where f has arity 2) + --> + r = call f(a,b) + call r(c,d) + +but this entails + - saving c and d on the stack + - making a continuation info table + - at the continuation, loading c and d off the stack into regs + - finally, call r + +Note that since there are a fixed number of different r's +(e.g. stg_ap_pp_fast), we can also pre-compile continuations +that correspond to each of them, rather than generating a fresh +one for each over-saturated call. + +Not only does this generate much less code, it is faster too. We will +generate something like: + +Sp[old+16] = c +Sp[old+24] = d +Sp[old+32] = stg_ap_pp_info +call f(a,b) -- usual calling convention + +For the purposes of the CmmCall node, we count this extra stack as +just more arguments that we are passing on the stack (cml_args). +-} + +-- | 'slowArgs' takes a list of function arguments and prepares them for +-- pushing on the stack for "extra" arguments to a function which requires +-- fewer arguments than we currently have. +slowArgs :: DynFlags -> [(ArgRep, Maybe CmmExpr)] -> [(ArgRep, Maybe CmmExpr)] +slowArgs _ [] = [] +slowArgs dflags args -- careful: reps contains voids (V), but args does not + | gopt Opt_SccProfilingOn dflags + = save_cccs ++ this_pat ++ slowArgs dflags rest_args + | otherwise = this_pat ++ slowArgs dflags rest_args + where + (arg_pat, n) = slowCallPattern (map fst args) + (call_args, rest_args) = splitAt n args + + stg_ap_pat = mkCmmRetInfoLabel rtsUnitId arg_pat + this_pat = (N, Just (mkLblExpr stg_ap_pat)) : call_args + save_cccs = [(N, Just (mkLblExpr save_cccs_lbl)), (N, Just cccsExpr)] + save_cccs_lbl = mkCmmRetInfoLabel rtsUnitId (fsLit "stg_restore_cccs") + +------------------------------------------------------------------------- +---- Laying out objects on the heap and stack +------------------------------------------------------------------------- + +-- The heap always grows upwards, so hpRel is easy to compute +hpRel :: VirtualHpOffset -- virtual offset of Hp + -> VirtualHpOffset -- virtual offset of The Thing + -> WordOff -- integer word offset +hpRel hp off = off - hp + +getHpRelOffset :: VirtualHpOffset -> FCode CmmExpr +-- See Note [Virtual and real heap pointers] in GHC.StgToCmm.Monad +getHpRelOffset virtual_offset + = do dflags <- getDynFlags + hp_usg <- getHpUsage + return (cmmRegOffW dflags hpReg (hpRel (realHp hp_usg) virtual_offset)) + +data FieldOffOrPadding a + = FieldOff (NonVoid a) -- Something that needs an offset. + ByteOff -- Offset in bytes. + | Padding ByteOff -- Length of padding in bytes. + ByteOff -- Offset in bytes. + +-- | Used to tell the various @mkVirtHeapOffsets@ functions what kind +-- of header the object has. This will be accounted for in the +-- offsets of the fields returned. +data ClosureHeader + = NoHeader + | StdHeader + | ThunkHeader + +mkVirtHeapOffsetsWithPadding + :: DynFlags + -> ClosureHeader -- What kind of header to account for + -> [NonVoid (PrimRep, a)] -- Things to make offsets for + -> ( WordOff -- Total number of words allocated + , WordOff -- Number of words allocated for *pointers* + , [FieldOffOrPadding a] -- Either an offset or padding. + ) + +-- Things with their offsets from start of object in order of +-- increasing offset; BUT THIS MAY BE DIFFERENT TO INPUT ORDER +-- First in list gets lowest offset, which is initial offset + 1. +-- +-- mkVirtHeapOffsetsWithPadding always returns boxed things with smaller offsets +-- than the unboxed things + +mkVirtHeapOffsetsWithPadding dflags header things = + ASSERT(not (any (isVoidRep . fst . fromNonVoid) things)) + ( tot_wds + , bytesToWordsRoundUp dflags bytes_of_ptrs + , concat (ptrs_w_offsets ++ non_ptrs_w_offsets) ++ final_pad + ) + where + hdr_words = case header of + NoHeader -> 0 + StdHeader -> fixedHdrSizeW dflags + ThunkHeader -> thunkHdrSize dflags + hdr_bytes = wordsToBytes dflags hdr_words + + (ptrs, non_ptrs) = partition (isGcPtrRep . fst . fromNonVoid) things + + (bytes_of_ptrs, ptrs_w_offsets) = + mapAccumL computeOffset 0 ptrs + (tot_bytes, non_ptrs_w_offsets) = + mapAccumL computeOffset bytes_of_ptrs non_ptrs + + tot_wds = bytesToWordsRoundUp dflags tot_bytes + + final_pad_size = tot_wds * word_size - tot_bytes + final_pad + | final_pad_size > 0 = [(Padding final_pad_size + (hdr_bytes + tot_bytes))] + | otherwise = [] + + word_size = wORD_SIZE dflags + + computeOffset bytes_so_far nv_thing = + (new_bytes_so_far, with_padding field_off) + where + (rep, thing) = fromNonVoid nv_thing + + -- Size of the field in bytes. + !sizeB = primRepSizeB dflags rep + + -- Align the start offset (eg, 2-byte value should be 2-byte aligned). + -- But not more than to a word. + !align = min word_size sizeB + !start = roundUpTo bytes_so_far align + !padding = start - bytes_so_far + + -- Final offset is: + -- size of header + bytes_so_far + padding + !final_offset = hdr_bytes + bytes_so_far + padding + !new_bytes_so_far = start + sizeB + field_off = FieldOff (NonVoid thing) final_offset + + with_padding field_off + | padding == 0 = [field_off] + | otherwise = [ Padding padding (hdr_bytes + bytes_so_far) + , field_off + ] + + +mkVirtHeapOffsets + :: DynFlags + -> ClosureHeader -- What kind of header to account for + -> [NonVoid (PrimRep,a)] -- Things to make offsets for + -> (WordOff, -- _Total_ number of words allocated + WordOff, -- Number of words allocated for *pointers* + [(NonVoid a, ByteOff)]) +mkVirtHeapOffsets dflags header things = + ( tot_wds + , ptr_wds + , [ (field, offset) | (FieldOff field offset) <- things_offsets ] + ) + where + (tot_wds, ptr_wds, things_offsets) = + mkVirtHeapOffsetsWithPadding dflags header things + +-- | Just like mkVirtHeapOffsets, but for constructors +mkVirtConstrOffsets + :: DynFlags -> [NonVoid (PrimRep, a)] + -> (WordOff, WordOff, [(NonVoid a, ByteOff)]) +mkVirtConstrOffsets dflags = mkVirtHeapOffsets dflags StdHeader + +-- | Just like mkVirtConstrOffsets, but used when we don't have the actual +-- arguments. Useful when e.g. generating info tables; we just need to know +-- sizes of pointer and non-pointer fields. +mkVirtConstrSizes :: DynFlags -> [NonVoid PrimRep] -> (WordOff, WordOff) +mkVirtConstrSizes dflags field_reps + = (tot_wds, ptr_wds) + where + (tot_wds, ptr_wds, _) = + mkVirtConstrOffsets dflags + (map (\nv_rep -> NonVoid (fromNonVoid nv_rep, ())) field_reps) + +------------------------------------------------------------------------- +-- +-- Making argument descriptors +-- +-- An argument descriptor describes the layout of args on the stack, +-- both for * GC (stack-layout) purposes, and +-- * saving/restoring registers when a heap-check fails +-- +-- Void arguments aren't important, therefore (contrast constructSlowCall) +-- +------------------------------------------------------------------------- + +-- bring in ARG_P, ARG_N, etc. +#include "../includes/rts/storage/FunTypes.h" + +mkArgDescr :: DynFlags -> [Id] -> ArgDescr +mkArgDescr dflags args + = let arg_bits = argBits dflags arg_reps + arg_reps = filter isNonV (map idArgRep args) + -- Getting rid of voids eases matching of standard patterns + in case stdPattern arg_reps of + Just spec_id -> ArgSpec spec_id + Nothing -> ArgGen arg_bits + +argBits :: DynFlags -> [ArgRep] -> [Bool] -- True for non-ptr, False for ptr +argBits _ [] = [] +argBits dflags (P : args) = False : argBits dflags args +argBits dflags (arg : args) = take (argRepSizeW dflags arg) (repeat True) + ++ argBits dflags args + +---------------------- +stdPattern :: [ArgRep] -> Maybe Int +stdPattern reps + = case reps of + [] -> Just ARG_NONE -- just void args, probably + [N] -> Just ARG_N + [P] -> Just ARG_P + [F] -> Just ARG_F + [D] -> Just ARG_D + [L] -> Just ARG_L + [V16] -> Just ARG_V16 + [V32] -> Just ARG_V32 + [V64] -> Just ARG_V64 + + [N,N] -> Just ARG_NN + [N,P] -> Just ARG_NP + [P,N] -> Just ARG_PN + [P,P] -> Just ARG_PP + + [N,N,N] -> Just ARG_NNN + [N,N,P] -> Just ARG_NNP + [N,P,N] -> Just ARG_NPN + [N,P,P] -> Just ARG_NPP + [P,N,N] -> Just ARG_PNN + [P,N,P] -> Just ARG_PNP + [P,P,N] -> Just ARG_PPN + [P,P,P] -> Just ARG_PPP + + [P,P,P,P] -> Just ARG_PPPP + [P,P,P,P,P] -> Just ARG_PPPPP + [P,P,P,P,P,P] -> Just ARG_PPPPPP + + _ -> Nothing + +------------------------------------------------------------------------- +-- +-- Generating the info table and code for a closure +-- +------------------------------------------------------------------------- + +-- Here we make an info table of type 'CmmInfo'. The concrete +-- representation as a list of 'CmmAddr' is handled later +-- in the pipeline by 'cmmToRawCmm'. +-- When loading the free variables, a function closure pointer may be tagged, +-- so we must take it into account. + +emitClosureProcAndInfoTable :: Bool -- top-level? + -> Id -- name of the closure + -> LambdaFormInfo + -> CmmInfoTable + -> [NonVoid Id] -- incoming arguments + -> ((Int, LocalReg, [LocalReg]) -> FCode ()) -- function body + -> FCode () +emitClosureProcAndInfoTable top_lvl bndr lf_info info_tbl args body + = do { dflags <- getDynFlags + -- Bind the binder itself, but only if it's not a top-level + -- binding. We need non-top let-bindings to refer to the + -- top-level binding, which this binding would incorrectly shadow. + ; node <- if top_lvl then return $ idToReg dflags (NonVoid bndr) + else bindToReg (NonVoid bndr) lf_info + ; let node_points = nodeMustPointToIt dflags lf_info + ; arg_regs <- bindArgsToRegs args + ; let args' = if node_points then (node : arg_regs) else arg_regs + conv = if nodeMustPointToIt dflags lf_info then NativeNodeCall + else NativeDirectCall + (offset, _, _) = mkCallEntry dflags conv args' [] + ; emitClosureAndInfoTable info_tbl conv args' $ body (offset, node, arg_regs) + } + +-- Data constructors need closures, but not with all the argument handling +-- needed for functions. The shared part goes here. +emitClosureAndInfoTable :: + CmmInfoTable -> Convention -> [LocalReg] -> FCode () -> FCode () +emitClosureAndInfoTable info_tbl conv args body + = do { (_, blks) <- getCodeScoped body + ; let entry_lbl = toEntryLbl (cit_lbl info_tbl) + ; emitProcWithConvention conv (Just info_tbl) entry_lbl args blks + } diff --git a/compiler/GHC/StgToCmm/Monad.hs b/compiler/GHC/StgToCmm/Monad.hs new file mode 100644 index 0000000000..716cbdab78 --- /dev/null +++ b/compiler/GHC/StgToCmm/Monad.hs @@ -0,0 +1,861 @@ +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE GADTs #-} + +----------------------------------------------------------------------------- +-- +-- Monad for Stg to C-- code generation +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Monad ( + FCode, -- type + + initC, runC, fixC, + newUnique, + + emitLabel, + + emit, emitDecl, + emitProcWithConvention, emitProcWithStackFrame, + emitOutOfLine, emitAssign, emitStore, + emitComment, emitTick, emitUnwind, + + getCmm, aGraphToGraph, + getCodeR, getCode, getCodeScoped, getHeapUsage, + + mkCmmIfThenElse, mkCmmIfThen, mkCmmIfGoto, + mkCmmIfThenElse', mkCmmIfThen', mkCmmIfGoto', + + mkCall, mkCmmCall, + + forkClosureBody, forkLneBody, forkAlts, forkAltPair, codeOnly, + + ConTagZ, + + Sequel(..), ReturnKind(..), + withSequel, getSequel, + + setTickyCtrLabel, getTickyCtrLabel, + tickScope, getTickScope, + + withUpdFrameOff, getUpdFrameOff, initUpdFrameOff, + + HeapUsage(..), VirtualHpOffset, initHpUsage, + getHpUsage, setHpUsage, heapHWM, + setVirtHp, getVirtHp, setRealHp, + + getModuleName, + + -- ideally we wouldn't export these, but some other modules access internal state + getState, setState, getSelfLoop, withSelfLoop, getInfoDown, getDynFlags, getThisPackage, + + -- more localised access to monad state + CgIdInfo(..), + getBinds, setBinds, + + -- out of general friendliness, we also export ... + CgInfoDownwards(..), CgState(..) -- non-abstract + ) where + +import GhcPrelude hiding( sequence, succ ) + +import Cmm +import GHC.StgToCmm.Closure +import DynFlags +import Hoopl.Collections +import MkGraph +import BlockId +import CLabel +import SMRep +import Module +import Id +import VarEnv +import OrdList +import BasicTypes( ConTagZ ) +import Unique +import UniqSupply +import FastString +import Outputable +import Util + +import Control.Monad +import Data.List + + + +-------------------------------------------------------- +-- The FCode monad and its types +-- +-- FCode is the monad plumbed through the Stg->Cmm code generator, and +-- the Cmm parser. It contains the following things: +-- +-- - A writer monad, collecting: +-- - code for the current function, in the form of a CmmAGraph. +-- The function "emit" appends more code to this. +-- - the top-level CmmDecls accumulated so far +-- +-- - A state monad with: +-- - the local bindings in scope +-- - the current heap usage +-- - a UniqSupply +-- +-- - A reader monad, for CgInfoDownwards, containing +-- - DynFlags, +-- - the current Module +-- - the update-frame offset +-- - the ticky counter label +-- - the Sequel (the continuation to return to) +-- - the self-recursive tail call information + +-------------------------------------------------------- + +newtype FCode a = FCode { doFCode :: CgInfoDownwards -> CgState -> (a, CgState) } + deriving (Functor) + +instance Applicative FCode where + pure val = FCode (\_info_down state -> (val, state)) + {-# INLINE pure #-} + (<*>) = ap + +instance Monad FCode where + FCode m >>= k = FCode $ + \info_down state -> + case m info_down state of + (m_result, new_state) -> + case k m_result of + FCode kcode -> kcode info_down new_state + {-# INLINE (>>=) #-} + +instance MonadUnique FCode where + getUniqueSupplyM = cgs_uniqs <$> getState + getUniqueM = FCode $ \_ st -> + let (u, us') = takeUniqFromSupply (cgs_uniqs st) + in (u, st { cgs_uniqs = us' }) + +initC :: IO CgState +initC = do { uniqs <- mkSplitUniqSupply 'c' + ; return (initCgState uniqs) } + +runC :: DynFlags -> Module -> CgState -> FCode a -> (a,CgState) +runC dflags mod st fcode = doFCode fcode (initCgInfoDown dflags mod) st + +fixC :: (a -> FCode a) -> FCode a +fixC fcode = FCode $ + \info_down state -> let (v, s) = doFCode (fcode v) info_down state + in (v, s) + +-------------------------------------------------------- +-- The code generator environment +-------------------------------------------------------- + +-- This monadery has some information that it only passes +-- *downwards*, as well as some ``state'' which is modified +-- as we go along. + +data CgInfoDownwards -- information only passed *downwards* by the monad + = MkCgInfoDown { + cgd_dflags :: DynFlags, + cgd_mod :: Module, -- Module being compiled + cgd_updfr_off :: UpdFrameOffset, -- Size of current update frame + cgd_ticky :: CLabel, -- Current destination for ticky counts + cgd_sequel :: Sequel, -- What to do at end of basic block + cgd_self_loop :: Maybe SelfLoopInfo,-- Which tail calls can be compiled + -- as local jumps? See Note + -- [Self-recursive tail calls] in + -- GHC.StgToCmm.Expr + cgd_tick_scope:: CmmTickScope -- Tick scope for new blocks & ticks + } + +type CgBindings = IdEnv CgIdInfo + +data CgIdInfo + = CgIdInfo + { cg_id :: Id -- Id that this is the info for + , cg_lf :: LambdaFormInfo + , cg_loc :: CgLoc -- CmmExpr for the *tagged* value + } + +instance Outputable CgIdInfo where + ppr (CgIdInfo { cg_id = id, cg_loc = loc }) + = ppr id <+> text "-->" <+> ppr loc + +-- Sequel tells what to do with the result of this expression +data Sequel + = Return -- Return result(s) to continuation found on the stack. + + | AssignTo + [LocalReg] -- Put result(s) in these regs and fall through + -- NB: no void arguments here + -- + Bool -- Should we adjust the heap pointer back to + -- recover space that's unused on this path? + -- We need to do this only if the expression + -- may allocate (e.g. it's a foreign call or + -- allocating primOp) + +instance Outputable Sequel where + ppr Return = text "Return" + ppr (AssignTo regs b) = text "AssignTo" <+> ppr regs <+> ppr b + +-- See Note [sharing continuations] below +data ReturnKind + = AssignedDirectly + | ReturnedTo BlockId ByteOff + +-- Note [sharing continuations] +-- +-- ReturnKind says how the expression being compiled returned its +-- results: either by assigning directly to the registers specified +-- by the Sequel, or by returning to a continuation that does the +-- assignments. The point of this is we might be able to re-use the +-- continuation in a subsequent heap-check. Consider: +-- +-- case f x of z +-- True -> <True code> +-- False -> <False code> +-- +-- Naively we would generate +-- +-- R2 = x -- argument to f +-- Sp[young(L1)] = L1 +-- call f returns to L1 +-- L1: +-- z = R1 +-- if (z & 1) then Ltrue else Lfalse +-- Ltrue: +-- Hp = Hp + 24 +-- if (Hp > HpLim) then L4 else L7 +-- L4: +-- HpAlloc = 24 +-- goto L5 +-- L5: +-- R1 = z +-- Sp[young(L6)] = L6 +-- call stg_gc_unpt_r1 returns to L6 +-- L6: +-- z = R1 +-- goto L1 +-- L7: +-- <True code> +-- Lfalse: +-- <False code> +-- +-- We want the gc call in L4 to return to L1, and discard L6. Note +-- that not only can we share L1 and L6, but the assignment of the +-- return address in L4 is unnecessary because the return address for +-- L1 is already on the stack. We used to catch the sharing of L1 and +-- L6 in the common-block-eliminator, but not the unnecessary return +-- address assignment. +-- +-- Since this case is so common I decided to make it more explicit and +-- robust by programming the sharing directly, rather than relying on +-- the common-block eliminator to catch it. This makes +-- common-block-elimination an optional optimisation, and furthermore +-- generates less code in the first place that we have to subsequently +-- clean up. +-- +-- There are some rarer cases of common blocks that we don't catch +-- this way, but that's ok. Common-block-elimination is still available +-- to catch them when optimisation is enabled. Some examples are: +-- +-- - when both the True and False branches do a heap check, we +-- can share the heap-check failure code L4a and maybe L4 +-- +-- - in a case-of-case, there might be multiple continuations that +-- we can common up. +-- +-- It is always safe to use AssignedDirectly. Expressions that jump +-- to the continuation from multiple places (e.g. case expressions) +-- fall back to AssignedDirectly. +-- + + +initCgInfoDown :: DynFlags -> Module -> CgInfoDownwards +initCgInfoDown dflags mod + = MkCgInfoDown { cgd_dflags = dflags + , cgd_mod = mod + , cgd_updfr_off = initUpdFrameOff dflags + , cgd_ticky = mkTopTickyCtrLabel + , cgd_sequel = initSequel + , cgd_self_loop = Nothing + , cgd_tick_scope= GlobalScope } + +initSequel :: Sequel +initSequel = Return + +initUpdFrameOff :: DynFlags -> UpdFrameOffset +initUpdFrameOff dflags = widthInBytes (wordWidth dflags) -- space for the RA + + +-------------------------------------------------------- +-- The code generator state +-------------------------------------------------------- + +data CgState + = MkCgState { + cgs_stmts :: CmmAGraph, -- Current procedure + + cgs_tops :: OrdList CmmDecl, + -- Other procedures and data blocks in this compilation unit + -- Both are ordered only so that we can + -- reduce forward references, when it's easy to do so + + cgs_binds :: CgBindings, + + cgs_hp_usg :: HeapUsage, + + cgs_uniqs :: UniqSupply } + +data HeapUsage -- See Note [Virtual and real heap pointers] + = HeapUsage { + virtHp :: VirtualHpOffset, -- Virtual offset of highest-allocated word + -- Incremented whenever we allocate + realHp :: VirtualHpOffset -- realHp: Virtual offset of real heap ptr + -- Used in instruction addressing modes + } + +type VirtualHpOffset = WordOff + + +{- Note [Virtual and real heap pointers] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The code generator can allocate one or more objects contiguously, performing +one heap check to cover allocation of all the objects at once. Let's call +this little chunk of heap space an "allocation chunk". The code generator +will emit code to + * Perform a heap-exhaustion check + * Move the heap pointer to the end of the allocation chunk + * Allocate multiple objects within the chunk + +The code generator uses VirtualHpOffsets to address words within a +single allocation chunk; these start at one and increase positively. +The first word of the chunk has VirtualHpOffset=1, the second has +VirtualHpOffset=2, and so on. + + * The field realHp tracks (the VirtualHpOffset) where the real Hp + register is pointing. Typically it'll be pointing to the end of the + allocation chunk. + + * The field virtHp gives the VirtualHpOffset of the highest-allocated + word so far. It starts at zero (meaning no word has been allocated), + and increases whenever an object is allocated. + +The difference between realHp and virtHp gives the offset from the +real Hp register of a particular word in the allocation chunk. This +is what getHpRelOffset does. Since the returned offset is relative +to the real Hp register, it is valid only until you change the real +Hp register. (Changing virtHp doesn't matter.) +-} + + +initCgState :: UniqSupply -> CgState +initCgState uniqs + = MkCgState { cgs_stmts = mkNop + , cgs_tops = nilOL + , cgs_binds = emptyVarEnv + , cgs_hp_usg = initHpUsage + , cgs_uniqs = uniqs } + +stateIncUsage :: CgState -> CgState -> CgState +-- stateIncUsage@ e1 e2 incorporates in e1 +-- the heap high water mark found in e2. +stateIncUsage s1 s2@(MkCgState { cgs_hp_usg = hp_usg }) + = s1 { cgs_hp_usg = cgs_hp_usg s1 `maxHpHw` virtHp hp_usg } + `addCodeBlocksFrom` s2 + +addCodeBlocksFrom :: CgState -> CgState -> CgState +-- Add code blocks from the latter to the former +-- (The cgs_stmts will often be empty, but not always; see codeOnly) +s1 `addCodeBlocksFrom` s2 + = s1 { cgs_stmts = cgs_stmts s1 MkGraph.<*> cgs_stmts s2, + cgs_tops = cgs_tops s1 `appOL` cgs_tops s2 } + + +-- The heap high water mark is the larger of virtHp and hwHp. The latter is +-- only records the high water marks of forked-off branches, so to find the +-- heap high water mark you have to take the max of virtHp and hwHp. Remember, +-- virtHp never retreats! +-- +-- Note Jan 04: ok, so why do we only look at the virtual Hp?? + +heapHWM :: HeapUsage -> VirtualHpOffset +heapHWM = virtHp + +initHpUsage :: HeapUsage +initHpUsage = HeapUsage { virtHp = 0, realHp = 0 } + +maxHpHw :: HeapUsage -> VirtualHpOffset -> HeapUsage +hp_usg `maxHpHw` hw = hp_usg { virtHp = virtHp hp_usg `max` hw } + +-------------------------------------------------------- +-- Operators for getting and setting the state and "info_down". +-------------------------------------------------------- + +getState :: FCode CgState +getState = FCode $ \_info_down state -> (state, state) + +setState :: CgState -> FCode () +setState state = FCode $ \_info_down _ -> ((), state) + +getHpUsage :: FCode HeapUsage +getHpUsage = do + state <- getState + return $ cgs_hp_usg state + +setHpUsage :: HeapUsage -> FCode () +setHpUsage new_hp_usg = do + state <- getState + setState $ state {cgs_hp_usg = new_hp_usg} + +setVirtHp :: VirtualHpOffset -> FCode () +setVirtHp new_virtHp + = do { hp_usage <- getHpUsage + ; setHpUsage (hp_usage {virtHp = new_virtHp}) } + +getVirtHp :: FCode VirtualHpOffset +getVirtHp + = do { hp_usage <- getHpUsage + ; return (virtHp hp_usage) } + +setRealHp :: VirtualHpOffset -> FCode () +setRealHp new_realHp + = do { hp_usage <- getHpUsage + ; setHpUsage (hp_usage {realHp = new_realHp}) } + +getBinds :: FCode CgBindings +getBinds = do + state <- getState + return $ cgs_binds state + +setBinds :: CgBindings -> FCode () +setBinds new_binds = do + state <- getState + setState $ state {cgs_binds = new_binds} + +withState :: FCode a -> CgState -> FCode (a,CgState) +withState (FCode fcode) newstate = FCode $ \info_down state -> + case fcode info_down newstate of + (retval, state2) -> ((retval,state2), state) + +newUniqSupply :: FCode UniqSupply +newUniqSupply = do + state <- getState + let (us1, us2) = splitUniqSupply (cgs_uniqs state) + setState $ state { cgs_uniqs = us1 } + return us2 + +newUnique :: FCode Unique +newUnique = do + state <- getState + let (u,us') = takeUniqFromSupply (cgs_uniqs state) + setState $ state { cgs_uniqs = us' } + return u + +------------------ +getInfoDown :: FCode CgInfoDownwards +getInfoDown = FCode $ \info_down state -> (info_down,state) + +getSelfLoop :: FCode (Maybe SelfLoopInfo) +getSelfLoop = do + info_down <- getInfoDown + return $ cgd_self_loop info_down + +withSelfLoop :: SelfLoopInfo -> FCode a -> FCode a +withSelfLoop self_loop code = do + info_down <- getInfoDown + withInfoDown code (info_down {cgd_self_loop = Just self_loop}) + +instance HasDynFlags FCode where + getDynFlags = liftM cgd_dflags getInfoDown + +getThisPackage :: FCode UnitId +getThisPackage = liftM thisPackage getDynFlags + +withInfoDown :: FCode a -> CgInfoDownwards -> FCode a +withInfoDown (FCode fcode) info_down = FCode $ \_ state -> fcode info_down state + +-- ---------------------------------------------------------------------------- +-- Get the current module name + +getModuleName :: FCode Module +getModuleName = do { info <- getInfoDown; return (cgd_mod info) } + +-- ---------------------------------------------------------------------------- +-- Get/set the end-of-block info + +withSequel :: Sequel -> FCode a -> FCode a +withSequel sequel code + = do { info <- getInfoDown + ; withInfoDown code (info {cgd_sequel = sequel, cgd_self_loop = Nothing }) } + +getSequel :: FCode Sequel +getSequel = do { info <- getInfoDown + ; return (cgd_sequel info) } + +-- ---------------------------------------------------------------------------- +-- Get/set the size of the update frame + +-- We keep track of the size of the update frame so that we +-- can set the stack pointer to the proper address on return +-- (or tail call) from the closure. +-- There should be at most one update frame for each closure. +-- Note: I'm including the size of the original return address +-- in the size of the update frame -- hence the default case on `get'. + +withUpdFrameOff :: UpdFrameOffset -> FCode a -> FCode a +withUpdFrameOff size code + = do { info <- getInfoDown + ; withInfoDown code (info {cgd_updfr_off = size }) } + +getUpdFrameOff :: FCode UpdFrameOffset +getUpdFrameOff + = do { info <- getInfoDown + ; return $ cgd_updfr_off info } + +-- ---------------------------------------------------------------------------- +-- Get/set the current ticky counter label + +getTickyCtrLabel :: FCode CLabel +getTickyCtrLabel = do + info <- getInfoDown + return (cgd_ticky info) + +setTickyCtrLabel :: CLabel -> FCode a -> FCode a +setTickyCtrLabel ticky code = do + info <- getInfoDown + withInfoDown code (info {cgd_ticky = ticky}) + +-- ---------------------------------------------------------------------------- +-- Manage tick scopes + +-- | The current tick scope. We will assign this to generated blocks. +getTickScope :: FCode CmmTickScope +getTickScope = do + info <- getInfoDown + return (cgd_tick_scope info) + +-- | Places blocks generated by the given code into a fresh +-- (sub-)scope. This will make sure that Cmm annotations in our scope +-- will apply to the Cmm blocks generated therein - but not the other +-- way around. +tickScope :: FCode a -> FCode a +tickScope code = do + info <- getInfoDown + if debugLevel (cgd_dflags info) == 0 then code else do + u <- newUnique + let scope' = SubScope u (cgd_tick_scope info) + withInfoDown code info{ cgd_tick_scope = scope' } + + +-------------------------------------------------------- +-- Forking +-------------------------------------------------------- + +forkClosureBody :: FCode () -> FCode () +-- forkClosureBody compiles body_code in environment where: +-- - sequel, update stack frame and self loop info are +-- set to fresh values +-- - state is set to a fresh value, except for local bindings +-- that are passed in unchanged. It's up to the enclosed code to +-- re-bind the free variables to a field of the closure. + +forkClosureBody body_code + = do { dflags <- getDynFlags + ; info <- getInfoDown + ; us <- newUniqSupply + ; state <- getState + ; let body_info_down = info { cgd_sequel = initSequel + , cgd_updfr_off = initUpdFrameOff dflags + , cgd_self_loop = Nothing } + fork_state_in = (initCgState us) { cgs_binds = cgs_binds state } + ((),fork_state_out) = doFCode body_code body_info_down fork_state_in + ; setState $ state `addCodeBlocksFrom` fork_state_out } + +forkLneBody :: FCode a -> FCode a +-- 'forkLneBody' takes a body of let-no-escape binding and compiles +-- it in the *current* environment, returning the graph thus constructed. +-- +-- The current environment is passed on completely unchanged to +-- the successor. In particular, any heap usage from the enclosed +-- code is discarded; it should deal with its own heap consumption. +forkLneBody body_code + = do { info_down <- getInfoDown + ; us <- newUniqSupply + ; state <- getState + ; let fork_state_in = (initCgState us) { cgs_binds = cgs_binds state } + (result, fork_state_out) = doFCode body_code info_down fork_state_in + ; setState $ state `addCodeBlocksFrom` fork_state_out + ; return result } + +codeOnly :: FCode () -> FCode () +-- Emit any code from the inner thing into the outer thing +-- Do not affect anything else in the outer state +-- Used in almost-circular code to prevent false loop dependencies +codeOnly body_code + = do { info_down <- getInfoDown + ; us <- newUniqSupply + ; state <- getState + ; let fork_state_in = (initCgState us) { cgs_binds = cgs_binds state + , cgs_hp_usg = cgs_hp_usg state } + ((), fork_state_out) = doFCode body_code info_down fork_state_in + ; setState $ state `addCodeBlocksFrom` fork_state_out } + +forkAlts :: [FCode a] -> FCode [a] +-- (forkAlts' bs d) takes fcodes 'bs' for the branches of a 'case', and +-- an fcode for the default case 'd', and compiles each in the current +-- environment. The current environment is passed on unmodified, except +-- that the virtual Hp is moved on to the worst virtual Hp for the branches + +forkAlts branch_fcodes + = do { info_down <- getInfoDown + ; us <- newUniqSupply + ; state <- getState + ; let compile us branch + = (us2, doFCode branch info_down branch_state) + where + (us1,us2) = splitUniqSupply us + branch_state = (initCgState us1) { + cgs_binds = cgs_binds state + , cgs_hp_usg = cgs_hp_usg state } + (_us, results) = mapAccumL compile us branch_fcodes + (branch_results, branch_out_states) = unzip results + ; setState $ foldl' stateIncUsage state branch_out_states + -- NB foldl. state is the *left* argument to stateIncUsage + ; return branch_results } + +forkAltPair :: FCode a -> FCode a -> FCode (a,a) +-- Most common use of 'forkAlts'; having this helper function avoids +-- accidental use of failible pattern-matches in @do@-notation +forkAltPair x y = do + xy' <- forkAlts [x,y] + case xy' of + [x',y'] -> return (x',y') + _ -> panic "forkAltPair" + +-- collect the code emitted by an FCode computation +getCodeR :: FCode a -> FCode (a, CmmAGraph) +getCodeR fcode + = do { state1 <- getState + ; (a, state2) <- withState fcode (state1 { cgs_stmts = mkNop }) + ; setState $ state2 { cgs_stmts = cgs_stmts state1 } + ; return (a, cgs_stmts state2) } + +getCode :: FCode a -> FCode CmmAGraph +getCode fcode = do { (_,stmts) <- getCodeR fcode; return stmts } + +-- | Generate code into a fresh tick (sub-)scope and gather generated code +getCodeScoped :: FCode a -> FCode (a, CmmAGraphScoped) +getCodeScoped fcode + = do { state1 <- getState + ; ((a, tscope), state2) <- + tickScope $ + flip withState state1 { cgs_stmts = mkNop } $ + do { a <- fcode + ; scp <- getTickScope + ; return (a, scp) } + ; setState $ state2 { cgs_stmts = cgs_stmts state1 } + ; return (a, (cgs_stmts state2, tscope)) } + + +-- 'getHeapUsage' applies a function to the amount of heap that it uses. +-- It initialises the heap usage to zeros, and passes on an unchanged +-- heap usage. +-- +-- It is usually a prelude to performing a GC check, so everything must +-- be in a tidy and consistent state. +-- +-- Note the slightly subtle fixed point behaviour needed here + +getHeapUsage :: (VirtualHpOffset -> FCode a) -> FCode a +getHeapUsage fcode + = do { info_down <- getInfoDown + ; state <- getState + ; let fstate_in = state { cgs_hp_usg = initHpUsage } + (r, fstate_out) = doFCode (fcode hp_hw) info_down fstate_in + hp_hw = heapHWM (cgs_hp_usg fstate_out) -- Loop here! + + ; setState $ fstate_out { cgs_hp_usg = cgs_hp_usg state } + ; return r } + +-- ---------------------------------------------------------------------------- +-- Combinators for emitting code + +emitCgStmt :: CgStmt -> FCode () +emitCgStmt stmt + = do { state <- getState + ; setState $ state { cgs_stmts = cgs_stmts state `snocOL` stmt } + } + +emitLabel :: BlockId -> FCode () +emitLabel id = do tscope <- getTickScope + emitCgStmt (CgLabel id tscope) + +emitComment :: FastString -> FCode () +emitComment s + | debugIsOn = emitCgStmt (CgStmt (CmmComment s)) + | otherwise = return () + +emitTick :: CmmTickish -> FCode () +emitTick = emitCgStmt . CgStmt . CmmTick + +emitUnwind :: [(GlobalReg, Maybe CmmExpr)] -> FCode () +emitUnwind regs = do + dflags <- getDynFlags + when (debugLevel dflags > 0) $ do + emitCgStmt $ CgStmt $ CmmUnwind regs + +emitAssign :: CmmReg -> CmmExpr -> FCode () +emitAssign l r = emitCgStmt (CgStmt (CmmAssign l r)) + +emitStore :: CmmExpr -> CmmExpr -> FCode () +emitStore l r = emitCgStmt (CgStmt (CmmStore l r)) + +emit :: CmmAGraph -> FCode () +emit ag + = do { state <- getState + ; setState $ state { cgs_stmts = cgs_stmts state MkGraph.<*> ag } } + +emitDecl :: CmmDecl -> FCode () +emitDecl decl + = do { state <- getState + ; setState $ state { cgs_tops = cgs_tops state `snocOL` decl } } + +emitOutOfLine :: BlockId -> CmmAGraphScoped -> FCode () +emitOutOfLine l (stmts, tscope) = emitCgStmt (CgFork l stmts tscope) + +emitProcWithStackFrame + :: Convention -- entry convention + -> Maybe CmmInfoTable -- info table? + -> CLabel -- label for the proc + -> [CmmFormal] -- stack frame + -> [CmmFormal] -- arguments + -> CmmAGraphScoped -- code + -> Bool -- do stack layout? + -> FCode () + +emitProcWithStackFrame _conv mb_info lbl _stk_args [] blocks False + = do { dflags <- getDynFlags + ; emitProc mb_info lbl [] blocks (widthInBytes (wordWidth dflags)) False + } +emitProcWithStackFrame conv mb_info lbl stk_args args (graph, tscope) True + -- do layout + = do { dflags <- getDynFlags + ; let (offset, live, entry) = mkCallEntry dflags conv args stk_args + graph' = entry MkGraph.<*> graph + ; emitProc mb_info lbl live (graph', tscope) offset True + } +emitProcWithStackFrame _ _ _ _ _ _ _ = panic "emitProcWithStackFrame" + +emitProcWithConvention :: Convention -> Maybe CmmInfoTable -> CLabel + -> [CmmFormal] + -> CmmAGraphScoped + -> FCode () +emitProcWithConvention conv mb_info lbl args blocks + = emitProcWithStackFrame conv mb_info lbl [] args blocks True + +emitProc :: Maybe CmmInfoTable -> CLabel -> [GlobalReg] -> CmmAGraphScoped + -> Int -> Bool -> FCode () +emitProc mb_info lbl live blocks offset do_layout + = do { dflags <- getDynFlags + ; l <- newBlockId + ; let + blks :: CmmGraph + blks = labelAGraph l blocks + + infos | Just info <- mb_info = mapSingleton (g_entry blks) info + | otherwise = mapEmpty + + sinfo = StackInfo { arg_space = offset + , updfr_space = Just (initUpdFrameOff dflags) + , do_layout = do_layout } + + tinfo = TopInfo { info_tbls = infos + , stack_info=sinfo} + + proc_block = CmmProc tinfo lbl live blks + + ; state <- getState + ; setState $ state { cgs_tops = cgs_tops state `snocOL` proc_block } } + +getCmm :: FCode () -> FCode CmmGroup +-- Get all the CmmTops (there should be no stmts) +-- Return a single Cmm which may be split from other Cmms by +-- object splitting (at a later stage) +getCmm code + = do { state1 <- getState + ; ((), state2) <- withState code (state1 { cgs_tops = nilOL }) + ; setState $ state2 { cgs_tops = cgs_tops state1 } + ; return (fromOL (cgs_tops state2)) } + + +mkCmmIfThenElse :: CmmExpr -> CmmAGraph -> CmmAGraph -> FCode CmmAGraph +mkCmmIfThenElse e tbranch fbranch = mkCmmIfThenElse' e tbranch fbranch Nothing + +mkCmmIfThenElse' :: CmmExpr -> CmmAGraph -> CmmAGraph + -> Maybe Bool -> FCode CmmAGraph +mkCmmIfThenElse' e tbranch fbranch likely = do + tscp <- getTickScope + endif <- newBlockId + tid <- newBlockId + fid <- newBlockId + + let + (test, then_, else_, likely') = case likely of + Just False | Just e' <- maybeInvertCmmExpr e + -- currently NCG doesn't know about likely + -- annotations. We manually switch then and + -- else branch so the likely false branch + -- becomes a fallthrough. + -> (e', fbranch, tbranch, Just True) + _ -> (e, tbranch, fbranch, likely) + + return $ catAGraphs [ mkCbranch test tid fid likely' + , mkLabel tid tscp, then_, mkBranch endif + , mkLabel fid tscp, else_, mkLabel endif tscp ] + +mkCmmIfGoto :: CmmExpr -> BlockId -> FCode CmmAGraph +mkCmmIfGoto e tid = mkCmmIfGoto' e tid Nothing + +mkCmmIfGoto' :: CmmExpr -> BlockId -> Maybe Bool -> FCode CmmAGraph +mkCmmIfGoto' e tid l = do + endif <- newBlockId + tscp <- getTickScope + return $ catAGraphs [ mkCbranch e tid endif l, mkLabel endif tscp ] + +mkCmmIfThen :: CmmExpr -> CmmAGraph -> FCode CmmAGraph +mkCmmIfThen e tbranch = mkCmmIfThen' e tbranch Nothing + +mkCmmIfThen' :: CmmExpr -> CmmAGraph -> Maybe Bool -> FCode CmmAGraph +mkCmmIfThen' e tbranch l = do + endif <- newBlockId + tid <- newBlockId + tscp <- getTickScope + return $ catAGraphs [ mkCbranch e tid endif l + , mkLabel tid tscp, tbranch, mkLabel endif tscp ] + +mkCall :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmExpr] + -> UpdFrameOffset -> [CmmExpr] -> FCode CmmAGraph +mkCall f (callConv, retConv) results actuals updfr_off extra_stack = do + dflags <- getDynFlags + k <- newBlockId + tscp <- getTickScope + let area = Young k + (off, _, copyin) = copyInOflow dflags retConv area results [] + copyout = mkCallReturnsTo dflags f callConv actuals k off updfr_off extra_stack + return $ catAGraphs [copyout, mkLabel k tscp, copyin] + +mkCmmCall :: CmmExpr -> [CmmFormal] -> [CmmExpr] -> UpdFrameOffset + -> FCode CmmAGraph +mkCmmCall f results actuals updfr_off + = mkCall f (NativeDirectCall, NativeReturn) results actuals updfr_off [] + + +-- ---------------------------------------------------------------------------- +-- turn CmmAGraph into CmmGraph, for making a new proc. + +aGraphToGraph :: CmmAGraphScoped -> FCode CmmGraph +aGraphToGraph stmts + = do { l <- newBlockId + ; return (labelAGraph l stmts) } diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs new file mode 100644 index 0000000000..dc69a51916 --- /dev/null +++ b/compiler/GHC/StgToCmm/Prim.hs @@ -0,0 +1,2622 @@ +{-# LANGUAGE CPP #-} +-- emitPrimOp is quite large +{-# OPTIONS_GHC -fmax-pmcheck-iterations=4000000 #-} + +---------------------------------------------------------------------------- +-- +-- Stg to C--: primitive operations +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Prim ( + cgOpApp, + cgPrimOp, -- internal(ish), used by cgCase to get code for a + -- comparison without also turning it into a Bool. + shouldInlinePrimOp + ) where + +#include "HsVersions.h" + +import GhcPrelude hiding ((<*>)) + +import GHC.StgToCmm.Layout +import GHC.StgToCmm.Foreign +import GHC.StgToCmm.Env +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Ticky +import GHC.StgToCmm.Heap +import GHC.StgToCmm.Prof ( costCentreFrom ) + +import DynFlags +import GHC.Platform +import BasicTypes +import BlockId +import MkGraph +import StgSyn +import Cmm +import Type ( Type, tyConAppTyCon ) +import TyCon +import CLabel +import CmmUtils +import PrimOp +import SMRep +import FastString +import Outputable +import Util +import Data.Maybe + +import Data.Bits ((.&.), bit) +import Control.Monad (liftM, when, unless) + +------------------------------------------------------------------------ +-- Primitive operations and foreign calls +------------------------------------------------------------------------ + +{- Note [Foreign call results] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A foreign call always returns an unboxed tuple of results, one +of which is the state token. This seems to happen even for pure +calls. + +Even if we returned a single result for pure calls, it'd still be +right to wrap it in a singleton unboxed tuple, because the result +might be a Haskell closure pointer, we don't want to evaluate it. -} + +---------------------------------- +cgOpApp :: StgOp -- The op + -> [StgArg] -- Arguments + -> Type -- Result type (always an unboxed tuple) + -> FCode ReturnKind + +-- Foreign calls +cgOpApp (StgFCallOp fcall ty) stg_args res_ty + = cgForeignCall fcall ty stg_args res_ty + -- Note [Foreign call results] + +-- tagToEnum# is special: we need to pull the constructor +-- out of the table, and perform an appropriate return. + +cgOpApp (StgPrimOp TagToEnumOp) [arg] res_ty + = ASSERT(isEnumerationTyCon tycon) + do { dflags <- getDynFlags + ; args' <- getNonVoidArgAmodes [arg] + ; let amode = case args' of [amode] -> amode + _ -> panic "TagToEnumOp had void arg" + ; emitReturn [tagToClosure dflags tycon amode] } + where + -- If you're reading this code in the attempt to figure + -- out why the compiler panic'ed here, it is probably because + -- you used tagToEnum# in a non-monomorphic setting, e.g., + -- intToTg :: Enum a => Int -> a ; intToTg (I# x#) = tagToEnum# x# + -- That won't work. + tycon = tyConAppTyCon res_ty + +cgOpApp (StgPrimOp primop) args res_ty = do + dflags <- getDynFlags + cmm_args <- getNonVoidArgAmodes args + case shouldInlinePrimOp dflags primop cmm_args of + Nothing -> do -- out-of-line + let fun = CmmLit (CmmLabel (mkRtsPrimOpLabel primop)) + emitCall (NativeNodeCall, NativeReturn) fun cmm_args + + Just f -- inline + | ReturnsPrim VoidRep <- result_info + -> do f [] + emitReturn [] + + | ReturnsPrim rep <- result_info + -> do dflags <- getDynFlags + res <- newTemp (primRepCmmType dflags rep) + f [res] + emitReturn [CmmReg (CmmLocal res)] + + | ReturnsAlg tycon <- result_info, isUnboxedTupleTyCon tycon + -> do (regs, _hints) <- newUnboxedTupleRegs res_ty + f regs + emitReturn (map (CmmReg . CmmLocal) regs) + + | otherwise -> panic "cgPrimop" + where + result_info = getPrimOpResultInfo primop + +cgOpApp (StgPrimCallOp primcall) args _res_ty + = do { cmm_args <- getNonVoidArgAmodes args + ; let fun = CmmLit (CmmLabel (mkPrimCallLabel primcall)) + ; emitCall (NativeNodeCall, NativeReturn) fun cmm_args } + +-- | Interpret the argument as an unsigned value, assuming the value +-- is given in two-complement form in the given width. +-- +-- Example: @asUnsigned W64 (-1)@ is 18446744073709551615. +-- +-- This function is used to work around the fact that many array +-- primops take Int# arguments, but we interpret them as unsigned +-- quantities in the code gen. This means that we have to be careful +-- every time we work on e.g. a CmmInt literal that corresponds to the +-- array size, as it might contain a negative Integer value if the +-- user passed a value larger than 2^(wORD_SIZE_IN_BITS-1) as the Int# +-- literal. +asUnsigned :: Width -> Integer -> Integer +asUnsigned w n = n .&. (bit (widthInBits w) - 1) + +-- TODO: Several primop implementations (e.g. 'doNewByteArrayOp') use +-- ByteOff (or some other fixed width signed type) to represent +-- array sizes or indices. This means that these will overflow for +-- large enough sizes. + +-- | Decide whether an out-of-line primop should be replaced by an +-- inline implementation. This might happen e.g. if there's enough +-- static information, such as statically know arguments, to emit a +-- more efficient implementation inline. +-- +-- Returns 'Nothing' if this primop should use its out-of-line +-- implementation (defined elsewhere) and 'Just' together with a code +-- generating function that takes the output regs as arguments +-- otherwise. +shouldInlinePrimOp :: DynFlags + -> PrimOp -- ^ The primop + -> [CmmExpr] -- ^ The primop arguments + -> Maybe ([LocalReg] -> FCode ()) + +shouldInlinePrimOp dflags NewByteArrayOp_Char [(CmmLit (CmmInt n w))] + | asUnsigned w n <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> doNewByteArrayOp res (fromInteger n) + +shouldInlinePrimOp dflags NewArrayOp [(CmmLit (CmmInt n w)), init] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> + doNewArrayOp res (arrPtrsRep dflags (fromInteger n)) mkMAP_DIRTY_infoLabel + [ (mkIntExpr dflags (fromInteger n), + fixedHdrSize dflags + oFFSET_StgMutArrPtrs_ptrs dflags) + , (mkIntExpr dflags (nonHdrSizeW (arrPtrsRep dflags (fromInteger n))), + fixedHdrSize dflags + oFFSET_StgMutArrPtrs_size dflags) + ] + (fromInteger n) init + +shouldInlinePrimOp _ CopyArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopyArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp _ CopyMutableArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopyMutableArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp _ CopyArrayArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopyArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp _ CopyMutableArrayArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopyMutableArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp dflags CloneArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags CloneMutableArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags FreezeArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags ThawArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags NewSmallArrayOp [(CmmLit (CmmInt n w)), init] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> + doNewArrayOp res (smallArrPtrsRep (fromInteger n)) mkSMAP_DIRTY_infoLabel + [ (mkIntExpr dflags (fromInteger n), + fixedHdrSize dflags + oFFSET_StgSmallMutArrPtrs_ptrs dflags) + ] + (fromInteger n) init + +shouldInlinePrimOp _ CopySmallArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopySmallArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp _ CopySmallMutableArrayOp + [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] = + Just $ \ [] -> doCopySmallMutableArrayOp src src_off dst dst_off (fromInteger n) + +shouldInlinePrimOp dflags CloneSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneSmallArray mkSMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags CloneSmallMutableArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneSmallArray mkSMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags FreezeSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneSmallArray mkSMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags ThawSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))] + | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) = + Just $ \ [res] -> emitCloneSmallArray mkSMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags primop args + | primOpOutOfLine primop = Nothing + | otherwise = Just $ \ regs -> emitPrimOp dflags regs primop args + +-- TODO: Several primops, such as 'copyArray#', only have an inline +-- implementation (below) but could possibly have both an inline +-- implementation and an out-of-line implementation, just like +-- 'newArray#'. This would lower the amount of code generated, +-- hopefully without a performance impact (needs to be measured). + +--------------------------------------------------- +cgPrimOp :: [LocalReg] -- where to put the results + -> PrimOp -- the op + -> [StgArg] -- arguments + -> FCode () + +cgPrimOp results op args + = do dflags <- getDynFlags + arg_exprs <- getNonVoidArgAmodes args + emitPrimOp dflags results op arg_exprs + + +------------------------------------------------------------------------ +-- Emitting code for a primop +------------------------------------------------------------------------ + +emitPrimOp :: DynFlags + -> [LocalReg] -- where to put the results + -> PrimOp -- the op + -> [CmmExpr] -- arguments + -> FCode () + +-- First we handle various awkward cases specially. The remaining +-- easy cases are then handled by translateOp, defined below. + +emitPrimOp _ [res] ParOp [arg] + = + -- for now, just implement this in a C function + -- later, we might want to inline it. + emitCCall + [(res,NoHint)] + (CmmLit (CmmLabel (mkForeignLabel (fsLit "newSpark") Nothing ForeignLabelInExternalPackage IsFunction))) + [(baseExpr, AddrHint), (arg,AddrHint)] + +emitPrimOp dflags [res] SparkOp [arg] + = do + -- returns the value of arg in res. We're going to therefore + -- refer to arg twice (once to pass to newSpark(), and once to + -- assign to res), so put it in a temporary. + tmp <- assignTemp arg + tmp2 <- newTemp (bWord dflags) + emitCCall + [(tmp2,NoHint)] + (CmmLit (CmmLabel (mkForeignLabel (fsLit "newSpark") Nothing ForeignLabelInExternalPackage IsFunction))) + [(baseExpr, AddrHint), ((CmmReg (CmmLocal tmp)), AddrHint)] + emitAssign (CmmLocal res) (CmmReg (CmmLocal tmp)) + +emitPrimOp dflags [res] GetCCSOfOp [arg] + = emitAssign (CmmLocal res) val + where + val + | gopt Opt_SccProfilingOn dflags = costCentreFrom dflags (cmmUntag dflags arg) + | otherwise = CmmLit (zeroCLit dflags) + +emitPrimOp _ [res] GetCurrentCCSOp [_dummy_arg] + = emitAssign (CmmLocal res) cccsExpr + +emitPrimOp _ [res] MyThreadIdOp [] + = emitAssign (CmmLocal res) currentTSOExpr + +emitPrimOp dflags [res] ReadMutVarOp [mutv] + = emitAssign (CmmLocal res) (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags)) + +emitPrimOp dflags res@[] WriteMutVarOp [mutv,var] + = do -- Without this write barrier, other CPUs may see this pointer before + -- the writes for the closure it points to have occurred. + emitPrimCall res MO_WriteBarrier [] + emitStore (cmmOffsetW dflags mutv (fixedHdrSizeW dflags)) var + emitCCall + [{-no results-}] + (CmmLit (CmmLabel mkDirty_MUT_VAR_Label)) + [(baseExpr, AddrHint), (mutv,AddrHint)] + +-- #define sizzeofByteArrayzh(r,a) \ +-- r = ((StgArrBytes *)(a))->bytes +emitPrimOp dflags [res] SizeofByteArrayOp [arg] + = emit $ mkAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags)) + +-- #define sizzeofMutableByteArrayzh(r,a) \ +-- r = ((StgArrBytes *)(a))->bytes +emitPrimOp dflags [res] SizeofMutableByteArrayOp [arg] + = emitPrimOp dflags [res] SizeofByteArrayOp [arg] + +-- #define getSizzeofMutableByteArrayzh(r,a) \ +-- r = ((StgArrBytes *)(a))->bytes +emitPrimOp dflags [res] GetSizeofMutableByteArrayOp [arg] + = emitAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags)) + + +-- #define touchzh(o) /* nothing */ +emitPrimOp _ res@[] TouchOp args@[_arg] + = do emitPrimCall res MO_Touch args + +-- #define byteArrayContentszh(r,a) r = BYTE_ARR_CTS(a) +emitPrimOp dflags [res] ByteArrayContents_Char [arg] + = emitAssign (CmmLocal res) (cmmOffsetB dflags arg (arrWordsHdrSize dflags)) + +-- #define stableNameToIntzh(r,s) (r = ((StgStableName *)s)->sn) +emitPrimOp dflags [res] StableNameToIntOp [arg] + = emitAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags)) + +emitPrimOp dflags [res] ReallyUnsafePtrEqualityOp [arg1,arg2] + = emitAssign (CmmLocal res) (CmmMachOp (mo_wordEq dflags) [arg1,arg2]) + +-- #define addrToHValuezh(r,a) r=(P_)a +emitPrimOp _ [res] AddrToAnyOp [arg] + = emitAssign (CmmLocal res) arg + +-- #define hvalueToAddrzh(r, a) r=(W_)a +emitPrimOp _ [res] AnyToAddrOp [arg] + = emitAssign (CmmLocal res) arg + +{- Freezing arrays-of-ptrs requires changing an info table, for the + benefit of the generational collector. It needs to scavenge mutable + objects, even if they are in old space. When they become immutable, + they can be removed from this scavenge list. -} + +-- #define unsafeFreezzeArrayzh(r,a) +-- { +-- SET_INFO((StgClosure *)a,&stg_MUT_ARR_PTRS_FROZEN_DIRTY_info); +-- r = a; +-- } +emitPrimOp _ [res] UnsafeFreezeArrayOp [arg] + = emit $ catAGraphs + [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_DIRTY_infoLabel)), + mkAssign (CmmLocal res) arg ] +emitPrimOp _ [res] UnsafeFreezeArrayArrayOp [arg] + = emit $ catAGraphs + [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_DIRTY_infoLabel)), + mkAssign (CmmLocal res) arg ] +emitPrimOp _ [res] UnsafeFreezeSmallArrayOp [arg] + = emit $ catAGraphs + [ setInfo arg (CmmLit (CmmLabel mkSMAP_FROZEN_DIRTY_infoLabel)), + mkAssign (CmmLocal res) arg ] + +-- #define unsafeFreezzeByteArrayzh(r,a) r=(a) +emitPrimOp _ [res] UnsafeFreezeByteArrayOp [arg] + = emitAssign (CmmLocal res) arg + +-- Reading/writing pointer arrays + +emitPrimOp _ [res] ReadArrayOp [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] IndexArrayOp [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [] WriteArrayOp [obj,ix,v] = doWritePtrArrayOp obj ix v + +emitPrimOp _ [res] IndexArrayArrayOp_ByteArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] IndexArrayArrayOp_ArrayArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] ReadArrayArrayOp_ByteArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] ReadArrayArrayOp_MutableByteArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] ReadArrayArrayOp_ArrayArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [res] ReadArrayArrayOp_MutableArrayArray [obj,ix] = doReadPtrArrayOp res obj ix +emitPrimOp _ [] WriteArrayArrayOp_ByteArray [obj,ix,v] = doWritePtrArrayOp obj ix v +emitPrimOp _ [] WriteArrayArrayOp_MutableByteArray [obj,ix,v] = doWritePtrArrayOp obj ix v +emitPrimOp _ [] WriteArrayArrayOp_ArrayArray [obj,ix,v] = doWritePtrArrayOp obj ix v +emitPrimOp _ [] WriteArrayArrayOp_MutableArrayArray [obj,ix,v] = doWritePtrArrayOp obj ix v + +emitPrimOp _ [res] ReadSmallArrayOp [obj,ix] = doReadSmallPtrArrayOp res obj ix +emitPrimOp _ [res] IndexSmallArrayOp [obj,ix] = doReadSmallPtrArrayOp res obj ix +emitPrimOp _ [] WriteSmallArrayOp [obj,ix,v] = doWriteSmallPtrArrayOp obj ix v + +-- Getting the size of pointer arrays + +emitPrimOp dflags [res] SizeofArrayOp [arg] + = emit $ mkAssign (CmmLocal res) (cmmLoadIndexW dflags arg + (fixedHdrSizeW dflags + bytesToWordsRoundUp dflags (oFFSET_StgMutArrPtrs_ptrs dflags)) + (bWord dflags)) +emitPrimOp dflags [res] SizeofMutableArrayOp [arg] + = emitPrimOp dflags [res] SizeofArrayOp [arg] +emitPrimOp dflags [res] SizeofArrayArrayOp [arg] + = emitPrimOp dflags [res] SizeofArrayOp [arg] +emitPrimOp dflags [res] SizeofMutableArrayArrayOp [arg] + = emitPrimOp dflags [res] SizeofArrayOp [arg] + +emitPrimOp dflags [res] SizeofSmallArrayOp [arg] = + emit $ mkAssign (CmmLocal res) + (cmmLoadIndexW dflags arg + (fixedHdrSizeW dflags + bytesToWordsRoundUp dflags (oFFSET_StgSmallMutArrPtrs_ptrs dflags)) + (bWord dflags)) +emitPrimOp dflags [res] SizeofSmallMutableArrayOp [arg] = + emitPrimOp dflags [res] SizeofSmallArrayOp [arg] + +-- IndexXXXoffAddr + +emitPrimOp dflags res IndexOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res IndexOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res IndexOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args +emitPrimOp _ res IndexOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args +emitPrimOp dflags res IndexOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp dflags res IndexOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args + +-- ReadXXXoffAddr, which are identical, for our purposes, to IndexXXXoffAddr. + +emitPrimOp dflags res ReadOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res ReadOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res ReadOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args +emitPrimOp _ res ReadOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args +emitPrimOp dflags res ReadOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp dflags res ReadOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args + +-- IndexXXXArray + +emitPrimOp dflags res IndexByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res IndexByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res IndexByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args +emitPrimOp _ res IndexByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args +emitPrimOp dflags res IndexByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp dflags res IndexByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args + +-- ReadXXXArray, identical to IndexXXXArray. + +emitPrimOp dflags res ReadByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res ReadByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res ReadByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args +emitPrimOp _ res ReadByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args +emitPrimOp dflags res ReadByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp dflags res ReadByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args + +-- IndexWord8ArrayAsXXX + +emitPrimOp dflags res IndexByteArrayOp_Word8AsChar args = doIndexByteArrayOpAs (Just (mo_u_8ToWord dflags)) b8 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsWideChar args = doIndexByteArrayOpAs (Just (mo_u_32ToWord dflags)) b32 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsInt args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsWord args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsAddr args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp _ res IndexByteArrayOp_Word8AsFloat args = doIndexByteArrayOpAs Nothing f32 b8 res args +emitPrimOp _ res IndexByteArrayOp_Word8AsDouble args = doIndexByteArrayOpAs Nothing f64 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsStablePtr args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsInt16 args = doIndexByteArrayOpAs (Just (mo_s_16ToWord dflags)) b16 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsInt32 args = doIndexByteArrayOpAs (Just (mo_s_32ToWord dflags)) b32 b8 res args +emitPrimOp _ res IndexByteArrayOp_Word8AsInt64 args = doIndexByteArrayOpAs Nothing b64 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsWord16 args = doIndexByteArrayOpAs (Just (mo_u_16ToWord dflags)) b16 b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word8AsWord32 args = doIndexByteArrayOpAs (Just (mo_u_32ToWord dflags)) b32 b8 res args +emitPrimOp _ res IndexByteArrayOp_Word8AsWord64 args = doIndexByteArrayOpAs Nothing b64 b8 res args + +-- ReadInt8ArrayAsXXX, identical to IndexInt8ArrayAsXXX + +emitPrimOp dflags res ReadByteArrayOp_Word8AsChar args = doIndexByteArrayOpAs (Just (mo_u_8ToWord dflags)) b8 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsWideChar args = doIndexByteArrayOpAs (Just (mo_u_32ToWord dflags)) b32 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsInt args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsWord args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsAddr args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp _ res ReadByteArrayOp_Word8AsFloat args = doIndexByteArrayOpAs Nothing f32 b8 res args +emitPrimOp _ res ReadByteArrayOp_Word8AsDouble args = doIndexByteArrayOpAs Nothing f64 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsStablePtr args = doIndexByteArrayOpAs Nothing (bWord dflags) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsInt16 args = doIndexByteArrayOpAs (Just (mo_s_16ToWord dflags)) b16 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsInt32 args = doIndexByteArrayOpAs (Just (mo_s_32ToWord dflags)) b32 b8 res args +emitPrimOp _ res ReadByteArrayOp_Word8AsInt64 args = doIndexByteArrayOpAs Nothing b64 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsWord16 args = doIndexByteArrayOpAs (Just (mo_u_16ToWord dflags)) b16 b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word8AsWord32 args = doIndexByteArrayOpAs (Just (mo_u_32ToWord dflags)) b32 b8 res args +emitPrimOp _ res ReadByteArrayOp_Word8AsWord64 args = doIndexByteArrayOpAs Nothing b64 b8 res args + +-- WriteXXXoffAddr + +emitPrimOp dflags res WriteOffAddrOp_Char args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_WideChar args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp dflags res WriteOffAddrOp_Int args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Word args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Addr args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res WriteOffAddrOp_Float args = doWriteOffAddrOp Nothing f32 res args +emitPrimOp _ res WriteOffAddrOp_Double args = doWriteOffAddrOp Nothing f64 res args +emitPrimOp dflags res WriteOffAddrOp_StablePtr args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Int8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_Int16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteOffAddrOp_Int32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteOffAddrOp_Int64 args = doWriteOffAddrOp Nothing b64 res args +emitPrimOp dflags res WriteOffAddrOp_Word8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_Word16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteOffAddrOp_Word32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteOffAddrOp_Word64 args = doWriteOffAddrOp Nothing b64 res args + +-- WriteXXXArray + +emitPrimOp dflags res WriteByteArrayOp_Char args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_WideChar args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp dflags res WriteByteArrayOp_Int args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Word args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Addr args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res WriteByteArrayOp_Float args = doWriteByteArrayOp Nothing f32 res args +emitPrimOp _ res WriteByteArrayOp_Double args = doWriteByteArrayOp Nothing f64 res args +emitPrimOp dflags res WriteByteArrayOp_StablePtr args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Int8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Int16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteByteArrayOp_Int32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteByteArrayOp_Int64 args = doWriteByteArrayOp Nothing b64 res args +emitPrimOp dflags res WriteByteArrayOp_Word8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteByteArrayOp_Word32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteByteArrayOp_Word64 args = doWriteByteArrayOp Nothing b64 res args + +-- WriteInt8ArrayAsXXX + +emitPrimOp dflags res WriteByteArrayOp_Word8AsChar args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsWideChar args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsInt args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsWord args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsAddr args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsFloat args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsDouble args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsStablePtr args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsInt16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsInt32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsInt64 args = doWriteByteArrayOp Nothing b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsWord16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word8AsWord32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args +emitPrimOp _ res WriteByteArrayOp_Word8AsWord64 args = doWriteByteArrayOp Nothing b8 res args + +-- Copying and setting byte arrays +emitPrimOp _ [] CopyByteArrayOp [src,src_off,dst,dst_off,n] = + doCopyByteArrayOp src src_off dst dst_off n +emitPrimOp _ [] CopyMutableByteArrayOp [src,src_off,dst,dst_off,n] = + doCopyMutableByteArrayOp src src_off dst dst_off n +emitPrimOp _ [] CopyByteArrayToAddrOp [src,src_off,dst,n] = + doCopyByteArrayToAddrOp src src_off dst n +emitPrimOp _ [] CopyMutableByteArrayToAddrOp [src,src_off,dst,n] = + doCopyMutableByteArrayToAddrOp src src_off dst n +emitPrimOp _ [] CopyAddrToByteArrayOp [src,dst,dst_off,n] = + doCopyAddrToByteArrayOp src dst dst_off n +emitPrimOp _ [] SetByteArrayOp [ba,off,len,c] = + doSetByteArrayOp ba off len c + +-- Comparing byte arrays +emitPrimOp _ [res] CompareByteArraysOp [ba1,ba1_off,ba2,ba2_off,n] = + doCompareByteArraysOp res ba1 ba1_off ba2 ba2_off n + +emitPrimOp _ [res] BSwap16Op [w] = emitBSwapCall res w W16 +emitPrimOp _ [res] BSwap32Op [w] = emitBSwapCall res w W32 +emitPrimOp _ [res] BSwap64Op [w] = emitBSwapCall res w W64 +emitPrimOp dflags [res] BSwapOp [w] = emitBSwapCall res w (wordWidth dflags) + +emitPrimOp _ [res] BRev8Op [w] = emitBRevCall res w W8 +emitPrimOp _ [res] BRev16Op [w] = emitBRevCall res w W16 +emitPrimOp _ [res] BRev32Op [w] = emitBRevCall res w W32 +emitPrimOp _ [res] BRev64Op [w] = emitBRevCall res w W64 +emitPrimOp dflags [res] BRevOp [w] = emitBRevCall res w (wordWidth dflags) + +-- Population count +emitPrimOp _ [res] PopCnt8Op [w] = emitPopCntCall res w W8 +emitPrimOp _ [res] PopCnt16Op [w] = emitPopCntCall res w W16 +emitPrimOp _ [res] PopCnt32Op [w] = emitPopCntCall res w W32 +emitPrimOp _ [res] PopCnt64Op [w] = emitPopCntCall res w W64 +emitPrimOp dflags [res] PopCntOp [w] = emitPopCntCall res w (wordWidth dflags) + +-- Parallel bit deposit +emitPrimOp _ [res] Pdep8Op [src, mask] = emitPdepCall res src mask W8 +emitPrimOp _ [res] Pdep16Op [src, mask] = emitPdepCall res src mask W16 +emitPrimOp _ [res] Pdep32Op [src, mask] = emitPdepCall res src mask W32 +emitPrimOp _ [res] Pdep64Op [src, mask] = emitPdepCall res src mask W64 +emitPrimOp dflags [res] PdepOp [src, mask] = emitPdepCall res src mask (wordWidth dflags) + +-- Parallel bit extract +emitPrimOp _ [res] Pext8Op [src, mask] = emitPextCall res src mask W8 +emitPrimOp _ [res] Pext16Op [src, mask] = emitPextCall res src mask W16 +emitPrimOp _ [res] Pext32Op [src, mask] = emitPextCall res src mask W32 +emitPrimOp _ [res] Pext64Op [src, mask] = emitPextCall res src mask W64 +emitPrimOp dflags [res] PextOp [src, mask] = emitPextCall res src mask (wordWidth dflags) + +-- count leading zeros +emitPrimOp _ [res] Clz8Op [w] = emitClzCall res w W8 +emitPrimOp _ [res] Clz16Op [w] = emitClzCall res w W16 +emitPrimOp _ [res] Clz32Op [w] = emitClzCall res w W32 +emitPrimOp _ [res] Clz64Op [w] = emitClzCall res w W64 +emitPrimOp dflags [res] ClzOp [w] = emitClzCall res w (wordWidth dflags) + +-- count trailing zeros +emitPrimOp _ [res] Ctz8Op [w] = emitCtzCall res w W8 +emitPrimOp _ [res] Ctz16Op [w] = emitCtzCall res w W16 +emitPrimOp _ [res] Ctz32Op [w] = emitCtzCall res w W32 +emitPrimOp _ [res] Ctz64Op [w] = emitCtzCall res w W64 +emitPrimOp dflags [res] CtzOp [w] = emitCtzCall res w (wordWidth dflags) + +-- Unsigned int to floating point conversions +emitPrimOp _ [res] Word2FloatOp [w] = emitPrimCall [res] + (MO_UF_Conv W32) [w] +emitPrimOp _ [res] Word2DoubleOp [w] = emitPrimCall [res] + (MO_UF_Conv W64) [w] + +-- SIMD primops +emitPrimOp dflags [res] (VecBroadcastOp vcat n w) [e] = do + checkVecCompatibility dflags vcat n w + doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros (replicate n e) res + where + zeros :: CmmExpr + zeros = CmmLit $ CmmVec (replicate n zero) + + zero :: CmmLit + zero = case vcat of + IntVec -> CmmInt 0 w + WordVec -> CmmInt 0 w + FloatVec -> CmmFloat 0 w + + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags [res] (VecPackOp vcat n w) es = do + checkVecCompatibility dflags vcat n w + when (es `lengthIsNot` n) $ + panic "emitPrimOp: VecPackOp has wrong number of arguments" + doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros es res + where + zeros :: CmmExpr + zeros = CmmLit $ CmmVec (replicate n zero) + + zero :: CmmLit + zero = case vcat of + IntVec -> CmmInt 0 w + WordVec -> CmmInt 0 w + FloatVec -> CmmFloat 0 w + + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecUnpackOp vcat n w) [arg] = do + checkVecCompatibility dflags vcat n w + when (res `lengthIsNot` n) $ + panic "emitPrimOp: VecUnpackOp has wrong number of results" + doVecUnpackOp (vecElemProjectCast dflags vcat w) ty arg res + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags [res] (VecInsertOp vcat n w) [v,e,i] = do + checkVecCompatibility dflags vcat n w + doVecInsertOp (vecElemInjectCast dflags vcat w) ty v e i res + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecIndexByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexByteArrayOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecReadByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexByteArrayOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecWriteByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doWriteByteArrayOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecIndexOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexOffAddrOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecReadOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexOffAddrOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecWriteOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doWriteOffAddrOp Nothing ty res args + where + ty :: CmmType + ty = vecVmmType vcat n w + +emitPrimOp dflags res (VecIndexScalarByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexByteArrayOpAs Nothing vecty ty res args + where + vecty :: CmmType + vecty = vecVmmType vcat n w + + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecReadScalarByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexByteArrayOpAs Nothing vecty ty res args + where + vecty :: CmmType + vecty = vecVmmType vcat n w + + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecWriteScalarByteArrayOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doWriteByteArrayOp Nothing ty res args + where + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecIndexScalarOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexOffAddrOpAs Nothing vecty ty res args + where + vecty :: CmmType + vecty = vecVmmType vcat n w + + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecReadScalarOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doIndexOffAddrOpAs Nothing vecty ty res args + where + vecty :: CmmType + vecty = vecVmmType vcat n w + + ty :: CmmType + ty = vecCmmCat vcat w + +emitPrimOp dflags res (VecWriteScalarOffAddrOp vcat n w) args = do + checkVecCompatibility dflags vcat n w + doWriteOffAddrOp Nothing ty res args + where + ty :: CmmType + ty = vecCmmCat vcat w + +-- Prefetch +emitPrimOp _ [] PrefetchByteArrayOp3 args = doPrefetchByteArrayOp 3 args +emitPrimOp _ [] PrefetchMutableByteArrayOp3 args = doPrefetchMutableByteArrayOp 3 args +emitPrimOp _ [] PrefetchAddrOp3 args = doPrefetchAddrOp 3 args +emitPrimOp _ [] PrefetchValueOp3 args = doPrefetchValueOp 3 args + +emitPrimOp _ [] PrefetchByteArrayOp2 args = doPrefetchByteArrayOp 2 args +emitPrimOp _ [] PrefetchMutableByteArrayOp2 args = doPrefetchMutableByteArrayOp 2 args +emitPrimOp _ [] PrefetchAddrOp2 args = doPrefetchAddrOp 2 args +emitPrimOp _ [] PrefetchValueOp2 args = doPrefetchValueOp 2 args + +emitPrimOp _ [] PrefetchByteArrayOp1 args = doPrefetchByteArrayOp 1 args +emitPrimOp _ [] PrefetchMutableByteArrayOp1 args = doPrefetchMutableByteArrayOp 1 args +emitPrimOp _ [] PrefetchAddrOp1 args = doPrefetchAddrOp 1 args +emitPrimOp _ [] PrefetchValueOp1 args = doPrefetchValueOp 1 args + +emitPrimOp _ [] PrefetchByteArrayOp0 args = doPrefetchByteArrayOp 0 args +emitPrimOp _ [] PrefetchMutableByteArrayOp0 args = doPrefetchMutableByteArrayOp 0 args +emitPrimOp _ [] PrefetchAddrOp0 args = doPrefetchAddrOp 0 args +emitPrimOp _ [] PrefetchValueOp0 args = doPrefetchValueOp 0 args + +-- Atomic read-modify-write +emitPrimOp dflags [res] FetchAddByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Add mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchSubByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Sub mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchAndByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_And mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchNandByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Nand mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchOrByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Or mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchXorByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Xor mba ix (bWord dflags) n +emitPrimOp dflags [res] AtomicReadByteArrayOp_Int [mba, ix] = + doAtomicReadByteArray res mba ix (bWord dflags) +emitPrimOp dflags [] AtomicWriteByteArrayOp_Int [mba, ix, val] = + doAtomicWriteByteArray mba ix (bWord dflags) val +emitPrimOp dflags [res] CasByteArrayOp_Int [mba, ix, old, new] = + doCasByteArray res mba ix (bWord dflags) old new + +-- The rest just translate straightforwardly +emitPrimOp dflags [res] op [arg] + | nopOp op + = emitAssign (CmmLocal res) arg + + | Just (mop,rep) <- narrowOp op + = emitAssign (CmmLocal res) $ + CmmMachOp (mop rep (wordWidth dflags)) [CmmMachOp (mop (wordWidth dflags) rep) [arg]] + +emitPrimOp dflags r@[res] op args + | Just prim <- callishOp op + = do emitPrimCall r prim args + + | Just mop <- translateOp dflags op + = let stmt = mkAssign (CmmLocal res) (CmmMachOp mop args) in + emit stmt + +emitPrimOp dflags results op args + = case callishPrimOpSupported dflags op args of + Left op -> emit $ mkUnsafeCall (PrimTarget op) results args + Right gen -> gen results args + +-- Note [QuotRem optimization] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- `quot` and `rem` with constant divisor can be implemented with fast bit-ops +-- (shift, .&.). +-- +-- Currently we only support optimization (performed in CmmOpt) when the +-- constant is a power of 2. #9041 tracks the implementation of the general +-- optimization. +-- +-- `quotRem` can be optimized in the same way. However as it returns two values, +-- it is implemented as a "callish" primop which is harder to match and +-- to transform later on. For simplicity, the current implementation detects cases +-- that can be optimized (see `quotRemCanBeOptimized`) and converts STG quotRem +-- primop into two CMM quot and rem primops. + +type GenericOp = [CmmFormal] -> [CmmActual] -> FCode () + +callishPrimOpSupported :: DynFlags -> PrimOp -> [CmmExpr] -> Either CallishMachOp GenericOp +callishPrimOpSupported dflags op args + = case op of + IntQuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_S_QuotRem (wordWidth dflags)) + | otherwise + -> Right (genericIntQuotRemOp (wordWidth dflags)) + + Int8QuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_S_QuotRem W8) + | otherwise -> Right (genericIntQuotRemOp W8) + + Int16QuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_S_QuotRem W16) + | otherwise -> Right (genericIntQuotRemOp W16) + + + WordQuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_U_QuotRem (wordWidth dflags)) + | otherwise + -> Right (genericWordQuotRemOp (wordWidth dflags)) + + WordQuotRem2Op | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_U_QuotRem2 (wordWidth dflags)) + | otherwise -> Right (genericWordQuotRem2Op dflags) + + Word8QuotRemOp | ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_U_QuotRem W8) + | otherwise -> Right (genericWordQuotRemOp W8) + + Word16QuotRemOp| ncg && (x86ish || ppc) + , not quotRemCanBeOptimized + -> Left (MO_U_QuotRem W16) + | otherwise -> Right (genericWordQuotRemOp W16) + + WordAdd2Op | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_Add2 (wordWidth dflags)) + | otherwise -> Right genericWordAdd2Op + + WordAddCOp | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_AddWordC (wordWidth dflags)) + | otherwise -> Right genericWordAddCOp + + WordSubCOp | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_SubWordC (wordWidth dflags)) + | otherwise -> Right genericWordSubCOp + + IntAddCOp | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_AddIntC (wordWidth dflags)) + | otherwise -> Right genericIntAddCOp + + IntSubCOp | (ncg && (x86ish || ppc)) + || llvm -> Left (MO_SubIntC (wordWidth dflags)) + | otherwise -> Right genericIntSubCOp + + WordMul2Op | ncg && (x86ish || ppc) + || llvm -> Left (MO_U_Mul2 (wordWidth dflags)) + | otherwise -> Right genericWordMul2Op + FloatFabsOp | (ncg && x86ish || ppc) + || llvm -> Left MO_F32_Fabs + | otherwise -> Right $ genericFabsOp W32 + DoubleFabsOp | (ncg && x86ish || ppc) + || llvm -> Left MO_F64_Fabs + | otherwise -> Right $ genericFabsOp W64 + + _ -> pprPanic "emitPrimOp: can't translate PrimOp " (ppr op) + where + -- See Note [QuotRem optimization] + quotRemCanBeOptimized = case args of + [_, CmmLit (CmmInt n _) ] -> isJust (exactLog2 n) + _ -> False + + ncg = case hscTarget dflags of + HscAsm -> True + _ -> False + llvm = case hscTarget dflags of + HscLlvm -> True + _ -> False + x86ish = case platformArch (targetPlatform dflags) of + ArchX86 -> True + ArchX86_64 -> True + _ -> False + ppc = case platformArch (targetPlatform dflags) of + ArchPPC -> True + ArchPPC_64 _ -> True + _ -> False + +genericIntQuotRemOp :: Width -> GenericOp +genericIntQuotRemOp width [res_q, res_r] [arg_x, arg_y] + = emit $ mkAssign (CmmLocal res_q) + (CmmMachOp (MO_S_Quot width) [arg_x, arg_y]) <*> + mkAssign (CmmLocal res_r) + (CmmMachOp (MO_S_Rem width) [arg_x, arg_y]) +genericIntQuotRemOp _ _ _ = panic "genericIntQuotRemOp" + +genericWordQuotRemOp :: Width -> GenericOp +genericWordQuotRemOp width [res_q, res_r] [arg_x, arg_y] + = emit $ mkAssign (CmmLocal res_q) + (CmmMachOp (MO_U_Quot width) [arg_x, arg_y]) <*> + mkAssign (CmmLocal res_r) + (CmmMachOp (MO_U_Rem width) [arg_x, arg_y]) +genericWordQuotRemOp _ _ _ = panic "genericWordQuotRemOp" + +genericWordQuotRem2Op :: DynFlags -> GenericOp +genericWordQuotRem2Op dflags [res_q, res_r] [arg_x_high, arg_x_low, arg_y] + = emit =<< f (widthInBits (wordWidth dflags)) zero arg_x_high arg_x_low + where ty = cmmExprType dflags arg_x_high + shl x i = CmmMachOp (MO_Shl (wordWidth dflags)) [x, i] + shr x i = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, i] + or x y = CmmMachOp (MO_Or (wordWidth dflags)) [x, y] + ge x y = CmmMachOp (MO_U_Ge (wordWidth dflags)) [x, y] + ne x y = CmmMachOp (MO_Ne (wordWidth dflags)) [x, y] + minus x y = CmmMachOp (MO_Sub (wordWidth dflags)) [x, y] + times x y = CmmMachOp (MO_Mul (wordWidth dflags)) [x, y] + zero = lit 0 + one = lit 1 + negone = lit (fromIntegral (widthInBits (wordWidth dflags)) - 1) + lit i = CmmLit (CmmInt i (wordWidth dflags)) + + f :: Int -> CmmExpr -> CmmExpr -> CmmExpr -> FCode CmmAGraph + f 0 acc high _ = return (mkAssign (CmmLocal res_q) acc <*> + mkAssign (CmmLocal res_r) high) + f i acc high low = + do roverflowedBit <- newTemp ty + rhigh' <- newTemp ty + rhigh'' <- newTemp ty + rlow' <- newTemp ty + risge <- newTemp ty + racc' <- newTemp ty + let high' = CmmReg (CmmLocal rhigh') + isge = CmmReg (CmmLocal risge) + overflowedBit = CmmReg (CmmLocal roverflowedBit) + let this = catAGraphs + [mkAssign (CmmLocal roverflowedBit) + (shr high negone), + mkAssign (CmmLocal rhigh') + (or (shl high one) (shr low negone)), + mkAssign (CmmLocal rlow') + (shl low one), + mkAssign (CmmLocal risge) + (or (overflowedBit `ne` zero) + (high' `ge` arg_y)), + mkAssign (CmmLocal rhigh'') + (high' `minus` (arg_y `times` isge)), + mkAssign (CmmLocal racc') + (or (shl acc one) isge)] + rest <- f (i - 1) (CmmReg (CmmLocal racc')) + (CmmReg (CmmLocal rhigh'')) + (CmmReg (CmmLocal rlow')) + return (this <*> rest) +genericWordQuotRem2Op _ _ _ = panic "genericWordQuotRem2Op" + +genericWordAdd2Op :: GenericOp +genericWordAdd2Op [res_h, res_l] [arg_x, arg_y] + = do dflags <- getDynFlags + r1 <- newTemp (cmmExprType dflags arg_x) + r2 <- newTemp (cmmExprType dflags arg_x) + let topHalf x = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, hww] + toTopHalf x = CmmMachOp (MO_Shl (wordWidth dflags)) [x, hww] + bottomHalf x = CmmMachOp (MO_And (wordWidth dflags)) [x, hwm] + add x y = CmmMachOp (MO_Add (wordWidth dflags)) [x, y] + or x y = CmmMachOp (MO_Or (wordWidth dflags)) [x, y] + hww = CmmLit (CmmInt (fromIntegral (widthInBits (halfWordWidth dflags))) + (wordWidth dflags)) + hwm = CmmLit (CmmInt (halfWordMask dflags) (wordWidth dflags)) + emit $ catAGraphs + [mkAssign (CmmLocal r1) + (add (bottomHalf arg_x) (bottomHalf arg_y)), + mkAssign (CmmLocal r2) + (add (topHalf (CmmReg (CmmLocal r1))) + (add (topHalf arg_x) (topHalf arg_y))), + mkAssign (CmmLocal res_h) + (topHalf (CmmReg (CmmLocal r2))), + mkAssign (CmmLocal res_l) + (or (toTopHalf (CmmReg (CmmLocal r2))) + (bottomHalf (CmmReg (CmmLocal r1))))] +genericWordAdd2Op _ _ = panic "genericWordAdd2Op" + +-- | Implements branchless recovery of the carry flag @c@ by checking the +-- leftmost bits of both inputs @a@ and @b@ and result @r = a + b@: +-- +-- @ +-- c = a&b | (a|b)&~r +-- @ +-- +-- https://brodowsky.it-sky.net/2015/04/02/how-to-recover-the-carry-bit/ +genericWordAddCOp :: GenericOp +genericWordAddCOp [res_r, res_c] [aa, bb] + = do dflags <- getDynFlags + emit $ catAGraphs [ + mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordAdd dflags) [aa,bb]), + mkAssign (CmmLocal res_c) $ + CmmMachOp (mo_wordUShr dflags) [ + CmmMachOp (mo_wordOr dflags) [ + CmmMachOp (mo_wordAnd dflags) [aa,bb], + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordOr dflags) [aa,bb], + CmmMachOp (mo_wordNot dflags) [CmmReg (CmmLocal res_r)] + ] + ], + mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1) + ] + ] +genericWordAddCOp _ _ = panic "genericWordAddCOp" + +-- | Implements branchless recovery of the carry flag @c@ by checking the +-- leftmost bits of both inputs @a@ and @b@ and result @r = a - b@: +-- +-- @ +-- c = ~a&b | (~a|b)&r +-- @ +-- +-- https://brodowsky.it-sky.net/2015/04/02/how-to-recover-the-carry-bit/ +genericWordSubCOp :: GenericOp +genericWordSubCOp [res_r, res_c] [aa, bb] + = do dflags <- getDynFlags + emit $ catAGraphs [ + mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordSub dflags) [aa,bb]), + mkAssign (CmmLocal res_c) $ + CmmMachOp (mo_wordUShr dflags) [ + CmmMachOp (mo_wordOr dflags) [ + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordNot dflags) [aa], + bb + ], + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordOr dflags) [ + CmmMachOp (mo_wordNot dflags) [aa], + bb + ], + CmmReg (CmmLocal res_r) + ] + ], + mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1) + ] + ] +genericWordSubCOp _ _ = panic "genericWordSubCOp" + +genericIntAddCOp :: GenericOp +genericIntAddCOp [res_r, res_c] [aa, bb] +{- + With some bit-twiddling, we can define int{Add,Sub}Czh portably in + C, and without needing any comparisons. This may not be the + fastest way to do it - if you have better code, please send it! --SDM + + Return : r = a + b, c = 0 if no overflow, 1 on overflow. + + We currently don't make use of the r value if c is != 0 (i.e. + overflow), we just convert to big integers and try again. This + could be improved by making r and c the correct values for + plugging into a new J#. + + { r = ((I_)(a)) + ((I_)(b)); \ + c = ((StgWord)(~(((I_)(a))^((I_)(b))) & (((I_)(a))^r))) \ + >> (BITS_IN (I_) - 1); \ + } + Wading through the mass of bracketry, it seems to reduce to: + c = ( (~(a^b)) & (a^r) ) >>unsigned (BITS_IN(I_)-1) + +-} + = do dflags <- getDynFlags + emit $ catAGraphs [ + mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordAdd dflags) [aa,bb]), + mkAssign (CmmLocal res_c) $ + CmmMachOp (mo_wordUShr dflags) [ + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordNot dflags) [CmmMachOp (mo_wordXor dflags) [aa,bb]], + CmmMachOp (mo_wordXor dflags) [aa, CmmReg (CmmLocal res_r)] + ], + mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1) + ] + ] +genericIntAddCOp _ _ = panic "genericIntAddCOp" + +genericIntSubCOp :: GenericOp +genericIntSubCOp [res_r, res_c] [aa, bb] +{- Similarly: + #define subIntCzh(r,c,a,b) \ + { r = ((I_)(a)) - ((I_)(b)); \ + c = ((StgWord)((((I_)(a))^((I_)(b))) & (((I_)(a))^r))) \ + >> (BITS_IN (I_) - 1); \ + } + + c = ((a^b) & (a^r)) >>unsigned (BITS_IN(I_)-1) +-} + = do dflags <- getDynFlags + emit $ catAGraphs [ + mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordSub dflags) [aa,bb]), + mkAssign (CmmLocal res_c) $ + CmmMachOp (mo_wordUShr dflags) [ + CmmMachOp (mo_wordAnd dflags) [ + CmmMachOp (mo_wordXor dflags) [aa,bb], + CmmMachOp (mo_wordXor dflags) [aa, CmmReg (CmmLocal res_r)] + ], + mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1) + ] + ] +genericIntSubCOp _ _ = panic "genericIntSubCOp" + +genericWordMul2Op :: GenericOp +genericWordMul2Op [res_h, res_l] [arg_x, arg_y] + = do dflags <- getDynFlags + let t = cmmExprType dflags arg_x + xlyl <- liftM CmmLocal $ newTemp t + xlyh <- liftM CmmLocal $ newTemp t + xhyl <- liftM CmmLocal $ newTemp t + r <- liftM CmmLocal $ newTemp t + -- This generic implementation is very simple and slow. We might + -- well be able to do better, but for now this at least works. + let topHalf x = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, hww] + toTopHalf x = CmmMachOp (MO_Shl (wordWidth dflags)) [x, hww] + bottomHalf x = CmmMachOp (MO_And (wordWidth dflags)) [x, hwm] + add x y = CmmMachOp (MO_Add (wordWidth dflags)) [x, y] + sum = foldl1 add + mul x y = CmmMachOp (MO_Mul (wordWidth dflags)) [x, y] + or x y = CmmMachOp (MO_Or (wordWidth dflags)) [x, y] + hww = CmmLit (CmmInt (fromIntegral (widthInBits (halfWordWidth dflags))) + (wordWidth dflags)) + hwm = CmmLit (CmmInt (halfWordMask dflags) (wordWidth dflags)) + emit $ catAGraphs + [mkAssign xlyl + (mul (bottomHalf arg_x) (bottomHalf arg_y)), + mkAssign xlyh + (mul (bottomHalf arg_x) (topHalf arg_y)), + mkAssign xhyl + (mul (topHalf arg_x) (bottomHalf arg_y)), + mkAssign r + (sum [topHalf (CmmReg xlyl), + bottomHalf (CmmReg xhyl), + bottomHalf (CmmReg xlyh)]), + mkAssign (CmmLocal res_l) + (or (bottomHalf (CmmReg xlyl)) + (toTopHalf (CmmReg r))), + mkAssign (CmmLocal res_h) + (sum [mul (topHalf arg_x) (topHalf arg_y), + topHalf (CmmReg xhyl), + topHalf (CmmReg xlyh), + topHalf (CmmReg r)])] +genericWordMul2Op _ _ = panic "genericWordMul2Op" + +-- This replicates what we had in libraries/base/GHC/Float.hs: +-- +-- abs x | x == 0 = 0 -- handles (-0.0) +-- | x > 0 = x +-- | otherwise = negateFloat x +genericFabsOp :: Width -> GenericOp +genericFabsOp w [res_r] [aa] + = do dflags <- getDynFlags + let zero = CmmLit (CmmFloat 0 w) + + eq x y = CmmMachOp (MO_F_Eq w) [x, y] + gt x y = CmmMachOp (MO_F_Gt w) [x, y] + + neg x = CmmMachOp (MO_F_Neg w) [x] + + g1 = catAGraphs [mkAssign (CmmLocal res_r) zero] + g2 = catAGraphs [mkAssign (CmmLocal res_r) aa] + + res_t <- CmmLocal <$> newTemp (cmmExprType dflags aa) + let g3 = catAGraphs [mkAssign res_t aa, + mkAssign (CmmLocal res_r) (neg (CmmReg res_t))] + + g4 <- mkCmmIfThenElse (gt aa zero) g2 g3 + + emit =<< mkCmmIfThenElse (eq aa zero) g1 g4 + +genericFabsOp _ _ _ = panic "genericFabsOp" + +-- These PrimOps are NOPs in Cmm + +nopOp :: PrimOp -> Bool +nopOp Int2WordOp = True +nopOp Word2IntOp = True +nopOp Int2AddrOp = True +nopOp Addr2IntOp = True +nopOp ChrOp = True -- Int# and Char# are rep'd the same +nopOp OrdOp = True +nopOp _ = False + +-- These PrimOps turn into double casts + +narrowOp :: PrimOp -> Maybe (Width -> Width -> MachOp, Width) +narrowOp Narrow8IntOp = Just (MO_SS_Conv, W8) +narrowOp Narrow16IntOp = Just (MO_SS_Conv, W16) +narrowOp Narrow32IntOp = Just (MO_SS_Conv, W32) +narrowOp Narrow8WordOp = Just (MO_UU_Conv, W8) +narrowOp Narrow16WordOp = Just (MO_UU_Conv, W16) +narrowOp Narrow32WordOp = Just (MO_UU_Conv, W32) +narrowOp _ = Nothing + +-- Native word signless ops + +translateOp :: DynFlags -> PrimOp -> Maybe MachOp +translateOp dflags IntAddOp = Just (mo_wordAdd dflags) +translateOp dflags IntSubOp = Just (mo_wordSub dflags) +translateOp dflags WordAddOp = Just (mo_wordAdd dflags) +translateOp dflags WordSubOp = Just (mo_wordSub dflags) +translateOp dflags AddrAddOp = Just (mo_wordAdd dflags) +translateOp dflags AddrSubOp = Just (mo_wordSub dflags) + +translateOp dflags IntEqOp = Just (mo_wordEq dflags) +translateOp dflags IntNeOp = Just (mo_wordNe dflags) +translateOp dflags WordEqOp = Just (mo_wordEq dflags) +translateOp dflags WordNeOp = Just (mo_wordNe dflags) +translateOp dflags AddrEqOp = Just (mo_wordEq dflags) +translateOp dflags AddrNeOp = Just (mo_wordNe dflags) + +translateOp dflags AndOp = Just (mo_wordAnd dflags) +translateOp dflags OrOp = Just (mo_wordOr dflags) +translateOp dflags XorOp = Just (mo_wordXor dflags) +translateOp dflags NotOp = Just (mo_wordNot dflags) +translateOp dflags SllOp = Just (mo_wordShl dflags) +translateOp dflags SrlOp = Just (mo_wordUShr dflags) + +translateOp dflags AddrRemOp = Just (mo_wordURem dflags) + +-- Native word signed ops + +translateOp dflags IntMulOp = Just (mo_wordMul dflags) +translateOp dflags IntMulMayOfloOp = Just (MO_S_MulMayOflo (wordWidth dflags)) +translateOp dflags IntQuotOp = Just (mo_wordSQuot dflags) +translateOp dflags IntRemOp = Just (mo_wordSRem dflags) +translateOp dflags IntNegOp = Just (mo_wordSNeg dflags) + + +translateOp dflags IntGeOp = Just (mo_wordSGe dflags) +translateOp dflags IntLeOp = Just (mo_wordSLe dflags) +translateOp dflags IntGtOp = Just (mo_wordSGt dflags) +translateOp dflags IntLtOp = Just (mo_wordSLt dflags) + +translateOp dflags AndIOp = Just (mo_wordAnd dflags) +translateOp dflags OrIOp = Just (mo_wordOr dflags) +translateOp dflags XorIOp = Just (mo_wordXor dflags) +translateOp dflags NotIOp = Just (mo_wordNot dflags) +translateOp dflags ISllOp = Just (mo_wordShl dflags) +translateOp dflags ISraOp = Just (mo_wordSShr dflags) +translateOp dflags ISrlOp = Just (mo_wordUShr dflags) + +-- Native word unsigned ops + +translateOp dflags WordGeOp = Just (mo_wordUGe dflags) +translateOp dflags WordLeOp = Just (mo_wordULe dflags) +translateOp dflags WordGtOp = Just (mo_wordUGt dflags) +translateOp dflags WordLtOp = Just (mo_wordULt dflags) + +translateOp dflags WordMulOp = Just (mo_wordMul dflags) +translateOp dflags WordQuotOp = Just (mo_wordUQuot dflags) +translateOp dflags WordRemOp = Just (mo_wordURem dflags) + +translateOp dflags AddrGeOp = Just (mo_wordUGe dflags) +translateOp dflags AddrLeOp = Just (mo_wordULe dflags) +translateOp dflags AddrGtOp = Just (mo_wordUGt dflags) +translateOp dflags AddrLtOp = Just (mo_wordULt dflags) + +-- Int8# signed ops + +translateOp dflags Int8Extend = Just (MO_SS_Conv W8 (wordWidth dflags)) +translateOp dflags Int8Narrow = Just (MO_SS_Conv (wordWidth dflags) W8) +translateOp _ Int8NegOp = Just (MO_S_Neg W8) +translateOp _ Int8AddOp = Just (MO_Add W8) +translateOp _ Int8SubOp = Just (MO_Sub W8) +translateOp _ Int8MulOp = Just (MO_Mul W8) +translateOp _ Int8QuotOp = Just (MO_S_Quot W8) +translateOp _ Int8RemOp = Just (MO_S_Rem W8) + +translateOp _ Int8EqOp = Just (MO_Eq W8) +translateOp _ Int8GeOp = Just (MO_S_Ge W8) +translateOp _ Int8GtOp = Just (MO_S_Gt W8) +translateOp _ Int8LeOp = Just (MO_S_Le W8) +translateOp _ Int8LtOp = Just (MO_S_Lt W8) +translateOp _ Int8NeOp = Just (MO_Ne W8) + +-- Word8# unsigned ops + +translateOp dflags Word8Extend = Just (MO_UU_Conv W8 (wordWidth dflags)) +translateOp dflags Word8Narrow = Just (MO_UU_Conv (wordWidth dflags) W8) +translateOp _ Word8NotOp = Just (MO_Not W8) +translateOp _ Word8AddOp = Just (MO_Add W8) +translateOp _ Word8SubOp = Just (MO_Sub W8) +translateOp _ Word8MulOp = Just (MO_Mul W8) +translateOp _ Word8QuotOp = Just (MO_U_Quot W8) +translateOp _ Word8RemOp = Just (MO_U_Rem W8) + +translateOp _ Word8EqOp = Just (MO_Eq W8) +translateOp _ Word8GeOp = Just (MO_U_Ge W8) +translateOp _ Word8GtOp = Just (MO_U_Gt W8) +translateOp _ Word8LeOp = Just (MO_U_Le W8) +translateOp _ Word8LtOp = Just (MO_U_Lt W8) +translateOp _ Word8NeOp = Just (MO_Ne W8) + +-- Int16# signed ops + +translateOp dflags Int16Extend = Just (MO_SS_Conv W16 (wordWidth dflags)) +translateOp dflags Int16Narrow = Just (MO_SS_Conv (wordWidth dflags) W16) +translateOp _ Int16NegOp = Just (MO_S_Neg W16) +translateOp _ Int16AddOp = Just (MO_Add W16) +translateOp _ Int16SubOp = Just (MO_Sub W16) +translateOp _ Int16MulOp = Just (MO_Mul W16) +translateOp _ Int16QuotOp = Just (MO_S_Quot W16) +translateOp _ Int16RemOp = Just (MO_S_Rem W16) + +translateOp _ Int16EqOp = Just (MO_Eq W16) +translateOp _ Int16GeOp = Just (MO_S_Ge W16) +translateOp _ Int16GtOp = Just (MO_S_Gt W16) +translateOp _ Int16LeOp = Just (MO_S_Le W16) +translateOp _ Int16LtOp = Just (MO_S_Lt W16) +translateOp _ Int16NeOp = Just (MO_Ne W16) + +-- Word16# unsigned ops + +translateOp dflags Word16Extend = Just (MO_UU_Conv W16 (wordWidth dflags)) +translateOp dflags Word16Narrow = Just (MO_UU_Conv (wordWidth dflags) W16) +translateOp _ Word16NotOp = Just (MO_Not W16) +translateOp _ Word16AddOp = Just (MO_Add W16) +translateOp _ Word16SubOp = Just (MO_Sub W16) +translateOp _ Word16MulOp = Just (MO_Mul W16) +translateOp _ Word16QuotOp = Just (MO_U_Quot W16) +translateOp _ Word16RemOp = Just (MO_U_Rem W16) + +translateOp _ Word16EqOp = Just (MO_Eq W16) +translateOp _ Word16GeOp = Just (MO_U_Ge W16) +translateOp _ Word16GtOp = Just (MO_U_Gt W16) +translateOp _ Word16LeOp = Just (MO_U_Le W16) +translateOp _ Word16LtOp = Just (MO_U_Lt W16) +translateOp _ Word16NeOp = Just (MO_Ne W16) + +-- Char# ops + +translateOp dflags CharEqOp = Just (MO_Eq (wordWidth dflags)) +translateOp dflags CharNeOp = Just (MO_Ne (wordWidth dflags)) +translateOp dflags CharGeOp = Just (MO_U_Ge (wordWidth dflags)) +translateOp dflags CharLeOp = Just (MO_U_Le (wordWidth dflags)) +translateOp dflags CharGtOp = Just (MO_U_Gt (wordWidth dflags)) +translateOp dflags CharLtOp = Just (MO_U_Lt (wordWidth dflags)) + +-- Double ops + +translateOp _ DoubleEqOp = Just (MO_F_Eq W64) +translateOp _ DoubleNeOp = Just (MO_F_Ne W64) +translateOp _ DoubleGeOp = Just (MO_F_Ge W64) +translateOp _ DoubleLeOp = Just (MO_F_Le W64) +translateOp _ DoubleGtOp = Just (MO_F_Gt W64) +translateOp _ DoubleLtOp = Just (MO_F_Lt W64) + +translateOp _ DoubleAddOp = Just (MO_F_Add W64) +translateOp _ DoubleSubOp = Just (MO_F_Sub W64) +translateOp _ DoubleMulOp = Just (MO_F_Mul W64) +translateOp _ DoubleDivOp = Just (MO_F_Quot W64) +translateOp _ DoubleNegOp = Just (MO_F_Neg W64) + +-- Float ops + +translateOp _ FloatEqOp = Just (MO_F_Eq W32) +translateOp _ FloatNeOp = Just (MO_F_Ne W32) +translateOp _ FloatGeOp = Just (MO_F_Ge W32) +translateOp _ FloatLeOp = Just (MO_F_Le W32) +translateOp _ FloatGtOp = Just (MO_F_Gt W32) +translateOp _ FloatLtOp = Just (MO_F_Lt W32) + +translateOp _ FloatAddOp = Just (MO_F_Add W32) +translateOp _ FloatSubOp = Just (MO_F_Sub W32) +translateOp _ FloatMulOp = Just (MO_F_Mul W32) +translateOp _ FloatDivOp = Just (MO_F_Quot W32) +translateOp _ FloatNegOp = Just (MO_F_Neg W32) + +-- Vector ops + +translateOp _ (VecAddOp FloatVec n w) = Just (MO_VF_Add n w) +translateOp _ (VecSubOp FloatVec n w) = Just (MO_VF_Sub n w) +translateOp _ (VecMulOp FloatVec n w) = Just (MO_VF_Mul n w) +translateOp _ (VecDivOp FloatVec n w) = Just (MO_VF_Quot n w) +translateOp _ (VecNegOp FloatVec n w) = Just (MO_VF_Neg n w) + +translateOp _ (VecAddOp IntVec n w) = Just (MO_V_Add n w) +translateOp _ (VecSubOp IntVec n w) = Just (MO_V_Sub n w) +translateOp _ (VecMulOp IntVec n w) = Just (MO_V_Mul n w) +translateOp _ (VecQuotOp IntVec n w) = Just (MO_VS_Quot n w) +translateOp _ (VecRemOp IntVec n w) = Just (MO_VS_Rem n w) +translateOp _ (VecNegOp IntVec n w) = Just (MO_VS_Neg n w) + +translateOp _ (VecAddOp WordVec n w) = Just (MO_V_Add n w) +translateOp _ (VecSubOp WordVec n w) = Just (MO_V_Sub n w) +translateOp _ (VecMulOp WordVec n w) = Just (MO_V_Mul n w) +translateOp _ (VecQuotOp WordVec n w) = Just (MO_VU_Quot n w) +translateOp _ (VecRemOp WordVec n w) = Just (MO_VU_Rem n w) + +-- Conversions + +translateOp dflags Int2DoubleOp = Just (MO_SF_Conv (wordWidth dflags) W64) +translateOp dflags Double2IntOp = Just (MO_FS_Conv W64 (wordWidth dflags)) + +translateOp dflags Int2FloatOp = Just (MO_SF_Conv (wordWidth dflags) W32) +translateOp dflags Float2IntOp = Just (MO_FS_Conv W32 (wordWidth dflags)) + +translateOp _ Float2DoubleOp = Just (MO_FF_Conv W32 W64) +translateOp _ Double2FloatOp = Just (MO_FF_Conv W64 W32) + +-- Word comparisons masquerading as more exotic things. + +translateOp dflags SameMutVarOp = Just (mo_wordEq dflags) +translateOp dflags SameMVarOp = Just (mo_wordEq dflags) +translateOp dflags SameMutableArrayOp = Just (mo_wordEq dflags) +translateOp dflags SameMutableByteArrayOp = Just (mo_wordEq dflags) +translateOp dflags SameMutableArrayArrayOp= Just (mo_wordEq dflags) +translateOp dflags SameSmallMutableArrayOp= Just (mo_wordEq dflags) +translateOp dflags SameTVarOp = Just (mo_wordEq dflags) +translateOp dflags EqStablePtrOp = Just (mo_wordEq dflags) +-- See Note [Comparing stable names] +translateOp dflags EqStableNameOp = Just (mo_wordEq dflags) + +translateOp _ _ = Nothing + +-- Note [Comparing stable names] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- A StableName# is actually a pointer to a stable name object (SNO) +-- containing an index into the stable name table (SNT). We +-- used to compare StableName#s by following the pointers to the +-- SNOs and checking whether they held the same SNT indices. However, +-- this is not necessary: there is a one-to-one correspondence +-- between SNOs and entries in the SNT, so simple pointer equality +-- does the trick. + +-- These primops are implemented by CallishMachOps, because they sometimes +-- turn into foreign calls depending on the backend. + +callishOp :: PrimOp -> Maybe CallishMachOp +callishOp DoublePowerOp = Just MO_F64_Pwr +callishOp DoubleSinOp = Just MO_F64_Sin +callishOp DoubleCosOp = Just MO_F64_Cos +callishOp DoubleTanOp = Just MO_F64_Tan +callishOp DoubleSinhOp = Just MO_F64_Sinh +callishOp DoubleCoshOp = Just MO_F64_Cosh +callishOp DoubleTanhOp = Just MO_F64_Tanh +callishOp DoubleAsinOp = Just MO_F64_Asin +callishOp DoubleAcosOp = Just MO_F64_Acos +callishOp DoubleAtanOp = Just MO_F64_Atan +callishOp DoubleAsinhOp = Just MO_F64_Asinh +callishOp DoubleAcoshOp = Just MO_F64_Acosh +callishOp DoubleAtanhOp = Just MO_F64_Atanh +callishOp DoubleLogOp = Just MO_F64_Log +callishOp DoubleLog1POp = Just MO_F64_Log1P +callishOp DoubleExpOp = Just MO_F64_Exp +callishOp DoubleExpM1Op = Just MO_F64_ExpM1 +callishOp DoubleSqrtOp = Just MO_F64_Sqrt + +callishOp FloatPowerOp = Just MO_F32_Pwr +callishOp FloatSinOp = Just MO_F32_Sin +callishOp FloatCosOp = Just MO_F32_Cos +callishOp FloatTanOp = Just MO_F32_Tan +callishOp FloatSinhOp = Just MO_F32_Sinh +callishOp FloatCoshOp = Just MO_F32_Cosh +callishOp FloatTanhOp = Just MO_F32_Tanh +callishOp FloatAsinOp = Just MO_F32_Asin +callishOp FloatAcosOp = Just MO_F32_Acos +callishOp FloatAtanOp = Just MO_F32_Atan +callishOp FloatAsinhOp = Just MO_F32_Asinh +callishOp FloatAcoshOp = Just MO_F32_Acosh +callishOp FloatAtanhOp = Just MO_F32_Atanh +callishOp FloatLogOp = Just MO_F32_Log +callishOp FloatLog1POp = Just MO_F32_Log1P +callishOp FloatExpOp = Just MO_F32_Exp +callishOp FloatExpM1Op = Just MO_F32_ExpM1 +callishOp FloatSqrtOp = Just MO_F32_Sqrt + +callishOp _ = Nothing + +------------------------------------------------------------------------------ +-- Helpers for translating various minor variants of array indexing. + +doIndexOffAddrOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexOffAddrOp maybe_post_read_cast rep [res] [addr,idx] + = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr rep idx +doIndexOffAddrOp _ _ _ _ + = panic "GHC.StgToCmm.Prim: doIndexOffAddrOp" + +doIndexOffAddrOpAs :: Maybe MachOp + -> CmmType + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexOffAddrOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx] + = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr idx_rep idx +doIndexOffAddrOpAs _ _ _ _ _ + = panic "GHC.StgToCmm.Prim: doIndexOffAddrOpAs" + +doIndexByteArrayOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexByteArrayOp maybe_post_read_cast rep [res] [addr,idx] + = do dflags <- getDynFlags + mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr rep idx +doIndexByteArrayOp _ _ _ _ + = panic "GHC.StgToCmm.Prim: doIndexByteArrayOp" + +doIndexByteArrayOpAs :: Maybe MachOp + -> CmmType + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexByteArrayOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx] + = do dflags <- getDynFlags + mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr idx_rep idx +doIndexByteArrayOpAs _ _ _ _ _ + = panic "GHC.StgToCmm.Prim: doIndexByteArrayOpAs" + +doReadPtrArrayOp :: LocalReg + -> CmmExpr + -> CmmExpr + -> FCode () +doReadPtrArrayOp res addr idx + = do dflags <- getDynFlags + mkBasicIndexedRead (arrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr (gcWord dflags) idx + +doWriteOffAddrOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doWriteOffAddrOp maybe_pre_write_cast idx_ty [] [addr,idx,val] + = mkBasicIndexedWrite 0 maybe_pre_write_cast addr idx_ty idx val +doWriteOffAddrOp _ _ _ _ + = panic "GHC.StgToCmm.Prim: doWriteOffAddrOp" + +doWriteByteArrayOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doWriteByteArrayOp maybe_pre_write_cast idx_ty [] [addr,idx,val] + = do dflags <- getDynFlags + mkBasicIndexedWrite (arrWordsHdrSize dflags) maybe_pre_write_cast addr idx_ty idx val +doWriteByteArrayOp _ _ _ _ + = panic "GHC.StgToCmm.Prim: doWriteByteArrayOp" + +doWritePtrArrayOp :: CmmExpr + -> CmmExpr + -> CmmExpr + -> FCode () +doWritePtrArrayOp addr idx val + = do dflags <- getDynFlags + let ty = cmmExprType dflags val + -- This write barrier is to ensure that the heap writes to the object + -- referred to by val have happened before we write val into the array. + -- See #12469 for details. + emitPrimCall [] MO_WriteBarrier [] + mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val + emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) + -- the write barrier. We must write a byte into the mark table: + -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] + emit $ mkStore ( + cmmOffsetExpr dflags + (cmmOffsetExprW dflags (cmmOffsetB dflags addr (arrPtrsHdrSize dflags)) + (loadArrPtrsSize dflags addr)) + (CmmMachOp (mo_wordUShr dflags) [idx, + mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)]) + ) (CmmLit (CmmInt 1 W8)) + +loadArrPtrsSize :: DynFlags -> CmmExpr -> CmmExpr +loadArrPtrsSize dflags addr = CmmLoad (cmmOffsetB dflags addr off) (bWord dflags) + where off = fixedHdrSize dflags + oFFSET_StgMutArrPtrs_ptrs dflags + +mkBasicIndexedRead :: ByteOff -- Initial offset in bytes + -> Maybe MachOp -- Optional result cast + -> CmmType -- Type of element we are accessing + -> LocalReg -- Destination + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> FCode () +mkBasicIndexedRead off Nothing ty res base idx_ty idx + = do dflags <- getDynFlags + emitAssign (CmmLocal res) (cmmLoadIndexOffExpr dflags off ty base idx_ty idx) +mkBasicIndexedRead off (Just cast) ty res base idx_ty idx + = do dflags <- getDynFlags + emitAssign (CmmLocal res) (CmmMachOp cast [ + cmmLoadIndexOffExpr dflags off ty base idx_ty idx]) + +mkBasicIndexedWrite :: ByteOff -- Initial offset in bytes + -> Maybe MachOp -- Optional value cast + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> CmmExpr -- Value to write + -> FCode () +mkBasicIndexedWrite off Nothing base idx_ty idx val + = do dflags <- getDynFlags + emitStore (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) val +mkBasicIndexedWrite off (Just cast) base idx_ty idx val + = mkBasicIndexedWrite off Nothing base idx_ty idx (CmmMachOp cast [val]) + +-- ---------------------------------------------------------------------------- +-- Misc utils + +cmmIndexOffExpr :: DynFlags + -> ByteOff -- Initial offset in bytes + -> Width -- Width of element by which we are indexing + -> CmmExpr -- Base address + -> CmmExpr -- Index + -> CmmExpr +cmmIndexOffExpr dflags off width base idx + = cmmIndexExpr dflags width (cmmOffsetB dflags base off) idx + +cmmLoadIndexOffExpr :: DynFlags + -> ByteOff -- Initial offset in bytes + -> CmmType -- Type of element we are accessing + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> CmmExpr +cmmLoadIndexOffExpr dflags off ty base idx_ty idx + = CmmLoad (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) ty + +setInfo :: CmmExpr -> CmmExpr -> CmmAGraph +setInfo closure_ptr info_ptr = mkStore closure_ptr info_ptr + +------------------------------------------------------------------------------ +-- Helpers for translating vector primops. + +vecVmmType :: PrimOpVecCat -> Length -> Width -> CmmType +vecVmmType pocat n w = vec n (vecCmmCat pocat w) + +vecCmmCat :: PrimOpVecCat -> Width -> CmmType +vecCmmCat IntVec = cmmBits +vecCmmCat WordVec = cmmBits +vecCmmCat FloatVec = cmmFloat + +vecElemInjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp +vecElemInjectCast _ FloatVec _ = Nothing +vecElemInjectCast dflags IntVec W8 = Just (mo_WordTo8 dflags) +vecElemInjectCast dflags IntVec W16 = Just (mo_WordTo16 dflags) +vecElemInjectCast dflags IntVec W32 = Just (mo_WordTo32 dflags) +vecElemInjectCast _ IntVec W64 = Nothing +vecElemInjectCast dflags WordVec W8 = Just (mo_WordTo8 dflags) +vecElemInjectCast dflags WordVec W16 = Just (mo_WordTo16 dflags) +vecElemInjectCast dflags WordVec W32 = Just (mo_WordTo32 dflags) +vecElemInjectCast _ WordVec W64 = Nothing +vecElemInjectCast _ _ _ = Nothing + +vecElemProjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp +vecElemProjectCast _ FloatVec _ = Nothing +vecElemProjectCast dflags IntVec W8 = Just (mo_s_8ToWord dflags) +vecElemProjectCast dflags IntVec W16 = Just (mo_s_16ToWord dflags) +vecElemProjectCast dflags IntVec W32 = Just (mo_s_32ToWord dflags) +vecElemProjectCast _ IntVec W64 = Nothing +vecElemProjectCast dflags WordVec W8 = Just (mo_u_8ToWord dflags) +vecElemProjectCast dflags WordVec W16 = Just (mo_u_16ToWord dflags) +vecElemProjectCast dflags WordVec W32 = Just (mo_u_32ToWord dflags) +vecElemProjectCast _ WordVec W64 = Nothing +vecElemProjectCast _ _ _ = Nothing + + +-- NOTE [SIMD Design for the future] +-- Check to make sure that we can generate code for the specified vector type +-- given the current set of dynamic flags. +-- Currently these checks are specific to x86 and x86_64 architecture. +-- This should be fixed! +-- In particular, +-- 1) Add better support for other architectures! (this may require a redesign) +-- 2) Decouple design choices from LLVM's pseudo SIMD model! +-- The high level LLVM naive rep makes per CPU family SIMD generation is own +-- optimization problem, and hides important differences in eg ARM vs x86_64 simd +-- 3) Depending on the architecture, the SIMD registers may also support general +-- computations on Float/Double/Word/Int scalars, but currently on +-- for example x86_64, we always put Word/Int (or sized) in GPR +-- (general purpose) registers. Would relaxing that allow for +-- useful optimization opportunities? +-- Phrased differently, it is worth experimenting with supporting +-- different register mapping strategies than we currently have, especially if +-- someday we want SIMD to be a first class denizen in GHC along with scalar +-- values! +-- The current design with respect to register mapping of scalars could +-- very well be the best,but exploring the design space and doing careful +-- measurments is the only only way to validate that. +-- In some next generation CPU ISAs, notably RISC V, the SIMD extension +-- includes support for a sort of run time CPU dependent vectorization parameter, +-- where a loop may act upon a single scalar each iteration OR some 2,4,8 ... +-- element chunk! Time will tell if that direction sees wide adoption, +-- but it is from that context that unifying our handling of simd and scalars +-- may benefit. It is not likely to benefit current architectures, though +-- it may very well be a design perspective that helps guide improving the NCG. + + +checkVecCompatibility :: DynFlags -> PrimOpVecCat -> Length -> Width -> FCode () +checkVecCompatibility dflags vcat l w = do + when (hscTarget dflags /= HscLlvm) $ do + sorry $ unlines ["SIMD vector instructions require the LLVM back-end." + ,"Please use -fllvm."] + check vecWidth vcat l w + where + check :: Width -> PrimOpVecCat -> Length -> Width -> FCode () + check W128 FloatVec 4 W32 | not (isSseEnabled dflags) = + sorry $ "128-bit wide single-precision floating point " ++ + "SIMD vector instructions require at least -msse." + check W128 _ _ _ | not (isSse2Enabled dflags) = + sorry $ "128-bit wide integer and double precision " ++ + "SIMD vector instructions require at least -msse2." + check W256 FloatVec _ _ | not (isAvxEnabled dflags) = + sorry $ "256-bit wide floating point " ++ + "SIMD vector instructions require at least -mavx." + check W256 _ _ _ | not (isAvx2Enabled dflags) = + sorry $ "256-bit wide integer " ++ + "SIMD vector instructions require at least -mavx2." + check W512 _ _ _ | not (isAvx512fEnabled dflags) = + sorry $ "512-bit wide " ++ + "SIMD vector instructions require -mavx512f." + check _ _ _ _ = return () + + vecWidth = typeWidth (vecVmmType vcat l w) + +------------------------------------------------------------------------------ +-- Helpers for translating vector packing and unpacking. + +doVecPackOp :: Maybe MachOp -- Cast from element to vector component + -> CmmType -- Type of vector + -> CmmExpr -- Initial vector + -> [CmmExpr] -- Elements + -> CmmFormal -- Destination for result + -> FCode () +doVecPackOp maybe_pre_write_cast ty z es res = do + dst <- newTemp ty + emitAssign (CmmLocal dst) z + vecPack dst es 0 + where + vecPack :: CmmFormal -> [CmmExpr] -> Int -> FCode () + vecPack src [] _ = + emitAssign (CmmLocal res) (CmmReg (CmmLocal src)) + + vecPack src (e : es) i = do + dst <- newTemp ty + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid) + [CmmReg (CmmLocal src), cast e, iLit]) + else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid) + [CmmReg (CmmLocal src), cast e, iLit]) + vecPack dst es (i + 1) + where + -- vector indices are always 32-bits + iLit = CmmLit (CmmInt (toInteger i) W32) + + cast :: CmmExpr -> CmmExpr + cast val = case maybe_pre_write_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + +doVecUnpackOp :: Maybe MachOp -- Cast from vector component to element result + -> CmmType -- Type of vector + -> CmmExpr -- Vector + -> [CmmFormal] -- Element results + -> FCode () +doVecUnpackOp maybe_post_read_cast ty e res = + vecUnpack res 0 + where + vecUnpack :: [CmmFormal] -> Int -> FCode () + vecUnpack [] _ = + return () + + vecUnpack (r : rs) i = do + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal r) (cast (CmmMachOp (MO_VF_Extract len wid) + [e, iLit])) + else emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid) + [e, iLit])) + vecUnpack rs (i + 1) + where + -- vector indices are always 32-bits + iLit = CmmLit (CmmInt (toInteger i) W32) + + cast :: CmmExpr -> CmmExpr + cast val = case maybe_post_read_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + +doVecInsertOp :: Maybe MachOp -- Cast from element to vector component + -> CmmType -- Vector type + -> CmmExpr -- Source vector + -> CmmExpr -- Element + -> CmmExpr -- Index at which to insert element + -> CmmFormal -- Destination for result + -> FCode () +doVecInsertOp maybe_pre_write_cast ty src e idx res = do + dflags <- getDynFlags + -- vector indices are always 32-bits + let idx' :: CmmExpr + idx' = CmmMachOp (MO_SS_Conv (wordWidth dflags) W32) [idx] + if isFloatType (vecElemType ty) + then emitAssign (CmmLocal res) (CmmMachOp (MO_VF_Insert len wid) [src, cast e, idx']) + else emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx']) + where + cast :: CmmExpr -> CmmExpr + cast val = case maybe_pre_write_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + +------------------------------------------------------------------------------ +-- Helpers for translating prefetching. + + +-- | Translate byte array prefetch operations into proper primcalls. +doPrefetchByteArrayOp :: Int + -> [CmmExpr] + -> FCode () +doPrefetchByteArrayOp locality [addr,idx] + = do dflags <- getDynFlags + mkBasicPrefetch locality (arrWordsHdrSize dflags) addr idx +doPrefetchByteArrayOp _ _ + = panic "GHC.StgToCmm.Prim: doPrefetchByteArrayOp" + +-- | Translate mutable byte array prefetch operations into proper primcalls. +doPrefetchMutableByteArrayOp :: Int + -> [CmmExpr] + -> FCode () +doPrefetchMutableByteArrayOp locality [addr,idx] + = do dflags <- getDynFlags + mkBasicPrefetch locality (arrWordsHdrSize dflags) addr idx +doPrefetchMutableByteArrayOp _ _ + = panic "GHC.StgToCmm.Prim: doPrefetchByteArrayOp" + +-- | Translate address prefetch operations into proper primcalls. +doPrefetchAddrOp ::Int + -> [CmmExpr] + -> FCode () +doPrefetchAddrOp locality [addr,idx] + = mkBasicPrefetch locality 0 addr idx +doPrefetchAddrOp _ _ + = panic "GHC.StgToCmm.Prim: doPrefetchAddrOp" + +-- | Translate value prefetch operations into proper primcalls. +doPrefetchValueOp :: Int + -> [CmmExpr] + -> FCode () +doPrefetchValueOp locality [addr] + = do dflags <- getDynFlags + mkBasicPrefetch locality 0 addr (CmmLit (CmmInt 0 (wordWidth dflags))) +doPrefetchValueOp _ _ + = panic "GHC.StgToCmm.Prim: doPrefetchValueOp" + +-- | helper to generate prefetch primcalls +mkBasicPrefetch :: Int -- Locality level 0-3 + -> ByteOff -- Initial offset in bytes + -> CmmExpr -- Base address + -> CmmExpr -- Index + -> FCode () +mkBasicPrefetch locality off base idx + = do dflags <- getDynFlags + emitPrimCall [] (MO_Prefetch_Data locality) [cmmIndexExpr dflags W8 (cmmOffsetB dflags base off) idx] + return () + +-- ---------------------------------------------------------------------------- +-- Allocating byte arrays + +-- | Takes a register to return the newly allocated array in and the +-- size of the new array in bytes. Allocates a new +-- 'MutableByteArray#'. +doNewByteArrayOp :: CmmFormal -> ByteOff -> FCode () +doNewByteArrayOp res_r n = do + dflags <- getDynFlags + + let info_ptr = mkLblExpr mkArrWords_infoLabel + rep = arrWordsRep dflags n + + tickyAllocPrim (mkIntExpr dflags (arrWordsHdrSize dflags)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) + + let hdr_size = fixedHdrSize dflags + + base <- allocHeapClosure rep info_ptr cccsExpr + [ (mkIntExpr dflags n, + hdr_size + oFFSET_StgArrBytes_bytes dflags) + ] + + emit $ mkAssign (CmmLocal res_r) base + +-- ---------------------------------------------------------------------------- +-- Comparing byte arrays + +doCompareByteArraysOp :: LocalReg -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doCompareByteArraysOp res ba1 ba1_off ba2 ba2_off n = do + dflags <- getDynFlags + ba1_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba1 (arrWordsHdrSize dflags)) ba1_off + ba2_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba2 (arrWordsHdrSize dflags)) ba2_off + + -- short-cut in case of equal pointers avoiding a costly + -- subroutine call to the memcmp(3) routine; the Cmm logic below + -- results in assembly code being generated for + -- + -- cmpPrefix10 :: ByteArray# -> ByteArray# -> Int# + -- cmpPrefix10 ba1 ba2 = compareByteArrays# ba1 0# ba2 0# 10# + -- + -- that looks like + -- + -- leaq 16(%r14),%rax + -- leaq 16(%rsi),%rbx + -- xorl %ecx,%ecx + -- cmpq %rbx,%rax + -- je l_ptr_eq + -- + -- ; NB: the common case (unequal pointers) falls-through + -- ; the conditional jump, and therefore matches the + -- ; usual static branch prediction convention of modern cpus + -- + -- subq $8,%rsp + -- movq %rbx,%rsi + -- movq %rax,%rdi + -- movl $10,%edx + -- xorl %eax,%eax + -- call memcmp + -- addq $8,%rsp + -- movslq %eax,%rax + -- movq %rax,%rcx + -- l_ptr_eq: + -- movq %rcx,%rbx + -- jmp *(%rbp) + + l_ptr_eq <- newBlockId + l_ptr_ne <- newBlockId + + emit (mkAssign (CmmLocal res) (zeroExpr dflags)) + emit (mkCbranch (cmmEqWord dflags ba1_p ba2_p) + l_ptr_eq l_ptr_ne (Just False)) + + emitLabel l_ptr_ne + emitMemcmpCall res ba1_p ba2_p n 1 + + emitLabel l_ptr_eq + +-- ---------------------------------------------------------------------------- +-- Copying byte arrays + +-- | Takes a source 'ByteArray#', an offset in the source array, a +-- destination 'MutableByteArray#', an offset into the destination +-- array, and the number of bytes to copy. Copies the given number of +-- bytes from the source array to the destination array. +doCopyByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doCopyByteArrayOp = emitCopyByteArray copy + where + -- Copy data (we assume the arrays aren't overlapping since + -- they're of different types) + copy _src _dst dst_p src_p bytes align = + emitMemcpyCall dst_p src_p bytes align + +-- | Takes a source 'MutableByteArray#', an offset in the source +-- array, a destination 'MutableByteArray#', an offset into the +-- destination array, and the number of bytes to copy. Copies the +-- given number of bytes from the source array to the destination +-- array. +doCopyMutableByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doCopyMutableByteArrayOp = emitCopyByteArray copy + where + -- The only time the memory might overlap is when the two arrays + -- we were provided are the same array! + -- TODO: Optimize branch for common case of no aliasing. + copy src dst dst_p src_p bytes align = do + dflags <- getDynFlags + (moveCall, cpyCall) <- forkAltPair + (getCode $ emitMemmoveCall dst_p src_p bytes align) + (getCode $ emitMemcpyCall dst_p src_p bytes align) + emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall + +emitCopyByteArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> Alignment -> FCode ()) + -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +emitCopyByteArray copy src src_off dst dst_off n = do + dflags <- getDynFlags + let byteArrayAlignment = wordAlignment dflags + srcOffAlignment = cmmExprAlignment src_off + dstOffAlignment = cmmExprAlignment dst_off + align = minimum [byteArrayAlignment, srcOffAlignment, dstOffAlignment] + dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off + src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off + copy src dst dst_p src_p n align + +-- | Takes a source 'ByteArray#', an offset in the source array, a +-- destination 'Addr#', and the number of bytes to copy. Copies the given +-- number of bytes from the source array to the destination memory region. +doCopyByteArrayToAddrOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode () +doCopyByteArrayToAddrOp src src_off dst_p bytes = do + -- Use memcpy (we are allowed to assume the arrays aren't overlapping) + dflags <- getDynFlags + src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off + emitMemcpyCall dst_p src_p bytes (mkAlignment 1) + +-- | Takes a source 'MutableByteArray#', an offset in the source array, a +-- destination 'Addr#', and the number of bytes to copy. Copies the given +-- number of bytes from the source array to the destination memory region. +doCopyMutableByteArrayToAddrOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doCopyMutableByteArrayToAddrOp = doCopyByteArrayToAddrOp + +-- | Takes a source 'Addr#', a destination 'MutableByteArray#', an offset into +-- the destination array, and the number of bytes to copy. Copies the given +-- number of bytes from the source memory region to the destination array. +doCopyAddrToByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode () +doCopyAddrToByteArrayOp src_p dst dst_off bytes = do + -- Use memcpy (we are allowed to assume the arrays aren't overlapping) + dflags <- getDynFlags + dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off + emitMemcpyCall dst_p src_p bytes (mkAlignment 1) + + +-- ---------------------------------------------------------------------------- +-- Setting byte arrays + +-- | Takes a 'MutableByteArray#', an offset into the array, a length, +-- and a byte, and sets each of the selected bytes in the array to the +-- character. +doSetByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr + -> FCode () +doSetByteArrayOp ba off len c = do + dflags <- getDynFlags + + let byteArrayAlignment = wordAlignment dflags -- known since BA is allocated on heap + offsetAlignment = cmmExprAlignment off + align = min byteArrayAlignment offsetAlignment + + p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off + emitMemsetCall p c len align + +-- ---------------------------------------------------------------------------- +-- Allocating arrays + +-- | Allocate a new array. +doNewArrayOp :: CmmFormal -- ^ return register + -> SMRep -- ^ representation of the array + -> CLabel -- ^ info pointer + -> [(CmmExpr, ByteOff)] -- ^ header payload + -> WordOff -- ^ array size + -> CmmExpr -- ^ initial element + -> FCode () +doNewArrayOp res_r rep info payload n init = do + dflags <- getDynFlags + + let info_ptr = mkLblExpr info + + tickyAllocPrim (mkIntExpr dflags (hdrSize dflags rep)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) + + base <- allocHeapClosure rep info_ptr cccsExpr payload + + arr <- CmmLocal `fmap` newTemp (bWord dflags) + emit $ mkAssign arr base + + -- Initialise all elements of the array + let mkOff off = cmmOffsetW dflags (CmmReg arr) (hdrSizeW dflags rep + off) + initialization = [ mkStore (mkOff off) init | off <- [0.. n - 1] ] + emit (catAGraphs initialization) + + emit $ mkAssign (CmmLocal res_r) (CmmReg arr) + +-- ---------------------------------------------------------------------------- +-- Copying pointer arrays + +-- EZY: This code has an unusually high amount of assignTemp calls, seen +-- nowhere else in the code generator. This is mostly because these +-- "primitive" ops result in a surprisingly large amount of code. It +-- will likely be worthwhile to optimize what is emitted here, so that +-- our optimization passes don't waste time repeatedly optimizing the +-- same bits of code. + +-- More closely imitates 'assignTemp' from the old code generator, which +-- returns a CmmExpr rather than a LocalReg. +assignTempE :: CmmExpr -> FCode CmmExpr +assignTempE e = do + t <- assignTemp e + return (CmmReg (CmmLocal t)) + +-- | Takes a source 'Array#', an offset in the source array, a +-- destination 'MutableArray#', an offset into the destination array, +-- and the number of elements to copy. Copies the given number of +-- elements from the source array to the destination array. +doCopyArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +doCopyArrayOp = emitCopyArray copy + where + -- Copy data (we assume the arrays aren't overlapping since + -- they're of different types) + copy _src _dst dst_p src_p bytes = + do dflags <- getDynFlags + emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags) + + +-- | Takes a source 'MutableArray#', an offset in the source array, a +-- destination 'MutableArray#', an offset into the destination array, +-- and the number of elements to copy. Copies the given number of +-- elements from the source array to the destination array. +doCopyMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +doCopyMutableArrayOp = emitCopyArray copy + where + -- The only time the memory might overlap is when the two arrays + -- we were provided are the same array! + -- TODO: Optimize branch for common case of no aliasing. + copy src dst dst_p src_p bytes = do + dflags <- getDynFlags + (moveCall, cpyCall) <- forkAltPair + (getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags)) + (getCode $ emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags)) + emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall + +emitCopyArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff + -> FCode ()) -- ^ copy function + -> CmmExpr -- ^ source array + -> CmmExpr -- ^ offset in source array + -> CmmExpr -- ^ destination array + -> CmmExpr -- ^ offset in destination array + -> WordOff -- ^ number of elements to copy + -> FCode () +emitCopyArray copy src0 src_off dst0 dst_off0 n = + when (n /= 0) $ do + dflags <- getDynFlags + + -- Passed as arguments (be careful) + src <- assignTempE src0 + dst <- assignTempE dst0 + dst_off <- assignTempE dst_off0 + + -- Set the dirty bit in the header. + emit (setInfo dst (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) + + dst_elems_p <- assignTempE $ cmmOffsetB dflags dst + (arrPtrsHdrSize dflags) + dst_p <- assignTempE $ cmmOffsetExprW dflags dst_elems_p dst_off + src_p <- assignTempE $ cmmOffsetExprW dflags + (cmmOffsetB dflags src (arrPtrsHdrSize dflags)) src_off + let bytes = wordsToBytes dflags n + + copy src dst dst_p src_p bytes + + -- The base address of the destination card table + dst_cards_p <- assignTempE $ cmmOffsetExprW dflags dst_elems_p + (loadArrPtrsSize dflags dst) + + emitSetCards dst_off dst_cards_p n + +doCopySmallArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +doCopySmallArrayOp = emitCopySmallArray copy + where + -- Copy data (we assume the arrays aren't overlapping since + -- they're of different types) + copy _src _dst dst_p src_p bytes = + do dflags <- getDynFlags + emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags) + + +doCopySmallMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +doCopySmallMutableArrayOp = emitCopySmallArray copy + where + -- The only time the memory might overlap is when the two arrays + -- we were provided are the same array! + -- TODO: Optimize branch for common case of no aliasing. + copy src dst dst_p src_p bytes = do + dflags <- getDynFlags + (moveCall, cpyCall) <- forkAltPair + (getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags)) + (getCode $ emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes) + (wordAlignment dflags)) + emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall + +emitCopySmallArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff + -> FCode ()) -- ^ copy function + -> CmmExpr -- ^ source array + -> CmmExpr -- ^ offset in source array + -> CmmExpr -- ^ destination array + -> CmmExpr -- ^ offset in destination array + -> WordOff -- ^ number of elements to copy + -> FCode () +emitCopySmallArray copy src0 src_off dst0 dst_off n = + when (n /= 0) $ do + dflags <- getDynFlags + + -- Passed as arguments (be careful) + src <- assignTempE src0 + dst <- assignTempE dst0 + + -- Set the dirty bit in the header. + emit (setInfo dst (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) + + dst_p <- assignTempE $ cmmOffsetExprW dflags + (cmmOffsetB dflags dst (smallArrPtrsHdrSize dflags)) dst_off + src_p <- assignTempE $ cmmOffsetExprW dflags + (cmmOffsetB dflags src (smallArrPtrsHdrSize dflags)) src_off + let bytes = wordsToBytes dflags n + + copy src dst dst_p src_p bytes + +-- | Takes an info table label, a register to return the newly +-- allocated array in, a source array, an offset in the source array, +-- and the number of elements to copy. Allocates a new array and +-- initializes it from the source array. +emitCloneArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +emitCloneArray info_p res_r src src_off n = do + dflags <- getDynFlags + + let info_ptr = mkLblExpr info_p + rep = arrPtrsRep dflags n + + tickyAllocPrim (mkIntExpr dflags (arrPtrsHdrSize dflags)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) + + let hdr_size = fixedHdrSize dflags + + base <- allocHeapClosure rep info_ptr cccsExpr + [ (mkIntExpr dflags n, + hdr_size + oFFSET_StgMutArrPtrs_ptrs dflags) + , (mkIntExpr dflags (nonHdrSizeW rep), + hdr_size + oFFSET_StgMutArrPtrs_size dflags) + ] + + arr <- CmmLocal `fmap` newTemp (bWord dflags) + emit $ mkAssign arr base + + dst_p <- assignTempE $ cmmOffsetB dflags (CmmReg arr) + (arrPtrsHdrSize dflags) + src_p <- assignTempE $ cmmOffsetExprW dflags src + (cmmAddWord dflags + (mkIntExpr dflags (arrPtrsHdrSizeW dflags)) src_off) + + emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n)) + (wordAlignment dflags) + + emit $ mkAssign (CmmLocal res_r) (CmmReg arr) + +-- | Takes an info table label, a register to return the newly +-- allocated array in, a source array, an offset in the source array, +-- and the number of elements to copy. Allocates a new array and +-- initializes it from the source array. +emitCloneSmallArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> WordOff + -> FCode () +emitCloneSmallArray info_p res_r src src_off n = do + dflags <- getDynFlags + + let info_ptr = mkLblExpr info_p + rep = smallArrPtrsRep n + + tickyAllocPrim (mkIntExpr dflags (smallArrPtrsHdrSize dflags)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) + + let hdr_size = fixedHdrSize dflags + + base <- allocHeapClosure rep info_ptr cccsExpr + [ (mkIntExpr dflags n, + hdr_size + oFFSET_StgSmallMutArrPtrs_ptrs dflags) + ] + + arr <- CmmLocal `fmap` newTemp (bWord dflags) + emit $ mkAssign arr base + + dst_p <- assignTempE $ cmmOffsetB dflags (CmmReg arr) + (smallArrPtrsHdrSize dflags) + src_p <- assignTempE $ cmmOffsetExprW dflags src + (cmmAddWord dflags + (mkIntExpr dflags (smallArrPtrsHdrSizeW dflags)) src_off) + + emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n)) + (wordAlignment dflags) + + emit $ mkAssign (CmmLocal res_r) (CmmReg arr) + +-- | Takes and offset in the destination array, the base address of +-- the card table, and the number of elements affected (*not* the +-- number of cards). The number of elements may not be zero. +-- Marks the relevant cards as dirty. +emitSetCards :: CmmExpr -> CmmExpr -> WordOff -> FCode () +emitSetCards dst_start dst_cards_start n = do + dflags <- getDynFlags + start_card <- assignTempE $ cardCmm dflags dst_start + let end_card = cardCmm dflags + (cmmSubWord dflags + (cmmAddWord dflags dst_start (mkIntExpr dflags n)) + (mkIntExpr dflags 1)) + emitMemsetCall (cmmAddWord dflags dst_cards_start start_card) + (mkIntExpr dflags 1) + (cmmAddWord dflags (cmmSubWord dflags end_card start_card) (mkIntExpr dflags 1)) + (mkAlignment 1) -- no alignment (1 byte) + +-- Convert an element index to a card index +cardCmm :: DynFlags -> CmmExpr -> CmmExpr +cardCmm dflags i = + cmmUShrWord dflags i (mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)) + +------------------------------------------------------------------------------ +-- SmallArray PrimOp implementations + +doReadSmallPtrArrayOp :: LocalReg + -> CmmExpr + -> CmmExpr + -> FCode () +doReadSmallPtrArrayOp res addr idx = do + dflags <- getDynFlags + mkBasicIndexedRead (smallArrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr + (gcWord dflags) idx + +doWriteSmallPtrArrayOp :: CmmExpr + -> CmmExpr + -> CmmExpr + -> FCode () +doWriteSmallPtrArrayOp addr idx val = do + dflags <- getDynFlags + let ty = cmmExprType dflags val + emitPrimCall [] MO_WriteBarrier [] -- #12469 + mkBasicIndexedWrite (smallArrPtrsHdrSize dflags) Nothing addr ty idx val + emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) + +------------------------------------------------------------------------------ +-- Atomic read-modify-write + +-- | Emit an atomic modification to a byte array element. The result +-- reg contains that previous value of the element. Implies a full +-- memory barrier. +doAtomicRMW :: LocalReg -- ^ Result reg + -> AtomicMachOp -- ^ Atomic op (e.g. add) + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Op argument (e.g. amount to add) + -> FCode () +doAtomicRMW res amop mba idx idx_ty n = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_AtomicRMW width amop) + [ addr, n ] + +-- | Emit an atomic read to a byte array that acts as a memory barrier. +doAtomicReadByteArray + :: LocalReg -- ^ Result reg + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> FCode () +doAtomicReadByteArray res mba idx idx_ty = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_AtomicRead width) + [ addr ] + +-- | Emit an atomic write to a byte array that acts as a memory barrier. +doAtomicWriteByteArray + :: CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Value to write + -> FCode () +doAtomicWriteByteArray mba idx idx_ty val = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ {- no results -} ] + (MO_AtomicWrite width) + [ addr, val ] + +doCasByteArray + :: LocalReg -- ^ Result reg + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Old value + -> CmmExpr -- ^ New value + -> FCode () +doCasByteArray res mba idx idx_ty old new = do + dflags <- getDynFlags + let width = (typeWidth idx_ty) + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_Cmpxchg width) + [ addr, old, new ] + +------------------------------------------------------------------------------ +-- Helpers for emitting function calls + +-- | Emit a call to @memcpy@. +emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode () +emitMemcpyCall dst src n align = do + emitPrimCall + [ {-no results-} ] + (MO_Memcpy (alignmentBytes align)) + [ dst, src, n ] + +-- | Emit a call to @memmove@. +emitMemmoveCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode () +emitMemmoveCall dst src n align = do + emitPrimCall + [ {- no results -} ] + (MO_Memmove (alignmentBytes align)) + [ dst, src, n ] + +-- | Emit a call to @memset@. The second argument must fit inside an +-- unsigned char. +emitMemsetCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode () +emitMemsetCall dst c n align = do + emitPrimCall + [ {- no results -} ] + (MO_Memset (alignmentBytes align)) + [ dst, c, n ] + +emitMemcmpCall :: LocalReg -> CmmExpr -> CmmExpr -> CmmExpr -> Int -> FCode () +emitMemcmpCall res ptr1 ptr2 n align = do + -- 'MO_Memcmp' is assumed to return an 32bit 'CInt' because all + -- code-gens currently call out to the @memcmp(3)@ C function. + -- This was easier than moving the sign-extensions into + -- all the code-gens. + dflags <- getDynFlags + let is32Bit = typeWidth (localRegType res) == W32 + + cres <- if is32Bit + then return res + else newTemp b32 + + emitPrimCall + [ cres ] + (MO_Memcmp align) + [ ptr1, ptr2, n ] + + unless is32Bit $ do + emit $ mkAssign (CmmLocal res) + (CmmMachOp + (mo_s_32ToWord dflags) + [(CmmReg (CmmLocal cres))]) + +emitBSwapCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitBSwapCall res x width = do + emitPrimCall + [ res ] + (MO_BSwap width) + [ x ] + +emitBRevCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitBRevCall res x width = do + emitPrimCall + [ res ] + (MO_BRev width) + [ x ] + +emitPopCntCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitPopCntCall res x width = do + emitPrimCall + [ res ] + (MO_PopCnt width) + [ x ] + +emitPdepCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode () +emitPdepCall res x y width = do + emitPrimCall + [ res ] + (MO_Pdep width) + [ x, y ] + +emitPextCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode () +emitPextCall res x y width = do + emitPrimCall + [ res ] + (MO_Pext width) + [ x, y ] + +emitClzCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitClzCall res x width = do + emitPrimCall + [ res ] + (MO_Clz width) + [ x ] + +emitCtzCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitCtzCall res x width = do + emitPrimCall + [ res ] + (MO_Ctz width) + [ x ] diff --git a/compiler/GHC/StgToCmm/Prof.hs b/compiler/GHC/StgToCmm/Prof.hs new file mode 100644 index 0000000000..ce8ef61f17 --- /dev/null +++ b/compiler/GHC/StgToCmm/Prof.hs @@ -0,0 +1,360 @@ +----------------------------------------------------------------------------- +-- +-- Code generation for profiling +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Prof ( + initCostCentres, ccType, ccsType, + mkCCostCentre, mkCCostCentreStack, + + -- Cost-centre Profiling + dynProfHdr, profDynAlloc, profAlloc, staticProfHdr, initUpdFrameProf, + enterCostCentreThunk, enterCostCentreFun, + costCentreFrom, + storeCurCCS, + emitSetCCC, + + saveCurrentCostCentre, restoreCurrentCostCentre, + + -- Lag/drag/void stuff + ldvEnter, ldvEnterClosure, ldvRecordCreate + ) where + +import GhcPrelude + +import GHC.StgToCmm.Closure +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Monad +import SMRep + +import MkGraph +import Cmm +import CmmUtils +import CLabel + +import CostCentre +import DynFlags +import FastString +import Module +import Outputable + +import Control.Monad +import Data.Char (ord) + +----------------------------------------------------------------------------- +-- +-- Cost-centre-stack Profiling +-- +----------------------------------------------------------------------------- + +-- Expression representing the current cost centre stack +ccsType :: DynFlags -> CmmType -- Type of a cost-centre stack +ccsType = bWord + +ccType :: DynFlags -> CmmType -- Type of a cost centre +ccType = bWord + +storeCurCCS :: CmmExpr -> CmmAGraph +storeCurCCS e = mkAssign cccsReg e + +mkCCostCentre :: CostCentre -> CmmLit +mkCCostCentre cc = CmmLabel (mkCCLabel cc) + +mkCCostCentreStack :: CostCentreStack -> CmmLit +mkCCostCentreStack ccs = CmmLabel (mkCCSLabel ccs) + +costCentreFrom :: DynFlags + -> CmmExpr -- A closure pointer + -> CmmExpr -- The cost centre from that closure +costCentreFrom dflags cl = CmmLoad (cmmOffsetB dflags cl (oFFSET_StgHeader_ccs dflags)) (ccsType dflags) + +-- | The profiling header words in a static closure +staticProfHdr :: DynFlags -> CostCentreStack -> [CmmLit] +staticProfHdr dflags ccs + = ifProfilingL dflags [mkCCostCentreStack ccs, staticLdvInit dflags] + +-- | Profiling header words in a dynamic closure +dynProfHdr :: DynFlags -> CmmExpr -> [CmmExpr] +dynProfHdr dflags ccs = ifProfilingL dflags [ccs, dynLdvInit dflags] + +-- | Initialise the profiling field of an update frame +initUpdFrameProf :: CmmExpr -> FCode () +initUpdFrameProf frame + = ifProfiling $ -- frame->header.prof.ccs = CCCS + do dflags <- getDynFlags + emitStore (cmmOffset dflags frame (oFFSET_StgHeader_ccs dflags)) cccsExpr + -- frame->header.prof.hp.rs = NULL (or frame-header.prof.hp.ldvw = 0) + -- is unnecessary because it is not used anyhow. + +--------------------------------------------------------------------------- +-- Saving and restoring the current cost centre +--------------------------------------------------------------------------- + +{- Note [Saving the current cost centre] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The current cost centre is like a global register. Like other +global registers, it's a caller-saves one. But consider + case (f x) of (p,q) -> rhs +Since 'f' may set the cost centre, we must restore it +before resuming rhs. So we want code like this: + local_cc = CCC -- save + r = f( x ) + CCC = local_cc -- restore +That is, we explicitly "save" the current cost centre in +a LocalReg, local_cc; and restore it after the call. The +C-- infrastructure will arrange to save local_cc across the +call. + +The same goes for join points; + let j x = join-stuff + in blah-blah +We want this kind of code: + local_cc = CCC -- save + blah-blah + J: + CCC = local_cc -- restore +-} + +saveCurrentCostCentre :: FCode (Maybe LocalReg) + -- Returns Nothing if profiling is off +saveCurrentCostCentre + = do dflags <- getDynFlags + if not (gopt Opt_SccProfilingOn dflags) + then return Nothing + else do local_cc <- newTemp (ccType dflags) + emitAssign (CmmLocal local_cc) cccsExpr + return (Just local_cc) + +restoreCurrentCostCentre :: Maybe LocalReg -> FCode () +restoreCurrentCostCentre Nothing + = return () +restoreCurrentCostCentre (Just local_cc) + = emit (storeCurCCS (CmmReg (CmmLocal local_cc))) + + +------------------------------------------------------------------------------- +-- Recording allocation in a cost centre +------------------------------------------------------------------------------- + +-- | Record the allocation of a closure. The CmmExpr is the cost +-- centre stack to which to attribute the allocation. +profDynAlloc :: SMRep -> CmmExpr -> FCode () +profDynAlloc rep ccs + = ifProfiling $ + do dflags <- getDynFlags + profAlloc (mkIntExpr dflags (heapClosureSizeW dflags rep)) ccs + +-- | Record the allocation of a closure (size is given by a CmmExpr) +-- The size must be in words, because the allocation counter in a CCS counts +-- in words. +profAlloc :: CmmExpr -> CmmExpr -> FCode () +profAlloc words ccs + = ifProfiling $ + do dflags <- getDynFlags + let alloc_rep = rEP_CostCentreStack_mem_alloc dflags + emit (addToMemE alloc_rep + (cmmOffsetB dflags ccs (oFFSET_CostCentreStack_mem_alloc dflags)) + (CmmMachOp (MO_UU_Conv (wordWidth dflags) (typeWidth alloc_rep)) $ + [CmmMachOp (mo_wordSub dflags) [words, + mkIntExpr dflags (profHdrSize dflags)]])) + -- subtract the "profiling overhead", which is the + -- profiling header in a closure. + +-- ----------------------------------------------------------------------- +-- Setting the current cost centre on entry to a closure + +enterCostCentreThunk :: CmmExpr -> FCode () +enterCostCentreThunk closure = + ifProfiling $ do + dflags <- getDynFlags + emit $ storeCurCCS (costCentreFrom dflags closure) + +enterCostCentreFun :: CostCentreStack -> CmmExpr -> FCode () +enterCostCentreFun ccs closure = + ifProfiling $ do + if isCurrentCCS ccs + then do dflags <- getDynFlags + emitRtsCall rtsUnitId (fsLit "enterFunCCS") + [(baseExpr, AddrHint), + (costCentreFrom dflags closure, AddrHint)] False + else return () -- top-level function, nothing to do + +ifProfiling :: FCode () -> FCode () +ifProfiling code + = do dflags <- getDynFlags + if gopt Opt_SccProfilingOn dflags + then code + else return () + +ifProfilingL :: DynFlags -> [a] -> [a] +ifProfilingL dflags xs + | gopt Opt_SccProfilingOn dflags = xs + | otherwise = [] + + +--------------------------------------------------------------- +-- Initialising Cost Centres & CCSs +--------------------------------------------------------------- + +initCostCentres :: CollectedCCs -> FCode () +-- Emit the declarations +initCostCentres (local_CCs, singleton_CCSs) + = do dflags <- getDynFlags + when (gopt Opt_SccProfilingOn dflags) $ + do mapM_ emitCostCentreDecl local_CCs + mapM_ emitCostCentreStackDecl singleton_CCSs + + +emitCostCentreDecl :: CostCentre -> FCode () +emitCostCentreDecl cc = do + { dflags <- getDynFlags + ; let is_caf | isCafCC cc = mkIntCLit dflags (ord 'c') -- 'c' == is a CAF + | otherwise = zero dflags + -- NB. bytesFS: we want the UTF-8 bytes here (#5559) + ; label <- newByteStringCLit (bytesFS $ costCentreUserNameFS cc) + ; modl <- newByteStringCLit (bytesFS $ Module.moduleNameFS + $ Module.moduleName + $ cc_mod cc) + ; loc <- newByteStringCLit $ bytesFS $ mkFastString $ + showPpr dflags (costCentreSrcSpan cc) + -- XXX going via FastString to get UTF-8 encoding is silly + ; let + lits = [ zero dflags, -- StgInt ccID, + label, -- char *label, + modl, -- char *module, + loc, -- char *srcloc, + zero64, -- StgWord64 mem_alloc + zero dflags, -- StgWord time_ticks + is_caf, -- StgInt is_caf + zero dflags -- struct _CostCentre *link + ] + ; emitDataLits (mkCCLabel cc) lits + } + +emitCostCentreStackDecl :: CostCentreStack -> FCode () +emitCostCentreStackDecl ccs + = case maybeSingletonCCS ccs of + Just cc -> + do dflags <- getDynFlags + let mk_lits cc = zero dflags : + mkCCostCentre cc : + replicate (sizeof_ccs_words dflags - 2) (zero dflags) + -- Note: to avoid making any assumptions about how the + -- C compiler (that compiles the RTS, in particular) does + -- layouts of structs containing long-longs, simply + -- pad out the struct with zero words until we hit the + -- size of the overall struct (which we get via DerivedConstants.h) + emitDataLits (mkCCSLabel ccs) (mk_lits cc) + Nothing -> pprPanic "emitCostCentreStackDecl" (ppr ccs) + +zero :: DynFlags -> CmmLit +zero dflags = mkIntCLit dflags 0 +zero64 :: CmmLit +zero64 = CmmInt 0 W64 + +sizeof_ccs_words :: DynFlags -> Int +sizeof_ccs_words dflags + -- round up to the next word. + | ms == 0 = ws + | otherwise = ws + 1 + where + (ws,ms) = sIZEOF_CostCentreStack dflags `divMod` wORD_SIZE dflags + +-- --------------------------------------------------------------------------- +-- Set the current cost centre stack + +emitSetCCC :: CostCentre -> Bool -> Bool -> FCode () +emitSetCCC cc tick push + = do dflags <- getDynFlags + if not (gopt Opt_SccProfilingOn dflags) + then return () + else do tmp <- newTemp (ccsType dflags) + pushCostCentre tmp cccsExpr cc + when tick $ emit (bumpSccCount dflags (CmmReg (CmmLocal tmp))) + when push $ emit (storeCurCCS (CmmReg (CmmLocal tmp))) + +pushCostCentre :: LocalReg -> CmmExpr -> CostCentre -> FCode () +pushCostCentre result ccs cc + = emitRtsCallWithResult result AddrHint + rtsUnitId + (fsLit "pushCostCentre") [(ccs,AddrHint), + (CmmLit (mkCCostCentre cc), AddrHint)] + False + +bumpSccCount :: DynFlags -> CmmExpr -> CmmAGraph +bumpSccCount dflags ccs + = addToMem (rEP_CostCentreStack_scc_count dflags) + (cmmOffsetB dflags ccs (oFFSET_CostCentreStack_scc_count dflags)) 1 + +----------------------------------------------------------------------------- +-- +-- Lag/drag/void stuff +-- +----------------------------------------------------------------------------- + +-- +-- Initial value for the LDV field in a static closure +-- +staticLdvInit :: DynFlags -> CmmLit +staticLdvInit = zeroCLit + +-- +-- Initial value of the LDV field in a dynamic closure +-- +dynLdvInit :: DynFlags -> CmmExpr +dynLdvInit dflags = -- (era << LDV_SHIFT) | LDV_STATE_CREATE + CmmMachOp (mo_wordOr dflags) [ + CmmMachOp (mo_wordShl dflags) [loadEra dflags, mkIntExpr dflags (lDV_SHIFT dflags)], + CmmLit (mkWordCLit dflags (iLDV_STATE_CREATE dflags)) + ] + +-- +-- Initialise the LDV word of a new closure +-- +ldvRecordCreate :: CmmExpr -> FCode () +ldvRecordCreate closure = do + dflags <- getDynFlags + emit $ mkStore (ldvWord dflags closure) (dynLdvInit dflags) + +-- +-- | Called when a closure is entered, marks the closure as having +-- been "used". The closure is not an "inherently used" one. The +-- closure is not @IND@ because that is not considered for LDV profiling. +-- +ldvEnterClosure :: ClosureInfo -> CmmReg -> FCode () +ldvEnterClosure closure_info node_reg = do + dflags <- getDynFlags + let tag = funTag dflags closure_info + -- don't forget to substract node's tag + ldvEnter (cmmOffsetB dflags (CmmReg node_reg) (-tag)) + +ldvEnter :: CmmExpr -> FCode () +-- Argument is a closure pointer +ldvEnter cl_ptr = do + dflags <- getDynFlags + let -- don't forget to substract node's tag + ldv_wd = ldvWord dflags cl_ptr + new_ldv_wd = cmmOrWord dflags (cmmAndWord dflags (CmmLoad ldv_wd (bWord dflags)) + (CmmLit (mkWordCLit dflags (iLDV_CREATE_MASK dflags)))) + (cmmOrWord dflags (loadEra dflags) (CmmLit (mkWordCLit dflags (iLDV_STATE_USE dflags)))) + ifProfiling $ + -- if (era > 0) { + -- LDVW((c)) = (LDVW((c)) & LDV_CREATE_MASK) | + -- era | LDV_STATE_USE } + emit =<< mkCmmIfThenElse (CmmMachOp (mo_wordUGt dflags) [loadEra dflags, CmmLit (zeroCLit dflags)]) + (mkStore ldv_wd new_ldv_wd) + mkNop + +loadEra :: DynFlags -> CmmExpr +loadEra dflags = CmmMachOp (MO_UU_Conv (cIntWidth dflags) (wordWidth dflags)) + [CmmLoad (mkLblExpr (mkCmmDataLabel rtsUnitId (fsLit "era"))) + (cInt dflags)] + +ldvWord :: DynFlags -> CmmExpr -> CmmExpr +-- Takes the address of a closure, and returns +-- the address of the LDV word in the closure +ldvWord dflags closure_ptr + = cmmOffsetB dflags closure_ptr (oFFSET_StgHeader_ldvw dflags) diff --git a/compiler/GHC/StgToCmm/Ticky.hs b/compiler/GHC/StgToCmm/Ticky.hs new file mode 100644 index 0000000000..06ef520c0d --- /dev/null +++ b/compiler/GHC/StgToCmm/Ticky.hs @@ -0,0 +1,682 @@ +{-# LANGUAGE BangPatterns #-} + +----------------------------------------------------------------------------- +-- +-- Code generation for ticky-ticky profiling +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +{- OVERVIEW: ticky ticky profiling + +Please see +https://gitlab.haskell.org/ghc/ghc/wikis/debugging/ticky-ticky and also +edit it and the rest of this comment to keep them up-to-date if you +change ticky-ticky. Thanks! + + *** All allocation ticky numbers are in bytes. *** + +Some of the relevant source files: + + ***not necessarily an exhaustive list*** + + * some codeGen/ modules import this one + + * this module imports cmm/CLabel.hs to manage labels + + * cmm/CmmParse.y expands some macros using generators defined in + this module + + * includes/stg/Ticky.h declares all of the global counters + + * includes/rts/Ticky.h declares the C data type for an + STG-declaration's counters + + * some macros defined in includes/Cmm.h (and used within the RTS's + CMM code) update the global ticky counters + + * at the end of execution rts/Ticky.c generates the final report + +RTS -r<report-file> -RTS + +The rts/Ticky.c function that generates the report includes an +STG-declaration's ticky counters if + + * that declaration was entered, or + + * it was allocated (if -ticky-allocd) + +On either of those events, the counter is "registered" by adding it to +a linked list; cf the CMM generated by registerTickyCtr. + +Ticky-ticky profiling has evolved over many years. Many of the +counters from its most sophisticated days are no longer +active/accurate. As the RTS has changed, sometimes the ticky code for +relevant counters was not accordingly updated. Unfortunately, neither +were the comments. + +As of March 2013, there still exist deprecated code and comments in +the code generator as well as the RTS because: + + * I don't know what is out-of-date versus merely commented out for + momentary convenience, and + + * someone else might know how to repair it! + +-} + +module GHC.StgToCmm.Ticky ( + withNewTickyCounterFun, + withNewTickyCounterLNE, + withNewTickyCounterThunk, + withNewTickyCounterStdThunk, + withNewTickyCounterCon, + + tickyDynAlloc, + tickyAllocHeap, + + tickyAllocPrim, + tickyAllocThunk, + tickyAllocPAP, + tickyHeapCheck, + tickyStackCheck, + + tickyUnknownCall, tickyDirectCall, + + tickyPushUpdateFrame, + tickyUpdateFrameOmitted, + + tickyEnterDynCon, + tickyEnterStaticCon, + tickyEnterViaNode, + + tickyEnterFun, + tickyEnterThunk, tickyEnterStdThunk, -- dynamic non-value + -- thunks only + tickyEnterLNE, + + tickyUpdateBhCaf, + tickyBlackHole, + tickyUnboxedTupleReturn, + tickyReturnOldCon, tickyReturnNewCon, + + tickyKnownCallTooFewArgs, tickyKnownCallExact, tickyKnownCallExtraArgs, + tickySlowCall, tickySlowCallPat, + ) where + +import GhcPrelude + +import GHC.StgToCmm.ArgRep ( slowCallPattern , toArgRep , argRepString ) +import GHC.StgToCmm.Closure +import GHC.StgToCmm.Utils +import GHC.StgToCmm.Monad + +import StgSyn +import CmmExpr +import MkGraph +import CmmUtils +import CLabel +import SMRep + +import Module +import Name +import Id +import BasicTypes +import FastString +import Outputable +import Util + +import DynFlags + +-- Turgid imports for showTypeCategory +import PrelNames +import TcType +import Type +import TyCon + +import Data.Maybe +import qualified Data.Char +import Control.Monad ( when ) + +----------------------------------------------------------------------------- +-- +-- Ticky-ticky profiling +-- +----------------------------------------------------------------------------- + +data TickyClosureType + = TickyFun + Bool -- True <-> single entry + | TickyCon + | TickyThunk + Bool -- True <-> updateable + Bool -- True <-> standard thunk (AP or selector), has no entry counter + | TickyLNE + +withNewTickyCounterFun :: Bool -> Name -> [NonVoid Id] -> FCode a -> FCode a +withNewTickyCounterFun single_entry = withNewTickyCounter (TickyFun single_entry) + +withNewTickyCounterLNE :: Name -> [NonVoid Id] -> FCode a -> FCode a +withNewTickyCounterLNE nm args code = do + b <- tickyLNEIsOn + if not b then code else withNewTickyCounter TickyLNE nm args code + +thunkHasCounter :: Bool -> FCode Bool +thunkHasCounter isStatic = do + b <- tickyDynThunkIsOn + pure (not isStatic && b) + +withNewTickyCounterThunk + :: Bool -- ^ static + -> Bool -- ^ updateable + -> Name + -> FCode a + -> FCode a +withNewTickyCounterThunk isStatic isUpdatable name code = do + has_ctr <- thunkHasCounter isStatic + if not has_ctr + then code + else withNewTickyCounter (TickyThunk isUpdatable False) name [] code + +withNewTickyCounterStdThunk + :: Bool -- ^ updateable + -> Name + -> FCode a + -> FCode a +withNewTickyCounterStdThunk isUpdatable name code = do + has_ctr <- thunkHasCounter False + if not has_ctr + then code + else withNewTickyCounter (TickyThunk isUpdatable True) name [] code + +withNewTickyCounterCon + :: Name + -> FCode a + -> FCode a +withNewTickyCounterCon name code = do + has_ctr <- thunkHasCounter False + if not has_ctr + then code + else withNewTickyCounter TickyCon name [] code + +-- args does not include the void arguments +withNewTickyCounter :: TickyClosureType -> Name -> [NonVoid Id] -> FCode a -> FCode a +withNewTickyCounter cloType name args m = do + lbl <- emitTickyCounter cloType name args + setTickyCtrLabel lbl m + +emitTickyCounter :: TickyClosureType -> Name -> [NonVoid Id] -> FCode CLabel +emitTickyCounter cloType name args + = let ctr_lbl = mkRednCountsLabel name in + (>> return ctr_lbl) $ + ifTicky $ do + { dflags <- getDynFlags + ; parent <- getTickyCtrLabel + ; mod_name <- getModuleName + + -- When printing the name of a thing in a ticky file, we + -- want to give the module name even for *local* things. We + -- print just "x (M)" rather that "M.x" to distinguish them + -- from the global kind. + ; let ppr_for_ticky_name :: SDoc + ppr_for_ticky_name = + let n = ppr name + ext = case cloType of + TickyFun single_entry -> parens $ hcat $ punctuate comma $ + [text "fun"] ++ [text "se"|single_entry] + TickyCon -> parens (text "con") + TickyThunk upd std -> parens $ hcat $ punctuate comma $ + [text "thk"] ++ [text "se"|not upd] ++ [text "std"|std] + TickyLNE | isInternalName name -> parens (text "LNE") + | otherwise -> panic "emitTickyCounter: how is this an external LNE?" + p = case hasHaskellName parent of + -- NB the default "top" ticky ctr does not + -- have a Haskell name + Just pname -> text "in" <+> ppr (nameUnique pname) + _ -> empty + in if isInternalName name + then n <+> parens (ppr mod_name) <+> ext <+> p + else n <+> ext <+> p + + ; fun_descr_lit <- newStringCLit $ showSDocDebug dflags ppr_for_ticky_name + ; arg_descr_lit <- newStringCLit $ map (showTypeCategory . idType . fromNonVoid) args + ; emitDataLits ctr_lbl + -- Must match layout of includes/rts/Ticky.h's StgEntCounter + -- + -- krc: note that all the fields are I32 now; some were I16 + -- before, but the code generator wasn't handling that + -- properly and it led to chaos, panic and disorder. + [ mkIntCLit dflags 0, -- registered? + mkIntCLit dflags (length args), -- Arity + mkIntCLit dflags 0, -- Heap allocated for this thing + fun_descr_lit, + arg_descr_lit, + zeroCLit dflags, -- Entries into this thing + zeroCLit dflags, -- Heap allocated by this thing + zeroCLit dflags -- Link to next StgEntCounter + ] + } + +-- ----------------------------------------------------------------------------- +-- Ticky stack frames + +tickyPushUpdateFrame, tickyUpdateFrameOmitted :: FCode () +tickyPushUpdateFrame = ifTicky $ bumpTickyCounter (fsLit "UPDF_PUSHED_ctr") +tickyUpdateFrameOmitted = ifTicky $ bumpTickyCounter (fsLit "UPDF_OMITTED_ctr") + +-- ----------------------------------------------------------------------------- +-- Ticky entries + +-- NB the name-specific entries are only available for names that have +-- dedicated Cmm code. As far as I know, this just rules out +-- constructor thunks. For them, there is no CMM code block to put the +-- bump of name-specific ticky counter into. On the other hand, we can +-- still track allocation their allocation. + +tickyEnterDynCon, tickyEnterStaticCon, tickyEnterViaNode :: FCode () +tickyEnterDynCon = ifTicky $ bumpTickyCounter (fsLit "ENT_DYN_CON_ctr") +tickyEnterStaticCon = ifTicky $ bumpTickyCounter (fsLit "ENT_STATIC_CON_ctr") +tickyEnterViaNode = ifTicky $ bumpTickyCounter (fsLit "ENT_VIA_NODE_ctr") + +tickyEnterThunk :: ClosureInfo -> FCode () +tickyEnterThunk cl_info + = ifTicky $ do + { bumpTickyCounter ctr + ; has_ctr <- thunkHasCounter static + ; when has_ctr $ do + ticky_ctr_lbl <- getTickyCtrLabel + registerTickyCtrAtEntryDyn ticky_ctr_lbl + bumpTickyEntryCount ticky_ctr_lbl } + where + updatable = closureSingleEntry cl_info + static = isStaticClosure cl_info + + ctr | static = if updatable then fsLit "ENT_STATIC_THK_SINGLE_ctr" + else fsLit "ENT_STATIC_THK_MANY_ctr" + | otherwise = if updatable then fsLit "ENT_DYN_THK_SINGLE_ctr" + else fsLit "ENT_DYN_THK_MANY_ctr" + +tickyEnterStdThunk :: ClosureInfo -> FCode () +tickyEnterStdThunk = tickyEnterThunk + +tickyBlackHole :: Bool{-updatable-} -> FCode () +tickyBlackHole updatable + = ifTicky (bumpTickyCounter ctr) + where + ctr | updatable = (fsLit "UPD_BH_SINGLE_ENTRY_ctr") + | otherwise = (fsLit "UPD_BH_UPDATABLE_ctr") + +tickyUpdateBhCaf :: ClosureInfo -> FCode () +tickyUpdateBhCaf cl_info + = ifTicky (bumpTickyCounter ctr) + where + ctr | closureUpdReqd cl_info = (fsLit "UPD_CAF_BH_SINGLE_ENTRY_ctr") + | otherwise = (fsLit "UPD_CAF_BH_UPDATABLE_ctr") + +tickyEnterFun :: ClosureInfo -> FCode () +tickyEnterFun cl_info = ifTicky $ do + ctr_lbl <- getTickyCtrLabel + + if isStaticClosure cl_info + then do bumpTickyCounter (fsLit "ENT_STATIC_FUN_DIRECT_ctr") + registerTickyCtr ctr_lbl + else do bumpTickyCounter (fsLit "ENT_DYN_FUN_DIRECT_ctr") + registerTickyCtrAtEntryDyn ctr_lbl + + bumpTickyEntryCount ctr_lbl + +tickyEnterLNE :: FCode () +tickyEnterLNE = ifTicky $ do + bumpTickyCounter (fsLit "ENT_LNE_ctr") + ifTickyLNE $ do + ctr_lbl <- getTickyCtrLabel + registerTickyCtr ctr_lbl + bumpTickyEntryCount ctr_lbl + +-- needn't register a counter upon entry if +-- +-- 1) it's for a dynamic closure, and +-- +-- 2) -ticky-allocd is on +-- +-- since the counter was registered already upon being alloc'd +registerTickyCtrAtEntryDyn :: CLabel -> FCode () +registerTickyCtrAtEntryDyn ctr_lbl = do + already_registered <- tickyAllocdIsOn + when (not already_registered) $ registerTickyCtr ctr_lbl + +registerTickyCtr :: CLabel -> FCode () +-- Register a ticky counter +-- if ( ! f_ct.registeredp ) { +-- f_ct.link = ticky_entry_ctrs; /* hook this one onto the front of the list */ +-- ticky_entry_ctrs = & (f_ct); /* mark it as "registered" */ +-- f_ct.registeredp = 1 } +registerTickyCtr ctr_lbl = do + dflags <- getDynFlags + let + -- krc: code generator doesn't handle Not, so we test for Eq 0 instead + test = CmmMachOp (MO_Eq (wordWidth dflags)) + [CmmLoad (CmmLit (cmmLabelOffB ctr_lbl + (oFFSET_StgEntCounter_registeredp dflags))) (bWord dflags), + zeroExpr dflags] + register_stmts + = [ mkStore (CmmLit (cmmLabelOffB ctr_lbl (oFFSET_StgEntCounter_link dflags))) + (CmmLoad ticky_entry_ctrs (bWord dflags)) + , mkStore ticky_entry_ctrs (mkLblExpr ctr_lbl) + , mkStore (CmmLit (cmmLabelOffB ctr_lbl + (oFFSET_StgEntCounter_registeredp dflags))) + (mkIntExpr dflags 1) ] + ticky_entry_ctrs = mkLblExpr (mkCmmDataLabel rtsUnitId (fsLit "ticky_entry_ctrs")) + emit =<< mkCmmIfThen test (catAGraphs register_stmts) + +tickyReturnOldCon, tickyReturnNewCon :: RepArity -> FCode () +tickyReturnOldCon arity + = ifTicky $ do { bumpTickyCounter (fsLit "RET_OLD_ctr") + ; bumpHistogram (fsLit "RET_OLD_hst") arity } +tickyReturnNewCon arity + = ifTicky $ do { bumpTickyCounter (fsLit "RET_NEW_ctr") + ; bumpHistogram (fsLit "RET_NEW_hst") arity } + +tickyUnboxedTupleReturn :: RepArity -> FCode () +tickyUnboxedTupleReturn arity + = ifTicky $ do { bumpTickyCounter (fsLit "RET_UNBOXED_TUP_ctr") + ; bumpHistogram (fsLit "RET_UNBOXED_TUP_hst") arity } + +-- ----------------------------------------------------------------------------- +-- Ticky calls + +-- Ticks at a *call site*: +tickyDirectCall :: RepArity -> [StgArg] -> FCode () +tickyDirectCall arity args + | args `lengthIs` arity = tickyKnownCallExact + | otherwise = do tickyKnownCallExtraArgs + tickySlowCallPat (map argPrimRep (drop arity args)) + +tickyKnownCallTooFewArgs :: FCode () +tickyKnownCallTooFewArgs = ifTicky $ bumpTickyCounter (fsLit "KNOWN_CALL_TOO_FEW_ARGS_ctr") + +tickyKnownCallExact :: FCode () +tickyKnownCallExact = ifTicky $ bumpTickyCounter (fsLit "KNOWN_CALL_ctr") + +tickyKnownCallExtraArgs :: FCode () +tickyKnownCallExtraArgs = ifTicky $ bumpTickyCounter (fsLit "KNOWN_CALL_EXTRA_ARGS_ctr") + +tickyUnknownCall :: FCode () +tickyUnknownCall = ifTicky $ bumpTickyCounter (fsLit "UNKNOWN_CALL_ctr") + +-- Tick for the call pattern at slow call site (i.e. in addition to +-- tickyUnknownCall, tickyKnownCallExtraArgs, etc.) +tickySlowCall :: LambdaFormInfo -> [StgArg] -> FCode () +tickySlowCall _ [] = return () +tickySlowCall lf_info args = do + -- see Note [Ticky for slow calls] + if isKnownFun lf_info + then tickyKnownCallTooFewArgs + else tickyUnknownCall + tickySlowCallPat (map argPrimRep args) + +tickySlowCallPat :: [PrimRep] -> FCode () +tickySlowCallPat args = ifTicky $ + let argReps = map toArgRep args + (_, n_matched) = slowCallPattern argReps + in if n_matched > 0 && args `lengthIs` n_matched + then bumpTickyLbl $ mkRtsSlowFastTickyCtrLabel $ concatMap (map Data.Char.toLower . argRepString) argReps + else bumpTickyCounter $ fsLit "VERY_SLOW_CALL_ctr" + +{- + +Note [Ticky for slow calls] +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Terminology is unfortunately a bit mixed up for these calls. codeGen +uses "slow call" to refer to unknown calls and under-saturated known +calls. + +Nowadays, though (ie as of the eval/apply paper), the significantly +slower calls are actually just a subset of these: the ones with no +built-in argument pattern (cf GHC.StgToCmm.ArgRep.slowCallPattern) + +So for ticky profiling, we split slow calls into +"SLOW_CALL_fast_<pattern>_ctr" (those matching a built-in pattern) and +VERY_SLOW_CALL_ctr (those without a built-in pattern; these are very +bad for both space and time). + +-} + +-- ----------------------------------------------------------------------------- +-- Ticky allocation + +tickyDynAlloc :: Maybe Id -> SMRep -> LambdaFormInfo -> FCode () +-- Called when doing a dynamic heap allocation; the LambdaFormInfo +-- used to distinguish between closure types +-- +-- TODO what else to count while we're here? +tickyDynAlloc mb_id rep lf = ifTicky $ getDynFlags >>= \dflags -> + let bytes = wORD_SIZE dflags * heapClosureSizeW dflags rep + + countGlobal tot ctr = do + bumpTickyCounterBy tot bytes + bumpTickyCounter ctr + countSpecific = ifTickyAllocd $ case mb_id of + Nothing -> return () + Just id -> do + let ctr_lbl = mkRednCountsLabel (idName id) + registerTickyCtr ctr_lbl + bumpTickyAllocd ctr_lbl bytes + + -- TODO are we still tracking "good stuff" (_gds) versus + -- administrative (_adm) versus slop (_slp)? I'm going with all _gds + -- for now, since I don't currently know neither if we do nor how to + -- distinguish. NSF Mar 2013 + + in case () of + _ | isConRep rep -> + ifTickyDynThunk countSpecific >> + countGlobal (fsLit "ALLOC_CON_gds") (fsLit "ALLOC_CON_ctr") + | isThunkRep rep -> + ifTickyDynThunk countSpecific >> + if lfUpdatable lf + then countGlobal (fsLit "ALLOC_THK_gds") (fsLit "ALLOC_UP_THK_ctr") + else countGlobal (fsLit "ALLOC_THK_gds") (fsLit "ALLOC_SE_THK_ctr") + | isFunRep rep -> + countSpecific >> + countGlobal (fsLit "ALLOC_FUN_gds") (fsLit "ALLOC_FUN_ctr") + | otherwise -> panic "How is this heap object not a con, thunk, or fun?" + + + +tickyAllocHeap :: + Bool -> -- is this a genuine allocation? As opposed to + -- GHC.StgToCmm.Layout.adjustHpBackwards + VirtualHpOffset -> FCode () +-- Called when doing a heap check [TICK_ALLOC_HEAP] +-- Must be lazy in the amount of allocation! +tickyAllocHeap genuine hp + = ifTicky $ + do { dflags <- getDynFlags + ; ticky_ctr <- getTickyCtrLabel + ; emit $ catAGraphs $ + -- only test hp from within the emit so that the monadic + -- computation itself is not strict in hp (cf knot in + -- GHC.StgToCmm.Monad.getHeapUsage) + if hp == 0 then [] + else let !bytes = wORD_SIZE dflags * hp in [ + -- Bump the allocation total in the closure's StgEntCounter + addToMem (rEP_StgEntCounter_allocs dflags) + (CmmLit (cmmLabelOffB ticky_ctr (oFFSET_StgEntCounter_allocs dflags))) + bytes, + -- Bump the global allocation total ALLOC_HEAP_tot + addToMemLbl (bWord dflags) + (mkCmmDataLabel rtsUnitId (fsLit "ALLOC_HEAP_tot")) + bytes, + -- Bump the global allocation counter ALLOC_HEAP_ctr + if not genuine then mkNop + else addToMemLbl (bWord dflags) + (mkCmmDataLabel rtsUnitId (fsLit "ALLOC_HEAP_ctr")) + 1 + ]} + + +-------------------------------------------------------------------------------- +-- these three are only called from CmmParse.y (ie ultimately from the RTS) + +-- the units are bytes + +tickyAllocPrim :: CmmExpr -- ^ size of the full header, in bytes + -> CmmExpr -- ^ size of the payload, in bytes + -> CmmExpr -> FCode () +tickyAllocPrim _hdr _goods _slop = ifTicky $ do + bumpTickyCounter (fsLit "ALLOC_PRIM_ctr") + bumpTickyCounterByE (fsLit "ALLOC_PRIM_adm") _hdr + bumpTickyCounterByE (fsLit "ALLOC_PRIM_gds") _goods + bumpTickyCounterByE (fsLit "ALLOC_PRIM_slp") _slop + +tickyAllocThunk :: CmmExpr -> CmmExpr -> FCode () +tickyAllocThunk _goods _slop = ifTicky $ do + -- TODO is it ever called with a Single-Entry thunk? + bumpTickyCounter (fsLit "ALLOC_UP_THK_ctr") + bumpTickyCounterByE (fsLit "ALLOC_THK_gds") _goods + bumpTickyCounterByE (fsLit "ALLOC_THK_slp") _slop + +tickyAllocPAP :: CmmExpr -> CmmExpr -> FCode () +tickyAllocPAP _goods _slop = ifTicky $ do + bumpTickyCounter (fsLit "ALLOC_PAP_ctr") + bumpTickyCounterByE (fsLit "ALLOC_PAP_gds") _goods + bumpTickyCounterByE (fsLit "ALLOC_PAP_slp") _slop + +tickyHeapCheck :: FCode () +tickyHeapCheck = ifTicky $ bumpTickyCounter (fsLit "HEAP_CHK_ctr") + +tickyStackCheck :: FCode () +tickyStackCheck = ifTicky $ bumpTickyCounter (fsLit "STK_CHK_ctr") + +-- ----------------------------------------------------------------------------- +-- Ticky utils + +ifTicky :: FCode () -> FCode () +ifTicky code = + getDynFlags >>= \dflags -> when (gopt Opt_Ticky dflags) code + +tickyAllocdIsOn :: FCode Bool +tickyAllocdIsOn = gopt Opt_Ticky_Allocd `fmap` getDynFlags + +tickyLNEIsOn :: FCode Bool +tickyLNEIsOn = gopt Opt_Ticky_LNE `fmap` getDynFlags + +tickyDynThunkIsOn :: FCode Bool +tickyDynThunkIsOn = gopt Opt_Ticky_Dyn_Thunk `fmap` getDynFlags + +ifTickyAllocd :: FCode () -> FCode () +ifTickyAllocd code = tickyAllocdIsOn >>= \b -> when b code + +ifTickyLNE :: FCode () -> FCode () +ifTickyLNE code = tickyLNEIsOn >>= \b -> when b code + +ifTickyDynThunk :: FCode () -> FCode () +ifTickyDynThunk code = tickyDynThunkIsOn >>= \b -> when b code + +bumpTickyCounter :: FastString -> FCode () +bumpTickyCounter lbl = bumpTickyLbl (mkCmmDataLabel rtsUnitId lbl) + +bumpTickyCounterBy :: FastString -> Int -> FCode () +bumpTickyCounterBy lbl = bumpTickyLblBy (mkCmmDataLabel rtsUnitId lbl) + +bumpTickyCounterByE :: FastString -> CmmExpr -> FCode () +bumpTickyCounterByE lbl = bumpTickyLblByE (mkCmmDataLabel rtsUnitId lbl) + +bumpTickyEntryCount :: CLabel -> FCode () +bumpTickyEntryCount lbl = do + dflags <- getDynFlags + bumpTickyLit (cmmLabelOffB lbl (oFFSET_StgEntCounter_entry_count dflags)) + +bumpTickyAllocd :: CLabel -> Int -> FCode () +bumpTickyAllocd lbl bytes = do + dflags <- getDynFlags + bumpTickyLitBy (cmmLabelOffB lbl (oFFSET_StgEntCounter_allocd dflags)) bytes + +bumpTickyLbl :: CLabel -> FCode () +bumpTickyLbl lhs = bumpTickyLitBy (cmmLabelOffB lhs 0) 1 + +bumpTickyLblBy :: CLabel -> Int -> FCode () +bumpTickyLblBy lhs = bumpTickyLitBy (cmmLabelOffB lhs 0) + +bumpTickyLblByE :: CLabel -> CmmExpr -> FCode () +bumpTickyLblByE lhs = bumpTickyLitByE (cmmLabelOffB lhs 0) + +bumpTickyLit :: CmmLit -> FCode () +bumpTickyLit lhs = bumpTickyLitBy lhs 1 + +bumpTickyLitBy :: CmmLit -> Int -> FCode () +bumpTickyLitBy lhs n = do + dflags <- getDynFlags + emit (addToMem (bWord dflags) (CmmLit lhs) n) + +bumpTickyLitByE :: CmmLit -> CmmExpr -> FCode () +bumpTickyLitByE lhs e = do + dflags <- getDynFlags + emit (addToMemE (bWord dflags) (CmmLit lhs) e) + +bumpHistogram :: FastString -> Int -> FCode () +bumpHistogram lbl n = do + dflags <- getDynFlags + let offset = n `min` (tICKY_BIN_COUNT dflags - 1) + emit (addToMem (bWord dflags) + (cmmIndexExpr dflags + (wordWidth dflags) + (CmmLit (CmmLabel (mkCmmDataLabel rtsUnitId lbl))) + (CmmLit (CmmInt (fromIntegral offset) (wordWidth dflags)))) + 1) + +------------------------------------------------------------------ +-- Showing the "type category" for ticky-ticky profiling + +showTypeCategory :: Type -> Char + {- + + dictionary + + > function + + {C,I,F,D,W} char, int, float, double, word + {c,i,f,d,w} unboxed ditto + + T tuple + + P other primitive type + p unboxed ditto + + L list + E enumeration type + S other single-constructor type + M other multi-constructor data-con type + + . other type + + - reserved for others to mark as "uninteresting" + + Accurate as of Mar 2013, but I eliminated the Array category instead + of updating it, for simplicity. It's in P/p, I think --NSF + + -} +showTypeCategory ty + | isDictTy ty = '+' + | otherwise = case tcSplitTyConApp_maybe ty of + Nothing -> '.' + Just (tycon, _) -> + (if isUnliftedTyCon tycon then Data.Char.toLower else id) $ + let anyOf us = getUnique tycon `elem` us in + case () of + _ | anyOf [funTyConKey] -> '>' + | anyOf [charPrimTyConKey, charTyConKey] -> 'C' + | anyOf [doublePrimTyConKey, doubleTyConKey] -> 'D' + | anyOf [floatPrimTyConKey, floatTyConKey] -> 'F' + | anyOf [intPrimTyConKey, int32PrimTyConKey, int64PrimTyConKey, + intTyConKey, int8TyConKey, int16TyConKey, int32TyConKey, int64TyConKey + ] -> 'I' + | anyOf [wordPrimTyConKey, word32PrimTyConKey, word64PrimTyConKey, wordTyConKey, + word8TyConKey, word16TyConKey, word32TyConKey, word64TyConKey + ] -> 'W' + | anyOf [listTyConKey] -> 'L' + | isTupleTyCon tycon -> 'T' + | isPrimTyCon tycon -> 'P' + | isEnumerationTyCon tycon -> 'E' + | isJust (tyConSingleDataCon_maybe tycon) -> 'S' + | otherwise -> 'M' -- oh, well... diff --git a/compiler/GHC/StgToCmm/Utils.hs b/compiler/GHC/StgToCmm/Utils.hs new file mode 100644 index 0000000000..30e37bb930 --- /dev/null +++ b/compiler/GHC/StgToCmm/Utils.hs @@ -0,0 +1,578 @@ +{-# LANGUAGE CPP #-} + +----------------------------------------------------------------------------- +-- +-- Code generator utilities; mostly monadic +-- +-- (c) The University of Glasgow 2004-2006 +-- +----------------------------------------------------------------------------- + +module GHC.StgToCmm.Utils ( + cgLit, mkSimpleLit, + emitDataLits, mkDataLits, + emitRODataLits, mkRODataLits, + emitRtsCall, emitRtsCallWithResult, emitRtsCallGen, + assignTemp, newTemp, + + newUnboxedTupleRegs, + + emitMultiAssign, emitCmmLitSwitch, emitSwitch, + + tagToClosure, mkTaggedObjectLoad, + + callerSaves, callerSaveVolatileRegs, get_GlobalReg_addr, + + cmmAndWord, cmmOrWord, cmmNegate, cmmEqWord, cmmNeWord, + cmmUGtWord, cmmSubWord, cmmMulWord, cmmAddWord, cmmUShrWord, + cmmOffsetExprW, cmmOffsetExprB, + cmmRegOffW, cmmRegOffB, + cmmLabelOffW, cmmLabelOffB, + cmmOffsetW, cmmOffsetB, + cmmOffsetLitW, cmmOffsetLitB, + cmmLoadIndexW, + cmmConstrTag1, + + cmmUntag, cmmIsTagged, + + addToMem, addToMemE, addToMemLblE, addToMemLbl, + mkWordCLit, + newStringCLit, newByteStringCLit, + blankWord, + ) where + +#include "HsVersions.h" + +import GhcPrelude + +import GHC.StgToCmm.Monad +import GHC.StgToCmm.Closure +import Cmm +import BlockId +import MkGraph +import GHC.Platform.Regs +import CLabel +import CmmUtils +import CmmSwitch +import GHC.StgToCmm.CgUtils + +import ForeignCall +import IdInfo +import Type +import TyCon +import SMRep +import Module +import Literal +import Digraph +import Util +import Unique +import UniqSupply (MonadUnique(..)) +import DynFlags +import FastString +import Outputable +import RepType + +import Data.ByteString (ByteString) +import qualified Data.ByteString.Char8 as BS8 +import qualified Data.Map as M +import Data.Char +import Data.List +import Data.Ord + + +------------------------------------------------------------------------- +-- +-- Literals +-- +------------------------------------------------------------------------- + +cgLit :: Literal -> FCode CmmLit +cgLit (LitString s) = newByteStringCLit s + -- not unpackFS; we want the UTF-8 byte stream. +cgLit other_lit = do dflags <- getDynFlags + return (mkSimpleLit dflags other_lit) + +mkSimpleLit :: DynFlags -> Literal -> CmmLit +mkSimpleLit dflags (LitChar c) = CmmInt (fromIntegral (ord c)) + (wordWidth dflags) +mkSimpleLit dflags LitNullAddr = zeroCLit dflags +mkSimpleLit dflags (LitNumber LitNumInt i _) = CmmInt i (wordWidth dflags) +mkSimpleLit _ (LitNumber LitNumInt64 i _) = CmmInt i W64 +mkSimpleLit dflags (LitNumber LitNumWord i _) = CmmInt i (wordWidth dflags) +mkSimpleLit _ (LitNumber LitNumWord64 i _) = CmmInt i W64 +mkSimpleLit _ (LitFloat r) = CmmFloat r W32 +mkSimpleLit _ (LitDouble r) = CmmFloat r W64 +mkSimpleLit _ (LitLabel fs ms fod) + = let -- TODO: Literal labels might not actually be in the current package... + labelSrc = ForeignLabelInThisPackage + in CmmLabel (mkForeignLabel fs ms labelSrc fod) +-- NB: LitRubbish should have been lowered in "CoreToStg" +mkSimpleLit _ other = pprPanic "mkSimpleLit" (ppr other) + +-------------------------------------------------------------------------- +-- +-- Incrementing a memory location +-- +-------------------------------------------------------------------------- + +addToMemLbl :: CmmType -> CLabel -> Int -> CmmAGraph +addToMemLbl rep lbl n = addToMem rep (CmmLit (CmmLabel lbl)) n + +addToMemLblE :: CmmType -> CLabel -> CmmExpr -> CmmAGraph +addToMemLblE rep lbl = addToMemE rep (CmmLit (CmmLabel lbl)) + +addToMem :: CmmType -- rep of the counter + -> CmmExpr -- Address + -> Int -- What to add (a word) + -> CmmAGraph +addToMem rep ptr n = addToMemE rep ptr (CmmLit (CmmInt (toInteger n) (typeWidth rep))) + +addToMemE :: CmmType -- rep of the counter + -> CmmExpr -- Address + -> CmmExpr -- What to add (a word-typed expression) + -> CmmAGraph +addToMemE rep ptr n + = mkStore ptr (CmmMachOp (MO_Add (typeWidth rep)) [CmmLoad ptr rep, n]) + + +------------------------------------------------------------------------- +-- +-- Loading a field from an object, +-- where the object pointer is itself tagged +-- +------------------------------------------------------------------------- + +mkTaggedObjectLoad + :: DynFlags -> LocalReg -> LocalReg -> ByteOff -> DynTag -> CmmAGraph +-- (loadTaggedObjectField reg base off tag) generates assignment +-- reg = bitsK[ base + off - tag ] +-- where K is fixed by 'reg' +mkTaggedObjectLoad dflags reg base offset tag + = mkAssign (CmmLocal reg) + (CmmLoad (cmmOffsetB dflags + (CmmReg (CmmLocal base)) + (offset - tag)) + (localRegType reg)) + +------------------------------------------------------------------------- +-- +-- Converting a closure tag to a closure for enumeration types +-- (this is the implementation of tagToEnum#). +-- +------------------------------------------------------------------------- + +tagToClosure :: DynFlags -> TyCon -> CmmExpr -> CmmExpr +tagToClosure dflags tycon tag + = CmmLoad (cmmOffsetExprW dflags closure_tbl tag) (bWord dflags) + where closure_tbl = CmmLit (CmmLabel lbl) + lbl = mkClosureTableLabel (tyConName tycon) NoCafRefs + +------------------------------------------------------------------------- +-- +-- Conditionals and rts calls +-- +------------------------------------------------------------------------- + +emitRtsCall :: UnitId -> FastString -> [(CmmExpr,ForeignHint)] -> Bool -> FCode () +emitRtsCall pkg fun args safe = emitRtsCallGen [] (mkCmmCodeLabel pkg fun) args safe + +emitRtsCallWithResult :: LocalReg -> ForeignHint -> UnitId -> FastString + -> [(CmmExpr,ForeignHint)] -> Bool -> FCode () +emitRtsCallWithResult res hint pkg fun args safe + = emitRtsCallGen [(res,hint)] (mkCmmCodeLabel pkg fun) args safe + +-- Make a call to an RTS C procedure +emitRtsCallGen + :: [(LocalReg,ForeignHint)] + -> CLabel + -> [(CmmExpr,ForeignHint)] + -> Bool -- True <=> CmmSafe call + -> FCode () +emitRtsCallGen res lbl args safe + = do { dflags <- getDynFlags + ; updfr_off <- getUpdFrameOff + ; let (caller_save, caller_load) = callerSaveVolatileRegs dflags + ; emit caller_save + ; call updfr_off + ; emit caller_load } + where + call updfr_off = + if safe then + emit =<< mkCmmCall fun_expr res' args' updfr_off + else do + let conv = ForeignConvention CCallConv arg_hints res_hints CmmMayReturn + emit $ mkUnsafeCall (ForeignTarget fun_expr conv) res' args' + (args', arg_hints) = unzip args + (res', res_hints) = unzip res + fun_expr = mkLblExpr lbl + + +----------------------------------------------------------------------------- +-- +-- Caller-Save Registers +-- +----------------------------------------------------------------------------- + +-- Here we generate the sequence of saves/restores required around a +-- foreign call instruction. + +-- TODO: reconcile with includes/Regs.h +-- * Regs.h claims that BaseReg should be saved last and loaded first +-- * This might not have been tickled before since BaseReg is callee save +-- * Regs.h saves SparkHd, ParkT1, SparkBase and SparkLim +-- +-- This code isn't actually used right now, because callerSaves +-- only ever returns true in the current universe for registers NOT in +-- system_regs (just do a grep for CALLER_SAVES in +-- includes/stg/MachRegs.h). It's all one giant no-op, and for +-- good reason: having to save system registers on every foreign call +-- would be very expensive, so we avoid assigning them to those +-- registers when we add support for an architecture. +-- +-- Note that the old code generator actually does more work here: it +-- also saves other global registers. We can't (nor want) to do that +-- here, as we don't have liveness information. And really, we +-- shouldn't be doing the workaround at this point in the pipeline, see +-- Note [Register parameter passing] and the ToDo on CmmCall in +-- cmm/CmmNode.hs. Right now the workaround is to avoid inlining across +-- unsafe foreign calls in rewriteAssignments, but this is strictly +-- temporary. +callerSaveVolatileRegs :: DynFlags -> (CmmAGraph, CmmAGraph) +callerSaveVolatileRegs dflags = (caller_save, caller_load) + where + platform = targetPlatform dflags + + caller_save = catAGraphs (map callerSaveGlobalReg regs_to_save) + caller_load = catAGraphs (map callerRestoreGlobalReg regs_to_save) + + system_regs = [ Sp,SpLim,Hp,HpLim,CCCS,CurrentTSO,CurrentNursery + {- ,SparkHd,SparkTl,SparkBase,SparkLim -} + , BaseReg ] + + regs_to_save = filter (callerSaves platform) system_regs + + callerSaveGlobalReg reg + = mkStore (get_GlobalReg_addr dflags reg) (CmmReg (CmmGlobal reg)) + + callerRestoreGlobalReg reg + = mkAssign (CmmGlobal reg) + (CmmLoad (get_GlobalReg_addr dflags reg) (globalRegType dflags reg)) + + +------------------------------------------------------------------------- +-- +-- Strings generate a top-level data block +-- +------------------------------------------------------------------------- + +emitDataLits :: CLabel -> [CmmLit] -> FCode () +-- Emit a data-segment data block +emitDataLits lbl lits = emitDecl (mkDataLits (Section Data lbl) lbl lits) + +emitRODataLits :: CLabel -> [CmmLit] -> FCode () +-- Emit a read-only data block +emitRODataLits lbl lits = emitDecl (mkRODataLits lbl lits) + +newStringCLit :: String -> FCode CmmLit +-- Make a global definition for the string, +-- and return its label +newStringCLit str = newByteStringCLit (BS8.pack str) + +newByteStringCLit :: ByteString -> FCode CmmLit +newByteStringCLit bytes + = do { uniq <- newUnique + ; let (lit, decl) = mkByteStringCLit (mkStringLitLabel uniq) bytes + ; emitDecl decl + ; return lit } + +------------------------------------------------------------------------- +-- +-- Assigning expressions to temporaries +-- +------------------------------------------------------------------------- + +assignTemp :: CmmExpr -> FCode LocalReg +-- Make sure the argument is in a local register. +-- We don't bother being particularly aggressive with avoiding +-- unnecessary local registers, since we can rely on a later +-- optimization pass to inline as necessary (and skipping out +-- on things like global registers can be a little dangerous +-- due to them being trashed on foreign calls--though it means +-- the optimization pass doesn't have to do as much work) +assignTemp (CmmReg (CmmLocal reg)) = return reg +assignTemp e = do { dflags <- getDynFlags + ; uniq <- newUnique + ; let reg = LocalReg uniq (cmmExprType dflags e) + ; emitAssign (CmmLocal reg) e + ; return reg } + +newTemp :: MonadUnique m => CmmType -> m LocalReg +newTemp rep = do { uniq <- getUniqueM + ; return (LocalReg uniq rep) } + +newUnboxedTupleRegs :: Type -> FCode ([LocalReg], [ForeignHint]) +-- Choose suitable local regs to use for the components +-- of an unboxed tuple that we are about to return to +-- the Sequel. If the Sequel is a join point, using the +-- regs it wants will save later assignments. +newUnboxedTupleRegs res_ty + = ASSERT( isUnboxedTupleType res_ty ) + do { dflags <- getDynFlags + ; sequel <- getSequel + ; regs <- choose_regs dflags sequel + ; ASSERT( regs `equalLength` reps ) + return (regs, map primRepForeignHint reps) } + where + reps = typePrimRep res_ty + choose_regs _ (AssignTo regs _) = return regs + choose_regs dflags _ = mapM (newTemp . primRepCmmType dflags) reps + + + +------------------------------------------------------------------------- +-- emitMultiAssign +------------------------------------------------------------------------- + +emitMultiAssign :: [LocalReg] -> [CmmExpr] -> FCode () +-- Emit code to perform the assignments in the +-- input simultaneously, using temporary variables when necessary. + +type Key = Int +type Vrtx = (Key, Stmt) -- Give each vertex a unique number, + -- for fast comparison +type Stmt = (LocalReg, CmmExpr) -- r := e + +-- We use the strongly-connected component algorithm, in which +-- * the vertices are the statements +-- * an edge goes from s1 to s2 iff +-- s1 assigns to something s2 uses +-- that is, if s1 should *follow* s2 in the final order + +emitMultiAssign [] [] = return () +emitMultiAssign [reg] [rhs] = emitAssign (CmmLocal reg) rhs +emitMultiAssign regs rhss = do + dflags <- getDynFlags + ASSERT2( equalLength regs rhss, ppr regs $$ ppr rhss ) + unscramble dflags ([1..] `zip` (regs `zip` rhss)) + +unscramble :: DynFlags -> [Vrtx] -> FCode () +unscramble dflags vertices = mapM_ do_component components + where + edges :: [ Node Key Vrtx ] + edges = [ DigraphNode vertex key1 (edges_from stmt1) + | vertex@(key1, stmt1) <- vertices ] + + edges_from :: Stmt -> [Key] + edges_from stmt1 = [ key2 | (key2, stmt2) <- vertices, + stmt1 `mustFollow` stmt2 ] + + components :: [SCC Vrtx] + components = stronglyConnCompFromEdgedVerticesUniq edges + + -- do_components deal with one strongly-connected component + -- Not cyclic, or singleton? Just do it + do_component :: SCC Vrtx -> FCode () + do_component (AcyclicSCC (_,stmt)) = mk_graph stmt + do_component (CyclicSCC []) = panic "do_component" + do_component (CyclicSCC [(_,stmt)]) = mk_graph stmt + + -- Cyclic? Then go via temporaries. Pick one to + -- break the loop and try again with the rest. + do_component (CyclicSCC ((_,first_stmt) : rest)) = do + dflags <- getDynFlags + u <- newUnique + let (to_tmp, from_tmp) = split dflags u first_stmt + mk_graph to_tmp + unscramble dflags rest + mk_graph from_tmp + + split :: DynFlags -> Unique -> Stmt -> (Stmt, Stmt) + split dflags uniq (reg, rhs) + = ((tmp, rhs), (reg, CmmReg (CmmLocal tmp))) + where + rep = cmmExprType dflags rhs + tmp = LocalReg uniq rep + + mk_graph :: Stmt -> FCode () + mk_graph (reg, rhs) = emitAssign (CmmLocal reg) rhs + + mustFollow :: Stmt -> Stmt -> Bool + (reg, _) `mustFollow` (_, rhs) = regUsedIn dflags (CmmLocal reg) rhs + +------------------------------------------------------------------------- +-- mkSwitch +------------------------------------------------------------------------- + + +emitSwitch :: CmmExpr -- Tag to switch on + -> [(ConTagZ, CmmAGraphScoped)] -- Tagged branches + -> Maybe CmmAGraphScoped -- Default branch (if any) + -> ConTagZ -> ConTagZ -- Min and Max possible values; + -- behaviour outside this range is + -- undefined + -> FCode () + +-- First, two rather common cases in which there is no work to do +emitSwitch _ [] (Just code) _ _ = emit (fst code) +emitSwitch _ [(_,code)] Nothing _ _ = emit (fst code) + +-- Right, off we go +emitSwitch tag_expr branches mb_deflt lo_tag hi_tag = do + join_lbl <- newBlockId + mb_deflt_lbl <- label_default join_lbl mb_deflt + branches_lbls <- label_branches join_lbl branches + tag_expr' <- assignTemp' tag_expr + + -- Sort the branches before calling mk_discrete_switch + let branches_lbls' = [ (fromIntegral i, l) | (i,l) <- sortBy (comparing fst) branches_lbls ] + let range = (fromIntegral lo_tag, fromIntegral hi_tag) + + emit $ mk_discrete_switch False tag_expr' branches_lbls' mb_deflt_lbl range + + emitLabel join_lbl + +mk_discrete_switch :: Bool -- ^ Use signed comparisons + -> CmmExpr + -> [(Integer, BlockId)] + -> Maybe BlockId + -> (Integer, Integer) + -> CmmAGraph + +-- SINGLETON TAG RANGE: no case analysis to do +mk_discrete_switch _ _tag_expr [(tag, lbl)] _ (lo_tag, hi_tag) + | lo_tag == hi_tag + = ASSERT( tag == lo_tag ) + mkBranch lbl + +-- SINGLETON BRANCH, NO DEFAULT: no case analysis to do +mk_discrete_switch _ _tag_expr [(_tag,lbl)] Nothing _ + = mkBranch lbl + -- The simplifier might have eliminated a case + -- so we may have e.g. case xs of + -- [] -> e + -- In that situation we can be sure the (:) case + -- can't happen, so no need to test + +-- SOMETHING MORE COMPLICATED: defer to CmmImplementSwitchPlans +-- See Note [Cmm Switches, the general plan] in CmmSwitch +mk_discrete_switch signed tag_expr branches mb_deflt range + = mkSwitch tag_expr $ mkSwitchTargets signed range mb_deflt (M.fromList branches) + +divideBranches :: Ord a => [(a,b)] -> ([(a,b)], a, [(a,b)]) +divideBranches branches = (lo_branches, mid, hi_branches) + where + -- 2 branches => n_branches `div` 2 = 1 + -- => branches !! 1 give the *second* tag + -- There are always at least 2 branches here + (mid,_) = branches !! (length branches `div` 2) + (lo_branches, hi_branches) = span is_lo branches + is_lo (t,_) = t < mid + +-------------- +emitCmmLitSwitch :: CmmExpr -- Tag to switch on + -> [(Literal, CmmAGraphScoped)] -- Tagged branches + -> CmmAGraphScoped -- Default branch (always) + -> FCode () -- Emit the code +emitCmmLitSwitch _scrut [] deflt = emit $ fst deflt +emitCmmLitSwitch scrut branches deflt = do + scrut' <- assignTemp' scrut + join_lbl <- newBlockId + deflt_lbl <- label_code join_lbl deflt + branches_lbls <- label_branches join_lbl branches + + dflags <- getDynFlags + let cmm_ty = cmmExprType dflags scrut + rep = typeWidth cmm_ty + + -- We find the necessary type information in the literals in the branches + let signed = case head branches of + (LitNumber nt _ _, _) -> litNumIsSigned nt + _ -> False + + let range | signed = (tARGET_MIN_INT dflags, tARGET_MAX_INT dflags) + | otherwise = (0, tARGET_MAX_WORD dflags) + + if isFloatType cmm_ty + then emit =<< mk_float_switch rep scrut' deflt_lbl noBound branches_lbls + else emit $ mk_discrete_switch + signed + scrut' + [(litValue lit,l) | (lit,l) <- branches_lbls] + (Just deflt_lbl) + range + emitLabel join_lbl + +-- | lower bound (inclusive), upper bound (exclusive) +type LitBound = (Maybe Literal, Maybe Literal) + +noBound :: LitBound +noBound = (Nothing, Nothing) + +mk_float_switch :: Width -> CmmExpr -> BlockId + -> LitBound + -> [(Literal,BlockId)] + -> FCode CmmAGraph +mk_float_switch rep scrut deflt _bounds [(lit,blk)] + = do dflags <- getDynFlags + return $ mkCbranch (cond dflags) deflt blk Nothing + where + cond dflags = CmmMachOp ne [scrut, CmmLit cmm_lit] + where + cmm_lit = mkSimpleLit dflags lit + ne = MO_F_Ne rep + +mk_float_switch rep scrut deflt_blk_id (lo_bound, hi_bound) branches + = do dflags <- getDynFlags + lo_blk <- mk_float_switch rep scrut deflt_blk_id bounds_lo lo_branches + hi_blk <- mk_float_switch rep scrut deflt_blk_id bounds_hi hi_branches + mkCmmIfThenElse (cond dflags) lo_blk hi_blk + where + (lo_branches, mid_lit, hi_branches) = divideBranches branches + + bounds_lo = (lo_bound, Just mid_lit) + bounds_hi = (Just mid_lit, hi_bound) + + cond dflags = CmmMachOp lt [scrut, CmmLit cmm_lit] + where + cmm_lit = mkSimpleLit dflags mid_lit + lt = MO_F_Lt rep + + +-------------- +label_default :: BlockId -> Maybe CmmAGraphScoped -> FCode (Maybe BlockId) +label_default _ Nothing + = return Nothing +label_default join_lbl (Just code) + = do lbl <- label_code join_lbl code + return (Just lbl) + +-------------- +label_branches :: BlockId -> [(a,CmmAGraphScoped)] -> FCode [(a,BlockId)] +label_branches _join_lbl [] + = return [] +label_branches join_lbl ((tag,code):branches) + = do lbl <- label_code join_lbl code + branches' <- label_branches join_lbl branches + return ((tag,lbl):branches') + +-------------- +label_code :: BlockId -> CmmAGraphScoped -> FCode BlockId +-- label_code J code +-- generates +-- [L: code; goto J] +-- and returns L +label_code join_lbl (code,tsc) = do + lbl <- newBlockId + emitOutOfLine lbl (code MkGraph.<*> mkBranch join_lbl, tsc) + return lbl + +-------------- +assignTemp' :: CmmExpr -> FCode CmmExpr +assignTemp' e + | isTrivialCmmExpr e = return e + | otherwise = do + dflags <- getDynFlags + lreg <- newTemp (cmmExprType dflags e) + let reg = CmmLocal lreg + emitAssign reg e + return (CmmReg reg) |