diff options
Diffstat (limited to 'compiler/GHC')
-rw-r--r-- | compiler/GHC/ByteCode/Asm.hs | 63 | ||||
-rw-r--r-- | compiler/GHC/ByteCode/Instr.hs | 16 | ||||
-rw-r--r-- | compiler/GHC/ByteCode/Types.hs | 44 | ||||
-rw-r--r-- | compiler/GHC/Cmm/CallConv.hs | 113 | ||||
-rw-r--r-- | compiler/GHC/Cmm/Parser.y | 4 | ||||
-rw-r--r-- | compiler/GHC/Cmm/Reg.hs | 2 | ||||
-rw-r--r-- | compiler/GHC/StgToByteCode.hs | 211 | ||||
-rw-r--r-- | compiler/GHC/StgToCmm/Foreign.hs | 22 |
8 files changed, 346 insertions, 129 deletions
diff --git a/compiler/GHC/ByteCode/Asm.hs b/compiler/GHC/ByteCode/Asm.hs index 24e2645052..391949d448 100644 --- a/compiler/GHC/ByteCode/Asm.hs +++ b/compiler/GHC/ByteCode/Asm.hs @@ -12,7 +12,7 @@ module GHC.ByteCode.Asm ( bcoFreeNames, SizedSeq, sizeSS, ssElts, iNTERP_STACK_CHECK_THRESH, - mkTupleInfoLit + mkNativeCallInfoLit ) where import GHC.Prelude @@ -32,7 +32,6 @@ import GHC.Types.Unique.DSet import GHC.Utils.Outputable import GHC.Utils.Panic -import GHC.Utils.Panic.Plain import GHC.Core.TyCon import GHC.Data.FastString @@ -40,7 +39,7 @@ import GHC.Data.SizedSeq import GHC.StgToCmm.Layout ( ArgRep(..) ) import GHC.Cmm.Expr -import GHC.Cmm.CallConv ( tupleRegsCover ) +import GHC.Cmm.CallConv ( allArgRegsCover ) import GHC.Platform import GHC.Platform.Profile @@ -202,7 +201,8 @@ assembleBCO platform (ProtoBCO { protoBCOName = nm (final_insns, final_lits, final_ptrs) <- flip execStateT initial_state $ runAsm platform long_jumps env asm -- precomputed size should be equal to final size - massert (n_insns == sizeSS final_insns) + massertPpr (n_insns == sizeSS final_insns) + (text "bytecode instruction count mismatch") let asm_insns = ssElts final_insns insns_arr = Array.listArray (0, fromIntegral n_insns - 1) asm_insns @@ -351,7 +351,8 @@ largeArg platform w = case platformWordSize platform of fromIntegral (w `shiftR` 32), fromIntegral (w `shiftR` 16), fromIntegral w] - PW4 -> assert (w < fromIntegral (maxBound :: Word32)) $ + PW4 -> assertPpr (w < fromIntegral (maxBound :: Word32)) + (text "largeArg too big:" <+> ppr w) $ [fromIntegral (w `shiftR` 16), fromIntegral w] @@ -388,14 +389,14 @@ assembleI platform i = case i of -> do let ul_bco = assembleBCO platform proto p <- ioptr (liftM BCOPtrBCO ul_bco) emit (push_alts pk) [Op p] - PUSH_ALTS_TUPLE proto tuple_info tuple_proto + PUSH_ALTS_TUPLE proto call_info tuple_proto -> do let ul_bco = assembleBCO platform proto ul_tuple_bco = assembleBCO platform tuple_proto p <- ioptr (liftM BCOPtrBCO ul_bco) p_tup <- ioptr (liftM BCOPtrBCO ul_tuple_bco) info <- int (fromIntegral $ - mkTupleInfoSig platform tuple_info) + mkNativeCallInfoSig platform call_info) emit bci_PUSH_ALTS_T [Op p, Op info, Op p_tup] PUSH_PAD8 -> emit bci_PUSH_PAD8 [] @@ -491,6 +492,7 @@ assembleI platform i = case i of RETURN_TUPLE -> emit bci_RETURN_T [] CCALL off m_addr i -> do np <- addr m_addr emit bci_CCALL [SmallOp off, Op np, SmallOp i] + PRIMCALL -> emit bci_PRIMCALL [] BRK_FUN index uniq cc -> do p1 <- ptr BCOPtrBreakArray q <- int (getKey uniq) np <- addr cc @@ -580,41 +582,44 @@ return_unlifted V64 = error "return_unlifted: vector" maximum number of tuple elements may be larger. Elements can also take multiple words on the stack (for example Double# on a 32 bit platform). - -} -maxTupleNativeStackSize :: WordOff -maxTupleNativeStackSize = 62 +maxTupleReturnNativeStackSize :: WordOff +maxTupleReturnNativeStackSize = 62 {- - Construct the tuple_info word that stg_ctoi_t and stg_ret_t use - to convert a tuple between the native calling convention and the + Construct the call_info word that stg_ctoi_t, stg_ret_t and stg_primcall + use to convert arguments between the native calling convention and the interpreter. - See Note [GHCi tuple layout] for more information. + See Note [GHCi and native call registers] for more information. -} -mkTupleInfoSig :: Platform -> TupleInfo -> Word32 -mkTupleInfoSig platform TupleInfo{..} - | tupleNativeStackSize > maxTupleNativeStackSize - = pprPanic "mkTupleInfoSig: tuple too big for the bytecode compiler" - (ppr tupleNativeStackSize <+> text "stack words." <+> +mkNativeCallInfoSig :: Platform -> NativeCallInfo -> Word32 +mkNativeCallInfoSig platform NativeCallInfo{..} + | nativeCallType == NativeTupleReturn && nativeCallStackSpillSize > maxTupleReturnNativeStackSize + = pprPanic "mkNativeCallInfoSig: tuple too big for the bytecode compiler" + (ppr nativeCallStackSpillSize <+> text "stack words." <+> text "Use -fobject-code to get around this limit" ) | otherwise - = assert (length regs <= 24) {- 24 bits for bitmap -} - assert (tupleNativeStackSize < 255) {- 8 bits for stack size -} - assert (all (`elem` regs) (regSetToList tupleRegs)) {- all regs accounted for -} - foldl' reg_bit 0 (zip regs [0..]) .|. - (fromIntegral tupleNativeStackSize `shiftL` 24) + = assertPpr (length regs <= 24) (text "too many registers for bitmap:" <+> ppr (length regs)) {- 24 bits for register bitmap -} + assertPpr (cont_offset < 255) (text "continuation offset too large:" <+> ppr cont_offset) {- 8 bits for continuation offset (only for NativeTupleReturn) -} + assertPpr (all (`elem` regs) (regSetToList nativeCallRegs)) (text "not all registers accounted for") {- all regs accounted for -} + foldl' reg_bit 0 (zip regs [0..]) .|. (cont_offset `shiftL` 24) where + cont_offset :: Word32 + cont_offset + | nativeCallType == NativeTupleReturn = fromIntegral nativeCallStackSpillSize + | otherwise = 0 -- there is no continuation for primcalls + reg_bit :: Word32 -> (GlobalReg, Int) -> Word32 reg_bit x (r, n) - | r `elemRegSet` tupleRegs = x .|. 1 `shiftL` n - | otherwise = x - regs = tupleRegsCover platform + | r `elemRegSet` nativeCallRegs = x .|. 1 `shiftL` n + | otherwise = x + regs = allArgRegsCover platform -mkTupleInfoLit :: Platform -> TupleInfo -> Literal -mkTupleInfoLit platform tuple_info = - mkLitWord platform . fromIntegral $ mkTupleInfoSig platform tuple_info +mkNativeCallInfoLit :: Platform -> NativeCallInfo -> Literal +mkNativeCallInfoLit platform call_info = + mkLitWord platform . fromIntegral $ mkNativeCallInfoSig platform call_info -- Make lists of host-sized words for literals, so that when the -- words are placed in memory at increasing addresses, the diff --git a/compiler/GHC/ByteCode/Instr.hs b/compiler/GHC/ByteCode/Instr.hs index 498152c471..34baa57d40 100644 --- a/compiler/GHC/ByteCode/Instr.hs +++ b/compiler/GHC/ByteCode/Instr.hs @@ -90,7 +90,7 @@ data BCInstr | PUSH_ALTS (ProtoBCO Name) | PUSH_ALTS_UNLIFTED (ProtoBCO Name) ArgRep | PUSH_ALTS_TUPLE (ProtoBCO Name) -- continuation - !TupleInfo + !NativeCallInfo (ProtoBCO Name) -- tuple return BCO -- Pushing 8, 16 and 32 bits of padding (for constructors). @@ -184,6 +184,8 @@ data BCInstr -- (XXX: inefficient, but I don't know -- what the alignment constraints are.) + | PRIMCALL + -- For doing magic ByteArray passing to foreign calls | SWIZZLE Word16 -- to the ptr N words down the stack, Word16 -- add M (interpreted as a signed 16-bit entity) @@ -269,8 +271,8 @@ instance Outputable BCInstr where ppr (PUSH_ALTS bco) = hang (text "PUSH_ALTS") 2 (ppr bco) ppr (PUSH_ALTS_UNLIFTED bco pk) = hang (text "PUSH_ALTS_UNLIFTED" <+> ppr pk) 2 (ppr bco) - ppr (PUSH_ALTS_TUPLE bco tuple_info tuple_bco) = - hang (text "PUSH_ALTS_TUPLE" <+> ppr tuple_info) + ppr (PUSH_ALTS_TUPLE bco call_info tuple_bco) = + hang (text "PUSH_ALTS_TUPLE" <+> ppr call_info) 2 (ppr tuple_bco $+$ ppr bco) @@ -340,6 +342,7 @@ instance Outputable BCInstr where 0x1 -> text "(interruptible)" 0x2 -> text "(unsafe)" _ -> empty) + ppr PRIMCALL = text "PRIMCALL" ppr (SWIZZLE stkoff n) = text "SWIZZLE " <+> text "stkoff" <+> ppr stkoff <+> text "by" <+> ppr n ppr ENTER = text "ENTER" @@ -382,11 +385,11 @@ bciStackUse (PUSH_ALTS bco) = 2 {- profiling only, restore CCCS -} + bciStackUse (PUSH_ALTS_UNLIFTED bco _) = 2 {- profiling only, restore CCCS -} + 4 + protoBCOStackUse bco bciStackUse (PUSH_ALTS_TUPLE bco info _) = - -- (tuple_bco, tuple_info word, cont_bco, stg_ctoi_t) + -- (tuple_bco, call_info word, cont_bco, stg_ctoi_t) -- tuple - -- (tuple_info, tuple_bco, stg_ret_t) + -- (call_info, tuple_bco, stg_ret_t) 1 {- profiling only -} + - 7 + fromIntegral (tupleSize info) + protoBCOStackUse bco + 7 + fromIntegral (nativeCallSize info) + protoBCOStackUse bco bciStackUse (PUSH_PAD8) = 1 -- overapproximation bciStackUse (PUSH_PAD16) = 1 -- overapproximation bciStackUse (PUSH_PAD32) = 1 -- overapproximation on 64bit arch @@ -443,6 +446,7 @@ bciStackUse RETURN{} = 0 bciStackUse RETURN_UNLIFTED{} = 1 -- pushes stg_ret_X for some X bciStackUse RETURN_TUPLE{} = 1 -- pushes stg_ret_t header bciStackUse CCALL{} = 0 +bciStackUse PRIMCALL{} = 1 -- pushes stg_primcall bciStackUse SWIZZLE{} = 0 bciStackUse BRK_FUN{} = 0 diff --git a/compiler/GHC/ByteCode/Types.hs b/compiler/GHC/ByteCode/Types.hs index 830b60a4ca..a4b025ce92 100644 --- a/compiler/GHC/ByteCode/Types.hs +++ b/compiler/GHC/ByteCode/Types.hs @@ -10,7 +10,7 @@ module GHC.ByteCode.Types ( CompiledByteCode(..), seqCompiledByteCode , FFIInfo(..) , RegBitmap(..) - , TupleInfo(..), voidTupleInfo + , NativeCallType(..), NativeCallInfo(..), voidTupleReturnInfo, voidPrimCallInfo , ByteOff(..), WordOff(..) , UnlinkedBCO(..), BCOPtr(..), BCONPtr(..) , ItblEnv, ItblPtr(..) @@ -105,22 +105,32 @@ newtype RegBitmap = RegBitmap { unRegBitmap :: Word32 } See GHC.StgToByteCode.layoutTuple for more details. -} -data TupleInfo = TupleInfo - { tupleSize :: !WordOff -- total size of tuple in words - , tupleRegs :: !GlobalRegSet - , tupleNativeStackSize :: !WordOff {- words spilled on the stack by - GHCs native calling convention -} - } deriving (Show) - -instance Outputable TupleInfo where - ppr TupleInfo{..} = text "<size" <+> ppr tupleSize <+> - text "stack" <+> ppr tupleNativeStackSize <+> - text "regs" <+> - ppr (map (text @SDoc . show) $ regSetToList tupleRegs) <> - char '>' - -voidTupleInfo :: TupleInfo -voidTupleInfo = TupleInfo 0 emptyRegSet 0 + +data NativeCallType = NativePrimCall + | NativeTupleReturn + deriving (Eq) + +data NativeCallInfo = NativeCallInfo + { nativeCallType :: !NativeCallType + , nativeCallSize :: !WordOff -- total size of arguments in words + , nativeCallRegs :: !GlobalRegSet + , nativeCallStackSpillSize :: !WordOff {- words spilled on the stack by + GHCs native calling convention -} + } + +instance Outputable NativeCallInfo where + ppr NativeCallInfo{..} = text "<arg_size" <+> ppr nativeCallSize <+> + text "stack" <+> ppr nativeCallStackSpillSize <+> + text "regs" <+> + ppr (map (text @SDoc . show) $ regSetToList nativeCallRegs) <> + char '>' + + +voidTupleReturnInfo :: NativeCallInfo +voidTupleReturnInfo = NativeCallInfo NativeTupleReturn 0 emptyRegSet 0 + +voidPrimCallInfo :: NativeCallInfo +voidPrimCallInfo = NativeCallInfo NativePrimCall 0 emptyRegSet 0 type ItblEnv = NameEnv (Name, ItblPtr) -- We need the Name in the range so we know which diff --git a/compiler/GHC/Cmm/CallConv.hs b/compiler/GHC/Cmm/CallConv.hs index a0fee0e5c6..97cebf99e6 100644 --- a/compiler/GHC/Cmm/CallConv.hs +++ b/compiler/GHC/Cmm/CallConv.hs @@ -3,7 +3,7 @@ module GHC.Cmm.CallConv ( assignArgumentsPos, assignStack, realArgRegsCover, - tupleRegsCover + allArgRegsCover ) where import GHC.Prelude @@ -220,12 +220,109 @@ realArgRegsCover platform realLongRegs platform -- we don't save XMM registers if they are not used for parameter passing --- Like realArgRegsCover but always includes the node. This covers the real --- and virtual registers used for unboxed tuples. --- --- Note: if anything changes in how registers for unboxed tuples overlap, --- make sure to also update GHC.StgToByteCode.layoutTuple. -tupleRegsCover :: Platform -> [GlobalReg] -tupleRegsCover platform = +{- + + Note [GHCi and native call registers] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + The GHCi bytecode interpreter does not have access to the STG registers + that the native calling convention uses for passing arguments. It uses + helper stack frames to move values between the stack and registers. + + If only a single register needs to be moved, GHCi uses a specific stack + frame. For example stg_ctoi_R1p saves a heap pointer value from STG register + R1 and stg_ctoi_D1 saves a double precision floating point value from D1. + In the other direction, helpers stg_ret_p and stg_ret_d move a value from + the stack to the R1 and D1 registers, respectively. + + When GHCi needs to move more than one register it cannot use a specific + helper frame. It would simply be impossible to create a helper for all + possible combinations of register values. Instead, there are generic helper + stack frames that use a call_info word that describes the active registers + and the number of stack words used by the arguments of a call. + + These helper stack frames are currently: + + - stg_ret_t: return a tuple to the continuation at the top of + the stack + - stg_ctoi_t: convert a tuple return value to be used in + bytecode + - stg_primcall: call a function + + + The call_info word contains a bitmap of the active registers + for the call and and a stack offset. The layout is as follows: + + - bit 0-23: Bitmap of active registers for the call, the + order corresponds to the list returned by + allArgRegsCover. For example if bit 0 (the least + significant bit) is set, the first register in the + allArgRegsCover list is active. Bit 1 for the + second register in the list and so on. + + - bit 24-31: Unsigned byte indicating the stack offset + of the continuation in words. For tuple returns + this is the number of words returned on the + stack. For primcalls this field is unused, since + we don't jump to a continuation. + + The upper 32 bits on 64 bit platforms are currently unused. + + If a register is smaller than a word on the stack (for example a + single precision float on a 64 bit system), then the stack slot + is padded to a whole word. + + Example: + + If a tuple is returned in three registers and an additional two + words on the stack, then three bits in the register bitmap + (bits 0-23) would be set. And bit 24-31 would be + 00000010 (two in binary). + + The values on the stack before a call to POP_ARG_REGS would + be as follows: + + ... + continuation + stack_arg_1 + stack_arg_2 + register_arg_3 + register_arg_2 + register_arg_1 <- Sp + + A call to POP_ARG_REGS(call_info) would move register_arg_1 + to the register corresponding to the lowest set bit in the + call_info word. register_arg_2 would be moved to the register + corresponding to the second lowest set bit, and so on. + + After POP_ARG_REGS(call_info), the stack pointer Sp points + to the topmost stack argument, so the stack looks as follows: + + ... + continuation + stack_arg_1 + stack_arg_2 <- Sp + + At this point all the arguments are in place and we are ready + to jump to the continuation, the location (offset from Sp) of + which is found by inspecting the value of bits 24-31. In this + case the offset is two words. + + On x86_64, the double precision (Dn) and single precision + floating (Fn) point registers overlap, e.g. D1 uses the same + physical register as F1. On this platform, the list returned + by allArgRegsCover contains only entries for the double + precision registers. If an argument is passed in register + Fn, the bit corresponding to Dn should be set. + + Note: if anything changes in how registers for native calls overlap, + make sure to also update GHC.StgToByteCode.layoutNativeCall + -} + +-- Like realArgRegsCover but always includes the node. This covers all real +-- and virtual registers actually used for passing arguments. + +allArgRegsCover :: Platform -> [GlobalReg] +allArgRegsCover platform = nub (VanillaReg 1 VGcPtr : realArgRegsCover platform) diff --git a/compiler/GHC/Cmm/Parser.y b/compiler/GHC/Cmm/Parser.y index dbb2e47030..35d8e4c40f 100644 --- a/compiler/GHC/Cmm/Parser.y +++ b/compiler/GHC/Cmm/Parser.y @@ -1233,8 +1233,8 @@ stmtMacros = listToUFM [ ( fsLit "SAVE_REGS", \[] -> emitSaveRegs ), ( fsLit "RESTORE_REGS", \[] -> emitRestoreRegs ), - ( fsLit "PUSH_TUPLE_REGS", \[live_regs] -> emitPushTupleRegs live_regs ), - ( fsLit "POP_TUPLE_REGS", \[live_regs] -> emitPopTupleRegs live_regs ), + ( fsLit "PUSH_ARG_REGS", \[live_regs] -> emitPushArgRegs live_regs ), + ( fsLit "POP_ARG_REGS", \[live_regs] -> emitPopArgRegs live_regs ), ( fsLit "LDV_ENTER", \[e] -> ldvEnter e ), ( fsLit "LDV_RECORD_CREATE", \[e] -> ldvRecordCreate e ), diff --git a/compiler/GHC/Cmm/Reg.hs b/compiler/GHC/Cmm/Reg.hs index a9b3fce101..104702f312 100644 --- a/compiler/GHC/Cmm/Reg.hs +++ b/compiler/GHC/Cmm/Reg.hs @@ -223,7 +223,7 @@ instance Eq GlobalReg where _r1 == _r2 = False -- NOTE: this Ord instance affects the tuple layout in GHCi, see --- Note [GHCi tuple layout] +-- Note [GHCi and native call registers] instance Ord GlobalReg where compare (VanillaReg i _) (VanillaReg j _) = compare i j -- Ignore type when seeking clashes diff --git a/compiler/GHC/StgToByteCode.hs b/compiler/GHC/StgToByteCode.hs index b59cbfe779..de37d987cb 100644 --- a/compiler/GHC/StgToByteCode.hs +++ b/compiler/GHC/StgToByteCode.hs @@ -58,7 +58,7 @@ import GHC.Data.FastString import GHC.Utils.Panic import GHC.Utils.Panic.Plain import GHC.Utils.Exception (evaluate) -import GHC.StgToCmm.Closure ( NonVoid(..), fromNonVoid, nonVoidIds ) +import GHC.StgToCmm.Closure ( NonVoid(..), fromNonVoid, nonVoidIds, argPrimRep ) import GHC.StgToCmm.Layout import GHC.Runtime.Heap.Layout hiding (WordOff, ByteOff, wordsToBytes) import GHC.Data.Bitmap @@ -464,10 +464,10 @@ returnUnliftedReps d s szb reps = do [rep] -> return (unitOL $ RETURN_UNLIFTED (toArgRep platform rep)) -- otherwise use RETURN_TUPLE with a tuple descriptor nv_reps -> do - let (tuple_info, args_offsets) = layoutTuple profile 0 (primRepCmmType platform) nv_reps + let (call_info, args_offsets) = layoutNativeCall profile NativeTupleReturn 0 (primRepCmmType platform) nv_reps args_ptrs = map (\(rep, off) -> (isFollowableArg (toArgRep platform rep), off)) args_offsets - tuple_bco <- emitBc (tupleBCO platform tuple_info args_ptrs) - return $ PUSH_UBX (mkTupleInfoLit platform tuple_info) 1 `consOL` + tuple_bco <- emitBc (tupleBCO platform call_info args_ptrs) + return $ PUSH_UBX (mkNativeCallInfoLit platform call_info) 1 `consOL` PUSH_BCO tuple_bco `consOL` unitOL RETURN_TUPLE return ( mkSlideB platform szb (d - s) -- clear to sequel @@ -484,7 +484,11 @@ returnUnboxedTuple d s p es = do profile <- getProfile let platform = profilePlatform profile arg_ty e = primRepCmmType platform (atomPrimRep e) - (tuple_info, tuple_components) = layoutTuple profile d arg_ty es + (call_info, tuple_components) = layoutNativeCall profile + NativeTupleReturn + d + arg_ty + es go _ pushes [] = return (reverse pushes) go !dd pushes ((a, off):cs) = do (push, szb) <- pushAtom dd p a massert (off == dd + szb) @@ -492,7 +496,7 @@ returnUnboxedTuple d s p es = do pushes <- go d [] tuple_components ret <- returnUnliftedReps d s - (wordsToBytes platform $ tupleSize tuple_info) + (wordsToBytes platform $ nativeCallSize call_info) (map atomPrimRep es) return (mconcat pushes `appOL` ret) @@ -648,14 +652,14 @@ schemeT d s p app -- Case 1 schemeT d s p (StgOpApp (StgFCallOp (CCall ccall_spec) _ty) args result_ty) = if isSupportedCConv ccall_spec - then generateCCall d s p ccall_spec result_ty (reverse args) + then generateCCall d s p ccall_spec result_ty args else unsupportedCConvException schemeT d s p (StgOpApp (StgPrimOp op) args _ty) = doTailCall d s p (primOpId op) (reverse args) -schemeT _d _s _p (StgOpApp StgPrimCallOp{} _args _ty) - = unsupportedCConvException +schemeT d s p (StgOpApp (StgPrimCallOp (PrimCall label unit)) args result_ty) + = generatePrimCall d s p label (Just unit) result_ty args -- Case 2: Unboxed tuple schemeT d s p (StgConApp con _cn args _tys) @@ -840,18 +844,18 @@ doCase d s p scrut bndr alts | ubx_frame = wordSize platform | otherwise = 0 - (bndr_size, tuple_info, args_offsets) + (bndr_size, call_info, args_offsets) | ubx_tuple_frame = let bndr_ty = primRepCmmType platform bndr_reps = filter (not.isVoidRep) (bcIdPrimReps bndr) - (tuple_info, args_offsets) = - layoutTuple profile 0 bndr_ty bndr_reps - in ( wordsToBytes platform (tupleSize tuple_info) - , tuple_info + (call_info, args_offsets) = + layoutNativeCall profile NativeTupleReturn 0 bndr_ty bndr_reps + in ( wordsToBytes platform (nativeCallSize call_info) + , call_info , args_offsets ) | otherwise = ( wordsToBytes platform (idSizeW platform bndr) - , voidTupleInfo + , voidTupleReturnInfo , [] ) @@ -885,17 +889,18 @@ doCase d s p scrut bndr alts | isUnboxedTupleType bndr_ty || isUnboxedSumType bndr_ty = let bndr_ty = primRepCmmType platform . bcIdPrimRep tuple_start = d_bndr - (tuple_info, args_offsets) = - layoutTuple profile - 0 - bndr_ty - bndrs + (call_info, args_offsets) = + layoutNativeCall profile + NativeTupleReturn + 0 + bndr_ty + bndrs stack_bot = d_alts p' = Map.insertList [ (arg, tuple_start - - wordsToBytes platform (tupleSize tuple_info) + + wordsToBytes platform (nativeCallSize call_info) + offset) | (arg, offset) <- args_offsets , not (isVoidRep $ bcIdPrimRep arg)] @@ -981,8 +986,8 @@ doCase d s p scrut bndr alts -- unboxed tuples get two more words, the second is a pointer (tuple_bco) (extra_pointers, extra_slots) - | ubx_tuple_frame && profiling = ([1], 3) -- tuple_info, tuple_BCO, CCCS - | ubx_tuple_frame = ([1], 2) -- tuple_info, tuple_BCO + | ubx_tuple_frame && profiling = ([1], 3) -- call_info, tuple_BCO, CCCS + | ubx_tuple_frame = ([1], 2) -- call_info, tuple_BCO | otherwise = ([], 0) bitmap_size = trunc16W $ fromIntegral extra_slots + @@ -1028,8 +1033,8 @@ doCase d s p scrut bndr alts let args_ptrs = map (\(rep, off) -> (isFollowableArg (toArgRep platform rep), off)) args_offsets - tuple_bco <- emitBc (tupleBCO platform tuple_info args_ptrs) - return (PUSH_ALTS_TUPLE alt_bco' tuple_info tuple_bco + tuple_bco <- emitBc (tupleBCO platform call_info args_ptrs) + return (PUSH_ALTS_TUPLE alt_bco' call_info tuple_bco `consOL` scrut_code) else let push_alts | not ubx_frame @@ -1050,14 +1055,15 @@ doCase d s p scrut bndr alts -- The native calling convention uses registers for tuples, but in the -- bytecode interpreter, all values live on the stack. -layoutTuple :: Profile - -> ByteOff - -> (a -> CmmType) - -> [a] - -> ( TupleInfo -- See Note [GHCi TupleInfo] - , [(a, ByteOff)] -- argument, offset on stack - ) -layoutTuple profile start_off arg_ty reps = +layoutNativeCall :: Profile + -> NativeCallType + -> ByteOff + -> (a -> CmmType) + -> [a] + -> ( NativeCallInfo -- See Note [GHCi TupleInfo] + , [(a, ByteOff)] -- argument, offset on stack + ) +layoutNativeCall profile call_type start_off arg_ty reps = let platform = profilePlatform profile (orig_stk_bytes, pos) = assignArgumentsPos profile 0 @@ -1070,7 +1076,7 @@ layoutTuple profile start_off arg_ty reps = -- sort the register parameters by register and add them to the stack regs_order :: Map.Map GlobalReg Int - regs_order = Map.fromList $ zip (tupleRegsCover platform) [0..] + regs_order = Map.fromList $ zip (allArgRegsCover platform) [0..] reg_order :: GlobalReg -> (Int, GlobalReg) reg_order reg | Just n <- Map.lookup reg regs_order = (n, reg) @@ -1099,10 +1105,11 @@ layoutTuple profile start_off arg_ty reps = get_byte_off _ = panic "GHC.StgToByteCode.layoutTuple get_byte_off" - in ( TupleInfo - { tupleSize = bytesToWords platform (ByteOff new_stk_bytes) - , tupleRegs = regs_set - , tupleNativeStackSize = bytesToWords platform + in ( NativeCallInfo + { nativeCallType = call_type + , nativeCallSize = bytesToWords platform (ByteOff new_stk_bytes) + , nativeCallRegs = regs_set + , nativeCallStackSpillSize = bytesToWords platform (ByteOff orig_stk_bytes) } , sortBy (comparing snd) $ @@ -1127,7 +1134,7 @@ usePlainReturn t ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ We have the bytecode instructions RETURN_TUPLE and PUSH_ALTS_TUPLE to return and receive arbitrary unboxed tuples, respectively. These - instructions use the helper data tuple_BCO and tuple_info. + instructions use the helper data tuple_BCO and call_info. The helper data is used to convert tuples between GHCs native calling convention (object code), which uses stack and registers, and the bytecode @@ -1139,7 +1146,7 @@ usePlainReturn t ================= Bytecode that returns a tuple first pushes all the tuple fields followed - by the appropriate tuple_info and tuple_BCO onto the stack. It then + by the appropriate call_info and tuple_BCO onto the stack. It then executes the RETURN_TUPLE instruction, which causes the interpreter to push stg_ret_t_info to the top of the stack. The stack (growing down) then looks as follows: @@ -1150,14 +1157,14 @@ usePlainReturn t tuple_field_2 ... tuple_field_n - tuple_info + call_info tuple_BCO stg_ret_t_info <- Sp If next_frame is bytecode, the interpreter will start executing it. If it's object code, the interpreter jumps back to the scheduler, which in turn jumps to stg_ret_t. stg_ret_t converts the tuple to the native - calling convention using the description in tuple_info, and then jumps + calling convention using the description in call_info, and then jumps to next_frame. @@ -1169,13 +1176,13 @@ usePlainReturn t tuple. The PUSH_ALTS_TUPLE instuction contains three pieces of data: * cont_BCO: the continuation that receives the tuple - * tuple_info: see below + * call_info: see below * tuple_BCO: see below The interpreter pushes these onto the stack when the PUSH_ALTS_TUPLE instruction is executed, followed by stg_ctoi_tN_info, with N depending on the number of stack words used by the tuple in the GHC native calling - convention. N is derived from tuple_info. + convention. N is derived from call_info. For example if we expect a tuple with three words on the stack, the stack looks as follows after PUSH_ALTS_TUPLE: @@ -1186,7 +1193,7 @@ usePlainReturn t cont_free_var_2 ... cont_free_var_n - tuple_info + call_info tuple_BCO cont_BCO stg_ctoi_t3_info <- Sp @@ -1206,20 +1213,20 @@ usePlainReturn t that is already on the stack. - The tuple_info word + The call_info word =================== - The tuple_info word describes the stack and STG register (e.g. R1..R6, - D1..D6) usage for the tuple. tuple_info contains enough information to + The call_info word describes the stack and STG register (e.g. R1..R6, + D1..D6) usage for the tuple. call_info contains enough information to convert the tuple between the stack-only bytecode and stack+registers GHC native calling conventions. - See Note [GHCi tuple layout] for more details of how the data is packed - in a single word. + See Note [GHCi and native call registers] for more details of how the + data is packed in a single word. -} -tupleBCO :: Platform -> TupleInfo -> [(Bool, ByteOff)] -> [FFIInfo] -> ProtoBCO Name +tupleBCO :: Platform -> NativeCallInfo -> [(Bool, ByteOff)] -> [FFIInfo] -> ProtoBCO Name tupleBCO platform info pointers = mkProtoBCO platform invented_name body_code (Left []) 0{-no arity-} bitmap_size bitmap False{-is alts-} @@ -1233,15 +1240,103 @@ tupleBCO platform info pointers = -} invented_name = mkSystemVarName (mkPseudoUniqueE 0) (fsLit "tuple") - -- the first word in the frame is the tuple_info word, + -- the first word in the frame is the call_info word, -- which is not a pointer - bitmap_size = trunc16W $ 1 + tupleSize info + bitmap_size = trunc16W $ 1 + nativeCallSize info bitmap = intsToReverseBitmap platform (fromIntegral bitmap_size) $ map ((+1) . fromIntegral . bytesToWords platform . snd) (filter fst pointers) body_code = mkSlideW 0 1 -- pop frame header `snocOL` RETURN_TUPLE -- and add it again +primCallBCO :: Platform -> NativeCallInfo -> [(Bool, ByteOff)] -> [FFIInfo] -> ProtoBCO Name +primCallBCO platform args_info pointers = + mkProtoBCO platform invented_name body_code (Left []) + 0{-no arity-} bitmap_size bitmap False{-is alts-} + + where + {- + The primcall BCO is never referred to by name, so we can get away + with using a fake name here. We will need to change this if we want + to save some memory by sharing the BCO between places that have + the same tuple shape + -} + invented_name = mkSystemVarName (mkPseudoUniqueE 0) (fsLit "primcall") + + -- the first three words in the frame are the BCO describing the + -- pointers in the frame, the call_info word and the pointer + -- to the Cmm function being called. None of these is a pointer that + -- should be followed by the garbage collector + bitmap_size = trunc16W $ 2 + nativeCallSize args_info + bitmap = intsToReverseBitmap platform (fromIntegral bitmap_size) $ + map ((+2) . fromIntegral . bytesToWords platform . snd) + (filter fst pointers) + -- if the primcall BCO is ever run it's a bug, since the BCO should only + -- be pushed immediately before running the PRIMCALL bytecode instruction, + -- which immediately leaves the interpreter to jump to the stg_primcall_info + -- Cmm function + body_code = unitOL CASEFAIL + +-- ----------------------------------------------------------------------------- +-- Deal with a primitive call to native code. + +generatePrimCall + :: StackDepth + -> Sequel + -> BCEnv + -> CLabelString -- where to call + -> Maybe Unit + -> Type + -> [StgArg] -- args (atoms) + -> BcM BCInstrList +generatePrimCall d s p target _mb_unit _result_ty args + = do + profile <- getProfile + let + platform = profilePlatform profile + + non_void VoidRep = False + non_void _ = True + + nv_args :: [StgArg] + nv_args = filter (non_void . argPrimRep) args + + (args_info, args_offsets) = + layoutNativeCall profile + NativePrimCall + d + (primRepCmmType platform . argPrimRep) + nv_args + + args_ptrs :: [(Bool, ByteOff)] + args_ptrs = + map (\(r, off) -> + (isFollowableArg (toArgRep platform . argPrimRep $ r), off)) + args_offsets + + push_target = PUSH_UBX (LitLabel target Nothing IsFunction) 1 + push_info = PUSH_UBX (mkNativeCallInfoLit platform args_info) 1 + {- + compute size to move payload (without stg_primcall_info header) + + size of arguments plus three words for: + - function pointer to the target + - call_info word + - BCO to describe the stack frame + -} + szb = wordsToBytes platform (nativeCallSize args_info + 3) + go _ pushes [] = return (reverse pushes) + go !dd pushes ((a, off):cs) = do (push, szb) <- pushAtom dd p a + massert (off == dd + szb) + go (dd + szb) (push:pushes) cs + push_args <- go d [] args_offsets + args_bco <- emitBc (primCallBCO platform args_info args_ptrs) + return $ mconcat push_args `appOL` + (push_target `consOL` + push_info `consOL` + PUSH_BCO args_bco `consOL` + (mkSlideB platform szb (d - s) `appOL` unitOL PRIMCALL)) + -- ----------------------------------------------------------------------------- -- Deal with a CCall. @@ -1259,11 +1354,17 @@ generateCCall -> Type -> [StgArg] -- args (atoms) -> BcM BCInstrList -generateCCall d0 s p (CCallSpec target cconv safety) result_ty args_r_to_l +generateCCall d0 s p (CCallSpec target PrimCallConv _) result_ty args + | (StaticTarget _ label mb_unit _) <- target + = generatePrimCall d0 s p label mb_unit result_ty args + | otherwise + = panic "GHC.StgToByteCode.generateCCall: primcall convention only supports static targets" +generateCCall d0 s p (CCallSpec target cconv safety) result_ty args = do profile <- getProfile let + args_r_to_l = reverse args platform = profilePlatform profile -- useful constants addr_size_b :: ByteOff @@ -2007,7 +2108,7 @@ isSupportedCConv :: CCallSpec -> Bool isSupportedCConv (CCallSpec _ cconv _) = case cconv of CCallConv -> True -- we explicitly pattern match on every StdCallConv -> True -- convention to ensure that a warning - PrimCallConv -> False -- is triggered when a new one is added + PrimCallConv -> True -- is triggered when a new one is added JavaScriptCallConv -> False CApiConv -> True diff --git a/compiler/GHC/StgToCmm/Foreign.hs b/compiler/GHC/StgToCmm/Foreign.hs index e71c418530..95b7d1c5fd 100644 --- a/compiler/GHC/StgToCmm/Foreign.hs +++ b/compiler/GHC/StgToCmm/Foreign.hs @@ -15,8 +15,8 @@ module GHC.StgToCmm.Foreign ( emitLoadThreadState, emitSaveRegs, emitRestoreRegs, - emitPushTupleRegs, - emitPopTupleRegs, + emitPushArgRegs, + emitPopArgRegs, loadThreadState, emitOpenNursery, emitCloseNursery, @@ -349,7 +349,7 @@ emitRestoreRegs = do -- bytecode interpreter. -- -- The "live registers" bitmap corresponds to the list of registers given by --- 'tupleRegsCover', with the least significant bit indicating liveness of +-- 'allArgRegsCover', with the least significant bit indicating liveness of -- the first register in the list. -- -- Each register is saved to a stack slot of one or more machine words, even @@ -362,12 +362,12 @@ emitRestoreRegs = do -- if((mask & 2) != 0) { Sp_adj(-1); Sp(0) = R2; } -- if((mask & 1) != 0) { Sp_adj(-1); Sp(0) = R1; } -- --- See Note [GHCi tuple layout] +-- See Note [GHCi and native call registers] -emitPushTupleRegs :: CmmExpr -> FCode () -emitPushTupleRegs regs_live = do +emitPushArgRegs :: CmmExpr -> FCode () +emitPushArgRegs regs_live = do platform <- getPlatform - let regs = zip (tupleRegsCover platform) [0..] + let regs = zip (allArgRegsCover platform) [0..] save_arg (reg, n) = let mask = CmmLit (CmmInt (1 `shiftL` n) (wordWidth platform)) live = cmmAndWord platform regs_live mask @@ -381,11 +381,11 @@ emitPushTupleRegs regs_live = do in mkCmmIfThen cond $ catAGraphs [adj_sp, save_reg] emit . catAGraphs =<< mapM save_arg (reverse regs) --- | Pop a subset of STG registers from the stack (see 'emitPushTupleRegs') -emitPopTupleRegs :: CmmExpr -> FCode () -emitPopTupleRegs regs_live = do +-- | Pop a subset of STG registers from the stack (see 'emitPushArgRegs') +emitPopArgRegs :: CmmExpr -> FCode () +emitPopArgRegs regs_live = do platform <- getPlatform - let regs = zip (tupleRegsCover platform) [0..] + let regs = zip (allArgRegsCover platform) [0..] save_arg (reg, n) = let mask = CmmLit (CmmInt (1 `shiftL` n) (wordWidth platform)) live = cmmAndWord platform regs_live mask |