8 files changed, 346 insertions, 129 deletions
diff --git a/compiler/GHC/ByteCode/Asm.hs b/compiler/GHC/ByteCode/Asm.hs
index 24e2645052..391949d448 100644
--- a/compiler/GHC/ByteCode/Asm.hs
+++ b/compiler/GHC/ByteCode/Asm.hs
@@ -12,7 +12,7 @@ module GHC.ByteCode.Asm (
         bcoFreeNames,
         SizedSeq, sizeSS, ssElts,
         iNTERP_STACK_CHECK_THRESH,
-        mkTupleInfoLit
+        mkNativeCallInfoLit
   ) where
 
 import GHC.Prelude
@@ -32,7 +32,6 @@ import GHC.Types.Unique.DSet
 
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
-import GHC.Utils.Panic.Plain
 
 import GHC.Core.TyCon
 import GHC.Data.FastString
@@ -40,7 +39,7 @@ import GHC.Data.SizedSeq
 
 import GHC.StgToCmm.Layout     ( ArgRep(..) )
 import GHC.Cmm.Expr
-import GHC.Cmm.CallConv        ( tupleRegsCover )
+import GHC.Cmm.CallConv        ( allArgRegsCover )
 import GHC.Platform
 import GHC.Platform.Profile
 
@@ -202,7 +201,8 @@ assembleBCO platform (ProtoBCO { protoBCOName       = nm
   (final_insns, final_lits, final_ptrs) <- flip execStateT initial_state $ runAsm platform long_jumps env asm
 
   -- precomputed size should be equal to final size
-  massert (n_insns == sizeSS final_insns)
+  massertPpr (n_insns == sizeSS final_insns)
+             (text "bytecode instruction count mismatch")
 
   let asm_insns = ssElts final_insns
       insns_arr = Array.listArray (0, fromIntegral n_insns - 1) asm_insns
@@ -351,7 +351,8 @@ largeArg platform w = case platformWordSize platform of
            fromIntegral (w `shiftR` 32),
            fromIntegral (w `shiftR` 16),
            fromIntegral w]
-   PW4 -> assert (w < fromIntegral (maxBound :: Word32)) $
+   PW4 -> assertPpr (w < fromIntegral (maxBound :: Word32))
+                    (text "largeArg too big:" <+> ppr w) $
           [fromIntegral (w `shiftR` 16),
            fromIntegral w]
 
@@ -388,14 +389,14 @@ assembleI platform i = case i of
                            -> do let ul_bco = assembleBCO platform proto
                                  p <- ioptr (liftM BCOPtrBCO ul_bco)
                                  emit (push_alts pk) [Op p]
-  PUSH_ALTS_TUPLE proto tuple_info tuple_proto
+  PUSH_ALTS_TUPLE proto call_info tuple_proto
                            -> do let ul_bco = assembleBCO platform proto
                                      ul_tuple_bco = assembleBCO platform
                                                                 tuple_proto
                                  p <- ioptr (liftM BCOPtrBCO ul_bco)
                                  p_tup <- ioptr (liftM BCOPtrBCO ul_tuple_bco)
                                  info <- int (fromIntegral $
-                                              mkTupleInfoSig platform tuple_info)
+                                              mkNativeCallInfoSig platform call_info)
                                  emit bci_PUSH_ALTS_T
                                       [Op p, Op info, Op p_tup]
   PUSH_PAD8                -> emit bci_PUSH_PAD8 []
@@ -491,6 +492,7 @@ assembleI platform i = case i of
   RETURN_TUPLE             -> emit bci_RETURN_T []
   CCALL off m_addr i       -> do np <- addr m_addr
                                  emit bci_CCALL [SmallOp off, Op np, SmallOp i]
+  PRIMCALL                 -> emit bci_PRIMCALL []
   BRK_FUN index uniq cc    -> do p1 <- ptr BCOPtrBreakArray
                                  q <- int (getKey uniq)
                                  np <- addr cc
@@ -580,41 +582,44 @@ return_unlifted V64 = error "return_unlifted: vector"
   maximum number of tuple elements may be larger. Elements can also
   take multiple words on the stack (for example Double# on a 32 bit
   platform).
-
  -}
-maxTupleNativeStackSize :: WordOff
-maxTupleNativeStackSize = 62
+maxTupleReturnNativeStackSize :: WordOff
+maxTupleReturnNativeStackSize = 62
 
 {-
-  Construct the tuple_info word that stg_ctoi_t and stg_ret_t use
-  to convert a tuple between the native calling convention and the
+  Construct the call_info word that stg_ctoi_t, stg_ret_t and stg_primcall
+  use to convert arguments between the native calling convention and the
   interpreter.
 
-  See Note [GHCi tuple layout] for more information.
+  See Note [GHCi and native call registers] for more information.
  -}
-mkTupleInfoSig :: Platform -> TupleInfo -> Word32
-mkTupleInfoSig platform TupleInfo{..}
-  | tupleNativeStackSize > maxTupleNativeStackSize
-  = pprPanic "mkTupleInfoSig: tuple too big for the bytecode compiler"
-             (ppr tupleNativeStackSize <+> text "stack words." <+>
+mkNativeCallInfoSig :: Platform -> NativeCallInfo -> Word32
+mkNativeCallInfoSig platform NativeCallInfo{..}
+  | nativeCallType == NativeTupleReturn && nativeCallStackSpillSize > maxTupleReturnNativeStackSize
+  = pprPanic "mkNativeCallInfoSig: tuple too big for the bytecode compiler"
+             (ppr nativeCallStackSpillSize <+> text "stack words." <+>
               text "Use -fobject-code to get around this limit"
              )
   | otherwise
-  = assert (length regs <= 24) {- 24 bits for bitmap -}
-    assert (tupleNativeStackSize < 255) {- 8 bits for stack size -}
-    assert (all (`elem` regs) (regSetToList tupleRegs)) {- all regs accounted for -}
-    foldl' reg_bit 0 (zip regs [0..]) .|.
-      (fromIntegral tupleNativeStackSize `shiftL` 24)
+  = assertPpr (length regs <= 24) (text "too many registers for bitmap:" <+> ppr (length regs)) {- 24 bits for register bitmap -}
+    assertPpr (cont_offset < 255) (text "continuation offset too large:" <+> ppr cont_offset) {- 8 bits for continuation offset (only for NativeTupleReturn) -}
+    assertPpr (all (`elem` regs) (regSetToList nativeCallRegs)) (text "not all registers accounted for") {- all regs accounted for -}
+    foldl' reg_bit 0 (zip regs [0..]) .|. (cont_offset `shiftL` 24)
   where
+    cont_offset :: Word32
+    cont_offset
+      | nativeCallType == NativeTupleReturn = fromIntegral nativeCallStackSpillSize
+      | otherwise                           = 0 -- there is no continuation for primcalls
+
     reg_bit :: Word32 -> (GlobalReg, Int) -> Word32
     reg_bit x (r, n)
-      | r `elemRegSet` tupleRegs = x .|. 1 `shiftL` n
-      | otherwise                = x
-    regs = tupleRegsCover platform
+      | r `elemRegSet` nativeCallRegs = x .|. 1 `shiftL` n
+      | otherwise                     = x
+    regs = allArgRegsCover platform
 
-mkTupleInfoLit :: Platform -> TupleInfo -> Literal
-mkTupleInfoLit platform tuple_info =
-  mkLitWord platform . fromIntegral $ mkTupleInfoSig platform tuple_info
+mkNativeCallInfoLit :: Platform -> NativeCallInfo -> Literal
+mkNativeCallInfoLit platform call_info =
+  mkLitWord platform . fromIntegral $ mkNativeCallInfoSig platform call_info
 
 -- Make lists of host-sized words for literals, so that when the
 -- words are placed in memory at increasing addresses, the
diff --git a/compiler/GHC/ByteCode/Instr.hs b/compiler/GHC/ByteCode/Instr.hs
index 498152c471..34baa57d40 100644
--- a/compiler/GHC/ByteCode/Instr.hs
+++ b/compiler/GHC/ByteCode/Instr.hs
@@ -90,7 +90,7 @@ data BCInstr
    | PUSH_ALTS          (ProtoBCO Name)
    | PUSH_ALTS_UNLIFTED (ProtoBCO Name) ArgRep
    | PUSH_ALTS_TUPLE    (ProtoBCO Name) -- continuation
-                        !TupleInfo
+                        !NativeCallInfo
                         (ProtoBCO Name) -- tuple return BCO
 
    -- Pushing 8, 16 and 32 bits of padding (for constructors).
@@ -184,6 +184,8 @@ data BCInstr
                                 -- (XXX: inefficient, but I don't know
                                 -- what the alignment constraints are.)
 
+   | PRIMCALL
+
    -- For doing magic ByteArray passing to foreign calls
    | SWIZZLE          Word16 -- to the ptr N words down the stack,
                       Word16 -- add M (interpreted as a signed 16-bit entity)
@@ -269,8 +271,8 @@ instance Outputable BCInstr where
 
    ppr (PUSH_ALTS bco)       = hang (text "PUSH_ALTS") 2 (ppr bco)
    ppr (PUSH_ALTS_UNLIFTED bco pk) = hang (text "PUSH_ALTS_UNLIFTED" <+> ppr pk) 2 (ppr bco)
-   ppr (PUSH_ALTS_TUPLE bco tuple_info tuple_bco) =
-                               hang (text "PUSH_ALTS_TUPLE" <+> ppr tuple_info)
+   ppr (PUSH_ALTS_TUPLE bco call_info tuple_bco) =
+                               hang (text "PUSH_ALTS_TUPLE" <+> ppr call_info)
                                     2
                                     (ppr tuple_bco $+$ ppr bco)
 
@@ -340,6 +342,7 @@ instance Outputable BCInstr where
                                                       0x1 -> text "(interruptible)"
                                                       0x2 -> text "(unsafe)"
                                                       _   -> empty)
+   ppr PRIMCALL              = text "PRIMCALL"
    ppr (SWIZZLE stkoff n)    = text "SWIZZLE " <+> text "stkoff" <+> ppr stkoff
                                                <+> text "by" <+> ppr n
    ppr ENTER                 = text "ENTER"
@@ -382,11 +385,11 @@ bciStackUse (PUSH_ALTS bco)       = 2 {- profiling only, restore CCCS -} +
 bciStackUse (PUSH_ALTS_UNLIFTED bco _) = 2 {- profiling only, restore CCCS -} +
                                          4 + protoBCOStackUse bco
 bciStackUse (PUSH_ALTS_TUPLE bco info _) =
-   -- (tuple_bco, tuple_info word, cont_bco, stg_ctoi_t)
+   -- (tuple_bco, call_info word, cont_bco, stg_ctoi_t)
    -- tuple
-   -- (tuple_info, tuple_bco, stg_ret_t)
+   -- (call_info, tuple_bco, stg_ret_t)
    1 {- profiling only -} +
-   7 + fromIntegral (tupleSize info) + protoBCOStackUse bco
+   7 + fromIntegral (nativeCallSize info) + protoBCOStackUse bco
 bciStackUse (PUSH_PAD8)           = 1  -- overapproximation
 bciStackUse (PUSH_PAD16)          = 1  -- overapproximation
 bciStackUse (PUSH_PAD32)          = 1  -- overapproximation on 64bit arch
@@ -443,6 +446,7 @@ bciStackUse RETURN{}              = 0
 bciStackUse RETURN_UNLIFTED{}     = 1 -- pushes stg_ret_X for some X
 bciStackUse RETURN_TUPLE{}        = 1 -- pushes stg_ret_t header
 bciStackUse CCALL{}               = 0
+bciStackUse PRIMCALL{}            = 1 -- pushes stg_primcall
 bciStackUse SWIZZLE{}             = 0
 bciStackUse BRK_FUN{}             = 0
 
diff --git a/compiler/GHC/ByteCode/Types.hs b/compiler/GHC/ByteCode/Types.hs
index 830b60a4ca..a4b025ce92 100644
--- a/compiler/GHC/ByteCode/Types.hs
+++ b/compiler/GHC/ByteCode/Types.hs
@@ -10,7 +10,7 @@ module GHC.ByteCode.Types
   ( CompiledByteCode(..), seqCompiledByteCode
   , FFIInfo(..)
   , RegBitmap(..)
-  , TupleInfo(..), voidTupleInfo
+  , NativeCallType(..), NativeCallInfo(..), voidTupleReturnInfo, voidPrimCallInfo
   , ByteOff(..), WordOff(..)
   , UnlinkedBCO(..), BCOPtr(..), BCONPtr(..)
   , ItblEnv, ItblPtr(..)
@@ -105,22 +105,32 @@ newtype RegBitmap = RegBitmap { unRegBitmap :: Word32 }
 
    See GHC.StgToByteCode.layoutTuple for more details.
 -}
-data TupleInfo = TupleInfo
-  { tupleSize            :: !WordOff   -- total size of tuple in words
-  , tupleRegs            :: !GlobalRegSet
-  , tupleNativeStackSize :: !WordOff {- words spilled on the stack by
-                                        GHCs native calling convention -}
-  } deriving (Show)
-
-instance Outputable TupleInfo where
-  ppr TupleInfo{..} = text "<size" <+> ppr tupleSize <+>
-                      text "stack" <+> ppr tupleNativeStackSize <+>
-                      text "regs"  <+>
-                      ppr (map (text @SDoc . show) $ regSetToList tupleRegs) <>
-                      char '>'
-
-voidTupleInfo :: TupleInfo
-voidTupleInfo = TupleInfo 0 emptyRegSet 0
+
+data NativeCallType = NativePrimCall
+                    | NativeTupleReturn
+  deriving (Eq)
+
+data NativeCallInfo = NativeCallInfo
+  { nativeCallType           :: !NativeCallType
+  , nativeCallSize           :: !WordOff   -- total size of arguments in words
+  , nativeCallRegs           :: !GlobalRegSet
+  , nativeCallStackSpillSize :: !WordOff {- words spilled on the stack by
+                                            GHCs native calling convention -}
+  }
+
+instance Outputable NativeCallInfo where
+  ppr NativeCallInfo{..} = text "<arg_size" <+> ppr nativeCallSize <+>
+                           text "stack" <+> ppr nativeCallStackSpillSize <+>
+                           text "regs"  <+>
+                           ppr (map (text @SDoc . show) $ regSetToList nativeCallRegs) <>
+                           char '>'
+
+
+voidTupleReturnInfo :: NativeCallInfo
+voidTupleReturnInfo = NativeCallInfo NativeTupleReturn 0 emptyRegSet 0
+
+voidPrimCallInfo :: NativeCallInfo
+voidPrimCallInfo = NativeCallInfo NativePrimCall 0 emptyRegSet 0
 
 type ItblEnv = NameEnv (Name, ItblPtr)
         -- We need the Name in the range so we know which
diff --git a/compiler/GHC/Cmm/CallConv.hs b/compiler/GHC/Cmm/CallConv.hs
index a0fee0e5c6..97cebf99e6 100644
--- a/compiler/GHC/Cmm/CallConv.hs
+++ b/compiler/GHC/Cmm/CallConv.hs
@@ -3,7 +3,7 @@ module GHC.Cmm.CallConv (
   assignArgumentsPos,
   assignStack,
   realArgRegsCover,
-  tupleRegsCover
+  allArgRegsCover
 ) where
 
 import GHC.Prelude
@@ -220,12 +220,109 @@ realArgRegsCover platform
       realLongRegs   platform
       -- we don't save XMM registers if they are not used for parameter passing
 
--- Like realArgRegsCover but always includes the node. This covers the real
--- and virtual registers used for unboxed tuples.
---
--- Note: if anything changes in how registers for unboxed tuples overlap,
---       make sure to also update GHC.StgToByteCode.layoutTuple.
 
-tupleRegsCover :: Platform -> [GlobalReg]
-tupleRegsCover platform =
+{-
+
+  Note [GHCi and native call registers]
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  The GHCi bytecode interpreter does not have access to the STG registers
+  that the native calling convention uses for passing arguments. It uses
+  helper stack frames to move values between the stack and registers.
+
+  If only a single register needs to be moved, GHCi uses a specific stack
+  frame. For example stg_ctoi_R1p saves a heap pointer value from STG register
+  R1 and stg_ctoi_D1 saves a double precision floating point value from D1.
+  In the other direction, helpers stg_ret_p and stg_ret_d move a value from
+  the stack to the R1 and D1 registers, respectively.
+
+  When GHCi needs to move more than one register it cannot use a specific
+  helper frame. It would simply be impossible to create a helper for all
+  possible combinations of register values. Instead, there are generic helper
+  stack frames that use a call_info word that describes the active registers
+  and the number of stack words used by the arguments of a call.
+
+  These helper stack frames are currently:
+
+      - stg_ret_t:    return a tuple to the continuation at the top of
+                          the stack
+      - stg_ctoi_t:   convert a tuple return value to be used in
+                          bytecode
+      - stg_primcall: call a function
+
+
+  The call_info word contains a bitmap of the active registers
+  for the call and and a stack offset. The layout is as follows:
+
+      - bit 0-23:  Bitmap of active registers for the call, the
+                   order corresponds to the list returned by
+                   allArgRegsCover. For example if bit 0 (the least
+                   significant bit) is set, the first register in the
+                   allArgRegsCover list is active. Bit 1 for the
+                   second register in the list and so on.
+
+      - bit 24-31: Unsigned byte indicating the stack offset
+                   of the continuation in words. For tuple returns
+                   this is the number of words returned on the
+                   stack. For primcalls this field is unused, since
+                   we don't jump to a continuation.
+
+    The upper 32 bits on 64 bit platforms are currently unused.
+
+    If a register is smaller than a word on the stack (for example a
+    single precision float on a 64 bit system), then the stack slot
+    is padded to a whole word.
+
+    Example:
+
+        If a tuple is returned in three registers and an additional two
+        words on the stack, then three bits in the register bitmap
+        (bits 0-23) would be set. And bit 24-31 would be
+        00000010 (two in binary).
+
+        The values on the stack before a call to POP_ARG_REGS would
+        be as follows:
+
+            ...
+            continuation
+            stack_arg_1
+            stack_arg_2
+            register_arg_3
+            register_arg_2
+            register_arg_1 <- Sp
+
+        A call to POP_ARG_REGS(call_info) would move register_arg_1
+        to the register corresponding to the lowest set bit in the
+        call_info word. register_arg_2 would be moved to the register
+        corresponding to the second lowest set bit, and so on.
+
+        After POP_ARG_REGS(call_info), the stack pointer Sp points
+        to the topmost stack argument, so the stack looks as follows:
+
+            ...
+            continuation
+            stack_arg_1
+            stack_arg_2 <- Sp
+
+        At this point all the arguments are in place and we are ready
+        to jump to the continuation, the location (offset from Sp) of
+        which is found by inspecting the value of bits 24-31. In this
+        case the offset is two words.
+
+    On x86_64, the double precision (Dn) and single precision
+    floating (Fn) point registers overlap, e.g. D1 uses the same
+    physical register as F1. On this platform, the list returned
+    by allArgRegsCover contains only entries for the double
+    precision registers. If an argument is passed in register
+    Fn, the bit corresponding to Dn should be set.
+
+  Note: if anything changes in how registers for native calls overlap,
+           make sure to also update GHC.StgToByteCode.layoutNativeCall
+ -}
+
+-- Like realArgRegsCover but always includes the node. This covers all real
+-- and virtual registers actually used for passing arguments.
+
+allArgRegsCover :: Platform -> [GlobalReg]
+allArgRegsCover platform =
   nub (VanillaReg 1 VGcPtr : realArgRegsCover platform)
diff --git a/compiler/GHC/Cmm/Parser.y b/compiler/GHC/Cmm/Parser.y
index dbb2e47030..35d8e4c40f 100644
--- a/compiler/GHC/Cmm/Parser.y
+++ b/compiler/GHC/Cmm/Parser.y
@@ -1233,8 +1233,8 @@ stmtMacros = listToUFM [
   ( fsLit "SAVE_REGS",             \[] -> emitSaveRegs ),
   ( fsLit "RESTORE_REGS",          \[] -> emitRestoreRegs ),
 
-  ( fsLit "PUSH_TUPLE_REGS",      \[live_regs] -> emitPushTupleRegs live_regs ),
-  ( fsLit "POP_TUPLE_REGS",       \[live_regs] -> emitPopTupleRegs live_regs ),
+  ( fsLit "PUSH_ARG_REGS",         \[live_regs] -> emitPushArgRegs live_regs ),
+  ( fsLit "POP_ARG_REGS",          \[live_regs] -> emitPopArgRegs live_regs ),
 
   ( fsLit "LDV_ENTER",             \[e] -> ldvEnter e ),
   ( fsLit "LDV_RECORD_CREATE",     \[e] -> ldvRecordCreate e ),
diff --git a/compiler/GHC/Cmm/Reg.hs b/compiler/GHC/Cmm/Reg.hs
index a9b3fce101..104702f312 100644
--- a/compiler/GHC/Cmm/Reg.hs
+++ b/compiler/GHC/Cmm/Reg.hs
@@ -223,7 +223,7 @@ instance Eq GlobalReg where
    _r1 == _r2 = False
 
 -- NOTE: this Ord instance affects the tuple layout in GHCi, see
---       Note [GHCi tuple layout]
+--       Note [GHCi and native call registers]
 instance Ord GlobalReg where
    compare (VanillaReg i _) (VanillaReg j _) = compare i j
      -- Ignore type when seeking clashes
diff --git a/compiler/GHC/StgToByteCode.hs b/compiler/GHC/StgToByteCode.hs
index b59cbfe779..de37d987cb 100644
--- a/compiler/GHC/StgToByteCode.hs
+++ b/compiler/GHC/StgToByteCode.hs
@@ -58,7 +58,7 @@ import GHC.Data.FastString
 import GHC.Utils.Panic
 import GHC.Utils.Panic.Plain
 import GHC.Utils.Exception (evaluate)
-import GHC.StgToCmm.Closure ( NonVoid(..), fromNonVoid, nonVoidIds )
+import GHC.StgToCmm.Closure ( NonVoid(..), fromNonVoid, nonVoidIds, argPrimRep )
 import GHC.StgToCmm.Layout
 import GHC.Runtime.Heap.Layout hiding (WordOff, ByteOff, wordsToBytes)
 import GHC.Data.Bitmap
@@ -464,10 +464,10 @@ returnUnliftedReps d s szb reps = do
              [rep] -> return (unitOL $ RETURN_UNLIFTED (toArgRep platform rep))
              -- otherwise use RETURN_TUPLE with a tuple descriptor
              nv_reps -> do
-               let (tuple_info, args_offsets) = layoutTuple profile 0 (primRepCmmType platform) nv_reps
+               let (call_info, args_offsets) = layoutNativeCall profile NativeTupleReturn 0 (primRepCmmType platform) nv_reps
                    args_ptrs = map (\(rep, off) -> (isFollowableArg (toArgRep platform rep), off)) args_offsets
-               tuple_bco <- emitBc (tupleBCO platform tuple_info args_ptrs)
-               return $ PUSH_UBX (mkTupleInfoLit platform tuple_info) 1 `consOL`
+               tuple_bco <- emitBc (tupleBCO platform call_info args_ptrs)
+               return $ PUSH_UBX (mkNativeCallInfoLit platform call_info) 1 `consOL`
                         PUSH_BCO tuple_bco `consOL`
                         unitOL RETURN_TUPLE
     return ( mkSlideB platform szb (d - s) -- clear to sequel
@@ -484,7 +484,11 @@ returnUnboxedTuple d s p es = do
     profile <- getProfile
     let platform = profilePlatform profile
         arg_ty e = primRepCmmType platform (atomPrimRep e)
-        (tuple_info, tuple_components) = layoutTuple profile d arg_ty es
+        (call_info, tuple_components) = layoutNativeCall profile
+                                                         NativeTupleReturn
+                                                         d
+                                                         arg_ty
+                                                         es
         go _   pushes [] = return (reverse pushes)
         go !dd pushes ((a, off):cs) = do (push, szb) <- pushAtom dd p a
                                          massert (off == dd + szb)
@@ -492,7 +496,7 @@ returnUnboxedTuple d s p es = do
     pushes <- go d [] tuple_components
     ret <- returnUnliftedReps d
                               s
-                              (wordsToBytes platform $ tupleSize tuple_info)
+                              (wordsToBytes platform $ nativeCallSize call_info)
                               (map atomPrimRep es)
     return (mconcat pushes `appOL` ret)
 
@@ -648,14 +652,14 @@ schemeT d s p app
    -- Case 1
 schemeT d s p (StgOpApp (StgFCallOp (CCall ccall_spec) _ty) args result_ty)
    = if isSupportedCConv ccall_spec
-      then generateCCall d s p ccall_spec result_ty (reverse args)
+      then generateCCall d s p ccall_spec result_ty args
       else unsupportedCConvException
 
 schemeT d s p (StgOpApp (StgPrimOp op) args _ty)
    = doTailCall d s p (primOpId op) (reverse args)
 
-schemeT _d _s _p (StgOpApp StgPrimCallOp{} _args _ty)
-   = unsupportedCConvException
+schemeT d s p (StgOpApp (StgPrimCallOp (PrimCall label unit)) args result_ty)
+   = generatePrimCall d s p label (Just unit) result_ty args
 
    -- Case 2: Unboxed tuple
 schemeT d s p (StgConApp con _cn args _tys)
@@ -840,18 +844,18 @@ doCase d s p scrut bndr alts
                              | ubx_frame       = wordSize platform
                              | otherwise       = 0
 
-        (bndr_size, tuple_info, args_offsets)
+        (bndr_size, call_info, args_offsets)
            | ubx_tuple_frame =
                let bndr_ty = primRepCmmType platform
                    bndr_reps = filter (not.isVoidRep) (bcIdPrimReps bndr)
-                   (tuple_info, args_offsets) =
-                       layoutTuple profile 0 bndr_ty bndr_reps
-               in ( wordsToBytes platform (tupleSize tuple_info)
-                  , tuple_info
+                   (call_info, args_offsets) =
+                       layoutNativeCall profile NativeTupleReturn 0 bndr_ty bndr_reps
+               in ( wordsToBytes platform (nativeCallSize call_info)
+                  , call_info
                   , args_offsets
                   )
            | otherwise = ( wordsToBytes platform (idSizeW platform bndr)
-                         , voidTupleInfo
+                         , voidTupleReturnInfo
                          , []
                          )
 
@@ -885,17 +889,18 @@ doCase d s p scrut bndr alts
            | isUnboxedTupleType bndr_ty || isUnboxedSumType bndr_ty =
              let bndr_ty = primRepCmmType platform . bcIdPrimRep
                  tuple_start = d_bndr
-                 (tuple_info, args_offsets) =
-                   layoutTuple profile
-                               0
-                               bndr_ty
-                               bndrs
+                 (call_info, args_offsets) =
+                   layoutNativeCall profile
+                                    NativeTupleReturn
+                                    0
+                                    bndr_ty
+                                    bndrs
 
                  stack_bot = d_alts
 
                  p' = Map.insertList
                         [ (arg, tuple_start -
-                                wordsToBytes platform (tupleSize tuple_info) +
+                                wordsToBytes platform (nativeCallSize call_info) +
                                 offset)
                         | (arg, offset) <- args_offsets
                         , not (isVoidRep $ bcIdPrimRep arg)]
@@ -981,8 +986,8 @@ doCase d s p scrut bndr alts
 
         -- unboxed tuples get two more words, the second is a pointer (tuple_bco)
         (extra_pointers, extra_slots)
-           | ubx_tuple_frame && profiling = ([1], 3) -- tuple_info, tuple_BCO, CCCS
-           | ubx_tuple_frame              = ([1], 2) -- tuple_info, tuple_BCO
+           | ubx_tuple_frame && profiling = ([1], 3) -- call_info, tuple_BCO, CCCS
+           | ubx_tuple_frame              = ([1], 2) -- call_info, tuple_BCO
            | otherwise                    = ([], 0)
 
         bitmap_size = trunc16W $ fromIntegral extra_slots +
@@ -1028,8 +1033,8 @@ doCase d s p scrut bndr alts
               let args_ptrs =
                     map (\(rep, off) -> (isFollowableArg (toArgRep platform rep), off))
                         args_offsets
-              tuple_bco <- emitBc (tupleBCO platform tuple_info args_ptrs)
-              return (PUSH_ALTS_TUPLE alt_bco' tuple_info tuple_bco
+              tuple_bco <- emitBc (tupleBCO platform call_info args_ptrs)
+              return (PUSH_ALTS_TUPLE alt_bco' call_info tuple_bco
                       `consOL` scrut_code)
        else let push_alts
                   | not ubx_frame
@@ -1050,14 +1055,15 @@ doCase d s p scrut bndr alts
 -- The native calling convention uses registers for tuples, but in the
 -- bytecode interpreter, all values live on the stack.
 
-layoutTuple :: Profile
-            -> ByteOff
-            -> (a -> CmmType)
-            -> [a]
-            -> ( TupleInfo      -- See Note [GHCi TupleInfo]
-               , [(a, ByteOff)] -- argument, offset on stack
-               )
-layoutTuple profile start_off arg_ty reps =
+layoutNativeCall :: Profile
+                 -> NativeCallType
+                 -> ByteOff
+                 -> (a -> CmmType)
+                 -> [a]
+                 -> ( NativeCallInfo      -- See Note [GHCi TupleInfo]
+                    , [(a, ByteOff)] -- argument, offset on stack
+                    )
+layoutNativeCall profile call_type start_off arg_ty reps =
   let platform = profilePlatform profile
       (orig_stk_bytes, pos) = assignArgumentsPos profile
                                                  0
@@ -1070,7 +1076,7 @@ layoutTuple profile start_off arg_ty reps =
 
       -- sort the register parameters by register and add them to the stack
       regs_order :: Map.Map GlobalReg Int
-      regs_order = Map.fromList $ zip (tupleRegsCover platform) [0..]
+      regs_order = Map.fromList $ zip (allArgRegsCover platform) [0..]
 
       reg_order :: GlobalReg -> (Int, GlobalReg)
       reg_order reg | Just n <- Map.lookup reg regs_order = (n, reg)
@@ -1099,10 +1105,11 @@ layoutTuple profile start_off arg_ty reps =
       get_byte_off _                 =
           panic "GHC.StgToByteCode.layoutTuple get_byte_off"
 
-  in ( TupleInfo
-         { tupleSize        = bytesToWords platform (ByteOff new_stk_bytes)
-         , tupleRegs        = regs_set
-         , tupleNativeStackSize = bytesToWords platform
+  in ( NativeCallInfo
+         { nativeCallType           = call_type
+         , nativeCallSize           = bytesToWords platform (ByteOff new_stk_bytes)
+         , nativeCallRegs           = regs_set
+         , nativeCallStackSpillSize = bytesToWords platform
                                                (ByteOff orig_stk_bytes)
          }
      , sortBy (comparing snd) $
@@ -1127,7 +1134,7 @@ usePlainReturn t
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   We have the bytecode instructions RETURN_TUPLE and PUSH_ALTS_TUPLE to
   return and receive arbitrary unboxed tuples, respectively. These
-  instructions use the helper data tuple_BCO and tuple_info.
+  instructions use the helper data tuple_BCO and call_info.
 
   The helper data is used to convert tuples between GHCs native calling
   convention (object code), which uses stack and registers, and the bytecode
@@ -1139,7 +1146,7 @@ usePlainReturn t
   =================
 
   Bytecode that returns a tuple first pushes all the tuple fields followed
-  by the appropriate tuple_info and tuple_BCO onto the stack. It then
+  by the appropriate call_info and tuple_BCO onto the stack. It then
   executes the RETURN_TUPLE instruction, which causes the interpreter
   to push stg_ret_t_info to the top of the stack. The stack (growing down)
   then looks as follows:
@@ -1150,14 +1157,14 @@ usePlainReturn t
       tuple_field_2
       ...
       tuple_field_n
-      tuple_info
+      call_info
       tuple_BCO
       stg_ret_t_info <- Sp
 
   If next_frame is bytecode, the interpreter will start executing it. If
   it's object code, the interpreter jumps back to the scheduler, which in
   turn jumps to stg_ret_t. stg_ret_t converts the tuple to the native
-  calling convention using the description in tuple_info, and then jumps
+  calling convention using the description in call_info, and then jumps
   to next_frame.
 
 
@@ -1169,13 +1176,13 @@ usePlainReturn t
   tuple. The PUSH_ALTS_TUPLE instuction contains three pieces of data:
 
      * cont_BCO: the continuation that receives the tuple
-     * tuple_info: see below
+     * call_info: see below
      * tuple_BCO: see below
 
   The interpreter pushes these onto the stack when the PUSH_ALTS_TUPLE
   instruction is executed, followed by stg_ctoi_tN_info, with N depending
   on the number of stack words used by the tuple in the GHC native calling
-  convention. N is derived from tuple_info.
+  convention. N is derived from call_info.
 
   For example if we expect a tuple with three words on the stack, the stack
   looks as follows after PUSH_ALTS_TUPLE:
@@ -1186,7 +1193,7 @@ usePlainReturn t
       cont_free_var_2
       ...
       cont_free_var_n
-      tuple_info
+      call_info
       tuple_BCO
       cont_BCO
       stg_ctoi_t3_info <- Sp
@@ -1206,20 +1213,20 @@ usePlainReturn t
   that is already on the stack.
 
 
-  The tuple_info word
+  The call_info word
   ===================
 
-  The tuple_info word describes the stack and STG register (e.g. R1..R6,
-  D1..D6) usage for the tuple. tuple_info contains enough information to
+  The call_info word describes the stack and STG register (e.g. R1..R6,
+  D1..D6) usage for the tuple. call_info contains enough information to
   convert the tuple between the stack-only bytecode and stack+registers
   GHC native calling conventions.
 
-  See Note [GHCi tuple layout] for more details of how the data is packed
-  in a single word.
+  See Note [GHCi and native call registers] for more details of how the
+  data is packed in a single word.
 
  -}
 
-tupleBCO :: Platform -> TupleInfo -> [(Bool, ByteOff)] -> [FFIInfo] -> ProtoBCO Name
+tupleBCO :: Platform -> NativeCallInfo -> [(Bool, ByteOff)] -> [FFIInfo] -> ProtoBCO Name
 tupleBCO platform info pointers =
   mkProtoBCO platform invented_name body_code (Left [])
              0{-no arity-} bitmap_size bitmap False{-is alts-}
@@ -1233,15 +1240,103 @@ tupleBCO platform info pointers =
     -}
     invented_name  = mkSystemVarName (mkPseudoUniqueE 0) (fsLit "tuple")
 
-    -- the first word in the frame is the tuple_info word,
+    -- the first word in the frame is the call_info word,
     -- which is not a pointer
-    bitmap_size = trunc16W $ 1 + tupleSize info
+    bitmap_size = trunc16W $ 1 + nativeCallSize info
     bitmap      = intsToReverseBitmap platform (fromIntegral bitmap_size) $
                   map ((+1) . fromIntegral . bytesToWords platform . snd)
                       (filter fst pointers)
     body_code = mkSlideW 0 1          -- pop frame header
                 `snocOL` RETURN_TUPLE -- and add it again
 
+primCallBCO ::  Platform -> NativeCallInfo -> [(Bool, ByteOff)] -> [FFIInfo] -> ProtoBCO Name
+primCallBCO platform args_info pointers =
+  mkProtoBCO platform invented_name body_code (Left [])
+             0{-no arity-} bitmap_size bitmap False{-is alts-}
+
+  where
+    {-
+      The primcall BCO is never referred to by name, so we can get away
+      with using a fake name here. We will need to change this if we want
+      to save some memory by sharing the BCO between places that have
+      the same tuple shape
+    -}
+    invented_name  = mkSystemVarName (mkPseudoUniqueE 0) (fsLit "primcall")
+
+    -- the first three words in the frame are the BCO describing the
+    -- pointers in the frame, the call_info word and the pointer
+    -- to the Cmm function being called. None of these is a pointer that
+    -- should be followed by the garbage collector
+    bitmap_size = trunc16W $ 2 + nativeCallSize args_info
+    bitmap      = intsToReverseBitmap platform (fromIntegral bitmap_size) $
+                  map ((+2) . fromIntegral . bytesToWords platform . snd)
+                      (filter fst pointers)
+    -- if the primcall BCO is ever run it's a bug, since the BCO should only
+    -- be pushed immediately before running the PRIMCALL bytecode instruction,
+    -- which immediately leaves the interpreter to jump to the stg_primcall_info
+    -- Cmm function
+    body_code =  unitOL CASEFAIL
+
+-- -----------------------------------------------------------------------------
+-- Deal with a primitive call to native code.
+
+generatePrimCall
+    :: StackDepth
+    -> Sequel
+    -> BCEnv
+    -> CLabelString          -- where to call
+    -> Maybe Unit
+    -> Type
+    -> [StgArg]              -- args (atoms)
+    -> BcM BCInstrList
+generatePrimCall d s p target _mb_unit _result_ty args
+ = do
+     profile <- getProfile
+     let
+         platform = profilePlatform profile
+
+         non_void VoidRep = False
+         non_void _       = True
+
+         nv_args :: [StgArg]
+         nv_args = filter (non_void . argPrimRep) args
+
+         (args_info, args_offsets) =
+              layoutNativeCall profile
+                               NativePrimCall
+                               d
+                               (primRepCmmType platform . argPrimRep)
+                               nv_args
+
+         args_ptrs :: [(Bool, ByteOff)]
+         args_ptrs =
+            map (\(r, off) ->
+                  (isFollowableArg (toArgRep platform . argPrimRep $ r), off))
+                args_offsets
+
+         push_target = PUSH_UBX (LitLabel target Nothing IsFunction) 1
+         push_info = PUSH_UBX (mkNativeCallInfoLit platform args_info) 1
+         {-
+            compute size to move payload (without stg_primcall_info header)
+
+            size of arguments plus three words for:
+                - function pointer to the target
+                - call_info word
+                - BCO to describe the stack frame
+          -}
+         szb = wordsToBytes platform (nativeCallSize args_info + 3)
+         go _   pushes [] = return (reverse pushes)
+         go !dd pushes ((a, off):cs) = do (push, szb) <- pushAtom dd p a
+                                          massert (off == dd + szb)
+                                          go (dd + szb) (push:pushes) cs
+     push_args <- go d [] args_offsets
+     args_bco <- emitBc (primCallBCO platform args_info args_ptrs)
+     return $ mconcat push_args `appOL`
+              (push_target `consOL`
+               push_info `consOL`
+               PUSH_BCO args_bco `consOL`
+               (mkSlideB platform szb (d - s) `appOL` unitOL PRIMCALL))
+
 -- -----------------------------------------------------------------------------
 -- Deal with a CCall.
 
@@ -1259,11 +1354,17 @@ generateCCall
     -> Type
     -> [StgArg]              -- args (atoms)
     -> BcM BCInstrList
-generateCCall d0 s p (CCallSpec target cconv safety) result_ty args_r_to_l
+generateCCall d0 s p (CCallSpec target PrimCallConv _) result_ty args
+ | (StaticTarget _ label mb_unit _) <- target
+ = generatePrimCall d0 s p label mb_unit result_ty args
+ | otherwise
+ = panic "GHC.StgToByteCode.generateCCall: primcall convention only supports static targets"
+generateCCall d0 s p (CCallSpec target cconv safety) result_ty args
  = do
      profile <- getProfile
 
      let
+         args_r_to_l = reverse args
          platform = profilePlatform profile
          -- useful constants
          addr_size_b :: ByteOff
@@ -2007,7 +2108,7 @@ isSupportedCConv :: CCallSpec -> Bool
 isSupportedCConv (CCallSpec _ cconv _) = case cconv of
    CCallConv            -> True     -- we explicitly pattern match on every
    StdCallConv          -> True     -- convention to ensure that a warning
-   PrimCallConv         -> False    -- is triggered when a new one is added
+   PrimCallConv         -> True     -- is triggered when a new one is added
    JavaScriptCallConv   -> False
    CApiConv             -> True
 
diff --git a/compiler/GHC/StgToCmm/Foreign.hs b/compiler/GHC/StgToCmm/Foreign.hs
index e71c418530..95b7d1c5fd 100644
--- a/compiler/GHC/StgToCmm/Foreign.hs
+++ b/compiler/GHC/StgToCmm/Foreign.hs
@@ -15,8 +15,8 @@ module GHC.StgToCmm.Foreign (
   emitLoadThreadState,
   emitSaveRegs,
   emitRestoreRegs,
-  emitPushTupleRegs,
-  emitPopTupleRegs,
+  emitPushArgRegs,
+  emitPopArgRegs,
   loadThreadState,
   emitOpenNursery,
   emitCloseNursery,
@@ -349,7 +349,7 @@ emitRestoreRegs = do
 -- bytecode interpreter.
 --
 -- The "live registers" bitmap corresponds to the list of registers given by
--- 'tupleRegsCover', with the least significant bit indicating liveness of
+-- 'allArgRegsCover', with the least significant bit indicating liveness of
 -- the first register in the list.
 --
 -- Each register is saved to a stack slot of one or more machine words, even
@@ -362,12 +362,12 @@ emitRestoreRegs = do
 --    if((mask & 2) != 0) { Sp_adj(-1); Sp(0) = R2; }
 --    if((mask & 1) != 0) { Sp_adj(-1); Sp(0) = R1; }
 --
--- See Note [GHCi tuple layout]
+-- See Note [GHCi and native call registers]
 
-emitPushTupleRegs :: CmmExpr -> FCode ()
-emitPushTupleRegs regs_live = do
+emitPushArgRegs :: CmmExpr -> FCode ()
+emitPushArgRegs regs_live = do
   platform <- getPlatform
-  let regs = zip (tupleRegsCover platform) [0..]
+  let regs = zip (allArgRegsCover platform) [0..]
       save_arg (reg, n) =
         let mask     = CmmLit (CmmInt (1 `shiftL` n) (wordWidth platform))
             live     = cmmAndWord platform regs_live mask
@@ -381,11 +381,11 @@ emitPushTupleRegs regs_live = do
         in mkCmmIfThen cond $ catAGraphs [adj_sp, save_reg]
   emit . catAGraphs =<< mapM save_arg (reverse regs)
 
--- | Pop a subset of STG registers from the stack (see 'emitPushTupleRegs')
-emitPopTupleRegs :: CmmExpr -> FCode ()
-emitPopTupleRegs regs_live = do
+-- | Pop a subset of STG registers from the stack (see 'emitPushArgRegs')
+emitPopArgRegs :: CmmExpr -> FCode ()
+emitPopArgRegs regs_live = do
   platform <- getPlatform
-  let regs = zip (tupleRegsCover platform) [0..]
+  let regs = zip (allArgRegsCover platform) [0..]
       save_arg (reg, n) =
         let mask     = CmmLit (CmmInt (1 `shiftL` n) (wordWidth platform))
             live     = cmmAndWord platform regs_live mask