Generate GHCi bytecode from STG instead of Core and support unboxed

tuples and sums. fixes #1257
author: Luite Stegeman <stegeman@gmail.com> 2021-01-22 00:09:17 +0100
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2021-03-20 07:49:15 -0400
commit: 1f94e0f7601f8e22fdd81a47f130650265a44196 (patch)
tree: d06d02317049b56763b2f1da27f71f3663efa5a0
parent: 7de3532f0317032f75b76150c5d3a6f76178be04 (diff)
download: haskell-1f94e0f7601f8e22fdd81a47f130650265a44196.tar.gz
32 files changed, 2177 insertions, 632 deletions
diff --git a/compiler/GHC/ByteCode/Asm.hs b/compiler/GHC/ByteCode/Asm.hs
index 3f88187960..c58328f57c 100644
--- a/compiler/GHC/ByteCode/Asm.hs
+++ b/compiler/GHC/ByteCode/Asm.hs
@@ -9,10 +9,10 @@
 -- | Bytecode assembler and linker
 module GHC.ByteCode.Asm (
         assembleBCOs, assembleOneBCO,
-
         bcoFreeNames,
         SizedSeq, sizeSS, ssElts,
-        iNTERP_STACK_CHECK_THRESH
+        iNTERP_STACK_CHECK_THRESH,
+        mkTupleInfoLit
   ) where
 
 #include "HsVersions.h"
@@ -27,7 +27,7 @@ import GHC.ByteCode.InfoTable
 import GHC.ByteCode.Types
 import GHCi.RemoteTypes
 import GHC.Runtime.Interpreter
-import GHC.Runtime.Heap.Layout
+import GHC.Runtime.Heap.Layout hiding ( WordOff )
 
 import GHC.Types.Name
 import GHC.Types.Name.Set
@@ -381,6 +381,16 @@ assembleI platform i = case i of
                            -> do let ul_bco = assembleBCO platform proto
                                  p <- ioptr (liftM BCOPtrBCO ul_bco)
                                  emit (push_alts pk) [Op p]
+  PUSH_ALTS_TUPLE proto tuple_info tuple_proto
+                           -> do let ul_bco = assembleBCO platform proto
+                                     ul_tuple_bco = assembleBCO platform
+                                                                tuple_proto
+                                 p <- ioptr (liftM BCOPtrBCO ul_bco)
+                                 p_tup <- ioptr (liftM BCOPtrBCO ul_tuple_bco)
+                                 info <- int (fromIntegral $
+                                              mkTupleInfoSig tuple_info)
+                                 emit bci_PUSH_ALTS_T
+                                      [Op p, Op info, Op p_tup]
   PUSH_PAD8                -> emit bci_PUSH_PAD8 []
   PUSH_PAD16               -> emit bci_PUSH_PAD16 []
   PUSH_PAD32               -> emit bci_PUSH_PAD32 []
@@ -439,6 +449,7 @@ assembleI platform i = case i of
   ENTER                    -> emit bci_ENTER []
   RETURN                   -> emit bci_RETURN []
   RETURN_UBX rep           -> emit (return_ubx rep) []
+  RETURN_TUPLE             -> emit bci_RETURN_T []
   CCALL off m_addr i       -> do np <- addr m_addr
                                  emit bci_CCALL [SmallOp off, Op np, SmallOp i]
   BRK_FUN index uniq cc    -> do p1 <- ptr BCOPtrBreakArray
@@ -516,6 +527,90 @@ return_ubx V16 = error "return_ubx: vector"
 return_ubx V32 = error "return_ubx: vector"
 return_ubx V64 = error "return_ubx: vector"
 
+{-
+  we can only handle up to a fixed number of words on the stack,
+  because we need a stg_ctoi_tN stack frame for each size N. See
+  Note [unboxed tuple bytecodes and tuple_BCO].
+
+  If needed, you can support larger tuples by adding more in
+  StgMiscClosures.cmm, Interpreter.c and MiscClosures.h and
+  raising this limit.
+
+  Note that the limit is the number of words passed on the stack.
+  If the calling convention passes part of the tuple in registers, the
+  maximum number of tuple elements may be larger. Elements can also
+  take multiple words on the stack (for example Double# on a 32 bit
+  platform).
+
+ -}
+maxTupleNativeStackSize :: WordOff
+maxTupleNativeStackSize = 62
+
+{-
+  Maximum number of supported registers for returning tuples.
+
+  If GHC uses more more than these (because of a change in the calling
+  convention or a new platform) mkTupleInfoSig will panic.
+
+  You can raise the limits after modifying stg_ctoi_t and stg_ret_t
+  (StgMiscClosures.cmm) to save and restore the additional registers.
+ -}
+maxTupleVanillaRegs, maxTupleFloatRegs, maxTupleDoubleRegs,
+    maxTupleLongRegs :: Int
+maxTupleVanillaRegs = 6
+maxTupleFloatRegs = 6
+maxTupleDoubleRegs = 6
+maxTupleLongRegs = 1
+
+{-
+  Construct the tuple_info word that stg_ctoi_t and stg_ret_t use
+  to convert a tuple between the native calling convention and the
+  interpreter.
+
+  See Note [GHCi tuple layout] for more information.
+ -}
+mkTupleInfoSig :: TupleInfo -> Word32
+mkTupleInfoSig ti@TupleInfo{..}
+  | tupleNativeStackSize > maxTupleNativeStackSize =
+    pprPanic "mkTupleInfoSig: tuple too big for the bytecode compiler"
+             (ppr tupleNativeStackSize <+> text "stack words." <+>
+              text "Use -fobject-code to get around this limit"
+             )
+  | tupleVanillaRegs `shiftR` maxTupleVanillaRegs /= 0 =
+    pprPanic "mkTupleInfoSig: too many vanilla registers" (ppr tupleVanillaRegs)
+  | tupleLongRegs `shiftR` maxTupleLongRegs /= 0 =
+    pprPanic "mkTupleInfoSig: too many long registers" (ppr tupleLongRegs)
+  | tupleFloatRegs `shiftR` maxTupleFloatRegs /= 0 =
+    pprPanic "mkTupleInfoSig: too many float registers" (ppr tupleFloatRegs)
+  | tupleDoubleRegs `shiftR` maxTupleDoubleRegs /= 0 =
+    pprPanic "mkTupleInfoSig: too many double registers" (ppr tupleDoubleRegs)
+  {-
+    Check that we can pack the register counts/bitmaps and stack size
+    in the information word. In particular we check that each component
+    fits in the bits we have reserved for it.
+
+    This overlaps with some of the above checks. It's likely that if the
+    number of registers changes, the number of bits will also need to be
+    updated.
+   -}
+  | tupleNativeStackSize < 16384 && -- 14 bits stack usage
+    tupleDoubleRegs < 64 && -- 6 bit bitmap (these can be shared with float)
+    tupleFloatRegs < 64 && -- 6 bit bitmap (these can be shared with double)
+    tupleLongRegs < 4 && -- 2 bit bitmap
+    tupleVanillaRegs < 65536 && -- 4 bit count (tupleVanillaRegs is still a bitmap)
+    -- check that there are no "holes", i.e. that R1..Rn are all in use
+    tupleVanillaRegs .&. (tupleVanillaRegs + 1) == 0
+    = fromIntegral tupleNativeStackSize .|.
+      unRegBitmap (tupleLongRegs `shiftL` 14) .|.
+      unRegBitmap (tupleDoubleRegs `shiftL` 16) .|.
+      unRegBitmap (tupleFloatRegs `shiftL` 22) .|.
+      fromIntegral (countTrailingZeros (1 + tupleVanillaRegs) `shiftL` 28)
+  | otherwise = pprPanic "mkTupleInfoSig: unsupported tuple shape" (ppr ti)
+
+mkTupleInfoLit :: Platform -> TupleInfo -> Literal
+mkTupleInfoLit platform tuple_info =
+  mkLitWord platform . fromIntegral $ mkTupleInfoSig tuple_info
+
 -- Make lists of host-sized words for literals, so that when the
 -- words are placed in memory at increasing addresses, the
 -- bit pattern is correct for the host's word size and endianness.
diff --git a/compiler/GHC/ByteCode/Instr.hs b/compiler/GHC/ByteCode/Instr.hs
index a8cc569548..5b0b20e38d 100644
--- a/compiler/GHC/ByteCode/Instr.hs
+++ b/compiler/GHC/ByteCode/Instr.hs
@@ -1,4 +1,5 @@
-{-# LANGUAGE CPP #-}
+{-# LANGUAGE CPP              #-}
+{-# LANGUAGE FlexibleContexts #-}
 {-# OPTIONS_GHC -funbox-strict-fields #-}
 --
 --  (c) The University of Glasgow 2002-2006
@@ -17,22 +18,19 @@ import GHC.ByteCode.Types
 import GHCi.RemoteTypes
 import GHCi.FFI (C_ffi_cif)
 import GHC.StgToCmm.Layout     ( ArgRep(..) )
-import GHC.Core.Ppr
 import GHC.Utils.Outputable
-import GHC.Data.FastString
 import GHC.Types.Name
 import GHC.Types.Unique
-import GHC.Types.Id
-import GHC.Core
 import GHC.Types.Literal
 import GHC.Core.DataCon
-import GHC.Types.Var.Set
 import GHC.Builtin.PrimOps
 import GHC.Runtime.Heap.Layout
 
 import Data.Word
 import GHC.Stack.CCS (CostCentre)
 
+import GHC.Stg.Syntax
+
 -- ----------------------------------------------------------------------------
 -- Bytecode instructions
 
@@ -45,7 +43,7 @@ data ProtoBCO a
         protoBCOBitmapSize :: Word16,
         protoBCOArity      :: Int,
         -- what the BCO came from, for debugging only
-        protoBCOExpr       :: Either  [AnnAlt Id DVarSet] (AnnExpr Id DVarSet),
+        protoBCOExpr       :: Either [CgStgAlt] CgStgRhs,
         -- malloc'd pointers
         protoBCOFFIs       :: [FFIInfo]
    }
@@ -91,6 +89,9 @@ data BCInstr
    -- Push an alt continuation
    | PUSH_ALTS          (ProtoBCO Name)
    | PUSH_ALTS_UNLIFTED (ProtoBCO Name) ArgRep
+   | PUSH_ALTS_TUPLE    (ProtoBCO Name) -- continuation
+                        !TupleInfo
+                        (ProtoBCO Name) -- tuple return BCO
 
    -- Pushing 8, 16 and 32 bits of padding (for constructors).
    | PUSH_PAD8
@@ -173,8 +174,9 @@ data BCInstr
 
    -- To Infinity And Beyond
    | ENTER
-   | RETURN             -- return a lifted value
+   | RETURN            -- return a lifted value
    | RETURN_UBX ArgRep -- return an unlifted value, here's its rep
+   | RETURN_TUPLE      -- return an unboxed tuple (info already on stack)
 
    -- Breakpoints
    | BRK_FUN          Word16 Unique (RemotePtr CostCentre)
@@ -193,36 +195,45 @@ instance Outputable a => Outputable (ProtoBCO a) where
       = (text "ProtoBCO" <+> ppr name <> char '#' <> int arity
                 <+> text (show ffis) <> colon)
         $$ nest 3 (case origin of
-                      Left alts -> vcat (zipWith (<+>) (char '{' : repeat (char ';'))
-                                                       (map (pprCoreAltShort.deAnnAlt) alts)) <+> char '}'
-                      Right rhs -> pprCoreExprShort (deAnnotate rhs))
+                      Left alts ->
+                        vcat (zipWith (<+>) (char '{' : repeat (char ';'))
+                             (map (pprStgAltShort shortStgPprOpts) alts))
+                      Right rhs ->
+                        pprStgRhsShort shortStgPprOpts rhs
+                  )
         $$ nest 3 (text "bitmap: " <+> text (show bsize) <+> ppr bitmap)
         $$ nest 3 (vcat (map ppr instrs))
 
--- Print enough of the Core expression to enable the reader to find
--- the expression in the -ddump-prep output.  That is, we need to
+-- Print enough of the STG expression to enable the reader to find
+-- the expression in the -ddump-stg output.  That is, we need to
 -- include at least a binder.
 
-pprCoreExprShort :: CoreExpr -> SDoc
-pprCoreExprShort expr@(Lam _ _)
-  = let
-        (bndrs, _) = collectBinders expr
-    in
-    char '\\' <+> sep (map (pprBndr LambdaBind) bndrs) <+> arrow <+> text "..."
-
-pprCoreExprShort (Case _expr var _ty _alts)
- = text "case of" <+> ppr var
-
-pprCoreExprShort (Let (NonRec x _) _) = text "let" <+> ppr x <+> ptext (sLit ("= ... in ..."))
-pprCoreExprShort (Let (Rec bs) _) = text "let {" <+> ppr (fst (head bs)) <+> ptext (sLit ("= ...; ... } in ..."))
-
-pprCoreExprShort (Tick t e) = ppr t <+> pprCoreExprShort e
-pprCoreExprShort (Cast e _) = pprCoreExprShort e <+> text "`cast` T"
+pprStgExprShort :: OutputablePass pass => StgPprOpts -> GenStgExpr pass -> SDoc
+pprStgExprShort _ (StgCase _expr var _ty _alts) =
+  text "case of" <+> ppr var
+pprStgExprShort _ (StgLet _ bnd _) =
+  text "let" <+> pprStgBindShort bnd <+> text "in ..."
+pprStgExprShort _ (StgLetNoEscape _ bnd _) =
+  text "let-no-escape" <+> pprStgBindShort bnd <+> text "in ..."
+pprStgExprShort opts (StgTick t e) = ppr t <+> pprStgExprShort opts e
+pprStgExprShort opts e = pprStgExpr opts e
+
+pprStgBindShort :: OutputablePass pass => GenStgBinding pass -> SDoc
+pprStgBindShort (StgNonRec x _) =
+  ppr x <+> text "= ..."
+pprStgBindShort (StgRec bs) =
+  char '{' <+> ppr (fst (head bs)) <+> text "= ...; ... }"
+
+pprStgAltShort :: OutputablePass pass => StgPprOpts -> GenStgAlt pass -> SDoc
+pprStgAltShort opts (con, args, expr) =
+  ppr con <+> sep (map ppr args) <+> text "->" <+> pprStgExprShort opts expr
+
+pprStgRhsShort :: OutputablePass pass => StgPprOpts -> GenStgRhs pass -> SDoc
+pprStgRhsShort opts (StgRhsClosure _ext _cc upd_flag args body) =
+  hang (hsep [ char '\\' <> ppr upd_flag, brackets (interppSP args) ])
+       4 (pprStgExprShort opts body)
+pprStgRhsShort opts rhs = pprStgRhs opts rhs
 
-pprCoreExprShort e = pprCoreExpr e
-
-pprCoreAltShort :: CoreAlt -> SDoc
-pprCoreAltShort (Alt con args expr) = ppr con <+> sep (map ppr args) <+> text "->" <+> pprCoreExprShort expr
 
 instance Outputable BCInstr where
    ppr (STKCHECK n)          = text "STKCHECK" <+> ppr n
@@ -239,8 +250,13 @@ instance Outputable BCInstr where
    ppr (PUSH_PRIMOP op)      = text "PUSH_G  " <+> text "GHC.PrimopWrappers."
                                                <> ppr op
    ppr (PUSH_BCO bco)        = hang (text "PUSH_BCO") 2 (ppr bco)
+
    ppr (PUSH_ALTS bco)       = hang (text "PUSH_ALTS") 2 (ppr bco)
    ppr (PUSH_ALTS_UNLIFTED bco pk) = hang (text "PUSH_ALTS_UNLIFTED" <+> ppr pk) 2 (ppr bco)
+   ppr (PUSH_ALTS_TUPLE bco tuple_info tuple_bco) =
+                               hang (text "PUSH_ALTS_TUPLE" <+> ppr tuple_info)
+                                    2
+                                    (ppr tuple_bco $+$ ppr bco)
 
    ppr PUSH_PAD8             = text "PUSH_PAD8"
    ppr PUSH_PAD16            = text "PUSH_PAD16"
@@ -297,8 +313,11 @@ instance Outputable BCInstr where
    ppr ENTER                 = text "ENTER"
    ppr RETURN                = text "RETURN"
    ppr (RETURN_UBX pk)       = text "RETURN_UBX  " <+> ppr pk
+   ppr (RETURN_TUPLE)        = text "RETURN_TUPLE"
    ppr (BRK_FUN index uniq _cc) = text "BRK_FUN" <+> ppr index <+> ppr uniq <+> text "<cc>"
 
+
+
 -- -----------------------------------------------------------------------------
 -- The stack use, in words, of each bytecode insn.  These _must_ be
 -- correct, or overestimates of reality, to be safe.
@@ -326,8 +345,16 @@ bciStackUse PUSH32_W{}            = 1  -- takes exactly 1 word
 bciStackUse PUSH_G{}              = 1
 bciStackUse PUSH_PRIMOP{}         = 1
 bciStackUse PUSH_BCO{}            = 1
-bciStackUse (PUSH_ALTS bco)       = 2 + protoBCOStackUse bco
-bciStackUse (PUSH_ALTS_UNLIFTED bco _) = 2 + protoBCOStackUse bco
+bciStackUse (PUSH_ALTS bco)       = 2 {- profiling only, restore CCCS -} +
+                                    3 + protoBCOStackUse bco
+bciStackUse (PUSH_ALTS_UNLIFTED bco _) = 2 {- profiling only, restore CCCS -} +
+                                         4 + protoBCOStackUse bco
+bciStackUse (PUSH_ALTS_TUPLE bco info _) =
+   -- (tuple_bco, tuple_info word, cont_bco, stg_ctoi_t)
+   -- tuple
+   -- (tuple_info, tuple_bco, stg_ret_t)
+   1 {- profiling only -} +
+   7 + fromIntegral (tupleSize info) + protoBCOStackUse bco
 bciStackUse (PUSH_PAD8)           = 1  -- overapproximation
 bciStackUse (PUSH_PAD16)          = 1  -- overapproximation
 bciStackUse (PUSH_PAD32)          = 1  -- overapproximation on 64bit arch
@@ -365,7 +392,8 @@ bciStackUse CASEFAIL{}            = 0
 bciStackUse JMP{}                 = 0
 bciStackUse ENTER{}               = 0
 bciStackUse RETURN{}              = 0
-bciStackUse RETURN_UBX{}          = 1
+bciStackUse RETURN_UBX{}          = 1 -- pushes stg_ret_X for some X
+bciStackUse RETURN_TUPLE{}        = 1 -- pushes stg_ret_t header
 bciStackUse CCALL{}               = 0
 bciStackUse SWIZZLE{}             = 0
 bciStackUse BRK_FUN{}             = 0
diff --git a/compiler/GHC/ByteCode/Types.hs b/compiler/GHC/ByteCode/Types.hs
index 97304cb7f4..02c117d716 100644
--- a/compiler/GHC/ByteCode/Types.hs
+++ b/compiler/GHC/ByteCode/Types.hs
@@ -6,7 +6,11 @@
 
 -- | Bytecode assembler types
 module GHC.ByteCode.Types
-  ( CompiledByteCode(..), seqCompiledByteCode, FFIInfo(..)
+  ( CompiledByteCode(..), seqCompiledByteCode
+  , FFIInfo(..)
+  , RegBitmap(..)
+  , TupleInfo(..), voidTupleInfo
+  , ByteOff(..), WordOff(..)
   , UnlinkedBCO(..), BCOPtr(..), BCONPtr(..)
   , ItblEnv, ItblPtr(..)
   , CgBreakInfo(..)
@@ -68,6 +72,61 @@ seqCompiledByteCode CompiledByteCode{..} =
   rnf bc_strs `seq`
   rnf (fmap seqModBreaks bc_breaks)
 
+newtype ByteOff = ByteOff Int
+    deriving (Enum, Eq, Show, Integral, Num, Ord, Real, Outputable)
+
+newtype WordOff = WordOff Int
+    deriving (Enum, Eq, Show, Integral, Num, Ord, Real, Outputable)
+
+newtype RegBitmap = RegBitmap { unRegBitmap :: Word32 }
+    deriving (Enum, Eq, Show, Integral, Num, Ord, Real, Bits, FiniteBits, Outputable)
+
+{- Note [GHCi TupleInfo]
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+   This contains the data we need for passing unboxed tuples between
+   bytecode and native code
+
+   In general we closely follow the native calling convention that
+   GHC uses for unboxed tuples, but we don't use any registers in
+   bytecode. All tuple elements are expanded to use a full register
+   or a full word on the stack.
+
+   The position of tuple elements that are returned on the stack in
+   the native calling convention is unchanged when returning the same
+   tuple in bytecode.
+
+   The order of the remaining elements is determined by the register in
+   which they would have been returned, rather than by their position in
+   the tuple in the Haskell source code. This makes jumping between bytecode
+   and native code easier: A map of live registers is enough to convert the
+   tuple.
+
+   See GHC.StgToByteCode.layoutTuple for more details.
+-}
+data TupleInfo = TupleInfo
+  { tupleSize            :: !WordOff   -- total size of tuple in words
+  , tupleVanillaRegs     :: !RegBitmap -- vanilla registers used
+  , tupleLongRegs        :: !RegBitmap -- long registers used
+  , tupleFloatRegs       :: !RegBitmap -- float registers used
+  , tupleDoubleRegs      :: !RegBitmap -- double registers used
+  , tupleNativeStackSize :: !WordOff {- words spilled on the stack by
+                                        GHCs native calling convention -}
+  } deriving (Show)
+
+instance Outputable TupleInfo where
+  ppr TupleInfo{..} = text "<size" <+> ppr tupleSize <+>
+                      text "stack" <+> ppr tupleNativeStackSize <+>
+                      text "regs"  <+>
+                          char 'R' <> ppr tupleVanillaRegs <+>
+                          char 'L' <> ppr tupleLongRegs <+>
+                          char 'F' <> ppr tupleFloatRegs <+>
+                          char 'D' <> ppr tupleDoubleRegs <>
+                      char '>'
+
+voidTupleInfo :: TupleInfo
+voidTupleInfo = TupleInfo 0 0 0 0 0 0
+
 type ItblEnv = NameEnv (Name, ItblPtr)
         -- We need the Name in the range so we know which
         -- elements to filter out when unloading a module
diff --git a/compiler/GHC/Cmm/Expr.hs b/compiler/GHC/Cmm/Expr.hs
index 2c68439dc0..86b06271d1 100644
--- a/compiler/GHC/Cmm/Expr.hs
+++ b/compiler/GHC/Cmm/Expr.hs
@@ -522,6 +522,8 @@ instance Eq GlobalReg where
    PicBaseReg == PicBaseReg = True
    _r1 == _r2 = False
 
+-- NOTE: this Ord instance affects the tuple layout in GHCi, see
+--       Note [GHCi tuple layout]
 instance Ord GlobalReg where
    compare (VanillaReg i _) (VanillaReg j _) = compare i j
      -- Ignore type when seeking clashes
diff --git a/compiler/GHC/Core/Lint.hs b/compiler/GHC/Core/Lint.hs
index 7edc0d7a28..a3ea0bb1d3 100644
--- a/compiler/GHC/Core/Lint.hs
+++ b/compiler/GHC/Core/Lint.hs
@@ -17,6 +17,8 @@ module GHC.Core.Lint (
     lintPassResult, lintInteractiveExpr, lintExpr,
     lintAnnots, lintAxioms,
 
+    interactiveInScope,
+
     -- ** Debug output
     endPass, endPassIO,
     displayLintResults, dumpPassResult,
@@ -379,7 +381,7 @@ lintPassResult hsc_env pass binds
   | not (gopt Opt_DoCoreLinting dflags)
   = return ()
   | otherwise
-  = do { let warns_and_errs = lintCoreBindings dflags pass (interactiveInScope hsc_env) binds
+  = do { let warns_and_errs = lintCoreBindings dflags pass (interactiveInScope $ hsc_IC hsc_env) binds
        ; Err.showPass logger dflags ("Core Linted result of " ++ showPpr dflags pass)
        ; displayLintResults logger dflags (showLintWarnings pass) (ppr pass)
                             (pprCoreBindings binds) warns_and_errs }
@@ -432,7 +434,7 @@ lintInteractiveExpr :: SDoc -- ^ The source of the linted expression
 lintInteractiveExpr what hsc_env expr
   | not (gopt Opt_DoCoreLinting dflags)
   = return ()
-  | Just err <- lintExpr dflags (interactiveInScope hsc_env) expr
+  | Just err <- lintExpr dflags (interactiveInScope $ hsc_IC hsc_env) expr
   = displayLintResults logger dflags False what (pprCoreExpr expr) (emptyBag, err)
   | otherwise
   = return ()
@@ -440,7 +442,7 @@ lintInteractiveExpr what hsc_env expr
     dflags = hsc_dflags hsc_env
     logger = hsc_logger hsc_env
 
-interactiveInScope :: HscEnv -> [Var]
+interactiveInScope :: InteractiveContext -> [Var]
 -- In GHCi we may lint expressions, or bindings arising from 'deriving'
 -- clauses, that mention variables bound in the interactive context.
 -- These are Local things (see Note [Interactively-bound Ids in GHCi] in GHC.Runtime.Context).
@@ -452,11 +454,10 @@ interactiveInScope :: HscEnv -> [Var]
 -- so this is a (cheap) no-op.
 --
 -- See #8215 for an example
-interactiveInScope hsc_env
+interactiveInScope ictxt
   = tyvars ++ ids
   where
     -- C.f. GHC.Tc.Module.setInteractiveContext, Desugar.deSugarExpr
-    ictxt                   = hsc_IC hsc_env
     (cls_insts, _fam_insts) = ic_instances ictxt
     te1    = mkTypeEnvWithImplicits (ic_tythings ictxt)
     te     = extendTypeEnvWithIds te1 (map instanceDFunId cls_insts)
diff --git a/compiler/GHC/CoreToStg.hs b/compiler/GHC/CoreToStg.hs
index b1397fe4e1..d8a6dd0e95 100644
--- a/compiler/GHC/CoreToStg.hs
+++ b/compiler/GHC/CoreToStg.hs
@@ -1,4 +1,6 @@
-{-# LANGUAGE CPP, DeriveFunctor #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE TypeFamilies #-}
 
 --
 -- (c) The GRASP/AQUA Project, Glasgow University, 1993-1998
@@ -414,13 +416,9 @@ coreToStgExpr expr@(Lam _ _)
         text "Unexpected value lambda:" $$ ppr expr
 
 coreToStgExpr (Tick tick expr)
-  = do stg_tick <- case tick of
-                     HpcTick m i        -> return (HpcTick m i)
-                     ProfNote cc cnt sc -> return (ProfNote cc cnt sc)
-                     SourceNote span nm -> return (SourceNote span nm)
-                     Breakpoint{} ->
-                       panic "coreToStgExpr: breakpoint should not happen"
-       expr2 <- coreToStgExpr expr
+  = do
+       let !stg_tick = coreToStgTick (exprType expr) tick
+       !expr2 <- coreToStgExpr expr
        return (StgTick stg_tick expr2)
 
 coreToStgExpr (Cast expr _)
@@ -570,12 +568,8 @@ coreToStgApp f args ticks = do
                 TickBoxOpId {}   -> pprPanic "coreToStg TickBox" $ ppr (f,args')
                 _other           -> StgApp f args'
 
-        convert_tick (Breakpoint _ bid fvs)  = res_ty `seq` Breakpoint res_ty bid fvs
-        convert_tick (HpcTick m i)           = HpcTick m i
-        convert_tick (SourceNote span nm)    = SourceNote span nm
-        convert_tick (ProfNote cc cnt scope) = ProfNote cc cnt scope
         add_tick !t !e = StgTick t e
-        tapp = foldr add_tick app (map convert_tick ticks ++ ticks')
+        tapp = foldr add_tick app (map (coreToStgTick res_ty) ticks ++ ticks')
 
     -- Forcing these fixes a leak in the code generator, noticed while
     -- profiling for trac #4367
@@ -601,12 +595,7 @@ coreToStgArgs (Coercion _ : args) -- Coercion argument; See Note [Coercion token
 coreToStgArgs (Tick t e : args)
   = ASSERT( not (tickishIsCode t) )
     do { (args', ts) <- coreToStgArgs (e : args)
-       ; let convert_tick (Breakpoint _ bid fvs) =
-               let !ty = exprType e in Breakpoint ty bid fvs
-             convert_tick (HpcTick m i)           = HpcTick m i
-             convert_tick (SourceNote span nm)    = SourceNote span nm
-             convert_tick (ProfNote cc cnt scope) = ProfNote cc cnt scope
-             !t' = convert_tick t
+       ; let !t' = coreToStgTick (exprType e) t
        ; return (args', t':ts) }
 
 coreToStgArgs (arg : args) = do         -- Non-type argument
@@ -639,6 +628,13 @@ coreToStgArgs (arg : args) = do         -- Non-type argument
     WARN( bad_args, text "Dangerous-looking argument. Probable cause: bad unsafeCoerce#" $$ ppr arg )
      return (stg_arg : stg_args, ticks ++ aticks)
 
+coreToStgTick :: Type -- type of the ticked expression
+              -> CoreTickish
+              -> StgTickish
+coreToStgTick _ty (HpcTick m i)           = HpcTick m i
+coreToStgTick _ty (SourceNote span nm)    = SourceNote span nm
+coreToStgTick _ty (ProfNote cc cnt scope) = ProfNote cc cnt scope
+coreToStgTick !ty (Breakpoint _ bid fvs)  = Breakpoint ty bid fvs
 
 -- ---------------------------------------------------------------------------
 -- The magic for lets:
diff --git a/compiler/GHC/CoreToStg/Prep.hs b/compiler/GHC/CoreToStg/Prep.hs
index 956175b3ad..af94cb92d7 100644
--- a/compiler/GHC/CoreToStg/Prep.hs
+++ b/compiler/GHC/CoreToStg/Prep.hs
@@ -964,7 +964,7 @@ no further floating will occur. This allows us to safely inline things like
    GHC.Magic. This definition is used in cases where runRW is curried.
 
  * In addition to its normal Haskell definition in GHC.Magic, we give it
-   a special late inlining here in CorePrep and GHC.CoreToByteCode, avoiding
+   a special late inlining here in CorePrep and GHC.StgToByteCode, avoiding
    the incorrect sharing due to float-out noted above.
 
  * It is levity-polymorphic:
diff --git a/compiler/GHC/Driver/Backend.hs b/compiler/GHC/Driver/Backend.hs
index 845a5f36c0..39789607d9 100644
--- a/compiler/GHC/Driver/Backend.hs
+++ b/compiler/GHC/Driver/Backend.hs
@@ -67,10 +67,10 @@ data Backend
                    -- Produce ByteCode objects (BCO, see "GHC.ByteCode") that
                    -- can be interpreted. It is used by GHCi.
                    --
-                   -- Currently some extensions are not supported (unboxed
-                   -- tuples/sums, foreign primops).
+                   -- Currently some extensions are not supported
+                   -- (foreign primops).
                    --
-                   -- See "GHC.CoreToByteCode"
+                   -- See "GHC.StgToByteCode"
 
 
    | NoBackend     -- ^ No code generated.
diff --git a/compiler/GHC/Driver/Main.hs b/compiler/GHC/Driver/Main.hs
index a910cdf23f..50e5a0a067 100644
--- a/compiler/GHC/Driver/Main.hs
+++ b/compiler/GHC/Driver/Main.hs
@@ -114,7 +114,7 @@ import GHC.Hs.Stats         ( ppSourceStats )
 
 import GHC.HsToCore
 
-import GHC.CoreToByteCode      ( byteCodeGen, coreExprToBCOs )
+import GHC.StgToByteCode    ( byteCodeGen, stgExprToBCOs )
 
 import GHC.IfaceToCore  ( typecheckIface )
 
@@ -132,6 +132,8 @@ import GHC.Core
 import GHC.Core.Tidy           ( tidyExpr )
 import GHC.Core.Type           ( Type, Kind )
 import GHC.Core.Lint           ( lintInteractiveExpr )
+import GHC.Core.Multiplicity
+import GHC.Core.Utils          ( exprType )
 import GHC.Core.ConLike
 import GHC.Core.Opt.Pipeline
 import GHC.Core.TyCon
@@ -156,6 +158,7 @@ import GHC.Stg.Pipeline ( stg2stg )
 
 import GHC.Builtin.Utils
 import GHC.Builtin.Names
+import GHC.Builtin.Uniques ( mkPseudoUniqueE )
 
 import qualified GHC.StgToCmm as StgToCmm ( codeGen )
 import GHC.StgToCmm.Types (CgInfos (..), ModuleLFInfos)
@@ -1551,7 +1554,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do
                withTiming logger dflags
                    (text "CoreToStg"<+>brackets (ppr this_mod))
                    (\(a, b, (c,d)) -> a `seqList` b `seq` c `seqList` d `seqList` ())
-                   (myCoreToStg logger dflags this_mod location prepd_binds)
+                   (myCoreToStg logger dflags (hsc_IC hsc_env) this_mod location prepd_binds)
 
         let cost_centre_info =
               (S.toList local_ccs ++ caf_ccs, caf_cc_stacks)
@@ -1622,8 +1625,12 @@ hscInteractive hsc_env cgguts location = do
     -- Do saturation and convert to A-normal form
     (prepd_binds, _) <- {-# SCC "CorePrep" #-}
                    corePrepPgm hsc_env this_mod location core_binds data_tycons
+
+    (stg_binds, _infotable_prov, _caf_ccs__caf_cc_stacks)
+      <- {-# SCC "CoreToStg" #-}
+          myCoreToStg logger dflags (hsc_IC hsc_env) this_mod location prepd_binds
     -----------------  Generate byte code ------------------
-    comp_bc <- byteCodeGen hsc_env this_mod prepd_binds data_tycons mod_breaks
+    comp_bc <- byteCodeGen hsc_env this_mod stg_binds data_tycons mod_breaks
     ------------------ Create f-x-dynamic C-side stuff -----
     (_istub_h_exists, istub_c_exists)
         <- outputForeignStubs logger tmpfs dflags (hsc_units hsc_env) this_mod location foreign_stubs
@@ -1760,22 +1767,43 @@ doCodeGen hsc_env this_mod denv data_tycons
 
     return (Stream.mapM dump2 pipeline_stream)
 
-myCoreToStg :: Logger -> DynFlags -> Module -> ModLocation -> CoreProgram
+myCoreToStgExpr :: Logger -> DynFlags -> InteractiveContext
+                -> Module -> ModLocation -> CoreExpr
+                -> IO ( StgRhs
+                      , InfoTableProvMap
+                      , CollectedCCs )
+myCoreToStgExpr logger dflags ictxt this_mod ml prepd_expr = do
+    {- Create a temporary binding (just because myCoreToStg needs a
+       binding for the stg2stg step) -}
+    let bco_tmp_id = mkSysLocal (fsLit "BCO_toplevel")
+                                (mkPseudoUniqueE 0)
+                                Many
+                                (exprType prepd_expr)
+    ([StgTopLifted (StgNonRec _ stg_expr)], prov_map, collected_ccs) <-
+       myCoreToStg logger
+                   dflags
+                   ictxt
+                   this_mod
+                   ml
+                   [NonRec bco_tmp_id prepd_expr]
+    return (stg_expr, prov_map, collected_ccs)
+
+myCoreToStg :: Logger -> DynFlags -> InteractiveContext
+            -> Module -> ModLocation -> CoreProgram
             -> IO ( [StgTopBinding] -- output program
                   , InfoTableProvMap
                   , CollectedCCs )  -- CAF cost centre info (declared and used)
-myCoreToStg logger dflags this_mod ml prepd_binds = do
+myCoreToStg logger dflags ictxt this_mod ml prepd_binds = do
     let (stg_binds, denv, cost_centre_info)
          = {-# SCC "Core2Stg" #-}
            coreToStg dflags this_mod ml prepd_binds
 
     stg_binds2
         <- {-# SCC "Stg2Stg" #-}
-           stg2stg logger dflags this_mod stg_binds
+           stg2stg logger dflags ictxt this_mod stg_binds
 
     return (stg_binds2, denv, cost_centre_info)
 
-
 {- **********************************************************************
 %*                                                                      *
 \subsection{Compiling a do-statement}
@@ -1911,9 +1939,18 @@ hscParsedDecls hsc_env decls = runInteractiveHsc hsc_env $ do
     (prepd_binds, _) <- {-# SCC "CorePrep" #-}
       liftIO $ corePrepPgm hsc_env this_mod iNTERACTIVELoc core_binds data_tycons
 
+    (stg_binds, _infotable_prov, _caf_ccs__caf_cc_stacks)
+        <- {-# SCC "CoreToStg" #-}
+           liftIO $ myCoreToStg (hsc_logger hsc_env)
+                                (hsc_dflags hsc_env)
+                                (hsc_IC hsc_env)
+                                this_mod
+                                iNTERACTIVELoc
+                                prepd_binds
+
     {- Generate byte code -}
     cbc <- liftIO $ byteCodeGen hsc_env this_mod
-                                prepd_binds data_tycons mod_breaks
+                                stg_binds data_tycons mod_breaks
 
     let src_span = srcLocSpan interactiveSrcLoc
     liftIO $ loadDecls hsc_env src_span cbc
@@ -2077,10 +2114,25 @@ hscCompileCoreExpr' hsc_env srcspan ds_expr
 
            {- Lint if necessary -}
          ; lintInteractiveExpr (text "hscCompileExpr") hsc_env prepd_expr
+         ; let iNTERACTIVELoc = ModLocation{ ml_hs_file   = Nothing,
+                                      ml_hi_file   = panic "hscCompileCoreExpr':ml_hi_file",
+                                      ml_obj_file  = panic "hscCompileCoreExpr':ml_obj_file",
+                                      ml_hie_file  = panic "hscCompileCoreExpr':ml_hie_file" }
+
+         ; let ictxt = hsc_IC hsc_env
+         ; (stg_expr, _, _) <-
+             myCoreToStgExpr (hsc_logger hsc_env)
+                             (hsc_dflags hsc_env)
+                             ictxt
+                             (icInteractiveModule ictxt)
+                             iNTERACTIVELoc
+                             prepd_expr
 
            {- Convert to BCOs -}
-         ; bcos <- coreExprToBCOs hsc_env
-                     (icInteractiveModule (hsc_IC hsc_env)) prepd_expr
+         ; bcos <- stgExprToBCOs hsc_env
+                     (icInteractiveModule ictxt)
+                     (exprType prepd_expr)
+                     stg_expr
 
            {- load it -}
          ; loadExpr hsc_env srcspan bcos }
diff --git a/compiler/GHC/Driver/Make.hs b/compiler/GHC/Driver/Make.hs
index bd885d9042..20fb7ecc86 100644
--- a/compiler/GHC/Driver/Make.hs
+++ b/compiler/GHC/Driver/Make.hs
@@ -2267,7 +2267,6 @@ downsweep hsc_env old_summaries excl_mods allow_dup_roots
        let tmpfs           = hsc_tmpfs     hsc_env
        map1 <- case backend dflags of
          NoBackend   -> enableCodeGenForTH logger tmpfs home_unit default_backend map0
-         Interpreter -> enableCodeGenForUnboxedTuplesOrSums logger tmpfs default_backend map0
          _           -> return map0
        if null errs
          then pure $ concat $ modNodeMapElems map1
@@ -2377,33 +2376,8 @@ enableCodeGenForTH logger tmpfs home_unit =
       -- can't compile anything anyway! See #16219.
       isHomeUnitDefinite home_unit
 
--- | Update the every ModSummary that is depended on
--- by a module that needs unboxed tuples. We enable codegen to
--- the specified target, disable optimization and change the .hi
--- and .o file locations to be temporary files.
---
--- This is used in order to load code that uses unboxed tuples
--- or sums into GHCi while still allowing some code to be interpreted.
-enableCodeGenForUnboxedTuplesOrSums
-  :: Logger
-  -> TmpFs
-  -> Backend
-  -> ModNodeMap [Either ErrorMessages ExtendedModSummary]
-  -> IO (ModNodeMap [Either ErrorMessages ExtendedModSummary])
-enableCodeGenForUnboxedTuplesOrSums logger tmpfs =
-  enableCodeGenWhen logger tmpfs condition should_modify TFL_GhcSession TFL_CurrentModule
-  where
-    condition ms =
-      unboxed_tuples_or_sums (ms_hspp_opts ms) &&
-      not (gopt Opt_ByteCode (ms_hspp_opts ms)) &&
-      (isBootSummary ms == NotBoot)
-    unboxed_tuples_or_sums d =
-      xopt LangExt.UnboxedTuples d || xopt LangExt.UnboxedSums d
-    should_modify (ModSummary { ms_hspp_opts = dflags }) =
-      backend dflags == Interpreter
-
--- | Helper used to implement 'enableCodeGenForTH' and
--- 'enableCodeGenForUnboxedTuples'. In particular, this enables
+-- | Helper used to implement 'enableCodeGenForTH'.
+-- In particular, this enables
 -- unoptimized code generation for all modules that meet some
 -- condition (first parameter), or are dependencies of those
 -- modules. The second parameter is a condition to check before
diff --git a/compiler/GHC/Runtime/Eval.hs b/compiler/GHC/Runtime/Eval.hs
index db43ff74ac..e3ba232add 100644
--- a/compiler/GHC/Runtime/Eval.hs
+++ b/compiler/GHC/Runtime/Eval.hs
@@ -692,7 +692,7 @@ pushResume hsc_env resume = hsc_env { hsc_IC = ictxt1 }
 
   There are 3 situations where items are removed from the Id list
   (or replaced with `Nothing`):
-  1.) If function `GHC.CoreToByteCode.schemeER_wrk` (which creates
+  1.) If function `GHC.StgToByteCode.schemeER_wrk` (which creates
       the Id list) doesn't find an Id in the ByteCode environement.
   2.) If function `GHC.Runtime.Eval.bindLocalsAtBreakpoint`
       filters out unboxed elements from the Id list, because GHCi cannot
diff --git a/compiler/GHC/Stg/Lint.hs b/compiler/GHC/Stg/Lint.hs
index 1e12e9bab9..8464cb8786 100644
--- a/compiler/GHC/Stg/Lint.hs
+++ b/compiler/GHC/Stg/Lint.hs
@@ -42,6 +42,7 @@ import GHC.Prelude
 import GHC.Stg.Syntax
 
 import GHC.Driver.Session
+import GHC.Core.Lint        ( interactiveInScope )
 import GHC.Data.Bag         ( Bag, emptyBag, isEmptyBag, snocBag, bagToList )
 import GHC.Types.Basic      ( TopLevelFlag(..), isTopLevel )
 import GHC.Types.CostCentre ( isCurrentCCS )
@@ -57,6 +58,7 @@ import GHC.Types.SrcLoc
 import GHC.Utils.Logger
 import GHC.Utils.Outputable
 import GHC.Unit.Module            ( Module )
+import GHC.Runtime.Context        ( InteractiveContext )
 import qualified GHC.Utils.Error as Err
 import Control.Applicative ((<|>))
 import Control.Monad
@@ -64,13 +66,14 @@ import Control.Monad
 lintStgTopBindings :: forall a . (OutputablePass a, BinderP a ~ Id)
                    => Logger
                    -> DynFlags
+                   -> InteractiveContext
                    -> Module -- ^ module being compiled
                    -> Bool   -- ^ have we run Unarise yet?
                    -> String -- ^ who produced the STG?
                    -> [GenStgTopBinding a]
                    -> IO ()
 
-lintStgTopBindings logger dflags this_mod unarised whodunnit binds
+lintStgTopBindings logger dflags ictxt this_mod unarised whodunnit binds
   = {-# SCC "StgLint" #-}
     case initL this_mod unarised opts top_level_binds (lint_binds binds) of
       Nothing  ->
@@ -89,7 +92,8 @@ lintStgTopBindings logger dflags this_mod unarised whodunnit binds
     opts = initStgPprOpts dflags
     -- Bring all top-level binds into scope because CoreToStg does not generate
     -- bindings in dependency order (so we may see a use before its definition).
-    top_level_binds = mkVarSet (bindersOfTopBinds binds)
+    top_level_binds = extendVarSetList (mkVarSet (bindersOfTopBinds binds))
+                                       (interactiveInScope ictxt)
 
     lint_binds :: [GenStgTopBinding a] -> LintM ()
 
diff --git a/compiler/GHC/Stg/Pipeline.hs b/compiler/GHC/Stg/Pipeline.hs
index c05450c0f7..d9f1342b66 100644
--- a/compiler/GHC/Stg/Pipeline.hs
+++ b/compiler/GHC/Stg/Pipeline.hs
@@ -24,6 +24,7 @@ import GHC.Stg.Unarise  ( unarise )
 import GHC.Stg.CSE      ( stgCse )
 import GHC.Stg.Lift     ( stgLiftLams )
 import GHC.Unit.Module ( Module )
+import GHC.Runtime.Context ( InteractiveContext )
 
 import GHC.Driver.Session
 import GHC.Utils.Error
@@ -49,11 +50,11 @@ runStgM mask (StgM m) = evalStateT m mask
 
 stg2stg :: Logger
         -> DynFlags                  -- includes spec of what stg-to-stg passes to do
+        -> InteractiveContext
         -> Module                    -- module being compiled
         -> [StgTopBinding]           -- input program
         -> IO [StgTopBinding]        -- output program
-
-stg2stg logger dflags this_mod binds
+stg2stg logger dflags ictxt this_mod binds
   = do  { dump_when Opt_D_dump_stg_from_core "Initial STG:" binds
         ; showPass logger dflags "Stg2Stg"
         -- Do the main business!
@@ -75,7 +76,7 @@ stg2stg logger dflags this_mod binds
   where
     stg_linter unarised
       | gopt Opt_DoStgLinting dflags
-      = lintStgTopBindings logger dflags this_mod unarised
+      = lintStgTopBindings logger dflags ictxt this_mod unarised
       | otherwise
       = \ _whodunnit _binds -> return ()
 
diff --git a/compiler/GHC/Stg/Syntax.hs b/compiler/GHC/Stg/Syntax.hs
index 03ba9b5549..6e2107e9d6 100644
--- a/compiler/GHC/Stg/Syntax.hs
+++ b/compiler/GHC/Stg/Syntax.hs
@@ -58,7 +58,8 @@ module GHC.Stg.Syntax (
         bindersOf, bindersOfTop, bindersOfTopBinds,
 
         -- ppr
-        StgPprOpts(..), initStgPprOpts, panicStgPprOpts,
+        StgPprOpts(..), initStgPprOpts,
+        panicStgPprOpts, shortStgPprOpts,
         pprStgArg, pprStgExpr, pprStgRhs, pprStgBinding,
         pprGenStgTopBinding, pprStgTopBinding,
         pprGenStgTopBindings, pprStgTopBindings
@@ -691,6 +692,13 @@ panicStgPprOpts = StgPprOpts
    { stgSccEnabled = True
    }
 
+-- | STG pretty-printing options used for short messages
+shortStgPprOpts :: StgPprOpts
+shortStgPprOpts = StgPprOpts
+   { stgSccEnabled = False
+   }
+
+
 pprGenStgTopBinding
   :: OutputablePass pass => StgPprOpts -> GenStgTopBinding pass -> SDoc
 pprGenStgTopBinding opts b = case b of
@@ -778,9 +786,10 @@ pprStgExpr opts e = case e of
              , hang (text "} in ") 2 (pprStgExpr opts expr)
              ]
 
-   StgTick tickish expr -> sdocOption sdocSuppressTicks $ \case
+   StgTick _tickish expr -> sdocOption sdocSuppressTicks $ \case
       True  -> pprStgExpr opts expr
-      False -> sep [ ppr tickish, pprStgExpr opts expr ]
+      False -> pprStgExpr opts expr
+        -- XXX sep [ ppr tickish, pprStgExpr opts expr ]
 
    -- Don't indent for a single case alternative.
    StgCase expr bndr alt_type [alt]
diff --git a/compiler/GHC/CoreToByteCode.hs b/compiler/GHC/StgToByteCode.hs
index dbb64d51d5..e14de72eb5 100644
--- a/compiler/GHC/CoreToByteCode.hs
+++ b/compiler/GHC/StgToByteCode.hs
@@ -10,8 +10,8 @@
 --  (c) The University of Glasgow 2002-2006
 --
 
--- | GHC.CoreToByteCode: Generate bytecode from Core
-module GHC.CoreToByteCode ( UnlinkedBCO, byteCodeGen, coreExprToBCOs ) where
+-- | GHC.StgToByteCode: Generate bytecode from STG
+module GHC.StgToByteCode ( UnlinkedBCO, byteCodeGen, stgExprToBCOs ) where
 
 #include "HsVersions.h"
 
@@ -24,6 +24,11 @@ import GHC.ByteCode.Instr
 import GHC.ByteCode.Asm
 import GHC.ByteCode.Types
 
+import GHC.Cmm.CallConv
+import GHC.Cmm.Expr
+import GHC.Cmm.Node
+import GHC.Cmm.Utils
+
 import GHC.Platform
 import GHC.Platform.Profile
 
@@ -36,12 +41,9 @@ import GHC.Types.Name
 import GHC.Types.Id.Make
 import GHC.Types.Id
 import GHC.Types.ForeignCall
-import GHC.Core.Utils
 import GHC.Core
-import GHC.Core.Ppr
 import GHC.Types.Literal
 import GHC.Builtin.PrimOps
-import GHC.Core.FVs
 import GHC.Core.Type
 import GHC.Types.RepType
 import GHC.Core.DataCon
@@ -55,6 +57,7 @@ import GHC.Core.TyCo.Ppr ( pprType )
 import GHC.Utils.Error
 import GHC.Types.Unique
 import GHC.Builtin.Uniques
+import GHC.Builtin.Utils ( primOpId )
 import GHC.Data.FastString
 import GHC.Utils.Panic
 import GHC.StgToCmm.Closure ( NonVoid(..), fromNonVoid, nonVoidIds )
@@ -64,7 +67,6 @@ import GHC.Data.Bitmap
 import GHC.Data.OrdList
 import GHC.Data.Maybe
 import GHC.Types.Var.Env
-import GHC.Builtin.Names ( unsafeEqualityProofName )
 import GHC.Types.Tickish
 
 import Data.List ( genericReplicate, genericLength, intersperse
@@ -89,35 +91,44 @@ import Data.Ord
 import GHC.Stack.CCS
 import Data.Either ( partitionEithers )
 
+import qualified GHC.Types.CostCentre as CC
+import GHC.Stg.Syntax
+import GHC.Stg.FVs
+
 -- -----------------------------------------------------------------------------
 -- Generating byte code for a complete module
 
 byteCodeGen :: HscEnv
             -> Module
-            -> CoreProgram
+            -> [StgTopBinding]
             -> [TyCon]
             -> Maybe ModBreaks
             -> IO CompiledByteCode
 byteCodeGen hsc_env this_mod binds tycs mb_modBreaks
    = withTiming logger dflags
-                (text "GHC.CoreToByteCode"<+>brackets (ppr this_mod))
+                (text "GHC.StgToByteCode"<+>brackets (ppr this_mod))
                 (const ()) $ do
         -- Split top-level binds into strings and others.
         -- See Note [generating code for top-level string literal bindings].
-        let (strings, flatBinds) = partitionEithers $ do  -- list monad
-                (bndr, rhs) <- flattenBinds binds
-                return $ case exprIsTickedString_maybe rhs of
-                    Just str -> Left (bndr, str)
-                    _ -> Right (bndr, simpleFreeVars rhs)
+        let (strings, lifted_binds) = partitionEithers $ do  -- list monad
+                bnd <- binds
+                case bnd of
+                  StgTopLifted bnd      -> [Right bnd]
+                  StgTopStringLit b str -> [Left (b, str)]
+            flattenBind (StgNonRec b e) = [(b,e)]
+            flattenBind (StgRec bs)     = bs
         stringPtrs <- allocateTopStrings hsc_env strings
 
         us <- mkSplitUniqSupply 'y'
         (BcM_State{..}, proto_bcos) <-
-           runBc hsc_env us this_mod mb_modBreaks (mkVarEnv stringPtrs) $
-             mapM schemeTopBind flatBinds
+           runBc hsc_env us this_mod mb_modBreaks (mkVarEnv stringPtrs) $ do
+             prepd_binds <- mapM bcPrepBind lifted_binds
+             let flattened_binds =
+                   concatMap (flattenBind . annBindingFreeVars) (reverse prepd_binds)
+             mapM schemeTopBind flattened_binds
 
         when (notNull ffis)
-             (panic "GHC.CoreToByteCode.byteCodeGen: missing final emitBc?")
+             (panic "GHC.StgToByteCode.byteCodeGen: missing final emitBc?")
 
         dumpIfSet_dyn logger dflags Opt_D_dump_BCOs
            "Proto-BCOs" FormatByteCode
@@ -168,27 +179,30 @@ literals:
 -- Generating byte code for an expression
 
 -- Returns: the root BCO for this expression
-coreExprToBCOs :: HscEnv
-               -> Module
-               -> CoreExpr
-               -> IO UnlinkedBCO
-coreExprToBCOs hsc_env this_mod expr
+stgExprToBCOs :: HscEnv
+              -> Module
+              -> Type
+              -> StgRhs
+              -> IO UnlinkedBCO
+stgExprToBCOs hsc_env this_mod expr_ty expr
  = withTiming logger dflags
-              (text "GHC.CoreToByteCode"<+>brackets (ppr this_mod))
+              (text "GHC.StgToByteCode"<+>brackets (ppr this_mod))
               (const ()) $ do
-      -- create a totally bogus name for the top-level BCO; this
-      -- should be harmless, since it's never used for anything
-      let invented_name  = mkSystemVarName (mkPseudoUniqueE 0) (fsLit "ExprTopLevel")
 
       -- the uniques are needed to generate fresh variables when we introduce new
       -- let bindings for ticked expressions
       us <- mkSplitUniqSupply 'y'
       (BcM_State _dflags _us _this_mod _final_ctr mallocd _ _ _, proto_bco)
-         <- runBc hsc_env us this_mod Nothing emptyVarEnv $
-              schemeR [] (invented_name, simpleFreeVars expr)
+         <- runBc hsc_env us this_mod Nothing emptyVarEnv $ do
+              prepd_expr <- annBindingFreeVars <$>
+                                       bcPrepBind (StgNonRec dummy_id expr)
+              case prepd_expr of
+                (StgNonRec _ cg_expr) -> schemeR [] (idName dummy_id, cg_expr)
+                _                     ->
+                  panic "GHC.StgByteCode.stgExprToBCOs"
 
       when (notNull mallocd)
-           (panic "GHC.CoreToByteCode.coreExprToBCOs: missing final emitBc?")
+           (panic "GHC.StgToByteCode.stgExprToBCOs: missing final emitBc?")
 
       dumpIfSet_dyn logger dflags Opt_D_dump_BCOs "Proto-BCOs" FormatByteCode
          (ppr proto_bco)
@@ -196,27 +210,110 @@ coreExprToBCOs hsc_env this_mod expr
       assembleOneBCO hsc_env proto_bco
   where dflags = hsc_dflags hsc_env
         logger = hsc_logger hsc_env
-
--- The regular freeVars function gives more information than is useful to
--- us here. We need only the free variables, not everything in an FVAnn.
--- Historical note: At one point FVAnn was more sophisticated than just
--- a set. Now it isn't. So this function is much simpler. Keeping it around
--- so that if someone changes FVAnn, they will get a nice type error right
--- here.
-simpleFreeVars :: CoreExpr -> AnnExpr Id DVarSet
-simpleFreeVars = freeVars
+        -- we need an otherwise unused Id for bytecode generation
+        dummy_id = mkSysLocal (fsLit "BCO_toplevel")
+                              (mkPseudoUniqueE 0)
+                              Many
+                              expr_ty
+{-
+  Prepare the STG for bytecode generation:
+
+   - Ensure that all breakpoints are directly under
+        a let-binding, introducing a new binding for
+        those that aren't already.
+
+   - Protect Not-necessarily lifted join points, see
+        Note [Not-necessarily-lifted join points]
+
+ -}
+
+bcPrepRHS :: StgRhs -> BcM StgRhs
+-- explicitly match all constructors so we get a warning if we miss any
+bcPrepRHS (StgRhsClosure fvs cc upd args (StgTick bp@Breakpoint{} expr)) = do
+  {- If we have a breakpoint directly under an StgRhsClosure we don't
+     need to introduce a new binding for it.
+   -}
+  expr' <- bcPrepExpr expr
+  pure (StgRhsClosure fvs cc upd args (StgTick bp expr'))
+bcPrepRHS (StgRhsClosure fvs cc upd args expr) =
+  StgRhsClosure fvs cc upd args <$> bcPrepExpr expr
+bcPrepRHS con@StgRhsCon{} = pure con
+
+bcPrepExpr :: StgExpr -> BcM StgExpr
+-- explicitly match all constructors so we get a warning if we miss any
+bcPrepExpr (StgTick bp@(Breakpoint tick_ty _ _) rhs)
+  | isLiftedTypeKind (typeKind tick_ty) = do
+      id <- newId tick_ty
+      rhs' <- bcPrepExpr rhs
+      let expr' = StgTick bp rhs'
+          bnd = StgNonRec id (StgRhsClosure noExtFieldSilent
+                                            CC.dontCareCCS
+                                            ReEntrant
+                                            []
+                                            expr'
+                             )
+          letExp = StgLet noExtFieldSilent bnd (StgApp id [])
+      pure letExp
+  | otherwise = do
+      id <- newId (mkVisFunTyMany realWorldStatePrimTy tick_ty)
+      st <- newId realWorldStatePrimTy
+      rhs' <- bcPrepExpr rhs
+      let expr' = StgTick bp rhs'
+          bnd = StgNonRec id (StgRhsClosure noExtFieldSilent
+                                            CC.dontCareCCS
+                                            ReEntrant
+                                            [voidArgId]
+                                            expr'
+                             )
+      pure $ StgLet noExtFieldSilent bnd (StgApp id [StgVarArg st])
+bcPrepExpr (StgTick tick rhs) =
+  StgTick tick <$> bcPrepExpr rhs
+bcPrepExpr (StgLet xlet bnds expr) =
+  StgLet xlet <$> bcPrepBind bnds
+              <*> bcPrepExpr expr
+bcPrepExpr (StgLetNoEscape xlne bnds expr) =
+  StgLet xlne <$> bcPrepBind bnds
+              <*> bcPrepExpr expr
+bcPrepExpr (StgCase expr bndr alt_type alts) =
+  StgCase <$> bcPrepExpr expr
+          <*> pure bndr
+          <*> pure alt_type
+          <*> mapM bcPrepAlt alts
+bcPrepExpr lit@StgLit{} = pure lit
+-- See Note [Not-necessarily-lifted join points], step 3.
+bcPrepExpr (StgApp x [])
+  | isNNLJoinPoint x = pure $
+      StgApp (protectNNLJoinPointId x) [StgVarArg voidPrimId]
+bcPrepExpr app@StgApp{} = pure app
+bcPrepExpr app@StgConApp{} = pure app
+bcPrepExpr app@StgOpApp{} = pure app
+
+bcPrepAlt :: StgAlt -> BcM StgAlt
+bcPrepAlt (ac, bndrs, expr) = (,,) ac bndrs <$> bcPrepExpr expr
+
+bcPrepBind :: StgBinding -> BcM StgBinding
+-- explicitly match all constructors so we get a warning if we miss any
+bcPrepBind (StgNonRec bndr rhs) =
+  let (bndr', rhs') = bcPrepSingleBind (bndr, rhs)
+  in  StgNonRec bndr' <$> bcPrepRHS rhs'
+bcPrepBind (StgRec bnds) =
+  StgRec <$> mapM ((\(b,r) -> (,) b <$> bcPrepRHS r) . bcPrepSingleBind)
+                  bnds
+
+bcPrepSingleBind :: (Id, StgRhs) -> (Id, StgRhs)
+-- If necessary, modify this Id and body to protect not-necessarily-lifted join points.
+-- See Note [Not-necessarily-lifted join points], step 2.
+bcPrepSingleBind (x, StgRhsClosure ext cc upd_flag args body)
+  | isNNLJoinPoint x
+  = ( protectNNLJoinPointId x
+    , StgRhsClosure ext cc upd_flag (args ++ [voidArgId]) body)
+bcPrepSingleBind bnd = bnd
 
 -- -----------------------------------------------------------------------------
 -- Compilation schema for the bytecode generator
 
 type BCInstrList = OrdList BCInstr
 
-newtype ByteOff = ByteOff Int
-    deriving (Enum, Eq, Integral, Num, Ord, Real)
-
-newtype WordOff = WordOff Int
-    deriving (Enum, Eq, Integral, Num, Ord, Real)
-
 wordsToBytes :: Platform -> WordOff -> ByteOff
 wordsToBytes platform = fromIntegral . (* platformWordSizeInBytes platform) . fromIntegral
 
@@ -226,7 +323,7 @@ bytesToWords platform (ByteOff bytes) =
     let (q, r) = bytes `quotRem` (platformWordSizeInBytes platform)
     in if r == 0
            then fromIntegral q
-           else panic $ "GHC.CoreToByteCode.bytesToWords: bytes=" ++ show bytes
+           else panic $ "GHC.StgToByteCode.bytesToWords: bytes=" ++ show bytes
 
 wordSize :: Platform -> ByteOff
 wordSize platform = ByteOff (platformWordSizeInBytes platform)
@@ -246,7 +343,7 @@ ppBCEnv p
      $$ nest 4 (vcat (map pp_one (sortBy cmp_snd (Map.toList p))))
      $$ text "end-env"
      where
-        pp_one (var, offset) = int offset <> colon <+> ppr var <+> ppr (bcIdArgRep var)
+        pp_one (var, ByteOff offset) = int offset <> colon <+> ppr var <+> ppr (bcIdArgReps var)
         cmp_snd x y = compare (snd x) (snd y)
 -}
 
@@ -256,7 +353,7 @@ mkProtoBCO
    :: Platform
    -> name
    -> BCInstrList
-   -> Either  [AnnAlt Id DVarSet] (AnnExpr Id DVarSet)
+   -> Either  [CgStgAlt] (CgStgRhs)
         -- ^ original expression; for debugging only
    -> Int
    -> Word16
@@ -315,12 +412,17 @@ argBits platform (rep : args)
   | isFollowableArg rep  = False : argBits platform args
   | otherwise = take (argRepSizeW platform rep) (repeat True) ++ argBits platform args
 
+non_void :: [ArgRep] -> [ArgRep]
+non_void = filter nv
+  where nv V = False
+        nv _ = True
+
 -- -----------------------------------------------------------------------------
 -- schemeTopBind
 
 -- Compile code for the right-hand side of a top-level binding
 
-schemeTopBind :: (Id, AnnExpr Id DVarSet) -> BcM (ProtoBCO Name)
+schemeTopBind :: (Id, CgStgRhs) -> BcM (ProtoBCO Name)
 schemeTopBind (id, rhs)
   | Just data_con <- isDataConWorkId_maybe id,
     isNullaryRepDataCon data_con = do
@@ -351,44 +453,27 @@ schemeTopBind (id, rhs)
 -- Park the resulting BCO in the monad.  Also requires the
 -- name of the variable to which this value was bound,
 -- so as to give the resulting BCO a name.
-
 schemeR :: [Id]                 -- Free vars of the RHS, ordered as they
                                 -- will appear in the thunk.  Empty for
                                 -- top-level things, which have no free vars.
-        -> (Name, AnnExpr Id DVarSet)
+        -> (Name, CgStgRhs)
         -> BcM (ProtoBCO Name)
 schemeR fvs (nm, rhs)
-{-
-   | trace (showSDoc (
-              (char ' '
-               $$ (ppr.filter (not.isTyVar).dVarSetElems.fst) rhs
-               $$ pprCoreExpr (deAnnotate rhs)
-               $$ char ' '
-              ))) False
-   = undefined
-   | otherwise
--}
    = schemeR_wrk fvs nm rhs (collect rhs)
 
 -- If an expression is a lambda (after apply bcView), return the
 -- list of arguments to the lambda (in R-to-L order) and the
 -- underlying expression
-collect :: AnnExpr Id DVarSet -> ([Var], AnnExpr' Id DVarSet)
-collect (_, e) = go [] e
-  where
-    go xs e | Just e' <- bcView e = go xs e'
-    go xs (AnnLam x (_,e))
-      | typePrimRep (idType x) `lengthExceeds` 1
-      = multiValException
-      | otherwise
-      = go (x:xs) e
-    go xs not_lambda = (reverse xs, not_lambda)
+
+collect :: CgStgRhs -> ([Var], CgStgExpr)
+collect (StgRhsClosure _ _ _ args body) = (args, body)
+collect (StgRhsCon _cc dc cnum _ticks args) = ([], StgConApp dc cnum args [])
 
 schemeR_wrk
     :: [Id]
     -> Name
-    -> AnnExpr Id DVarSet             -- expression e, for debugging only
-    -> ([Var], AnnExpr' Var DVarSet)  -- result of collect on e
+    -> CgStgRhs            -- expression e, for debugging only
+    -> ([Var], CgStgExpr)  -- result of collect on e
     -> BcM (ProtoBCO Name)
 schemeR_wrk fvs nm original_body (args, body)
    = do
@@ -417,17 +502,16 @@ schemeR_wrk fvs nm original_body (args, body)
                  arity bitmap_size bitmap False{-not alts-})
 
 -- introduce break instructions for ticked expressions
-schemeER_wrk :: StackDepth -> BCEnv -> AnnExpr' Id DVarSet -> BcM BCInstrList
-schemeER_wrk d p rhs
-  | AnnTick (Breakpoint _ext tick_no fvs) (_annot, newRhs) <- rhs
-  = do  code <- schemeE d 0 p newRhs
+schemeER_wrk :: StackDepth -> BCEnv -> CgStgExpr -> BcM BCInstrList
+schemeER_wrk d p (StgTick (Breakpoint tick_ty tick_no fvs) rhs)
+  = do  code <- schemeE d 0 p rhs
         cc_arr <- getCCArray
         this_mod <- moduleName <$> getCurrentModule
         platform <- profilePlatform <$> getProfile
         let idOffSets = getVarOffSets platform d p fvs
         let breakInfo = CgBreakInfo
                         { cgb_vars = idOffSets
-                        , cgb_resty = exprType (deAnnotate' newRhs)
+                        , cgb_resty = tick_ty
                         }
         newBreakInfo tick_no breakInfo
         hsc_env <- getHscEnv
@@ -437,7 +521,7 @@ schemeER_wrk d p rhs
                | otherwise = toRemotePtr nullPtr
         let breakInstr = BRK_FUN (fromIntegral tick_no) (getUnique this_mod) cc
         return $ breakInstr `consOL` code
-   | otherwise = schemeE d 0 p rhs
+schemeER_wrk d p rhs = schemeE d 0 p rhs
 
 getVarOffSets :: Platform -> StackDepth -> BCEnv -> [Id] -> [Maybe (Id, Word16)]
 getVarOffSets platform depth env = map getOffSet
@@ -469,7 +553,7 @@ trunc16B = truncIntegral16
 trunc16W :: WordOff -> Word16
 trunc16W = truncIntegral16
 
-fvsToEnv :: BCEnv -> DVarSet -> [Id]
+fvsToEnv :: BCEnv -> CgStgRhs -> [Id]
 -- Takes the free variables of a right-hand side, and
 -- delivers an ordered list of the local variables that will
 -- be captured in the thunk for the RHS
@@ -478,93 +562,128 @@ fvsToEnv :: BCEnv -> DVarSet -> [Id]
 --
 -- The code that constructs the thunk, and the code that executes
 -- it, have to agree about this layout
-fvsToEnv p fvs = [v | v <- dVarSetElems fvs,
-                      isId v,           -- Could be a type variable
-                      v `Map.member` p]
+
+fvsToEnv p (StgRhsClosure fvs _ _ _ _) =
+            [v | v <- dVarSetElems fvs,
+                 v `Map.member` p]
+fvsToEnv _ _ = []
 
 -- -----------------------------------------------------------------------------
 -- schemeE
 
+-- Returning an unlifted value.
+-- Heave it on the stack, SLIDE, and RETURN.
 returnUnboxedAtom
     :: StackDepth
     -> Sequel
     -> BCEnv
-    -> AnnExpr' Id DVarSet
-    -> ArgRep
+    -> StgArg
     -> BcM BCInstrList
--- Returning an unlifted value.
--- Heave it on the stack, SLIDE, and RETURN.
-returnUnboxedAtom d s p e e_rep = do
-    dflags <- getDynFlags
-    let platform = targetPlatform dflags
+returnUnboxedAtom d s p e = do
+    let reps = case e of
+                 StgLitArg lit -> typePrimRepArgs (literalType lit)
+                 StgVarArg i   -> bcIdPrimReps i
     (push, szb) <- pushAtom d p e
-    return (push                                  -- value onto stack
-           `appOL`  mkSlideB platform szb (d - s) -- clear to sequel
-           `snocOL` RETURN_UBX e_rep)             -- go
+    ret <- returnUnboxedReps d s szb reps
+    return (push `appOL` ret)
+
+-- return an unboxed value from the top of the stack
+returnUnboxedReps
+    :: StackDepth
+    -> Sequel
+    -> ByteOff    -- size of the thing we're returning
+    -> [PrimRep]  -- representations
+    -> BcM BCInstrList
+returnUnboxedReps d s szb reps = do
+    profile <- getProfile
+    let platform = profilePlatform profile
+        non_void VoidRep = False
+        non_void _ = True
+    ret <- case filter non_void reps of
+             -- use RETURN_UBX for unary representations
+             []    -> return (unitOL $ RETURN_UBX V)
+             [rep] -> return (unitOL $ RETURN_UBX (toArgRep platform rep))
+             -- otherwise use RETURN_TUPLE with a tuple descriptor
+             nv_reps -> do
+               let (tuple_info, args_offsets) = layoutTuple profile 0 (primRepCmmType platform) nv_reps
+                   args_ptrs = map (\(rep, off) -> (isFollowableArg (toArgRep platform rep), off)) args_offsets
+               tuple_bco <- emitBc (tupleBCO platform tuple_info args_ptrs)
+               return $ PUSH_UBX (mkTupleInfoLit platform tuple_info) 1 `consOL`
+                        PUSH_BCO tuple_bco `consOL`
+                        unitOL RETURN_TUPLE
+    return ( mkSlideB platform szb (d - s) -- clear to sequel
+             `appOL`  ret)                 -- go
+
+-- construct and return an unboxed tuple
+returnUnboxedTuple
+    :: StackDepth
+    -> Sequel
+    -> BCEnv
+    -> [StgArg]
+    -> BcM BCInstrList
+returnUnboxedTuple d s p es = do
+    profile <- getProfile
+    let platform = profilePlatform profile
+        arg_ty e = primRepCmmType platform (atomPrimRep e)
+        (tuple_info, tuple_components) = layoutTuple profile d arg_ty es
+        go _   pushes [] = return (reverse pushes)
+        go !dd pushes ((a, off):cs) = do (push, szb) <- pushAtom dd p a
+                                         MASSERT(off == dd + szb)
+                                         go (dd + szb) (push:pushes) cs
+    pushes <- go d [] tuple_components
+    ret <- returnUnboxedReps d
+                             s
+                             (wordsToBytes platform $ tupleSize tuple_info)
+                             (map atomPrimRep es)
+    return (mconcat pushes `appOL` ret)
 
 -- Compile code to apply the given expression to the remaining args
 -- on the stack, returning a HNF.
 schemeE
-    :: StackDepth -> Sequel -> BCEnv -> AnnExpr' Id DVarSet -> BcM BCInstrList
-schemeE d s p e
-   | Just e' <- bcView e
-   = schemeE d s p e'
-
+    :: StackDepth -> Sequel -> BCEnv -> CgStgExpr -> BcM BCInstrList
+schemeE d s p (StgLit lit) = returnUnboxedAtom d s p (StgLitArg lit)
+schemeE d s p (StgApp x [])
+   | isUnliftedType (idType x) = returnUnboxedAtom d s p (StgVarArg x)
 -- Delegate tail-calls to schemeT.
-schemeE d s p e@(AnnApp _ _) = schemeT d s p e
-
-schemeE d s p e@(AnnLit lit)     = do
-    platform <- profilePlatform <$> getProfile
-    returnUnboxedAtom d s p e (typeArgRep platform (literalType lit))
-schemeE d s p e@(AnnCoercion {}) = returnUnboxedAtom d s p e V
-
-schemeE d s p e@(AnnVar v)
-      -- See Note [Not-necessarily-lifted join points], step 3.
-    | isNNLJoinPoint v          = doTailCall d s p (protectNNLJoinPointId v) [AnnVar voidPrimId]
-    | isUnliftedType (idType v) = do
-        platform <- profilePlatform <$> getProfile
-        returnUnboxedAtom d s p e (bcIdArgRep platform v)
-    | otherwise                 = schemeT d s p e
-
-schemeE d s p (AnnLet (AnnNonRec x (_,rhs)) (_,body))
-   | (AnnVar v, args_r_to_l) <- splitApp rhs,
-     Just data_con <- isDataConWorkId_maybe v,
-     dataConRepArity data_con == length args_r_to_l
+schemeE d s p e@(StgApp {}) = schemeT d s p e
+schemeE d s p e@(StgConApp {}) = schemeT d s p e
+schemeE d s p e@(StgOpApp {}) = schemeT d s p e
+schemeE d s p (StgLetNoEscape xlet bnd body)
+   = schemeE d s p (StgLet xlet bnd body)
+schemeE d s p (StgLet _xlet
+                      (StgNonRec x (StgRhsCon _cc data_con _cnum _ticks args))
+                      body)
    = do -- Special case for a non-recursive let whose RHS is a
         -- saturated constructor application.
         -- Just allocate the constructor and carry on
-        alloc_code <- mkConAppCode d s p data_con args_r_to_l
+        alloc_code <- mkConAppCode d s p data_con args
         platform <- targetPlatform <$> getDynFlags
         let !d2 = d + wordSize platform
         body_code <- schemeE d2 s (Map.insert x d2 p) body
         return (alloc_code `appOL` body_code)
-
 -- General case for let.  Generates correct, if inefficient, code in
 -- all situations.
-schemeE d s p (AnnLet binds (_,body)) = do
+schemeE d s p (StgLet _ext binds body) = do
      platform <- targetPlatform <$> getDynFlags
-     let (xs,rhss) = case binds of AnnNonRec x rhs  -> ([x],[rhs])
-                                   AnnRec xs_n_rhss -> unzip xs_n_rhss
+     let (xs,rhss) = case binds of StgNonRec x rhs  -> ([x],[rhs])
+                                   StgRec xs_n_rhss -> unzip xs_n_rhss
          n_binds = genericLength xs
 
-         fvss  = map (fvsToEnv p' . fst) rhss
-
-           -- See Note [Not-necessarily-lifted join points], step 2.
-         (xs',rhss') = zipWithAndUnzip protectNNLJoinPointBind xs rhss
+         fvss  = map (fvsToEnv p') rhss
 
          -- Sizes of free vars
          size_w = trunc16W . idSizeW platform
          sizes = map (\rhs_fvs -> sum (map size_w rhs_fvs)) fvss
 
          -- the arity of each rhs
-         arities = map (genericLength . fst . collect) rhss'
+         arities = map (genericLength . fst . collect) rhss
 
          -- This p', d' defn is safe because all the items being pushed
          -- are ptrs, so all have size 1 word.  d' and p' reflect the stack
          -- after the closures have been allocated in the heap (but not
          -- filled in), and pointers to them parked on the stack.
          offsets = mkStackOffsets d (genericReplicate n_binds (wordSize platform))
-         p' = Map.insertList (zipE xs' offsets) p
+         p' = Map.insertList (zipE xs offsets) p
          d' = d + wordsToBytes platform n_binds
          zipE = zipEqual "schemeE"
 
@@ -583,7 +702,7 @@ schemeE d s p (AnnLet binds (_,body)) = do
                 mkap | arity == 0 = MKAP
                      | otherwise  = MKPAP
          build_thunk dd (fv:fvs) size bco off arity = do
-              (push_code, pushed_szb) <- pushAtom dd p' (AnnVar fv)
+              (push_code, pushed_szb) <- pushAtom dd p' (StgVarArg fv)
               more_push_code <-
                   build_thunk (dd + pushed_szb) fvs size bco off arity
               return (push_code `appOL` more_push_code)
@@ -595,112 +714,35 @@ schemeE d s p (AnnLet binds (_,body)) = do
                  mkAlloc sz arity = ALLOC_PAP arity sz
 
          is_tick = case binds of
-                     AnnNonRec id _ -> occNameFS (getOccName id) == tickFS
+                     StgNonRec id _ -> occNameFS (getOccName id) == tickFS
                      _other -> False
 
-         compile_bind d' fvs x rhs size arity off = do
+         compile_bind d' fvs x (rhs::CgStgRhs) size arity off = do
                 bco <- schemeR fvs (getName x,rhs)
                 build_thunk d' fvs size bco off arity
 
          compile_binds =
             [ compile_bind d' fvs x rhs size arity (trunc16W n)
             | (fvs, x, rhs, size, arity, n) <-
-                zip6 fvss xs' rhss' sizes arities [n_binds, n_binds-1 .. 1]
+                zip6 fvss xs rhss sizes arities [n_binds, n_binds-1 .. 1]
             ]
      body_code <- schemeE d' s p' body
      thunk_codes <- sequence compile_binds
      return (alloc_code `appOL` concatOL thunk_codes `appOL` body_code)
 
--- Introduce a let binding for a ticked case expression. This rule
--- *should* only fire when the expression was not already let-bound
--- (the code gen for let bindings should take care of that).  Todo: we
--- call exprFreeVars on a deAnnotated expression, this may not be the
--- best way to calculate the free vars but it seemed like the least
--- intrusive thing to do
-schemeE d s p exp@(AnnTick (Breakpoint _ext _id _fvs) _rhs)
-   | isLiftedTypeKind (typeKind ty)
-   = do   id <- newId ty
-          -- Todo: is emptyVarSet correct on the next line?
-          let letExp = AnnLet (AnnNonRec id (fvs, exp)) (emptyDVarSet, AnnVar id)
-          schemeE d s p letExp
-
-   | otherwise
-   = do   -- If the result type is not definitely lifted, then we must generate
-          --   let f = \s . tick<n> e
-          --   in  f realWorld#
-          -- When we stop at the breakpoint, _result will have an unlifted
-          -- type and hence won't be bound in the environment, but the
-          -- breakpoint will otherwise work fine.
-          --
-          -- NB (#12007) this /also/ applies for if (ty :: TYPE r), where
-          --    r :: RuntimeRep is a variable. This can happen in the
-          --    continuations for a pattern-synonym matcher
-          --    match = /\(r::RuntimeRep) /\(a::TYPE r).
-          --            \(k :: Int -> a) \(v::T).
-          --            case v of MkV n -> k n
-          -- Here (k n) :: a :: TYPE r, so we don't know if it's lifted
-          -- or not; but that should be fine provided we add that void arg.
-
-          id <- newId (mkVisFunTyMany realWorldStatePrimTy ty)
-          st <- newId realWorldStatePrimTy
-          let letExp = AnnLet (AnnNonRec id (fvs, AnnLam st (emptyDVarSet, exp)))
-                              (emptyDVarSet, (AnnApp (emptyDVarSet, AnnVar id)
-                                                    (emptyDVarSet, AnnVar realWorldPrimId)))
-          schemeE d s p letExp
-
-   where
-     exp' = deAnnotate' exp
-     fvs  = exprFreeVarsDSet exp'
-     ty   = exprType exp'
+schemeE _d _s _p (StgTick (Breakpoint _ bp_id _) _rhs)
+   = panic ("schemeE: Breakpoint without let binding: " ++
+            show bp_id ++
+            " forgot to run bcPrep?")
 
 -- ignore other kinds of tick
-schemeE d s p (AnnTick _ (_, rhs)) = schemeE d s p rhs
+schemeE d s p (StgTick _ rhs) = schemeE d s p rhs
 
 -- no alts: scrut is guaranteed to diverge
-schemeE d s p (AnnCase (_,scrut) _ _ []) = schemeE d s p scrut
-
--- handle pairs with one void argument (e.g. state token)
-schemeE d s p (AnnCase scrut bndr _ [AnnAlt (DataAlt dc) [bind1, bind2] rhs])
-   | isUnboxedTupleDataCon dc
-        -- Convert
-        --      case .... of x { (# V'd-thing, a #) -> ... }
-        -- to
-        --      case .... of a { DEFAULT -> ... }
-        -- because the return convention for both are identical.
-        --
-        -- Note that it does not matter losing the void-rep thing from the
-        -- envt (it won't be bound now) because we never look such things up.
-   , Just res <- case (typePrimRep (idType bind1), typePrimRep (idType bind2)) of
-                   ([], [_])
-                     -> Just $ doCase d s p scrut bind2 [AnnAlt DEFAULT [] rhs] (Just bndr)
-                   ([_], [])
-                     -> Just $ doCase d s p scrut bind1 [AnnAlt DEFAULT [] rhs] (Just bndr)
-                   _ -> Nothing
-   = res
-
--- handle unit tuples
-schemeE d s p (AnnCase scrut bndr _ [AnnAlt (DataAlt dc) [bind1] rhs])
-   | isUnboxedTupleDataCon dc
-   , typePrimRep (idType bndr) `lengthAtMost` 1
-   = doCase d s p scrut bind1 [AnnAlt DEFAULT [] rhs] (Just bndr)
-
--- handle nullary tuples
-schemeE d s p (AnnCase scrut bndr _ alt@[AnnAlt DEFAULT [] _])
-   | isUnboxedTupleType (idType bndr)
-   , Just ty <- case typePrimRep (idType bndr) of
-       [_]  -> Just (unwrapType (idType bndr))
-       []   -> Just unboxedUnitTy
-       _    -> Nothing
-       -- handles any pattern with a single non-void binder; in particular I/O
-       -- monad returns (# RealWorld#, a #)
-   = doCase d s p scrut (bndr `setIdType` ty) alt (Just bndr)
-
-schemeE d s p (AnnCase scrut bndr _ alts)
-   = doCase d s p scrut bndr alts Nothing{-not an unboxed tuple-}
-
-schemeE _ _ _ expr
-   = pprPanic "GHC.CoreToByteCode.schemeE: unhandled case"
-               (pprCoreExpr (deAnnotate' expr))
+schemeE d s p (StgCase scrut _ _ []) = schemeE d s p scrut
+
+schemeE d s p (StgCase scrut bndr _ alts)
+   = doCase d s p scrut bndr alts
 
 -- Is this Id a not-necessarily-lifted join point?
 -- See Note [Not-necessarily-lifted join points], step 1
@@ -708,16 +750,6 @@ isNNLJoinPoint :: Id -> Bool
 isNNLJoinPoint x = isJoinId x &&
                    Just True /= isLiftedType_maybe (idType x)
 
--- If necessary, modify this Id and body to protect not-necessarily-lifted join points.
--- See Note [Not-necessarily-lifted join points], step 2.
-protectNNLJoinPointBind :: Id -> AnnExpr Id DVarSet -> (Id, AnnExpr Id DVarSet)
-protectNNLJoinPointBind x rhs@(fvs, _)
-  | isNNLJoinPoint x
-  = (protectNNLJoinPointId x, (fvs, AnnLam voidArgId rhs))
-
-  | otherwise
-  = (x, rhs)
-
 -- Update an Id's type to take a Void# argument.
 -- Precondition: the Id is a not-necessarily-lifted join point.
 -- See Note [Not-necessarily-lifted join points]
@@ -763,7 +795,7 @@ isUnliftedType check in the AnnVar case of schemeE.) Here is the strategy:
    type to tack on a `(# #) ->`.
    Note that functions are never levity-polymorphic, so this transformation
    changes an NNLJP to a non-levity-polymorphic join point. This is done
-   in protectNNLJoinPointBind, called from the AnnLet case of schemeE.
+   in bcPrepSingleBind.
 
 3. At an occurrence of an NNLJP, add an application to void# (called voidPrimId),
    being careful to note the new type of the NNLJP. This is done in the AnnVar
@@ -805,10 +837,8 @@ Right Fix is to take advantage of join points as goto-labels.
 --
 -- 1.  The fn denotes a ccall.  Defer to generateCCall.
 --
--- 2.  (Another nasty hack).  Spot (# a::V, b #) and treat
---     it simply as  b  -- since the representations are identical
---     (the V takes up zero stack space).  Also, spot
---     (# b #) and treat it as  b.
+-- 2.  An unboxed tuple: push the components on the top of
+--     the stack and return.
 --
 -- 3.  Application of a constructor, by defn saturated.
 --     Split the args into ptrs and non-ptrs, and push the nonptrs,
@@ -820,59 +850,45 @@ Right Fix is to take advantage of join points as goto-labels.
 schemeT :: StackDepth   -- Stack depth
         -> Sequel       -- Sequel depth
         -> BCEnv        -- stack env
-        -> AnnExpr' Id DVarSet
+        -> CgStgExpr
         -> BcM BCInstrList
 
-schemeT d s p app
-
    -- Case 0
+schemeT d s p app
    | Just (arg, constr_names) <- maybe_is_tagToEnum_call app
    = implement_tagToId d s p arg constr_names
 
    -- Case 1
-   | Just (CCall ccall_spec) <- isFCallId_maybe fn
+schemeT d s p (StgOpApp (StgFCallOp (CCall ccall_spec) _ty) args result_ty)
    = if isSupportedCConv ccall_spec
-      then generateCCall d s p ccall_spec fn args_r_to_l
+      then generateCCall d s p ccall_spec result_ty (reverse args)
       else unsupportedCConvException
 
+schemeT d s p (StgOpApp (StgPrimOp op) args _ty)
+   = doTailCall d s p (primOpId op) (reverse args)
 
-   -- Case 2: Constructor application
-   | Just con <- maybe_saturated_dcon
-   , isUnboxedTupleDataCon con
-   = do
-     platform <- profilePlatform <$> getProfile
-     case args_r_to_l of
-        [arg1,arg2] | isVAtom platform arg1 ->
-                  unboxedTupleReturn d s p arg2
-        [arg1,arg2] | isVAtom platform arg2 ->
-                  unboxedTupleReturn d s p arg1
-        _other -> multiValException
+schemeT _d _s _p (StgOpApp StgPrimCallOp{} _args _ty)
+   = unsupportedCConvException
+
+   -- Case 2: Unboxed tuple
+schemeT d s p (StgConApp con _ext args _tys)
+   | isUnboxedTupleDataCon con || isUnboxedSumDataCon con
+   = returnUnboxedTuple d s p args
 
    -- Case 3: Ordinary data constructor
-   | Just con <- maybe_saturated_dcon
-   = do alloc_con <- mkConAppCode d s p con args_r_to_l
+   | otherwise
+   = do alloc_con <- mkConAppCode d s p con args
         platform <- profilePlatform <$> getProfile
         return (alloc_con         `appOL`
                 mkSlideW 1 (bytesToWords platform $ d - s) `snocOL`
                 ENTER)
 
    -- Case 4: Tail call of function
-   | otherwise
-   = doTailCall d s p fn args_r_to_l
-
-   where
-        -- Extract the args (R->L) and fn
-        -- The function will necessarily be a variable,
-        -- because we are compiling a tail call
-      (AnnVar fn, args_r_to_l) = splitApp app
-
-      -- Only consider this to be a constructor application iff it is
-      -- saturated.  Otherwise, we'll call the constructor wrapper.
-      n_args = length args_r_to_l
-      maybe_saturated_dcon
-        = case isDataConWorkId_maybe fn of
-                Just con | dataConRepArity con == n_args -> Just con
-                _ -> Nothing
+schemeT d s p (StgApp fn args)
+   = doTailCall d s p fn (reverse args)
+
+schemeT _ _ _ e = pprPanic "GHC.StgToByteCode.schemeT"
+                           (pprStgExpr shortStgPprOpts e)
 
 -- -----------------------------------------------------------------------------
 -- Generate code to build a constructor application,
@@ -883,26 +899,17 @@ mkConAppCode
     -> Sequel
     -> BCEnv
     -> DataCon                  -- The data constructor
-    -> [AnnExpr' Id DVarSet]    -- Args, in *reverse* order
+    -> [StgArg]                 -- Args, in *reverse* order
     -> BcM BCInstrList
-mkConAppCode _ _ _ con []       -- Nullary constructor
-  = ASSERT( isNullaryRepDataCon con )
-    return (unitOL (PUSH_G (getName (dataConWorkId con))))
-        -- Instead of doing a PACK, which would allocate a fresh
-        -- copy of this constructor, use the single shared version.
-
-mkConAppCode orig_d _ p con args_r_to_l =
-    ASSERT( args_r_to_l `lengthIs` dataConRepArity con ) app_code
+mkConAppCode orig_d _ p con args = app_code
   where
     app_code = do
         profile <- getProfile
         let platform = profilePlatform profile
 
-        -- The args are initially in reverse order, but mkVirtHeapOffsets
-        -- expects them to be left-to-right.
-        let non_voids =
+            non_voids =
                 [ NonVoid (prim_rep, arg)
-                | arg <- reverse args_r_to_l
+                | arg <- args
                 , let prim_rep = atomPrimRep arg
                 , not (isVoidRep prim_rep)
                 ]
@@ -922,20 +929,6 @@ mkConAppCode orig_d _ p con args_r_to_l =
         -- Push on the stack in the reverse order.
         do_pushery orig_d (reverse args_offsets)
 
-
--- -----------------------------------------------------------------------------
--- Returning an unboxed tuple with one non-void component (the only
--- case we can handle).
---
--- Remember, we don't want to *evaluate* the component that is being
--- returned, even if it is a pointed type.  We always just return.
-
-unboxedTupleReturn
-    :: StackDepth -> Sequel -> BCEnv -> AnnExpr' Id DVarSet -> BcM BCInstrList
-unboxedTupleReturn d s p arg = do
-   platform <- profilePlatform <$> getProfile
-   returnUnboxedAtom d s p arg (atomRep platform arg)
-
 -- -----------------------------------------------------------------------------
 -- Generate code for a tail-call
 
@@ -944,7 +937,7 @@ doTailCall
     -> Sequel
     -> BCEnv
     -> Id
-    -> [AnnExpr' Id DVarSet]
+    -> [StgArg]
     -> BcM BCInstrList
 doTailCall init_d s p fn args = do
    platform <- profilePlatform <$> getProfile
@@ -952,7 +945,7 @@ doTailCall init_d s p fn args = do
   where
   do_pushes !d [] reps = do
         ASSERT( null reps ) return ()
-        (push_fn, sz) <- pushAtom d p (AnnVar fn)
+        (push_fn, sz) <- pushAtom d p (StgVarArg fn)
         platform <- profilePlatform <$> getProfile
         ASSERT( sz == wordSize platform ) return ()
         let slide = mkSlideB platform (d - init_d + wordSize platform) (init_d - s)
@@ -997,7 +990,7 @@ findPushSeq (D: rest)
 findPushSeq (L: rest)
   = (PUSH_APPLY_L, 1, rest)
 findPushSeq _
-  = panic "GHC.CoreToByteCode.findPushSeq"
+  = panic "GHC.StgToByteCode.findPushSeq"
 
 -- -----------------------------------------------------------------------------
 -- Case expressions
@@ -1006,23 +999,31 @@ doCase
     :: StackDepth
     -> Sequel
     -> BCEnv
-    -> AnnExpr Id DVarSet
+    -> CgStgExpr
     -> Id
-    -> [AnnAlt Id DVarSet]
-    -> Maybe Id  -- Just x <=> is an unboxed tuple case with scrut binder,
-                 -- don't enter the result
+    -> [CgStgAlt]
     -> BcM BCInstrList
-doCase d s p (_,scrut) bndr alts is_unboxed_tuple
-  | typePrimRep (idType bndr) `lengthExceeds` 1
-  = multiValException
-
-  | otherwise
+doCase d s p scrut bndr alts
   = do
      profile <- getProfile
      hsc_env <- getHscEnv
      let
         platform = profilePlatform profile
 
+        -- Are we dealing with an unboxed tuple with a tuple return frame?
+        --
+        -- 'Simple' tuples with at most one non-void component,
+        -- like (# Word# #) or (# Int#, State# RealWorld# #) do not have a
+        -- tuple return frame. This is because (# foo #) and (# foo, Void# #)
+        -- have the same runtime rep. We have more efficient specialized
+        -- return frames for the situations with one non-void element.
+
+        ubx_tuple_frame =
+          (isUnboxedTupleType bndr_ty || isUnboxedSumType bndr_ty) &&
+          length non_void_arg_reps > 1
+
+        non_void_arg_reps = non_void (typeArgReps platform bndr_ty)
+
         profiling
           | Just interp <- hsc_interp hsc_env
           = interpreterProfiled interp
@@ -1033,53 +1034,84 @@ doCase d s p (_,scrut) bndr alts is_unboxed_tuple
         -- When an alt is entered, it assumes the returned value is
         -- on top of the itbl.
         ret_frame_size_b :: StackDepth
-        ret_frame_size_b = 2 * wordSize platform
+        ret_frame_size_b | ubx_tuple_frame =
+                             (if profiling then 5 else 4) * wordSize platform
+                         | otherwise = 2 * wordSize platform
 
-        -- The extra frame we push to save/restore the CCCS when profiling
-        save_ccs_size_b | profiling = 2 * wordSize platform
+        -- The stack space used to save/restore the CCCS when profiling
+        save_ccs_size_b | profiling &&
+                          not ubx_tuple_frame = 2 * wordSize platform
                         | otherwise = 0
 
         -- An unlifted value gets an extra info table pushed on top
         -- when it is returned.
         unlifted_itbl_size_b :: StackDepth
-        unlifted_itbl_size_b | isAlgCase = 0
-                             | otherwise = wordSize platform
+        unlifted_itbl_size_b | isAlgCase       = 0
+                             | ubx_tuple_frame = 3 * wordSize platform
+                             | otherwise       = wordSize platform
+
+        (bndr_size, tuple_info, args_offsets)
+           | ubx_tuple_frame =
+               let bndr_ty = primRepCmmType platform
+                   bndr_reps = filter (not.isVoidRep) (bcIdPrimReps bndr)
+                   (tuple_info, args_offsets) =
+                       layoutTuple profile 0 bndr_ty bndr_reps
+               in ( wordsToBytes platform (tupleSize tuple_info)
+                  , tuple_info
+                  , args_offsets
+                  )
+           | otherwise = ( wordsToBytes platform (idSizeW platform bndr)
+                         , voidTupleInfo
+                         , []
+                         )
 
         -- depth of stack after the return value has been pushed
         d_bndr =
-            d + ret_frame_size_b + wordsToBytes platform (idSizeW platform bndr)
+            d + ret_frame_size_b + bndr_size
 
         -- depth of stack after the extra info table for an unboxed return
         -- has been pushed, if any.  This is the stack depth at the
         -- continuation.
-        d_alts = d_bndr + unlifted_itbl_size_b
+        d_alts = d + ret_frame_size_b + bndr_size + unlifted_itbl_size_b
 
         -- Env in which to compile the alts, not including
         -- any vars bound by the alts themselves
-        p_alts0 = Map.insert bndr d_bndr p
-
-        p_alts = case is_unboxed_tuple of
-                   Just ubx_bndr -> Map.insert ubx_bndr d_bndr p_alts0
-                   Nothing       -> p_alts0
+        p_alts = Map.insert bndr d_bndr p
 
         bndr_ty = idType bndr
-        isAlgCase = not (isUnliftedType bndr_ty) && isNothing is_unboxed_tuple
+        isAlgCase = not (isUnliftedType bndr_ty)
 
         -- given an alt, return a discr and code for it.
-        codeAlt (AnnAlt DEFAULT _ (_,rhs))
+        codeAlt (DEFAULT, _, rhs)
            = do rhs_code <- schemeE d_alts s p_alts rhs
                 return (NoDiscr, rhs_code)
 
-        codeAlt alt@(AnnAlt _ bndrs (_,rhs))
+        codeAlt alt@(_, bndrs, rhs)
            -- primitive or nullary constructor alt: no need to UNPACK
            | null real_bndrs = do
                 rhs_code <- schemeE d_alts s p_alts rhs
                 return (my_discr alt, rhs_code)
-           -- If an alt attempts to match on an unboxed tuple or sum, we must
-           -- bail out, as the bytecode compiler can't handle them.
-           -- (See #14608.)
-           | any (\bndr -> typePrimRep (idType bndr) `lengthExceeds` 1) bndrs
-           = multiValException
+           | isUnboxedTupleType bndr_ty || isUnboxedSumType bndr_ty =
+             let bndr_ty = primRepCmmType platform . bcIdPrimRep
+                 tuple_start = d_bndr
+                 (tuple_info, args_offsets) =
+                   layoutTuple profile
+                               0
+                               bndr_ty
+                               bndrs
+
+                 stack_bot = d_alts
+
+                 p' = Map.insertList
+                        [ (arg, tuple_start -
+                                wordsToBytes platform (tupleSize tuple_info) +
+                                offset)
+                        | (arg, offset) <- args_offsets
+                        , not (isVoidRep $ bcIdPrimRep arg)]
+                        p_alts
+             in do
+               rhs_code <- schemeE stack_bot s p' rhs
+               return (NoDiscr, rhs_code)
            -- algebraic alt with some binders
            | otherwise =
              let (tot_wds, _ptrs_wds, args_offsets) =
@@ -1104,24 +1136,24 @@ doCase d s p (_,scrut) bndr alts is_unboxed_tuple
            where
              real_bndrs = filterOut isTyVar bndrs
 
-        my_discr (AnnAlt DEFAULT _ _) = NoDiscr {-shouldn't really happen-}
-        my_discr (AnnAlt (DataAlt dc) _ _)
+        my_discr (DEFAULT, _, _) = NoDiscr {-shouldn't really happen-}
+        my_discr (DataAlt dc, _, _)
            | isUnboxedTupleDataCon dc || isUnboxedSumDataCon dc
-           = multiValException
+           = NoDiscr
            | otherwise
            = DiscrP (fromIntegral (dataConTag dc - fIRST_TAG))
-        my_discr (AnnAlt (LitAlt l) _ _)
+        my_discr (LitAlt l, _, _)
            = case l of LitNumber LitNumInt i  -> DiscrI (fromInteger i)
                        LitNumber LitNumWord w -> DiscrW (fromInteger w)
                        LitFloat r   -> DiscrF (fromRational r)
                        LitDouble r  -> DiscrD (fromRational r)
                        LitChar i    -> DiscrI (ord i)
-                       _ -> pprPanic "schemeE(AnnCase).my_discr" (ppr l)
+                       _ -> pprPanic "schemeE(StgCase).my_discr" (ppr l)
 
         maybe_ncons
            | not isAlgCase = Nothing
            | otherwise
-           = case [dc | AnnAlt (DataAlt dc) _ _ <- alts] of
+           = case [dc | (DataAlt dc, _, _) <- alts] of
                 []     -> Nothing
                 (dc:_) -> Just (tyConFamilySize (dataConTyCon dc))
 
@@ -1139,20 +1171,36 @@ doCase d s p (_,scrut) bndr alts is_unboxed_tuple
         -- really want a bitmap up to depth (d-s).  This affects compilation of
         -- case-of-case expressions, which is the only time we can be compiling a
         -- case expression with s /= 0.
-        bitmap_size = trunc16W $ bytesToWords platform (d - s)
+
+        -- unboxed tuples get two more words, the second is a pointer (tuple_bco)
+        (extra_pointers, extra_slots)
+           | ubx_tuple_frame && profiling = ([1], 3) -- tuple_info, tuple_BCO, CCCS
+           | ubx_tuple_frame              = ([1], 2) -- tuple_info, tuple_BCO
+           | otherwise                    = ([], 0)
+
+        bitmap_size = trunc16W $ fromIntegral extra_slots +
+                                 bytesToWords platform (d - s)
+
         bitmap_size' :: Int
         bitmap_size' = fromIntegral bitmap_size
-        bitmap = intsToReverseBitmap platform bitmap_size'{-size-}
-                        (sort (filter (< bitmap_size') rel_slots))
+
+
+        pointers =
+          extra_pointers ++
+          sort (filter (< bitmap_size') (map (+extra_slots) rel_slots))
           where
           binds = Map.toList p
           -- NB: unboxed tuple cases bind the scrut binder to the same offset
           -- as one of the alt binders, so we have to remove any duplicates here:
           rel_slots = nub $ map fromIntegral $ concatMap spread binds
-          spread (id, offset) | isFollowableArg (bcIdArgRep platform id) = [ rel_offset ]
-                              | otherwise                                = []
+          spread (id, offset) | isUnboxedTupleType (idType id) ||
+                                isUnboxedSumType (idType id) = []
+                              | isFollowableArg (bcIdArgRep platform id) = [ rel_offset ]
+                              | otherwise                      = []
                 where rel_offset = trunc16W $ bytesToWords platform (d - offset)
 
+        bitmap = intsToReverseBitmap platform bitmap_size'{-size-} pointers
+
      alt_stuff <- mapM codeAlt alts
      alt_final <- mkMultiBranch maybe_ncons alt_stuff
 
@@ -1160,20 +1208,218 @@ doCase d s p (_,scrut) bndr alts is_unboxed_tuple
          alt_bco_name = getName bndr
          alt_bco = mkProtoBCO platform alt_bco_name alt_final (Left alts)
                        0{-no arity-} bitmap_size bitmap True{-is alts-}
---     trace ("case: bndr = " ++ showSDocDebug (ppr bndr) ++ "\ndepth = " ++ show d ++ "\nenv = \n" ++ showSDocDebug (ppBCEnv p) ++
---            "\n      bitmap = " ++ show bitmap) $ do
-
      scrut_code <- schemeE (d + ret_frame_size_b + save_ccs_size_b)
                            (d + ret_frame_size_b + save_ccs_size_b)
                            p scrut
      alt_bco' <- emitBc alt_bco
-     let push_alts
-            | isAlgCase = PUSH_ALTS alt_bco'
-            | otherwise = PUSH_ALTS_UNLIFTED alt_bco' (typeArgRep platform bndr_ty)
-     return (push_alts `consOL` scrut_code)
+     if ubx_tuple_frame
+       then do
+              let args_ptrs =
+                    map (\(rep, off) -> (isFollowableArg (toArgRep platform rep), off))
+                        args_offsets
+              tuple_bco <- emitBc (tupleBCO platform tuple_info args_ptrs)
+              return (PUSH_ALTS_TUPLE alt_bco' tuple_info tuple_bco
+                      `consOL` scrut_code)
+       else let push_alts
+                  | isAlgCase
+                  = PUSH_ALTS alt_bco'
+                  | otherwise
+                  = let unlifted_rep =
+                          case non_void_arg_reps of
+                            []    -> V
+                            [rep] -> rep
+                            _     -> panic "schemeE(StgCase).push_alts"
+                    in PUSH_ALTS_UNLIFTED alt_bco' unlifted_rep
+            in return (push_alts `consOL` scrut_code)
 
 
 -- -----------------------------------------------------------------------------
+-- Deal with tuples
+
+-- The native calling convention uses registers for tuples, but in the
+-- bytecode interpreter, all values live on the stack.
+
+layoutTuple :: Profile
+            -> ByteOff
+            -> (a -> CmmType)
+            -> [a]
+            -> ( TupleInfo      -- See Note [GHCi TupleInfo]
+               , [(a, ByteOff)] -- argument, offset on stack
+               )
+layoutTuple profile start_off arg_ty reps =
+  let platform = profilePlatform profile
+      (orig_stk_bytes, pos) = assignArgumentsPos profile
+                                                 0
+                                                 NativeReturn
+                                                 arg_ty
+                                                 reps
+
+      -- keep the stack parameters in the same place
+      orig_stk_params = [(x, fromIntegral off) | (x, StackParam off) <- pos]
+
+      -- sort the register parameters by register and add them to the stack
+      (regs, reg_params)
+          = unzip $ sortBy (comparing fst)
+                           [(reg, x) | (x, RegisterParam reg) <- pos]
+
+      (new_stk_bytes, new_stk_params) = assignStack platform
+                                                    orig_stk_bytes
+                                                    arg_ty
+                                                    reg_params
+
+      -- make live register bitmaps
+      bmp_reg r ~(v, f, d, l)
+        = case r of VanillaReg n _ -> (a v n, f,     d,     l    )
+                    FloatReg n     -> (v,     a f n, d,     l    )
+                    DoubleReg n    -> (v,     f,     a d n, l    )
+                    LongReg n      -> (v,     f,     d,     a l n)
+                    _              ->
+                      pprPanic "GHC.StgToByteCode.layoutTuple unsupported register type"
+                               (ppr r)
+              where a bmp n = bmp .|. (1 `shiftL` (n-1))
+
+      (vanilla_regs, float_regs, double_regs, long_regs)
+          = foldr bmp_reg (0, 0, 0, 0) regs
+
+      get_byte_off (x, StackParam y) = (x, fromIntegral y)
+      get_byte_off _                 =
+          panic "GHC.StgToByteCode.layoutTuple get_byte_off"
+
+  in ( TupleInfo
+         { tupleSize        = bytesToWords platform (ByteOff new_stk_bytes)
+         , tupleVanillaRegs = vanilla_regs
+         , tupleLongRegs    = long_regs
+         , tupleFloatRegs   = float_regs
+         , tupleDoubleRegs  = double_regs
+         , tupleNativeStackSize = bytesToWords platform
+                                               (ByteOff orig_stk_bytes)
+         }
+     , sortBy (comparing snd) $
+              map (\(x, o) -> (x, o + start_off))
+                  (orig_stk_params ++ map get_byte_off new_stk_params)
+     )
+
+{- Note [unboxed tuple bytecodes and tuple_BCO]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  We have the bytecode instructions RETURN_TUPLE and PUSH_ALTS_TUPLE to
+  return and receive arbitrary unboxed tuples, respectively. These
+  instructions use the helper data tuple_BCO and tuple_info.
+
+  The helper data is used to convert tuples between GHCs native calling
+  convention (object code), which uses stack and registers, and the bytecode
+  calling convention, which only uses the stack. See Note [GHCi TupleInfo]
+  for more details.
+
+
+  Returning a tuple
+  =================
+
+  Bytecode that returns a tuple first pushes all the tuple fields followed
+  by the appropriate tuple_info and tuple_BCO onto the stack. It then
+  executes the RETURN_TUPLE instruction, which causes the interpreter
+  to push stg_ret_t_info to the top of the stack. The stack (growing down)
+  then looks as follows:
+
+      ...
+      next_frame
+      tuple_field_1
+      tuple_field_2
+      ...
+      tuple_field_n
+      tuple_info
+      tuple_BCO
+      stg_ret_t_info <- Sp
+
+  If next_frame is bytecode, the interpreter will start executing it. If
+  it's object code, the interpreter jumps back to the scheduler, which in
+  turn jumps to stg_ret_t. stg_ret_t converts the tuple to the native
+  calling convention using the description in tuple_info, and then jumps
+  to next_frame.
+
+
+  Receiving a tuple
+  =================
+
+  Bytecode that receives a tuple uses the PUSH_ALTS_TUPLE instruction to
+  push a continuation, followed by jumping to the code that produces the
+  tuple. The PUSH_ALTS_TUPLE instuction contains three pieces of data:
+
+     * cont_BCO: the continuation that receives the tuple
+     * tuple_info: see below
+     * tuple_BCO: see below
+
+  The interpreter pushes these onto the stack when the PUSH_ALTS_TUPLE
+  instruction is executed, followed by stg_ctoi_tN_info, with N depending
+  on the number of stack words used by the tuple in the GHC native calling
+  convention. N is derived from tuple_info.
+
+  For example if we expect a tuple with three words on the stack, the stack
+  looks as follows after PUSH_ALTS_TUPLE:
+
+      ...
+      next_frame
+      cont_free_var_1
+      cont_free_var_2
+      ...
+      cont_free_var_n
+      tuple_info
+      tuple_BCO
+      cont_BCO
+      stg_ctoi_t3_info <- Sp
+
+  If the tuple is returned by object code, stg_ctoi_t3 will deal with
+  adjusting the stack pointer and converting the tuple to the bytecode
+  calling convention. See Note [GHCi unboxed tuples stack spills] for more
+  details.
+
+
+  The tuple_BCO
+  =============
+
+  The tuple_BCO is a helper bytecode object. Its main purpose is describing
+  the contents of the stack frame containing the tuple for the storage
+  manager. It contains only instructions to immediately return the tuple
+  that is already on the stack.
+
+
+  The tuple_info word
+  ===================
+
+  The tuple_info word describes the stack and STG register (e.g. R1..R6,
+  D1..D6) usage for the tuple. tuple_info contains enough information to
+  convert the tuple between the stack-only bytecode and stack+registers
+  GHC native calling conventions.
+
+  See Note [GHCi tuple layout] for more details of how the data is packed
+  in a single word.
+
+ -}
+
+tupleBCO :: Platform -> TupleInfo -> [(Bool, ByteOff)] -> [FFIInfo] -> ProtoBCO Name
+tupleBCO platform info pointers =
+  mkProtoBCO platform invented_name body_code (Left [])
+             0{-no arity-} bitmap_size bitmap False{-is alts-}
+
+  where
+    {-
+      The tuple BCO is never referred to by name, so we can get away
+      with using a fake name here. We will need to change this if we want
+      to save some memory by sharing the BCO between places that have
+      the same tuple shape
+    -}
+    invented_name  = mkSystemVarName (mkPseudoUniqueE 0) (fsLit "tuple")
+
+    -- the first word in the frame is the tuple_info word,
+    -- which is not a pointer
+    bitmap_size = trunc16W $ 1 + tupleSize info
+    bitmap      = intsToReverseBitmap platform (fromIntegral bitmap_size) $
+                  map ((+1) . fromIntegral . bytesToWords platform . snd)
+                      (filter fst pointers)
+    body_code = mkSlideW 0 1          -- pop frame header
+                `snocOL` RETURN_TUPLE -- and add it again
+
+-- -----------------------------------------------------------------------------
 -- Deal with a CCall.
 
 -- Taggedly push the args onto the stack R->L,
@@ -1187,10 +1433,10 @@ generateCCall
     -> Sequel
     -> BCEnv
     -> CCallSpec               -- where to call
-    -> Id                      -- of target, for type info
-    -> [AnnExpr' Id DVarSet]   -- args (atoms)
+    -> Type
+    -> [StgArg]              -- args (atoms)
     -> BcM BCInstrList
-generateCCall d0 s p (CCallSpec target cconv safety) fn args_r_to_l
+generateCCall d0 s p (CCallSpec target cconv safety) result_ty args_r_to_l
  = do
      profile <- getProfile
 
@@ -1200,56 +1446,40 @@ generateCCall d0 s p (CCallSpec target cconv safety) fn args_r_to_l
          addr_size_b :: ByteOff
          addr_size_b = wordSize platform
 
+         arrayish_rep_hdr_size :: TyCon -> Maybe Int
+         arrayish_rep_hdr_size t
+           | t == arrayPrimTyCon || t == mutableArrayPrimTyCon
+              = Just (arrPtrsHdrSize profile)
+           | t == smallArrayPrimTyCon || t == smallMutableArrayPrimTyCon
+              = Just (smallArrPtrsHdrSize profile)
+           | t == byteArrayPrimTyCon || t == mutableByteArrayPrimTyCon
+              = Just (arrWordsHdrSize profile)
+           | otherwise
+              = Nothing
+
          -- Get the args on the stack, with tags and suitably
          -- dereferenced for the CCall.  For each arg, return the
          -- depth to the first word of the bits for that arg, and the
          -- ArgRep of what was actually pushed.
 
          pargs
-             :: ByteOff -> [AnnExpr' Id DVarSet] -> BcM [(BCInstrList, PrimRep)]
+             :: ByteOff -> [StgArg] -> BcM [(BCInstrList, PrimRep)]
          pargs _ [] = return []
-         pargs d (a:az)
-            = let arg_ty = unwrapType (exprType (deAnnotate' a))
-
-              in case tyConAppTyCon_maybe arg_ty of
-                    -- Don't push the FO; instead push the Addr# it
-                    -- contains.
-                    Just t
-                     | t == arrayPrimTyCon || t == mutableArrayPrimTyCon
-                       -> do rest <- pargs (d + addr_size_b) az
-                             code <- parg_ArrayishRep (fromIntegral (arrPtrsHdrSize profile)) d p a
-                             return ((code,AddrRep):rest)
-
-                     | t == smallArrayPrimTyCon || t == smallMutableArrayPrimTyCon
-                       -> do rest <- pargs (d + addr_size_b) az
-                             code <- parg_ArrayishRep (fromIntegral (smallArrPtrsHdrSize profile)) d p a
-                             return ((code,AddrRep):rest)
-
-                     | t == byteArrayPrimTyCon || t == mutableByteArrayPrimTyCon
-                       -> do rest <- pargs (d + addr_size_b) az
-                             code <- parg_ArrayishRep (fromIntegral (arrWordsHdrSize profile)) d p a
-                             return ((code,AddrRep):rest)
-
-                    -- Default case: push taggedly, but otherwise intact.
-                    _
-                       -> do (code_a, sz_a) <- pushAtom d p a
-                             rest <- pargs (d + sz_a) az
-                             return ((code_a, atomPrimRep a) : rest)
-
-         -- Do magic for Ptr/Byte arrays.  Push a ptr to the array on
-         -- the stack but then advance it over the headers, so as to
-         -- point to the payload.
-         parg_ArrayishRep
-             :: Word16
-             -> StackDepth
-             -> BCEnv
-             -> AnnExpr' Id DVarSet
-             -> BcM BCInstrList
-         parg_ArrayishRep hdrSize d p a
-            = do (push_fo, _) <- pushAtom d p a
+         pargs d (aa@(StgVarArg a):az)
+            | Just t      <- tyConAppTyCon_maybe (idType a)
+            , Just hdr_sz <- arrayish_rep_hdr_size t
+            -- Do magic for Ptr/Byte arrays.  Push a ptr to the array on
+            -- the stack but then advance it over the headers, so as to
+            -- point to the payload.
+            = do rest <- pargs (d + addr_size_b) az
+                 (push_fo, _) <- pushAtom d p aa
                  -- The ptr points at the header.  Advance it over the
                  -- header and then pretend this is an Addr#.
-                 return (push_fo `snocOL` SWIZZLE 0 hdrSize)
+                 let code = push_fo `snocOL` SWIZZLE 0 (fromIntegral hdr_sz)
+                 return ((code, AddrRep) : rest)
+         pargs d (aa:az) =  do (code_a, sz_a) <- pushAtom d p aa
+                               rest <- pargs (d + sz_a) az
+                               return ((code_a, atomPrimRep aa) : rest)
 
      code_n_reps <- pargs d0 args_r_to_l
      let
@@ -1260,7 +1490,7 @@ generateCCall d0 s p (CCallSpec target cconv safety) fn args_r_to_l
          !d_after_args = d0 + wordsToBytes platform a_reps_sizeW
          a_reps_pushed_RAW
             | null a_reps_pushed_r_to_l || not (isVoidRep (head a_reps_pushed_r_to_l))
-            = panic "GHC.CoreToByteCode.generateCCall: missing or invalid World token?"
+            = panic "GHC.StgToByteCode.generateCCall: missing or invalid World token?"
             | otherwise
             = reverse (tail a_reps_pushed_r_to_l)
 
@@ -1270,7 +1500,7 @@ generateCCall d0 s p (CCallSpec target cconv safety) fn args_r_to_l
 
          -- Get the result rep.
          (returns_void, r_rep)
-            = case maybe_getCCallReturnRep (idType fn) of
+            = case maybe_getCCallReturnRep result_ty of
                  Nothing -> (True,  VoidRep)
                  Just rr -> (False, rr)
          {-
@@ -1332,7 +1562,7 @@ generateCCall d0 s p (CCallSpec target cconv safety) fn args_r_to_l
          a_reps --  | trace (showSDoc (ppr a_reps_pushed_RAW)) False = error "???"
                 | is_static = a_reps_pushed_RAW
                 | otherwise = if null a_reps_pushed_RAW
-                              then panic "GHC.CoreToByteCode.generateCCall: dyn with no args"
+                              then panic "GHC.StgToByteCode.generateCCall: dyn with no args"
                               else tail a_reps_pushed_RAW
 
          -- push the Addr#
@@ -1362,7 +1592,7 @@ generateCCall d0 s p (CCallSpec target cconv safety) fn args_r_to_l
          conv = case cconv of
            CCallConv -> FFICCall
            StdCallConv -> FFIStdCall
-           _ -> panic "GHC.CoreToByteCode: unexpected calling convention"
+           _ -> panic "GHC.StgToByteCode: unexpected calling convention"
 
      -- the only difference in libffi mode is that we prepare a cif
      -- describing the call type by calling libffi, and we attach the
@@ -1472,14 +1702,10 @@ maybe_getCCallReturnRep fn_ty
                  -- valid return value placeholder on the stack
          _             -> blargh
 
-maybe_is_tagToEnum_call :: AnnExpr' Id DVarSet -> Maybe (AnnExpr' Id DVarSet, [Name])
+maybe_is_tagToEnum_call :: CgStgExpr -> Maybe (Id, [Name])
 -- Detect and extract relevant info for the tagToEnum kludge.
-maybe_is_tagToEnum_call app
-  | AnnApp (_, AnnApp (_, AnnVar v) (_, AnnType t)) arg <- app
-  , Just TagToEnumOp <- isPrimOpId_maybe v
-  = Just (snd arg, extract_constr_Names t)
-  | otherwise
-  = Nothing
+maybe_is_tagToEnum_call (StgOpApp (StgPrimOp TagToEnumOp) [StgVarArg v] t)
+  = Just (v, extract_constr_Names t)
   where
     extract_constr_Names ty
            | rep_ty <- unwrapType ty
@@ -1490,6 +1716,7 @@ maybe_is_tagToEnum_call app
            -- the DataCon.  See "GHC.Core.DataCon" for details.
            | otherwise
            = pprPanic "maybe_is_tagToEnum_call.extract_constr_Ids" (ppr ty)
+maybe_is_tagToEnum_call _ = Nothing
 
 {- -----------------------------------------------------------------------------
 Note [Implementing tagToEnum#]
@@ -1533,13 +1760,13 @@ implement_tagToId
     :: StackDepth
     -> Sequel
     -> BCEnv
-    -> AnnExpr' Id DVarSet
+    -> Id
     -> [Name]
     -> BcM BCInstrList
 -- See Note [Implementing tagToEnum#]
 implement_tagToId d s p arg names
   = ASSERT( notNull names )
-    do (push_arg, arg_bytes) <- pushAtom d p arg
+    do (push_arg, arg_bytes) <- pushAtom d p (StgVarArg arg)
        labels <- getLabelsBc (genericLength names)
        label_fail <- getLabelBc
        label_exit <- getLabelBc
@@ -1582,21 +1809,12 @@ implement_tagToId d s p arg names
 -- depth 6 stack has valid words 0 .. 5.
 
 pushAtom
-    :: StackDepth -> BCEnv -> AnnExpr' Id DVarSet -> BcM (BCInstrList, ByteOff)
-pushAtom d p e
-   | Just e' <- bcView e
-   = pushAtom d p e'
-
-pushAtom _ _ (AnnCoercion {})   -- Coercions are zero-width things,
-   = return (nilOL, 0)          -- treated just like a variable V
+    :: StackDepth -> BCEnv -> StgArg -> BcM (BCInstrList, ByteOff)
 
 -- See Note [Empty case alternatives] in GHC.Core
 -- and Note [Bottoming expressions] in GHC.Core.Utils:
 -- The scrutinee of an empty case evaluates to bottom
-pushAtom d p (AnnCase (_, a) _ _ []) -- trac #12128
-   = pushAtom d p a
-
-pushAtom d p (AnnVar var)
+pushAtom d p (StgVarArg var)
    | [] <- typePrimRep (idType var)
    = return (nilOL, 0)
 
@@ -1635,15 +1853,14 @@ pushAtom d p (AnnVar var)
    = do topStrings <- getTopStrings
         platform <- targetPlatform <$> getDynFlags
         case lookupVarEnv topStrings var of
-            Just ptr -> pushAtom d p $ AnnLit $ mkLitWord platform $
+            Just ptr -> pushAtom d p $ StgLitArg $ mkLitWord platform $
               fromIntegral $ ptrToWordPtr $ fromRemotePtr ptr
             Nothing -> do
                 let sz = idSizeCon platform var
                 MASSERT( sz == wordSize platform )
                 return (unitOL (PUSH_G (getName var)), sz)
 
-
-pushAtom _ _ (AnnLit lit) = do
+pushAtom _ _ (StgLitArg lit) = do
      platform <- targetPlatform <$> getDynFlags
      let code :: PrimRep -> BcM (BCInstrList, ByteOff)
          code rep =
@@ -1684,21 +1901,15 @@ pushAtom _ _ (AnnLit lit) = do
           LitNumInteger -> panic "pushAtom: LitInteger"
           LitNumNatural -> panic "pushAtom: LitNatural"
 
-pushAtom _ _ expr
-   = pprPanic "GHC.CoreToByteCode.pushAtom"
-              (pprCoreExpr (deAnnotate' expr))
-
-
 -- | Push an atom for constructor (i.e., PACK instruction) onto the stack.
 -- This is slightly different to @pushAtom@ due to the fact that we allow
 -- packing constructor fields. See also @mkConAppCode@ and @pushPadding@.
 pushConstrAtom
-    :: StackDepth -> BCEnv -> AnnExpr' Id DVarSet -> BcM (BCInstrList, ByteOff)
-
-pushConstrAtom _ _ (AnnLit lit@(LitFloat _)) =
+    :: StackDepth -> BCEnv -> StgArg -> BcM (BCInstrList, ByteOff)
+pushConstrAtom _ _ (StgLitArg lit@(LitFloat _)) =
     return (unitOL (PUSH_UBX32 lit), 4)
 
-pushConstrAtom d p (AnnVar v)
+pushConstrAtom d p va@(StgVarArg v)
     | Just d_v <- lookupBCEnv_maybe v p = do  -- v is a local variable
         platform <- targetPlatform <$> getDynFlags
         let !szb = idSizeCon platform v
@@ -1709,7 +1920,7 @@ pushConstrAtom d p (AnnVar v)
             1 -> done PUSH8
             2 -> done PUSH16
             4 -> done PUSH32
-            _ -> pushAtom d p (AnnVar v)
+            _ -> pushAtom d p va
 
 pushConstrAtom d p expr = pushAtom d p expr
 
@@ -1869,7 +2080,14 @@ idSizeW :: Platform -> Id -> WordOff
 idSizeW platform = WordOff . argRepSizeW platform . bcIdArgRep platform
 
 idSizeCon :: Platform -> Id -> ByteOff
-idSizeCon platform = ByteOff . primRepSizeB platform . bcIdPrimRep
+idSizeCon platform var
+  -- unboxed tuple components are padded to word size
+  | isUnboxedTupleType (idType var) ||
+    isUnboxedSumType (idType var) =
+    wordsToBytes platform .
+    WordOff . sum . map (argRepSizeW platform . toArgRep platform) .
+    bcIdPrimReps $ var
+  | otherwise = ByteOff (primRepSizeB platform (bcIdPrimRep var))
 
 bcIdArgRep :: Platform -> Id -> ArgRep
 bcIdArgRep platform = toArgRep platform . bcIdPrimRep
@@ -1881,6 +2099,10 @@ bcIdPrimRep id
   | otherwise
   = pprPanic "bcIdPrimRep" (ppr id <+> dcolon <+> ppr (idType id))
 
+
+bcIdPrimReps :: Id -> [PrimRep]
+bcIdPrimReps id = typePrimRepArgs (idType id)
+
 repSizeWords :: Platform -> PrimRep -> WordOff
 repSizeWords platform rep = WordOff $ argRepSizeW platform (toArgRep platform rep)
 
@@ -1888,17 +2110,6 @@ isFollowableArg :: ArgRep -> Bool
 isFollowableArg P = True
 isFollowableArg _ = False
 
-isVoidArg :: ArgRep -> Bool
-isVoidArg V = True
-isVoidArg _ = False
-
--- See bug #1257
-multiValException :: a
-multiValException = throwGhcException (ProgramError
-  ("Error: bytecode compiler can't handle unboxed tuples and sums.\n"++
-   "  Possibly due to foreign import/export decls in source.\n"++
-   "  Workaround: use -fobject-code, or compile this module to .o separately."))
-
 -- | Indicate if the calling convention is supported
 isSupportedCConv :: CCallSpec -> Bool
 isSupportedCConv (CCallSpec _ cconv _) = case cconv of
@@ -1934,62 +2145,11 @@ mkSlideW !n !ws
     limit :: Word16
     limit = maxBound
 
-splitApp :: AnnExpr' Var ann -> (AnnExpr' Var ann, [AnnExpr' Var ann])
-        -- The arguments are returned in *right-to-left* order
-splitApp e | Just e' <- bcView e = splitApp e'
-splitApp (AnnApp (_,f) (_,a))    = case splitApp f of
-                                      (f', as) -> (f', a:as)
-splitApp e                       = (e, [])
-
-
-bcView :: AnnExpr' Var ann -> Maybe (AnnExpr' Var ann)
--- The "bytecode view" of a term discards
---  a) type abstractions
---  b) type applications
---  c) casts
---  d) ticks (but not breakpoints)
---  e) case unsafeEqualityProof of UnsafeRefl -> e  ==> e
--- Type lambdas *can* occur in random expressions,
--- whereas value lambdas cannot; that is why they are nuked here
-bcView (AnnCast (_,e) _)             = Just e
-bcView (AnnLam v (_,e)) | isTyVar v  = Just e
-bcView (AnnApp (_,e) (_, AnnType _)) = Just e
-bcView (AnnTick Breakpoint{} _)      = Nothing
-bcView (AnnTick _other_tick (_,e))   = Just e
-bcView (AnnCase (_,e) _ _ alts)  -- Handle unsafe equality proof
-  | AnnVar id <- bcViewLoop e
-  , idName id == unsafeEqualityProofName
-  , [AnnAlt _ _ (_, rhs)] <- alts
-  = Just rhs
-bcView _                             = Nothing
-
-bcViewLoop :: AnnExpr' Var ann -> AnnExpr' Var ann
-bcViewLoop e =
-    case bcView e of
-      Nothing -> e
-      Just e' -> bcViewLoop e'
-
-isVAtom :: Platform -> AnnExpr' Var ann -> Bool
-isVAtom platform expr = case expr of
-   e | Just e' <- bcView e -> isVAtom platform e'
-   (AnnVar v)              -> isVoidArg (bcIdArgRep platform v)
-   (AnnCoercion {})        -> True
-   _                       -> False
-
-atomPrimRep :: AnnExpr' Id ann -> PrimRep
-atomPrimRep e | Just e' <- bcView e = atomPrimRep e'
-atomPrimRep (AnnVar v)              = bcIdPrimRep v
-atomPrimRep (AnnLit l)              = typePrimRep1 (literalType l)
-
--- #12128:
--- A case expression can be an atom because empty cases evaluate to bottom.
--- See Note [Empty case alternatives] in GHC.Core
-atomPrimRep (AnnCase _ _ ty _)      =
-  ASSERT(case typePrimRep ty of [LiftedRep] -> True; _ -> False) LiftedRep
-atomPrimRep (AnnCoercion {})        = VoidRep
-atomPrimRep other = pprPanic "atomPrimRep" (ppr (deAnnotate' other))
+atomPrimRep :: StgArg -> PrimRep
+atomPrimRep (StgVarArg v) = bcIdPrimRep v
+atomPrimRep (StgLitArg l) = typePrimRep1 (literalType l)
 
-atomRep :: Platform -> AnnExpr' Id ann -> ArgRep
+atomRep :: Platform -> StgArg -> ArgRep
 atomRep platform e = toArgRep platform (atomPrimRep e)
 
 -- | Let szsw be the sizes in bytes of some items pushed onto the stack, which
@@ -1998,8 +2158,8 @@ atomRep platform e = toArgRep platform (atomPrimRep e)
 mkStackOffsets :: ByteOff -> [ByteOff] -> [ByteOff]
 mkStackOffsets original_depth szsb = tail (scanl' (+) original_depth szsb)
 
-typeArgRep :: Platform -> Type -> ArgRep
-typeArgRep platform = toArgRep platform . typePrimRep1
+typeArgReps :: Platform -> Type -> [ArgRep]
+typeArgReps platform = map (toArgRep platform) . typePrimRepArgs
 
 -- -----------------------------------------------------------------------------
 -- The bytecode generator's monad
@@ -2088,7 +2248,7 @@ getLabelsBc n
 
 getCCArray :: BcM (Array BreakIndex (RemotePtr CostCentre))
 getCCArray = BcM $ \st ->
-  let breaks = expectJust "GHC.CoreToByteCode.getCCArray" $ modBreaks st in
+  let breaks = expectJust "GHC.StgToByteCode.getCCArray" $ modBreaks st in
   return (st, modBreaks_ccs breaks)
 
 
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index 76d225bd57..ded5bc4c07 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -335,7 +335,6 @@ Library
         GHC.Core.Stats
         GHC.Core.Subst
         GHC.Core.Tidy
-        GHC.CoreToByteCode
         GHC.CoreToIface
         GHC.CoreToStg
         GHC.CoreToStg.Prep
@@ -536,6 +535,7 @@ Library
         GHC.Stg.Stats
         GHC.Stg.Subst
         GHC.Stg.Syntax
+        GHC.StgToByteCode
         GHC.StgToCmm
         GHC.StgToCmm.ArgRep
         GHC.StgToCmm.Bind
diff --git a/includes/rts/Bytecodes.h b/includes/rts/Bytecodes.h
index f7a0d6f151..859892de2d 100644
--- a/includes/rts/Bytecodes.h
+++ b/includes/rts/Bytecodes.h
@@ -91,6 +91,9 @@
 #define bci_BRK_FUN			66
 #define bci_TESTLT_W   			67
 #define bci_TESTEQ_W  			68
+
+#define bci_RETURN_T          69
+#define bci_PUSH_ALTS_T       70
 /* If you need to go past 255 then you will run into the flags */
 
 /* If you need to go below 0x0100 then you will run into the instructions */
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 43e099a0d6..d8aefd8035 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -87,6 +87,77 @@ RTS_RET(stg_ctoi_D1);
 RTS_RET(stg_ctoi_L1);
 RTS_RET(stg_ctoi_V);
 
+RTS_FUN_DECL(stg_ctoi_t);
+RTS_RET(stg_ctoi_t0);
+RTS_RET(stg_ctoi_t1);
+RTS_RET(stg_ctoi_t2);
+RTS_RET(stg_ctoi_t3);
+RTS_RET(stg_ctoi_t4);
+RTS_RET(stg_ctoi_t5);
+RTS_RET(stg_ctoi_t6);
+RTS_RET(stg_ctoi_t7);
+RTS_RET(stg_ctoi_t8);
+RTS_RET(stg_ctoi_t9);
+
+RTS_RET(stg_ctoi_t10);
+RTS_RET(stg_ctoi_t11);
+RTS_RET(stg_ctoi_t12);
+RTS_RET(stg_ctoi_t13);
+RTS_RET(stg_ctoi_t14);
+RTS_RET(stg_ctoi_t15);
+RTS_RET(stg_ctoi_t16);
+RTS_RET(stg_ctoi_t17);
+RTS_RET(stg_ctoi_t18);
+RTS_RET(stg_ctoi_t19);
+
+RTS_RET(stg_ctoi_t20);
+RTS_RET(stg_ctoi_t21);
+RTS_RET(stg_ctoi_t22);
+RTS_RET(stg_ctoi_t23);
+RTS_RET(stg_ctoi_t24);
+RTS_RET(stg_ctoi_t25);
+RTS_RET(stg_ctoi_t26);
+RTS_RET(stg_ctoi_t27);
+RTS_RET(stg_ctoi_t28);
+RTS_RET(stg_ctoi_t29);
+
+RTS_RET(stg_ctoi_t30);
+RTS_RET(stg_ctoi_t31);
+RTS_RET(stg_ctoi_t32);
+RTS_RET(stg_ctoi_t33);
+RTS_RET(stg_ctoi_t34);
+RTS_RET(stg_ctoi_t35);
+RTS_RET(stg_ctoi_t36);
+RTS_RET(stg_ctoi_t37);
+RTS_RET(stg_ctoi_t38);
+RTS_RET(stg_ctoi_t39);
+
+RTS_RET(stg_ctoi_t40);
+RTS_RET(stg_ctoi_t41);
+RTS_RET(stg_ctoi_t42);
+RTS_RET(stg_ctoi_t43);
+RTS_RET(stg_ctoi_t44);
+RTS_RET(stg_ctoi_t45);
+RTS_RET(stg_ctoi_t46);
+RTS_RET(stg_ctoi_t47);
+RTS_RET(stg_ctoi_t48);
+RTS_RET(stg_ctoi_t49);
+
+RTS_RET(stg_ctoi_t50);
+RTS_RET(stg_ctoi_t51);
+RTS_RET(stg_ctoi_t52);
+RTS_RET(stg_ctoi_t53);
+RTS_RET(stg_ctoi_t54);
+RTS_RET(stg_ctoi_t55);
+RTS_RET(stg_ctoi_t56);
+RTS_RET(stg_ctoi_t57);
+RTS_RET(stg_ctoi_t58);
+RTS_RET(stg_ctoi_t59);
+
+RTS_RET(stg_ctoi_t60);
+RTS_RET(stg_ctoi_t61);
+RTS_RET(stg_ctoi_t62);
+
 RTS_RET(stg_apply_interp);
 
 RTS_ENTRY(stg_IND);
@@ -292,6 +363,7 @@ RTS_RET(stg_ret_n);
 RTS_RET(stg_ret_f);
 RTS_RET(stg_ret_d);
 RTS_RET(stg_ret_l);
+RTS_RET(stg_ret_t);
 
 RTS_FUN_DECL(stg_gc_prim);
 RTS_FUN_DECL(stg_gc_prim_p);
diff --git a/libraries/ghci/GHCi/BreakArray.hs b/libraries/ghci/GHCi/BreakArray.hs
index 51bf3466eb..2c13928801 100644
--- a/libraries/ghci/GHCi/BreakArray.hs
+++ b/libraries/ghci/GHCi/BreakArray.hs
@@ -24,7 +24,7 @@
 module GHCi.BreakArray
     (
       BreakArray
-          (BA) -- constructor is exported only for GHC.CoreToByteCode
+          (BA) -- constructor is exported only for GHC.StgToByteCode
     , newBreakArray
     , getBreak
     , setupBreakpoint
diff --git a/rts/Disassembler.c b/rts/Disassembler.c
index 67a451e7e6..451521d57e 100644
--- a/rts/Disassembler.c
+++ b/rts/Disassembler.c
@@ -148,6 +148,13 @@ disInstr ( StgBCO *bco, int pc )
          debugBelch("PUSH_ALTS_V  " ); printPtr( ptrs[instrs[pc]] );
          debugBelch("\n");
          pc += 1; break;
+      case bci_PUSH_ALTS_T:
+         debugBelch("PUSH_ALTS_T  ");
+         printPtr( ptrs[instrs[pc]] );
+         debugBelch(" 0x%" FMT_HexWord " ", literals[instrs[pc+1]] );
+         printPtr( ptrs[instrs[pc+2]] );
+         debugBelch("\n");
+         pc += 3; break;
       case bci_PUSH_PAD8:
          debugBelch("PUSH_PAD8\n");
          pc += 1; break;
@@ -310,6 +317,9 @@ disInstr ( StgBCO *bco, int pc )
       case bci_RETURN_V:
          debugBelch("RETURN_V\n" );
          break;
+      case bci_RETURN_T:
+         debugBelch("RETURN_T\n ");
+         break;
 
       default:
          barf("disInstr: unknown opcode %u", (unsigned int) instr);
@@ -317,12 +327,6 @@ disInstr ( StgBCO *bco, int pc )
    return pc;
 }
 
-
-/* Something of a kludge .. how do we know where the end of the insn
-   array is, since it isn't recorded anywhere?  Answer: the first
-   short is the number of bytecodes which follow it.
-   See GHC.CoreToByteCode.linkBCO.insns_arr for construction ...
-*/
 void disassemble( StgBCO *bco )
 {
    uint32_t i, j;
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
index 6929aec5fd..efbfd091d8 100644
--- a/rts/Interpreter.c
+++ b/rts/Interpreter.c
@@ -4,6 +4,7 @@
  * Copyright (c) The GHC Team, 1994-2002.
  * ---------------------------------------------------------------------------*/
 
+
 #include "PosixSource.h"
 #include "Rts.h"
 #include "RtsAPI.h"
@@ -681,12 +682,13 @@ do_return_unboxed:
                 || SpW(0) == (W_)&stg_ret_f_info
                 || SpW(0) == (W_)&stg_ret_d_info
                 || SpW(0) == (W_)&stg_ret_l_info
+                || SpW(0) == (W_)&stg_ret_t_info
             );
 
         IF_DEBUG(interpreter,
              debugBelch(
              "\n---------------------------------------------------------------\n");
-             debugBelch("Returning: "); printObj(obj);
+             debugBelch("Returning unboxed\n");
              debugBelch("Sp = %p\n", Sp);
 #if defined(PROFILING)
              fprintCCS(stderr, cap->r.rCCCS);
@@ -697,7 +699,7 @@ do_return_unboxed:
              debugBelch("\n\n");
             );
 
-        // get the offset of the stg_ctoi_ret_XXX itbl
+        // get the offset of the header of the next stack frame
         offset = stack_frame_sizeW((StgClosure *)Sp);
 
         switch (get_itbl((StgClosure*)(Sp_plusW(offset)))->type) {
@@ -934,6 +936,43 @@ run_BCO_return_unboxed:
     // Stack checks aren't necessary at return points, the stack use
     // is aggregated into the enclosing function entry point.
 
+#if defined(PROFILING)
+    /*
+       Restore the current cost centre stack if a tuple is being returned.
+
+       When a "simple" unboxed value is returned, the cccs is restored with
+       an stg_restore_cccs frame on the stack, for example:
+
+           ...
+           stg_ctoi_D1
+           <CCCS>
+           stg_restore_cccs
+
+       But stg_restore_cccs cannot deal with tuples, which may have more
+       things on the stack. Therefore we store the CCCS inside the
+       stg_ctoi_t frame.
+
+       If we have a tuple being returned, the stack looks like this:
+
+           ...
+           <CCCS>           <- to restore, Sp offset <next frame + 4 words>
+           tuple_BCO
+           tuple_info
+           cont_BCO
+           stg_ctoi_t       <- next frame
+           tuple_data_1
+           ...
+           tuple_data_n
+           tuple_info
+           tuple_BCO
+           stg_ret_t        <- Sp
+     */
+
+    if(SpW(0) == (W_)&stg_ret_t_info) {
+        cap->r.rCCCS = (CostCentreStack*)SpW(stack_frame_sizeW((StgClosure *)Sp) + 4);
+    }
+#endif
+
     goto run_BCO;
 
 run_BCO_fun:
@@ -1329,6 +1368,100 @@ run_BCO:
             goto nextInsn;
         }
 
+        case bci_PUSH_ALTS_T: {
+            int o_bco = BCO_GET_LARGE_ARG;
+            W_ tuple_info = (W_)BCO_LIT(BCO_GET_LARGE_ARG);
+            int o_tuple_bco = BCO_GET_LARGE_ARG;
+
+#if defined(PROFILING)
+            SpW(-1) = (W_)cap->r.rCCCS;
+            Sp_subW(1);
+#endif
+
+            SpW(-1) = BCO_PTR(o_tuple_bco);
+            SpW(-2) = tuple_info;
+            SpW(-3) = BCO_PTR(o_bco);
+            W_ ctoi_t_offset;
+            int tuple_stack_words = tuple_info & 0x3fff;
+            switch(tuple_stack_words) {
+                case 0:  ctoi_t_offset = (W_)&stg_ctoi_t0_info;  break;
+                case 1:  ctoi_t_offset = (W_)&stg_ctoi_t1_info;  break;
+                case 2:  ctoi_t_offset = (W_)&stg_ctoi_t2_info;  break;
+                case 3:  ctoi_t_offset = (W_)&stg_ctoi_t3_info;  break;
+                case 4:  ctoi_t_offset = (W_)&stg_ctoi_t4_info;  break;
+                case 5:  ctoi_t_offset = (W_)&stg_ctoi_t5_info;  break;
+                case 6:  ctoi_t_offset = (W_)&stg_ctoi_t6_info;  break;
+                case 7:  ctoi_t_offset = (W_)&stg_ctoi_t7_info;  break;
+                case 8:  ctoi_t_offset = (W_)&stg_ctoi_t8_info;  break;
+                case 9:  ctoi_t_offset = (W_)&stg_ctoi_t9_info;  break;
+
+                case 10: ctoi_t_offset = (W_)&stg_ctoi_t10_info; break;
+                case 11: ctoi_t_offset = (W_)&stg_ctoi_t11_info; break;
+                case 12: ctoi_t_offset = (W_)&stg_ctoi_t12_info; break;
+                case 13: ctoi_t_offset = (W_)&stg_ctoi_t13_info; break;
+                case 14: ctoi_t_offset = (W_)&stg_ctoi_t14_info; break;
+                case 15: ctoi_t_offset = (W_)&stg_ctoi_t15_info; break;
+                case 16: ctoi_t_offset = (W_)&stg_ctoi_t16_info; break;
+                case 17: ctoi_t_offset = (W_)&stg_ctoi_t17_info; break;
+                case 18: ctoi_t_offset = (W_)&stg_ctoi_t18_info; break;
+                case 19: ctoi_t_offset = (W_)&stg_ctoi_t19_info; break;
+
+                case 20: ctoi_t_offset = (W_)&stg_ctoi_t20_info; break;
+                case 21: ctoi_t_offset = (W_)&stg_ctoi_t21_info; break;
+                case 22: ctoi_t_offset = (W_)&stg_ctoi_t22_info; break;
+                case 23: ctoi_t_offset = (W_)&stg_ctoi_t23_info; break;
+                case 24: ctoi_t_offset = (W_)&stg_ctoi_t24_info; break;
+                case 25: ctoi_t_offset = (W_)&stg_ctoi_t25_info; break;
+                case 26: ctoi_t_offset = (W_)&stg_ctoi_t26_info; break;
+                case 27: ctoi_t_offset = (W_)&stg_ctoi_t27_info; break;
+                case 28: ctoi_t_offset = (W_)&stg_ctoi_t28_info; break;
+                case 29: ctoi_t_offset = (W_)&stg_ctoi_t29_info; break;
+
+                case 30: ctoi_t_offset = (W_)&stg_ctoi_t30_info; break;
+                case 31: ctoi_t_offset = (W_)&stg_ctoi_t31_info; break;
+                case 32: ctoi_t_offset = (W_)&stg_ctoi_t32_info; break;
+                case 33: ctoi_t_offset = (W_)&stg_ctoi_t33_info; break;
+                case 34: ctoi_t_offset = (W_)&stg_ctoi_t34_info; break;
+                case 35: ctoi_t_offset = (W_)&stg_ctoi_t35_info; break;
+                case 36: ctoi_t_offset = (W_)&stg_ctoi_t36_info; break;
+                case 37: ctoi_t_offset = (W_)&stg_ctoi_t37_info; break;
+                case 38: ctoi_t_offset = (W_)&stg_ctoi_t38_info; break;
+                case 39: ctoi_t_offset = (W_)&stg_ctoi_t39_info; break;
+
+                case 40: ctoi_t_offset = (W_)&stg_ctoi_t40_info; break;
+                case 41: ctoi_t_offset = (W_)&stg_ctoi_t41_info; break;
+                case 42: ctoi_t_offset = (W_)&stg_ctoi_t42_info; break;
+                case 43: ctoi_t_offset = (W_)&stg_ctoi_t43_info; break;
+                case 44: ctoi_t_offset = (W_)&stg_ctoi_t44_info; break;
+                case 45: ctoi_t_offset = (W_)&stg_ctoi_t45_info; break;
+                case 46: ctoi_t_offset = (W_)&stg_ctoi_t46_info; break;
+                case 47: ctoi_t_offset = (W_)&stg_ctoi_t47_info; break;
+                case 48: ctoi_t_offset = (W_)&stg_ctoi_t48_info; break;
+                case 49: ctoi_t_offset = (W_)&stg_ctoi_t49_info; break;
+
+                case 50: ctoi_t_offset = (W_)&stg_ctoi_t50_info; break;
+                case 51: ctoi_t_offset = (W_)&stg_ctoi_t51_info; break;
+                case 52: ctoi_t_offset = (W_)&stg_ctoi_t52_info; break;
+                case 53: ctoi_t_offset = (W_)&stg_ctoi_t53_info; break;
+                case 54: ctoi_t_offset = (W_)&stg_ctoi_t54_info; break;
+                case 55: ctoi_t_offset = (W_)&stg_ctoi_t55_info; break;
+                case 56: ctoi_t_offset = (W_)&stg_ctoi_t56_info; break;
+                case 57: ctoi_t_offset = (W_)&stg_ctoi_t57_info; break;
+                case 58: ctoi_t_offset = (W_)&stg_ctoi_t58_info; break;
+                case 59: ctoi_t_offset = (W_)&stg_ctoi_t59_info; break;
+
+                case 60: ctoi_t_offset = (W_)&stg_ctoi_t60_info; break;
+                case 61: ctoi_t_offset = (W_)&stg_ctoi_t61_info; break;
+                case 62: ctoi_t_offset = (W_)&stg_ctoi_t62_info; break;
+
+                default: barf("unsupported tuple size %d", tuple_stack_words);
+            }
+
+            SpW(-4) = ctoi_t_offset;
+            Sp_subW(4);
+            goto nextInsn;
+        }
+
         case bci_PUSH_APPLY_N:
             Sp_subW(1); SpW(0) = (W_)&stg_ap_n_info;
             goto nextInsn;
@@ -1708,6 +1841,12 @@ run_BCO:
             Sp_subW(1);
             SpW(0) = (W_)&stg_ret_v_info;
             goto do_return_unboxed;
+        case bci_RETURN_T: {
+            /* tuple_info and tuple_bco must already be on the stack */
+            Sp_subW(1);
+            SpW(0) = (W_)&stg_ret_t_info;
+            goto do_return_unboxed;
+        }
 
         case bci_SWIZZLE: {
             int stkoff = BCO_NEXT;
diff --git a/rts/Printer.c b/rts/Printer.c
index ef9a52719b..7d9614cfd7 100644
--- a/rts/Printer.c
+++ b/rts/Printer.c
@@ -529,17 +529,7 @@ printStackChunk( StgPtr sp, StgPtr spBottom )
 
         case RET_SMALL: {
             StgWord c = *sp;
-            if (c == (StgWord)&stg_ctoi_R1p_info) {
-                debugBelch("tstg_ctoi_ret_R1p_info\n" );
-            } else if (c == (StgWord)&stg_ctoi_R1n_info) {
-                debugBelch("stg_ctoi_ret_R1n_info\n" );
-            } else if (c == (StgWord)&stg_ctoi_F1_info) {
-                debugBelch("stg_ctoi_ret_F1_info\n" );
-            } else if (c == (StgWord)&stg_ctoi_D1_info) {
-                debugBelch("stg_ctoi_ret_D1_info\n" );
-            } else if (c == (StgWord)&stg_ctoi_V_info) {
-                debugBelch("stg_ctoi_ret_V_info\n" );
-            } else if (c == (StgWord)&stg_ap_v_info) {
+            if (c == (StgWord)&stg_ap_v_info) {
                 debugBelch("stg_ap_v_info\n" );
             } else if (c == (StgWord)&stg_ap_f_info) {
                 debugBelch("stg_ap_f_info\n" );
@@ -595,11 +585,51 @@ printStackChunk( StgPtr sp, StgPtr spBottom )
         }
 
         case RET_BCO: {
-            StgBCO *bco;
-
-            bco = ((StgBCO *)sp[1]);
+            StgWord c = *sp;
+            StgBCO *bco = ((StgBCO *)sp[1]);
 
-            debugBelch("RET_BCO (%p)\n", sp);
+            if (c == (StgWord)&stg_ctoi_R1p_info) {
+                debugBelch("stg_ctoi_R1p_info" );
+            } else if (c == (StgWord)&stg_ctoi_R1unpt_info) {
+                debugBelch("stg_ctoi_R1unpt_info" );
+            } else if (c == (StgWord)&stg_ctoi_R1n_info) {
+                debugBelch("stg_ctoi_R1n_info" );
+            } else if (c == (StgWord)&stg_ctoi_F1_info) {
+                debugBelch("stg_ctoi_F1_info" );
+            } else if (c == (StgWord)&stg_ctoi_D1_info) {
+                debugBelch("stg_ctoi_D1_info" );
+            } else if (c == (StgWord)&stg_ctoi_V_info) {
+                debugBelch("stg_ctoi_V_info" );
+            } else if (c == (StgWord)&stg_BCO_info) {
+                debugBelch("stg_BCO_info" );
+            } else if (c == (StgWord)&stg_apply_interp_info) {
+                debugBelch("stg_apply_interp_info" );
+            } else if (c == (StgWord)&stg_ret_t_info) {
+                debugBelch("stg_ret_t_info" );
+            } else if (c == (StgWord)&stg_ctoi_t0_info) {
+                debugBelch("stg_ctoi_t0_info" );
+            } else if (c == (StgWord)&stg_ctoi_t1_info) {
+                debugBelch("stg_ctoi_t1_info" );
+            } else if (c == (StgWord)&stg_ctoi_t2_info) {
+                debugBelch("stg_ctoi_t2_info" );
+            } else if (c == (StgWord)&stg_ctoi_t3_info) {
+                debugBelch("stg_ctoi_t3_info" );
+            } else if (c == (StgWord)&stg_ctoi_t4_info) {
+                debugBelch("stg_ctoi_t4_info" );
+            } else if (c == (StgWord)&stg_ctoi_t5_info) {
+                debugBelch("stg_ctoi_t5_info" );
+            } else if (c == (StgWord)&stg_ctoi_t6_info) {
+                debugBelch("stg_ctoi_t6_info" );
+            } else if (c == (StgWord)&stg_ctoi_t7_info) {
+                debugBelch("stg_ctoi_t7_info" );
+            } else if (c == (StgWord)&stg_ctoi_t8_info) {
+                debugBelch("stg_ctoi_t8_info" );
+            /* there are more stg_ctoi_tN_info frames,
+               but we don't print them all */
+            } else {
+                debugBelch("RET_BCO");
+            }
+            debugBelch(" (%p)\n", sp);
             printLargeBitmap(spBottom, sp+2,
                              BCO_BITMAP(bco), BCO_BITMAP_SIZE(bco));
             continue;
diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c
index 50a3bae267..3a9f568ed4 100644
--- a/rts/RtsSymbols.c
+++ b/rts/RtsSymbols.c
@@ -561,6 +561,8 @@
       SymI_HasProto(stg_ret_f_info)                                     \
       SymI_HasProto(stg_ret_d_info)                                     \
       SymI_HasProto(stg_ret_l_info)                                     \
+      SymI_HasProto(stg_ret_t_info)                                     \
+      SymI_HasProto(stg_ctoi_t)                                         \
       SymI_HasProto(stg_gc_prim_p)                                      \
       SymI_HasProto(stg_gc_prim_pp)                                     \
       SymI_HasProto(stg_gc_prim_n)                                      \
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index 7a8f20dded..b9379ab3e6 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -195,6 +195,274 @@ INFO_TABLE_RET( stg_ctoi_V, RET_BCO )
     jump stg_yield_to_interpreter [];
 }
 
+/*   Note [GHCi unboxed tuples stack spills]
+
+   In the calling convention for compiled code, a tuple is returned
+   in registers, with everything that doesn't fit spilled onto the STG
+   stack.
+
+   At the time the continuation is called, Sp points to the highest word
+   used on the stack:
+
+       ...
+       stg_ctoi_t  (next stack frame, continuation)
+       spilled_1
+       spilled_2
+       spilled_3   <- Sp
+
+   This makes it difficult to write a procedure that can handle tuples of
+   any size.
+
+   To get around this, we use a Cmm procedure that adjusts the stack pointer
+   to skip over the tuple:
+
+       ...
+       stg_ctoi_t3  (advances Sp by 3 words, then calls stg_ctoi_t)
+       spilled_1
+       spilled_2
+       spilled_3    <- Sp
+  
+   When stg_ctoi_t is called, the stack looks like:
+
+       ...
+       tuple_BCO
+       tuple_info
+       cont_BCO     (continuation in bytecode)
+       stg_ctoi_t3  <- Sp
+       spilled_1
+       spilled_2
+       spilled_3
+
+   stg_ctoi_t then reads the tuple_info word to determine the registers
+   to save onto the stack and construct a call to tuple_BCO. Afterwards the
+   stack looks as follows:
+
+       ...
+       tuple_BCO
+       tuple_info
+       cont_BCO
+       stg_ctoi_t3
+       spilled_1
+       spilled_2
+       spilled_3
+       saved_R2
+       saved_R1
+       saved_D3
+       ...
+       tuple_BCO
+       stg_apply_interp <- Sp
+
+
+   tuple_BCO contains the bytecode instructions to return the tuple to
+   cont_BCO. The bitmap in tuple_BCO describes the contents of
+   the tuple to the storage manager.
+
+   At this point we can safely jump to the interpreter.
+
+ */
+
+#define MK_STG_CTOI_T(N) INFO_TABLE_RET( \
+  stg_ctoi_t ## N, RET_BCO ) \
+  { Sp_adj(N); jump stg_ctoi_t [*]; }
+
+MK_STG_CTOI_T(0)
+MK_STG_CTOI_T(1)
+MK_STG_CTOI_T(2)
+MK_STG_CTOI_T(3)
+MK_STG_CTOI_T(4)
+MK_STG_CTOI_T(5)
+MK_STG_CTOI_T(6)
+MK_STG_CTOI_T(7)
+MK_STG_CTOI_T(8)
+MK_STG_CTOI_T(9)
+
+MK_STG_CTOI_T(10)
+MK_STG_CTOI_T(11)
+MK_STG_CTOI_T(12)
+MK_STG_CTOI_T(13)
+MK_STG_CTOI_T(14)
+MK_STG_CTOI_T(15)
+MK_STG_CTOI_T(16)
+MK_STG_CTOI_T(17)
+MK_STG_CTOI_T(18)
+MK_STG_CTOI_T(19)
+
+MK_STG_CTOI_T(20)
+MK_STG_CTOI_T(21)
+MK_STG_CTOI_T(22)
+MK_STG_CTOI_T(23)
+MK_STG_CTOI_T(24)
+MK_STG_CTOI_T(25)
+MK_STG_CTOI_T(26)
+MK_STG_CTOI_T(27)
+MK_STG_CTOI_T(28)
+MK_STG_CTOI_T(29)
+
+MK_STG_CTOI_T(30)
+MK_STG_CTOI_T(31)
+MK_STG_CTOI_T(32)
+MK_STG_CTOI_T(33)
+MK_STG_CTOI_T(34)
+MK_STG_CTOI_T(35)
+MK_STG_CTOI_T(36)
+MK_STG_CTOI_T(37)
+MK_STG_CTOI_T(38)
+MK_STG_CTOI_T(39)
+
+MK_STG_CTOI_T(40)
+MK_STG_CTOI_T(41)
+MK_STG_CTOI_T(42)
+MK_STG_CTOI_T(43)
+MK_STG_CTOI_T(44)
+MK_STG_CTOI_T(45)
+MK_STG_CTOI_T(46)
+MK_STG_CTOI_T(47)
+MK_STG_CTOI_T(48)
+MK_STG_CTOI_T(49)
+
+MK_STG_CTOI_T(50)
+MK_STG_CTOI_T(51)
+MK_STG_CTOI_T(52)
+MK_STG_CTOI_T(53)
+MK_STG_CTOI_T(54)
+MK_STG_CTOI_T(55)
+MK_STG_CTOI_T(56)
+MK_STG_CTOI_T(57)
+MK_STG_CTOI_T(58)
+MK_STG_CTOI_T(59)
+
+MK_STG_CTOI_T(60)
+MK_STG_CTOI_T(61)
+MK_STG_CTOI_T(62)
+
+/*
+  Note [GHCi tuple layout]
+
+  the tuple_info word describes the register and stack usage of the tuple:
+
+  [ rrrr ffff ffdd dddd llss ssss ssss ssss ]
+
+  - r: number of vanilla registers R1..Rn
+  - f: bitmap of float registers F1..F6
+  - d: bitmap of double registers D1..D6
+  - l: bitmap of long registers L1..Ln
+  - s: number of words on stack (in addition to registers)
+
+  The order in which the registers are pushed on the stack is determined by
+  the Ord instance of GHC.Cmm.Expr.GlobalReg. If you change the Ord instance,
+  the order in stg_ctoi_t and stg_ret_t needs to be adjusted accordingly.
+
+ */
+
+stg_ctoi_t
+    /* explicit stack */
+{
+
+    W_ tuple_info, tuple_stack, tuple_regs_R,
+       tuple_regs_F, tuple_regs_D, tuple_regs_L;
+    P_ tuple_BCO;
+
+    tuple_info = Sp(2); /* tuple information word */
+    tuple_BCO  = Sp(3); /* bytecode object that returns the tuple in
+                           the interpreter */
+
+#if defined(PROFILING)
+    CCCS = Sp(4);
+#endif
+
+    tuple_stack  = tuple_info & 0x3fff; /* number of words spilled on stack */
+    tuple_regs_R = (tuple_info >> 28) & 0xf;  /* number of R1..Rn */
+    tuple_regs_F = (tuple_info >> 22) & 0x3f; /* 6 bits bitmap */
+    tuple_regs_D = (tuple_info >> 16) & 0x3f; /* 6 bits bitmap */
+    tuple_regs_L = (tuple_info >> 14) & 0x3;  /* 2 bits bitmap */
+
+    Sp = Sp - WDS(tuple_stack);
+
+    /* save long registers */
+    /* fixme L2 ? */
+    if((tuple_regs_L & 1) != 0) { Sp = Sp - 8; L_[Sp] = L1; }
+
+    /* save double registers */
+    if((tuple_regs_D & 32) != 0) { Sp = Sp - SIZEOF_DOUBLE; D_[Sp] = D6; }
+    if((tuple_regs_D & 16) != 0) { Sp = Sp - SIZEOF_DOUBLE; D_[Sp] = D5; }
+    if((tuple_regs_D & 8)  != 0) { Sp = Sp - SIZEOF_DOUBLE; D_[Sp] = D4; }
+    if((tuple_regs_D & 4)  != 0) { Sp = Sp - SIZEOF_DOUBLE; D_[Sp] = D3; }
+    if((tuple_regs_D & 2)  != 0) { Sp = Sp - SIZEOF_DOUBLE; D_[Sp] = D2; }
+    if((tuple_regs_D & 1)  != 0) { Sp = Sp - SIZEOF_DOUBLE; D_[Sp] = D1; }
+
+    /* save float registers */
+    if((tuple_regs_F & 32) != 0) { Sp_adj(-1); F_[Sp] = F6; }
+    if((tuple_regs_F & 16) != 0) { Sp_adj(-1); F_[Sp] = F5; }
+    if((tuple_regs_F & 8)  != 0) { Sp_adj(-1); F_[Sp] = F4; }
+    if((tuple_regs_F & 4)  != 0) { Sp_adj(-1); F_[Sp] = F3; }
+    if((tuple_regs_F & 2)  != 0) { Sp_adj(-1); F_[Sp] = F2; }
+    if((tuple_regs_F & 1)  != 0) { Sp_adj(-1); F_[Sp] = F1; }
+
+    /* save vanilla registers */
+    if(tuple_regs_R >= 6) { Sp_adj(-1); Sp(0) = R6; }
+    if(tuple_regs_R >= 5) { Sp_adj(-1); Sp(0) = R5; }
+    if(tuple_regs_R >= 4) { Sp_adj(-1); Sp(0) = R4; }
+    if(tuple_regs_R >= 3) { Sp_adj(-1); Sp(0) = R3; }
+    if(tuple_regs_R >= 2) { Sp_adj(-1); Sp(0) = R2; }
+    if(tuple_regs_R >= 1) { Sp_adj(-1); Sp(0) = R1; }
+
+    /* jump to the BCO that will finish the return of the tuple */
+    Sp_adj(-3);
+    Sp(2) = tuple_info;
+    Sp(1) = tuple_BCO;
+    Sp(0) = stg_ret_t_info;
+
+    jump stg_yield_to_interpreter [];
+}
+
+INFO_TABLE_RET( stg_ret_t, RET_BCO )
+{
+    W_ tuple_info, tuple_stack, tuple_regs_R, tuple_regs_F,
+       tuple_regs_D, tuple_regs_L;
+
+    tuple_info = Sp(2);
+    Sp_adj(3);
+
+    tuple_stack  = tuple_info & 0x3fff; /* number of words spilled on stack */
+    tuple_regs_R = (tuple_info >> 28) & 0xf;  /* number of R1..Rn */
+    tuple_regs_F = (tuple_info >> 22) & 0x3f; /* 6 bits bitmap */
+    tuple_regs_D = (tuple_info >> 16) & 0x3f; /* 6 bits bitmap */
+    tuple_regs_L = (tuple_info >> 14) & 0x3;  /* 2 bits bitmap */
+
+    /* restore everything in the reverse order of stg_ctoi_t */
+
+    /* restore vanilla registers */
+    if(tuple_regs_R >= 1) { R1 = Sp(0); Sp_adj(1); }
+    if(tuple_regs_R >= 2) { R2 = Sp(0); Sp_adj(1); }
+    if(tuple_regs_R >= 3) { R3 = Sp(0); Sp_adj(1); }
+    if(tuple_regs_R >= 4) { R4 = Sp(0); Sp_adj(1); }
+    if(tuple_regs_R >= 5) { R5 = Sp(0); Sp_adj(1); }
+    if(tuple_regs_R >= 6) { R6 = Sp(0); Sp_adj(1); }
+
+    /* restore float registers */
+    if((tuple_regs_F & 1)  != 0) { F1 = F_[Sp]; Sp_adj(1); }
+    if((tuple_regs_F & 2)  != 0) { F2 = F_[Sp]; Sp_adj(1); }
+    if((tuple_regs_F & 4)  != 0) { F3 = F_[Sp]; Sp_adj(1); }
+    if((tuple_regs_F & 8)  != 0) { F4 = F_[Sp]; Sp_adj(1); }
+    if((tuple_regs_F & 16) != 0) { F5 = F_[Sp]; Sp_adj(1); }
+    if((tuple_regs_F & 32) != 0) { F6 = F_[Sp]; Sp_adj(1); }
+
+    /* restore double registers */
+    if((tuple_regs_D & 1)  != 0) { D1 = D_[Sp]; Sp = Sp + SIZEOF_DOUBLE; }
+    if((tuple_regs_D & 2)  != 0) { D2 = D_[Sp]; Sp = Sp + SIZEOF_DOUBLE; }
+    if((tuple_regs_D & 4)  != 0) { D3 = D_[Sp]; Sp = Sp + SIZEOF_DOUBLE; }
+    if((tuple_regs_D & 8)  != 0) { D4 = D_[Sp]; Sp = Sp + SIZEOF_DOUBLE; }
+    if((tuple_regs_D & 16) != 0) { D5 = D_[Sp]; Sp = Sp + SIZEOF_DOUBLE; }
+    if((tuple_regs_D & 32) != 0) { D6 = D_[Sp]; Sp = Sp + SIZEOF_DOUBLE; }
+
+    /* restore long registers */
+    if((tuple_regs_L & 1) != 0) { L1 = L_[Sp]; Sp = Sp + 8; }
+
+    /* Sp points to the topmost argument now */
+    jump %ENTRY_CODE(Sp(tuple_stack)) [*]; // NB. all registers live!
+}
+
+
 /*
  * Dummy info table pushed on the top of the stack when the interpreter
  * should apply the BCO on the stack to its arguments, also on the
diff --git a/testsuite/tests/ghci/T16670/T16670_unboxed.hs b/testsuite/tests/ghci/T16670/T16670_unboxed.hs
index 2e903959bb..93816795e0 100644
--- a/testsuite/tests/ghci/T16670/T16670_unboxed.hs
+++ b/testsuite/tests/ghci/T16670/T16670_unboxed.hs
@@ -1,5 +1,13 @@
 {-# LANGUAGE UnboxedTuples #-}
+
 {-# OPTIONS_GHC -fwrite-interface #-}
+{-
+  GHCi doesn't automatically switch to object code anymore now that
+  UnboxedTuples are supported in bytecode. But we test for the
+  existence of the file.
+ -}
+{-# OPTIONS_GHC -fobject-code #-}
+
 module T16670_unboxed where
 
 data UnboxedTupleData = MkUTD (# (),() #)
diff --git a/testsuite/tests/ghci/prog014/prog014.T b/testsuite/tests/ghci/prog014/prog014.T
index d9dee7eac7..1b583e8c19 100644
--- a/testsuite/tests/ghci/prog014/prog014.T
+++ b/testsuite/tests/ghci/prog014/prog014.T
@@ -1,5 +1,6 @@
 test('prog014',
      [extra_files(['Primop.hs', 'dummy.c']),
+      expect_fail, # bytecode compiler doesn't support foreign import prim
       extra_run_opts('dummy.o'),
       pre_cmd('$MAKE -s --no-print-directory prog014')],
      ghci_script, ['prog014.script'])
diff --git a/testsuite/tests/ghci/should_run/UnboxedTuples/ByteCode.hs b/testsuite/tests/ghci/should_run/UnboxedTuples/ByteCode.hs
new file mode 100644
index 0000000000..a1bce35ad0
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/UnboxedTuples/ByteCode.hs
@@ -0,0 +1,17 @@
+{-# LANGUAGE CPP, UnboxedTuples, MagicHash, ScopedTypeVariables, PolyKinds #-}
+{-# OPTIONS_GHC -fbyte-code #-}
+
+#include "MachDeps.h"
+
+#if WORD_SIZE_IN_BITS < 64
+#define WW Word64
+#else
+#define WW Word
+#endif
+
+module ByteCode where
+
+import GHC.Exts
+import GHC.Word
+
+#include "Common.hs-incl"
diff --git a/testsuite/tests/ghci/should_run/UnboxedTuples/Common.hs-incl b/testsuite/tests/ghci/should_run/UnboxedTuples/Common.hs-incl
new file mode 100644
index 0000000000..6931397f09
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/UnboxedTuples/Common.hs-incl
@@ -0,0 +1,368 @@
+swap :: (# a, b #) -> (# b, a #)
+swap (# x, y #) = (# y, x #)
+
+type T1 a = a -> (# a #)
+tuple1 :: T1 a
+tuple1 x = (# x #)
+
+tuple1_a :: T1 a -> a -> a
+tuple1_a f x = case f x of (# y #) -> y
+
+tuple1_b :: T1 a -> a -> String -> IO ()
+tuple1_b f x msg = case f x of (# _ #) -> putStrLn msg
+
+-- can still be returned in registers, pointers
+type T2p a = a -> a -> a -> a -> (# a, a, a, a #)
+
+tuple2p :: T2p a
+tuple2p x1 x2 x3 x4 = (# x1, x2, x3, x4 #)
+
+tuple2p_a :: T2p a -> a -> a -> a -> a -> (a, a, a, a)
+tuple2p_a f x1 x2 x3 x4 =
+    case f x1 x2 x3 x4 of (# y1, y2, y3, y4 #) -> (y1, y2, y3, y4)
+
+-- can still be returned in registers, non-pointers
+type T2n = Int -> Int -> Int -> Int -> (# Int#, Int#, Int#, Int# #)
+
+tuple2n :: T2n
+tuple2n (I# x1) (I# x2) (I# x3) (I# x4) = (# x1, x2, x3, x4 #)
+
+tuple2n_a :: T2n -> Int -> Int -> Int -> Int -> (Int, Int, Int, Int)
+tuple2n_a f x1 x2 x3 x4 =
+    case f x1 x2 x3 x4 of
+        (# y1, y2, y3, y4 #) -> (I# y1, I# y2, I# y3, I# y4)
+
+
+-- too big to fit in registers
+type T3 a = a -> a -> a -> a
+        -> a -> a -> a -> a
+        -> a -> a -> a -> a
+        -> (# a, a, a, a
+            , a, a, a, a
+            , a, a, a, a #)
+tuple3 :: T3 a
+tuple3 x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 =
+    (# x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12 #)
+
+tuple3_a :: T3 a
+        -> a -> a -> a -> a
+        -> a -> a -> a -> a
+        -> a -> a -> a -> a
+        -> ( a, a, a, a
+           , a, a, a, a
+           , a, a, a, a
+           )
+tuple3_a f x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 =
+    case f x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 of
+            (# y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12 #) ->
+                (y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12)
+
+type T4a = Float -> Double -> Float -> Double
+        -> (# Float#, Double#, Float#, Double# #)
+
+tuple4a :: T4a
+tuple4a (F# f1) (D# d1) (F# f2) (D# d2) = (# f1, d1, f2, d2 #)
+
+tuple4a_a :: T4a
+         -> Float -> Double -> Float -> Double
+         -> (Float, Double, Float, Double)
+tuple4a_a h f1 d1 f2 d2 =
+  case h f1 d1 f2 d2 of (# g1, e1, g2, e2 #) -> (F# g1, D# e1, F# g2, D# e2 )
+
+
+-- this should fill the floating point registers
+type T4b = Float -> Double -> Float -> Double
+       -> Float -> Double -> Float -> Double
+       -> Float -> Double -> Float -> Double
+       -> Float -> Double -> Float -> Double
+       -> Float -> Double -> Float -> Double
+       -> (# Float#, Double#, Float#, Double#
+           , Float#, Double#, Float#, Double#
+           , Float#, Double#, Float#, Double#
+           , Float#, Double#, Float#, Double#
+           , Float#, Double#, Float#, Double# #)
+tuple4b :: T4b
+tuple4b (F# f1) (D# d1) (F# f2) (D# d2)
+       (F# f3) (D# d3) (F# f4) (D# d4)
+       (F# f5) (D# d5) (F# f6) (D# d6)
+       (F# f7) (D# d7) (F# f8) (D# d8)
+       (F# f9) (D# d9) (F# f10) (D# d10) =
+    (# f1, d1, f2, d2
+     , f3, d3, f4, d4
+     , f5, d5, f6, d6
+     , f7, d7, f8, d8
+     , f9, d9, f10, d10
+     #)
+
+tuple4b_a :: T4b
+         -> Float -> Double -> Float -> Double
+         -> Float -> Double -> Float -> Double
+         -> Float -> Double -> Float -> Double
+         -> Float -> Double -> Float -> Double
+         -> Float -> Double -> Float -> Double
+         -> ( (Float, Double, Float, Double)
+            , (Float, Double, Float, Double)
+            , (Float, Double, Float, Double)
+            , (Float, Double, Float, Double)
+            , (Float, Double, Float, Double)
+            )
+tuple4b_a h f1 d1 f2 d2
+           f3 d3 f4 d4
+           f5 d5 f6 d6
+           f7 d7 f8 d8
+           f9 d9 f10 d10 =
+    case h f1 d1 f2 d2
+           f3 d3 f4 d4
+           f5 d5 f6 d6
+           f7 d7 f8 d8
+           f9 d9 f10 d10 of
+      (# g1, e1, g2, e2
+       , g3, e3, g4, e4
+       , g5, e5, g6, e6
+       , g7, e7, g8, e8
+       , g9, e9, g10, e10 #) ->
+        ( (F# g1, D# e1, F# g2, D# e2)
+        , (F# g3, D# e3, F# g4, D# e4)
+        , (F# g5, D# e5, F# g6, D# e6)
+        , (F# g7, D# e7, F# g8, D# e8)
+        , (F# g9, D# e9, F# g10, D# e10))
+
+type T4c = Float -> Double -> Word64 -> Integer
+        -> Float -> Double -> Word64 -> Integer
+        -> Float -> Double -> Word64 -> Integer
+        -> Float -> Double -> Word64 -> Integer
+        -> (# Float#, Double#, WW#, Integer
+            , Float#, Double#, WW#, Integer
+            , Float#, Double#, WW#, Integer
+            , Float#, Double#, WW#, Integer
+            #)
+tuple4c :: T4c
+tuple4c (F# f1) (D# d1) (W64# w1) i1
+        (F# f2) (D# d2) (W64# w2) i2
+        (F# f3) (D# d3) (W64# w3) i3
+        (F# f4) (D# d4) (W64# w4) i4 =
+     (# f1, d1, w1, i1
+      , f2, d2, w2, i2
+      , f3, d3, w3, i3
+      , f4, d4, w4, i4
+      #)
+
+tuple4c_a :: T4c
+          -> Float -> Double -> Word64 -> Integer
+          -> Float -> Double -> Word64 -> Integer
+          -> Float -> Double -> Word64 -> Integer
+          -> Float -> Double -> Word64 -> Integer
+          -> ( ( Float, Double, Word64, Integer)
+            , ( Float, Double, Word64, Integer)
+            , ( Float, Double, Word64, Integer)
+            ,  ( Float, Double, Word64, Integer)
+             )
+tuple4c_a h f1 d1 w1 i1
+            f2 d2 w2 i2
+            f3 d3 w3 i3
+            f4 d4 w4 i4 =
+    case h f1 d1 w1 i1
+            f2 d2 w2 i2
+            f3 d3 w3 i3
+            f4 d4 w4 i4 of
+      (# f1', d1', w1', i1'
+       , f2', d2', w2', i2'
+       , f3', d3', w3', i3'
+       , f4', d4', w4', i4' #) ->
+       ( (F# f1', D# d1', W64# w1', i1')
+       , (F# f2', D# d2', W64# w2', i2')
+       , (F# f3', D# d3', W64# w3', i3')
+       , (F# f4', D# d4', W64# w4', i4')
+       )
+
+type T5 = Int -> Word64 -> Int -> Word64
+       -> Int -> Word64 -> Int -> Word64
+       -> Int -> Word64 -> Int -> Word64
+       -> Int -> Word64 -> Int -> Word64
+       -> (# Int, WW#, Int, WW#
+           , Int, WW#, Int, WW#
+           , Int, WW#, Int, WW#
+           , Int, WW#, Int, WW#
+           #)
+
+tuple5 :: T5
+tuple5 i1 (W64# w1) i2 (W64# w2)
+       i3 (W64# w3) i4 (W64# w4)
+       i5 (W64# w5) i6 (W64# w6)
+       i7 (W64# w7) i8 (W64# w8) =
+    (# i1, w1, i2, w2
+     , i3, w3, i4, w4
+     , i5, w5, i6, w6
+     , i7, w7, i8, w8 #)
+
+tuple5_a :: T5
+         -> Int -> Word64 -> Int -> Word64
+         -> Int -> Word64 -> Int -> Word64
+         -> Int -> Word64 -> Int -> Word64
+         -> Int -> Word64 -> Int -> Word64
+         -> ( (Int, Word64, Int, Word64)
+            , (Int, Word64, Int, Word64)
+            , (Int, Word64, Int, Word64)
+            , (Int, Word64, Int, Word64)
+            )
+tuple5_a f i1 w1 i2 w2
+           i3 w3 i4 w4
+           i5 w5 i6 w6
+           i7 w7 i8 w8 =
+    case f i1 w1 i2 w2
+           i3 w3 i4 w4
+           i5 w5 i6 w6
+           i7 w7 i8 w8 of
+      (# j1, x1, j2, x2
+       , j3, x3, j4, x4
+       , j5, x5, j6, x6
+       , j7, x7, j8, x8
+       #) ->
+       ( (j1, W64# x1, j2, W64# x2)
+       , (j3, W64# x3, j4, W64# x4)
+       , (j5, W64# x5, j6, W64# x6)
+       , (j7, W64# x7, j8, W64# x8)
+       )
+
+type T6 = Int ->
+        (# Int#, (# Int, (# Int#, (# #) #) #) #)
+tuple6 :: T6
+tuple6 x@(I# x#) = (# x#, (# x, (# x#, (# #) #) #) #)
+
+tuple6_a :: T6 -> Int -> String
+tuple6_a f x =
+  case f x of
+    (# x1, (# x2, (# x3, (# #) #) #) #) -> show (I# x1, (x2, (I# x3, ())))
+
+-- empty tuples and tuples with void
+
+type TV1 = Bool -> (# #)
+
+{-# NOINLINE tuple_v1 #-}
+tuple_v1 :: TV1
+tuple_v1 _ = (# #)
+
+{-# NOINLINE tuple_v1_a #-}
+tuple_v1_a :: TV1 -> Bool -> Bool
+tuple_v1_a f x = case f x of (# #) -> True
+
+
+type TV2 = Bool -> (# (# #) #)
+
+{-# NOINLINE tuple_v2 #-}
+tuple_v2 :: TV2
+tuple_v2 _ = (# (# #) #)
+
+{-# NOINLINE tuple_v2_a #-}
+tuple_v2_a :: TV2 -> Bool -> Bool
+tuple_v2_a f x = case f x of (# _ #) -> True
+
+
+type TV3 a = a -> (# (# #), a #)
+
+{-# NOINLINE tuple_v3 #-}
+tuple_v3 :: TV3 a
+tuple_v3 x = (# (# #), x #)
+
+{-# NOINLINE tuple_v3_a #-}
+tuple_v3_a :: TV3 a -> a -> a
+tuple_v3_a f x = case f x of (# _, y #) -> y
+
+
+type TV4 a = a -> (# a, (# #) #)
+
+{-# NOINLINE tuple_v4 #-}
+tuple_v4 :: TV4 a
+tuple_v4 x = (# x, (# #) #)
+
+{-# NOINLINE tuple_v4_a #-}
+tuple_v4_a :: TV4 a -> a -> a
+tuple_v4_a f x = case f x of (# y, _ #) -> y
+
+
+type TV5 a = a -> (# (# #), a, (# #) #)
+
+{-# NOINLINE tuple_v5 #-}
+tuple_v5 :: TV5 a
+tuple_v5 x = (# (# #), x, (# #) #)
+
+{-# NOINLINE tuple_v5_a #-}
+tuple_v5_a :: TV5 a -> a -> a
+tuple_v5_a f x = case f x of (# _, x, _ #) -> x
+
+
+type TV6 = Int -> Double -> Int -> Double
+         -> (# Int#, (# #), Double#, (# #)
+            ,  Int#, (# #), Double#, (# #) #)
+
+{-# NOINLINE tuple_v6 #-}
+tuple_v6 :: TV6
+tuple_v6 (I# x) (D# y) (I# z) (D# w) = (# x, (# #), y, (# #), z, (# #), w, (# #) #)
+
+{-# NOINLINE tuple_v6_a #-}
+tuple_v6_a :: TV6 -> Int -> Double -> Int -> Double
+           -> (Int, Double, Int, Double)
+tuple_v6_a f x y z w = case f x y z w of (# x', _, y', _, z', _, w', _ #) ->
+                                           (I# x', D# y', I# z', D# w')
+
+-- some levity polymorphic things
+{-# NOINLINE lev_poly #-}
+lev_poly :: forall r a (b :: TYPE r).
+            (a -> a -> a -> a ->
+             a -> a -> a -> a ->
+             a -> a -> a -> a -> b) -> a -> b
+lev_poly f x = f x x x x x x x x x x x x
+
+{-# NOINLINE lev_poly_a #-}
+lev_poly_a :: (t1
+                -> t2 -> (# a, b, c, d, e, f, g, h, i, j, k, l #))
+            -> t1 -> t2 -> (a, b, c, d, e, f, g, h, i, j, k, l)
+lev_poly_a lp t x =
+  case lp t x of (# x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12 #) ->
+                   (x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12)
+
+{-# NOINLINE lev_poly_boxed #-}
+lev_poly_boxed x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12
+  = (x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12)
+
+{-# NOINLINE lev_poly_b #-}
+lev_poly_b lp t x =
+  case lp t x of (x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12)
+                   -> (x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12)
+
+-- some unboxed sums
+type S1 = (# (# Int#, String #) | Bool #)
+
+{-# NOINLINE sum1 #-}
+sum1 :: Int -> Int -> String -> Bool -> S1
+sum1 0 (I# x) y _ = (# (# x, y #) | #)
+sum1 _ _      _ b = (# | b #)
+
+{-# NOINLINE sum1_a #-}
+sum1_a :: (Int -> Int -> String -> Bool -> S1) -> Int -> Int -> String -> Bool -> Either (Int, String) Bool
+sum1_a f n x y b =
+  case f n x y b of
+    (# (# x, y #) | #) -> Left (I# x, y)
+    (# | b #)          -> Right b
+
+
+type S2 a = (# (# a, a, a, a #) | (# a, a #) | (# #) | Int# | Int #)
+
+{-# NOINLINE sum2 #-}
+sum2 :: Int -> a -> S2 a
+sum2 0 x = (# (# x, x, x, x #) | | | | #)
+sum2 1 x = (# | (# x, x #) | | | #)
+sum2 2 _ = (# | | (# #) | | #)
+sum2 n@(I# n#) _
+  | even n = (# | | | n# | #)
+  | otherwise = (# | | | | n #)
+
+{-# NOINLINE sum2_a #-}
+sum2_a :: Show a => (Int -> a -> S2 a) -> Int -> a -> String
+sum2_a f n x =
+  case f n x of
+    (# (# x1, x2, x3, x4 #) | | | | #) -> show (x1, x2, x3, x4)
+    (# | (# x1, x2 #) | | | #)         -> show (x1, x2)
+    (# | | (# #) | | #)                -> "(# #)"
+    (# | | | x# | #)                   -> show (I# x#) ++ "#"
+    (# | | | | x #)                    -> show x
diff --git a/testsuite/tests/ghci/should_run/UnboxedTuples/Obj.hs b/testsuite/tests/ghci/should_run/UnboxedTuples/Obj.hs
new file mode 100644
index 0000000000..190b8f1683
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/UnboxedTuples/Obj.hs
@@ -0,0 +1,17 @@
+{-# LANGUAGE CPP, UnboxedTuples, MagicHash, ScopedTypeVariables, PolyKinds #-}
+{-# OPTIONS_GHC -fobject-code #-}
+
+#include "MachDeps.h"
+
+#if WORD_SIZE_IN_BITS < 64
+#define WW Word64
+#else
+#define WW Word
+#endif
+
+module Obj where
+
+import GHC.Exts
+import GHC.Word
+
+#include "Common.hs-incl"
diff --git a/testsuite/tests/ghci/should_run/UnboxedTuples/UnboxedTuples.hs b/testsuite/tests/ghci/should_run/UnboxedTuples/UnboxedTuples.hs
new file mode 100644
index 0000000000..1daec7f207
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/UnboxedTuples/UnboxedTuples.hs
@@ -0,0 +1,182 @@
+{-# LANGUAGE UnboxedTuples, MagicHash #-}
+{-# OPTIONS_GHC -fbyte-code #-}
+
+{-
+  Test unboxed tuples and sums in the bytecode interpreter.
+
+  The bytecode interpreter uses the stack for everything, while
+  compiled code uses STG registers for arguments and return values.
+ -}
+
+module Main where
+
+import qualified Obj      as O
+import qualified ByteCode as B
+
+import GHC.Exts
+import GHC.Word
+
+main :: IO ()
+main = do
+
+    case B.swap (O.swap (B.swap (O.swap (# "x", 1 #)))) of
+      (# y1, y2 #) -> print (y1, y2)
+
+    -- one-tuples
+    testX "tuple1"
+          B.tuple1_a O.tuple1_a
+          B.tuple1   O.tuple1
+          (\f -> f 90053)
+
+    -- check that the contents of a one-tuple aren't evaluated
+    B.tuple1_b B.tuple1 (error "error tuple1_b") "tuple1_b"
+    B.tuple1_b O.tuple1 (error "error tuple1_b") "tuple1_b"
+    O.tuple1_b B.tuple1 (error "error tuple1_b") "tuple1_b"
+    O.tuple1_b O.tuple1 (error "error tuple1_b") "tuple1_b"
+
+    -- various size tuples with boxed/unboxed elements
+    testX "tuple2p"
+          B.tuple2p_a O.tuple2p_a
+          B.tuple2p   O.tuple2p
+          (\f -> f (1234::Integer) 1235 1236 1237)
+
+    testX "tuple2n"
+          B.tuple2n_a O.tuple2n_a
+          B.tuple2n   O.tuple2n
+          (\f -> f 7654 7653 7652 7651)
+
+    testX "tuple3"
+          B.tuple3_a O.tuple3_a
+          B.tuple3   O.tuple3
+          (\f -> f (1000::Integer) 1001 1002 1003
+                   1004 1005 1006 1007
+                   1008 1009 1010 1011)
+
+    testX "tuple4a"
+          B.tuple4a_a O.tuple4a_a
+          B.tuple4a   O.tuple4a
+          (\f -> f 2000 2001 2002 2003)
+
+    testX "tuple4b"
+          B.tuple4b_a O.tuple4b_a
+          B.tuple4b   O.tuple4b
+          (\f -> f 3000 3001 3002 3003
+                   3004 3005 3006 3007
+                   3008 3009 3010 3011
+                   3012 3013 3014 3015
+                   3016 3017 3018 3019)
+
+    testX "tuple4c"
+          B.tuple4c_a O.tuple4c_a
+          B.tuple4c   O.tuple4c
+          (\f -> f 3000 3001 3002 3003
+                   3004 3005 3006 3007
+                   3008 3009 3010 3011
+                   3012 3013 3014 3015)
+
+    testX "tuple5"
+          B.tuple5_a O.tuple5_a
+          B.tuple5   O.tuple5
+          (\f -> f 4000 4001 4002 4003
+                   4004 4005 4006 4007
+                   4008 4009 4010 4011
+                   4012 4013 4014 4015)
+
+    testX "tuple6"
+          B.tuple6_a O.tuple6_a
+          B.tuple6   O.tuple6
+          (\f -> f 6006)
+
+    -- tuples with void and empty tuples
+    testX "tuplev1"
+          B.tuple_v1_a O.tuple_v1_a
+          B.tuple_v1   O.tuple_v1
+          (\f -> f False)
+
+    testX "tuplev2"
+          B.tuple_v2_a O.tuple_v2_a
+          B.tuple_v2   O.tuple_v2
+          (\f -> f False)
+
+    testX "tuplev3"
+          B.tuple_v3_a O.tuple_v3_a
+          B.tuple_v3   O.tuple_v3
+          (\f -> f 30001)
+
+    testX "tuplev4"
+          B.tuple_v4_a O.tuple_v4_a
+          B.tuple_v4   O.tuple_v4
+          (\f -> f 40001)
+
+    testX "tuplev5"
+          B.tuple_v5_a O.tuple_v5_a
+          B.tuple_v5   O.tuple_v5
+          (\f -> f 50001)
+
+    testX "tuplev6"
+          B.tuple_v6_a O.tuple_v6_a
+          B.tuple_v6   O.tuple_v6
+          (\f -> f 601 602 603 604)
+
+    -- levity polymorphic
+    print $ B.lev_poly_a B.lev_poly B.tuple3 991
+    print $ B.lev_poly_a B.lev_poly O.tuple3 992
+    print $ B.lev_poly_a O.lev_poly B.tuple3 993
+    print $ B.lev_poly_a O.lev_poly O.tuple3 994
+    print $ O.lev_poly_a B.lev_poly B.tuple3 995
+    print $ O.lev_poly_a B.lev_poly O.tuple3 996
+    print $ O.lev_poly_a O.lev_poly B.tuple3 997
+    print $ O.lev_poly_a O.lev_poly O.tuple3 998
+
+    print $ B.lev_poly_b B.lev_poly B.lev_poly_boxed 981
+    print $ B.lev_poly_b B.lev_poly O.lev_poly_boxed 982
+    print $ B.lev_poly_b O.lev_poly B.lev_poly_boxed 983
+    print $ B.lev_poly_b O.lev_poly O.lev_poly_boxed 984
+    print $ O.lev_poly_b B.lev_poly B.lev_poly_boxed 985
+    print $ O.lev_poly_b B.lev_poly O.lev_poly_boxed 986
+    print $ O.lev_poly_b O.lev_poly B.lev_poly_boxed 987
+    print $ O.lev_poly_b O.lev_poly O.lev_poly_boxed 988
+
+    -- sums
+    testX "sum1a"
+          B.sum1_a O.sum1_a
+          B.sum1   O.sum1
+          (\f -> f 0 1 "23" True)
+
+    testX "sum1b"
+          B.sum1_a O.sum1_a
+          B.sum1   O.sum1
+          (\f -> f 1 1 "23" True)
+
+    testX "sum2a"
+          B.sum2_a O.sum2_a
+          B.sum2   O.sum2
+          (\f -> f 0 "sum2")
+
+    testX "sum2b"
+          B.sum2_a O.sum2_a
+          B.sum2   O.sum2
+          (\f -> f 1 "sum2")
+
+    testX "sum2c"
+          B.sum2_a O.sum2_a
+          B.sum2   O.sum2
+          (\f -> f 2 "sum2")
+
+    testX "sum2d"
+          B.sum2_a O.sum2_a
+          B.sum2   O.sum2
+          (\f -> f 3 "sum2")
+
+    testX "sum2e"
+          B.sum2_a O.sum2_a
+          B.sum2   O.sum2
+          (\f -> f 4 "sum2")
+
+
+
+testX :: (Eq a, Show a)
+      => String -> (p -> t) -> (p -> t) -> p -> p -> (t -> a) -> IO ()
+testX msg a1 a2 b1 b2 ap =
+    let (r:rs) = [ap (f g) | f <- [a1,a2], g <- [b1,b2]]
+    in putStrLn (msg ++ " " ++ (show $ all (==r) rs) ++ " " ++ show r)
diff --git a/testsuite/tests/ghci/should_run/UnboxedTuples/UnboxedTuples.stdout b/testsuite/tests/ghci/should_run/UnboxedTuples/UnboxedTuples.stdout
new file mode 100644
index 0000000000..82619b86fc
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/UnboxedTuples/UnboxedTuples.stdout
@@ -0,0 +1,43 @@
+("x",1)
+tuple1 True 90053
+tuple1_b
+tuple1_b
+tuple1_b
+tuple1_b
+tuple2p True (1234,1235,1236,1237)
+tuple2n True (7654,7653,7652,7651)
+tuple3 True (1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011)
+tuple4a True (2000.0,2001.0,2002.0,2003.0)
+tuple4b True ((3000.0,3001.0,3002.0,3003.0),(3004.0,3005.0,3006.0,3007.0),(3008.0,3009.0,3010.0,3011.0),(3012.0,3013.0,3014.0,3015.0),(3016.0,3017.0,3018.0,3019.0))
+tuple4c True ((3000.0,3001.0,3002,3003),(3004.0,3005.0,3006,3007),(3008.0,3009.0,3010,3011),(3012.0,3013.0,3014,3015))
+tuple5 True ((4000,4001,4002,4003),(4004,4005,4006,4007),(4008,4009,4010,4011),(4012,4013,4014,4015))
+tuple6 True "(6006,(6006,(6006,())))"
+tuplev1 True True
+tuplev2 True True
+tuplev3 True 30001
+tuplev4 True 40001
+tuplev5 True 50001
+tuplev6 True (601,602.0,603,604.0)
+(991,991,991,991,991,991,991,991,991,991,991,991)
+(992,992,992,992,992,992,992,992,992,992,992,992)
+(993,993,993,993,993,993,993,993,993,993,993,993)
+(994,994,994,994,994,994,994,994,994,994,994,994)
+(995,995,995,995,995,995,995,995,995,995,995,995)
+(996,996,996,996,996,996,996,996,996,996,996,996)
+(997,997,997,997,997,997,997,997,997,997,997,997)
+(998,998,998,998,998,998,998,998,998,998,998,998)
+(981,981,981,981,981,981,981,981,981,981,981,981)
+(982,982,982,982,982,982,982,982,982,982,982,982)
+(983,983,983,983,983,983,983,983,983,983,983,983)
+(984,984,984,984,984,984,984,984,984,984,984,984)
+(985,985,985,985,985,985,985,985,985,985,985,985)
+(986,986,986,986,986,986,986,986,986,986,986,986)
+(987,987,987,987,987,987,987,987,987,987,987,987)
+(988,988,988,988,988,988,988,988,988,988,988,988)
+sum1a True Left (1,"23")
+sum1b True Right True
+sum2a True "(\"sum2\",\"sum2\",\"sum2\",\"sum2\")"
+sum2b True "(\"sum2\",\"sum2\")"
+sum2c True "(# #)"
+sum2d True "3"
+sum2e True "4#"
diff --git a/testsuite/tests/ghci/should_run/UnboxedTuples/unboxedtuples.T b/testsuite/tests/ghci/should_run/UnboxedTuples/unboxedtuples.T
new file mode 100644
index 0000000000..4166c82f7f
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/UnboxedTuples/unboxedtuples.T
@@ -0,0 +1,10 @@
+test('UnboxedTuples',
+     [ extra_files(['Obj.hs', 'ByteCode.hs', 'Common.hs-incl']),
+       req_interp,
+       extra_ways(['ghci']),
+       when(config.have_ext_interp, extra_ways(['ghci', 'ghci-ext'])),
+       when(config.have_ext_interp and config.have_profiling, extra_ways(['ghci', 'ghci-ext', 'ghci-ext-prof']))
+     ],
+     compile_and_run,
+     ['']
+    )
author	Luite Stegeman <stegeman@gmail.com>	2021-01-22 00:09:17 +0100
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2021-03-20 07:49:15 -0400
commit	1f94e0f7601f8e22fdd81a47f130650265a44196 (patch)
tree	d06d02317049b56763b2f1da27f71f3663efa5a0
parent	7de3532f0317032f75b76150c5d3a6f76178be04 (diff)
download	haskell-1f94e0f7601f8e22fdd81a47f130650265a44196.tar.gz