From d85ed900b271109185251cb0494d51048a4cf213 Mon Sep 17 00:00:00 2001
From: Alexis King <lexi.lambda@gmail.com>
Date: Fri, 12 May 2023 15:08:18 -0500
Subject: Use a uniform return convention in bytecode for unary results

fixes #22958
---
 compiler/GHC/ByteCode/Asm.hs     |  28 +++++------
 compiler/GHC/ByteCode/Instr.hs   |  23 ++++-----
 compiler/GHC/StgToByteCode.hs    | 102 +++++++++++++++++++--------------------
 compiler/GHC/StgToCmm/Closure.hs |   3 +-
 4 files changed, 71 insertions(+), 85 deletions(-)

(limited to 'compiler')

diff --git a/compiler/GHC/ByteCode/Asm.hs b/compiler/GHC/ByteCode/Asm.hs
index f020f0af0a..c0359cacb9 100644
--- a/compiler/GHC/ByteCode/Asm.hs
+++ b/compiler/GHC/ByteCode/Asm.hs
@@ -395,10 +395,7 @@ assembleI platform i = case i of
   PUSH_BCO proto           -> do let ul_bco = assembleBCO platform proto
                                  p <- ioptr (liftM BCOPtrBCO ul_bco)
                                  emit bci_PUSH_G [Op p]
-  PUSH_ALTS proto          -> do let ul_bco = assembleBCO platform proto
-                                 p <- ioptr (liftM BCOPtrBCO ul_bco)
-                                 emit bci_PUSH_ALTS [Op p]
-  PUSH_ALTS_UNLIFTED proto pk
+  PUSH_ALTS proto pk
                            -> do let ul_bco = assembleBCO platform proto
                                  p <- ioptr (liftM BCOPtrBCO ul_bco)
                                  emit (push_alts pk) [Op p]
@@ -504,8 +501,7 @@ assembleI platform i = case i of
   SWIZZLE   stkoff n       -> emit bci_SWIZZLE [SmallOp stkoff, SmallOp n]
   JMP       l              -> emit bci_JMP [LabelOp l]
   ENTER                    -> emit bci_ENTER []
-  RETURN                   -> emit bci_RETURN []
-  RETURN_UNLIFTED rep      -> emit (return_unlifted rep) []
+  RETURN rep               -> emit (return_non_tuple rep) []
   RETURN_TUPLE             -> emit bci_RETURN_T []
   CCALL off m_addr i       -> do np <- addr m_addr
                                  emit bci_CCALL [SmallOp off, Op np, SmallOp i]
@@ -574,16 +570,16 @@ push_alts V16 = error "push_alts: vector"
 push_alts V32 = error "push_alts: vector"
 push_alts V64 = error "push_alts: vector"
 
-return_unlifted :: ArgRep -> Word16
-return_unlifted V   = bci_RETURN_V
-return_unlifted P   = bci_RETURN_P
-return_unlifted N   = bci_RETURN_N
-return_unlifted L   = bci_RETURN_L
-return_unlifted F   = bci_RETURN_F
-return_unlifted D   = bci_RETURN_D
-return_unlifted V16 = error "return_unlifted: vector"
-return_unlifted V32 = error "return_unlifted: vector"
-return_unlifted V64 = error "return_unlifted: vector"
+return_non_tuple :: ArgRep -> Word16
+return_non_tuple V   = bci_RETURN_V
+return_non_tuple P   = bci_RETURN_P
+return_non_tuple N   = bci_RETURN_N
+return_non_tuple L   = bci_RETURN_L
+return_non_tuple F   = bci_RETURN_F
+return_non_tuple D   = bci_RETURN_D
+return_non_tuple V16 = error "return_non_tuple: vector"
+return_non_tuple V32 = error "return_non_tuple: vector"
+return_non_tuple V64 = error "return_non_tuple: vector"
 
 {-
   we can only handle up to a fixed number of words on the stack,
diff --git a/compiler/GHC/ByteCode/Instr.hs b/compiler/GHC/ByteCode/Instr.hs
index a35c4d1fd9..e8d377ed69 100644
--- a/compiler/GHC/ByteCode/Instr.hs
+++ b/compiler/GHC/ByteCode/Instr.hs
@@ -88,8 +88,7 @@ data BCInstr
    | PUSH_BCO     (ProtoBCO Name)
 
    -- Push an alt continuation
-   | PUSH_ALTS          (ProtoBCO Name)
-   | PUSH_ALTS_UNLIFTED (ProtoBCO Name) ArgRep
+   | PUSH_ALTS          (ProtoBCO Name) ArgRep
    | PUSH_ALTS_TUPLE    (ProtoBCO Name) -- continuation
                         !NativeCallInfo
                         (ProtoBCO Name) -- tuple return BCO
@@ -197,9 +196,10 @@ data BCInstr
 
    -- To Infinity And Beyond
    | ENTER
-   | RETURN                 -- return a lifted value
-   | RETURN_UNLIFTED ArgRep -- return an unlifted value, here's its rep
-   | RETURN_TUPLE           -- return an unboxed tuple (info already on stack)
+   | RETURN ArgRep -- return a non-tuple value, here's its rep; see
+                   -- Note [Return convention for non-tuple values] in GHC.StgToByteCode
+   | RETURN_TUPLE  -- return an unboxed tuple (info already on stack); see
+                   -- Note [unboxed tuple bytecodes and tuple_BCO] in GHC.StgToByteCode
 
    -- Breakpoints
    | BRK_FUN          Word16 Unique (RemotePtr CostCentre)
@@ -274,8 +274,7 @@ instance Outputable BCInstr where
                                                <> ppr op
    ppr (PUSH_BCO bco)        = hang (text "PUSH_BCO") 2 (ppr bco)
 
-   ppr (PUSH_ALTS bco)       = hang (text "PUSH_ALTS") 2 (ppr bco)
-   ppr (PUSH_ALTS_UNLIFTED bco pk) = hang (text "PUSH_ALTS_UNLIFTED" <+> ppr pk) 2 (ppr bco)
+   ppr (PUSH_ALTS bco pk)    = hang (text "PUSH_ALTS" <+> ppr pk) 2 (ppr bco)
    ppr (PUSH_ALTS_TUPLE bco call_info tuple_bco) =
                                hang (text "PUSH_ALTS_TUPLE" <+> ppr call_info)
                                     2
@@ -352,8 +351,7 @@ instance Outputable BCInstr where
    ppr (SWIZZLE stkoff n)    = text "SWIZZLE " <+> text "stkoff" <+> ppr stkoff
                                                <+> text "by" <+> ppr n
    ppr ENTER                 = text "ENTER"
-   ppr RETURN                = text "RETURN"
-   ppr (RETURN_UNLIFTED pk)  = text "RETURN_UNLIFTED  " <+> ppr pk
+   ppr (RETURN pk)           = text "RETURN  " <+> ppr pk
    ppr (RETURN_TUPLE)        = text "RETURN_TUPLE"
    ppr (BRK_FUN index uniq _cc) = text "BRK_FUN" <+> ppr index <+> mb_uniq <+> text "<cc>"
      where mb_uniq = sdocOption sdocSuppressUniques $ \case
@@ -389,10 +387,8 @@ bciStackUse PUSH32_W{}            = 1  -- takes exactly 1 word
 bciStackUse PUSH_G{}              = 1
 bciStackUse PUSH_PRIMOP{}         = 1
 bciStackUse PUSH_BCO{}            = 1
-bciStackUse (PUSH_ALTS bco)       = 2 {- profiling only, restore CCCS -} +
+bciStackUse (PUSH_ALTS bco _)     = 2 {- profiling only, restore CCCS -} +
                                     3 + protoBCOStackUse bco
-bciStackUse (PUSH_ALTS_UNLIFTED bco _) = 2 {- profiling only, restore CCCS -} +
-                                         4 + protoBCOStackUse bco
 bciStackUse (PUSH_ALTS_TUPLE bco info _) =
    -- (tuple_bco, call_info word, cont_bco, stg_ctoi_t)
    -- tuple
@@ -452,8 +448,7 @@ bciStackUse TESTEQ_P{}            = 0
 bciStackUse CASEFAIL{}            = 0
 bciStackUse JMP{}                 = 0
 bciStackUse ENTER{}               = 0
-bciStackUse RETURN{}              = 0
-bciStackUse RETURN_UNLIFTED{}     = 1 -- pushes stg_ret_X for some X
+bciStackUse RETURN{}              = 1 -- pushes stg_ret_X for some X
 bciStackUse RETURN_TUPLE{}        = 1 -- pushes stg_ret_t header
 bciStackUse CCALL{}               = 0
 bciStackUse PRIMCALL{}            = 1 -- pushes stg_primcall
diff --git a/compiler/GHC/StgToByteCode.hs b/compiler/GHC/StgToByteCode.hs
index a6eebe9bc9..33a16903c3 100644
--- a/compiler/GHC/StgToByteCode.hs
+++ b/compiler/GHC/StgToByteCode.hs
@@ -319,7 +319,7 @@ schemeTopBind (id, rhs)
         -- by just re-using the single top-level definition.  So
         -- for the worker itself, we must allocate it directly.
     -- ioToBc (putStrLn $ "top level BCO")
-    emitBc (mkProtoBCO platform (getName id) (toOL [PACK data_con 0, RETURN])
+    emitBc (mkProtoBCO platform (getName id) (toOL [PACK data_con 0, RETURN P])
                        (Right rhs) 0 0 [{-no bitmap-}] False{-not alts-})
 
   | otherwise
@@ -480,9 +480,9 @@ returnUnliftedReps d s szb reps = do
         non_void VoidRep = False
         non_void _ = True
     ret <- case filter non_void reps of
-             -- use RETURN_UBX for unary representations
-             []    -> return (unitOL $ RETURN_UNLIFTED V)
-             [rep] -> return (unitOL $ RETURN_UNLIFTED (toArgRep platform rep))
+             -- use RETURN for nullary/unary representations
+             []    -> return (unitOL $ RETURN V)
+             [rep] -> return (unitOL $ RETURN (toArgRep platform rep))
              -- otherwise use RETURN_TUPLE with a tuple descriptor
              nv_reps -> do
                let (call_info, args_offsets) = layoutNativeCall profile NativeTupleReturn 0 (primRepCmmType platform) nv_reps
@@ -526,7 +526,7 @@ schemeE
     :: StackDepth -> Sequel -> BCEnv -> CgStgExpr -> BcM BCInstrList
 schemeE d s p (StgLit lit) = returnUnliftedAtom d s p (StgLitArg lit)
 schemeE d s p (StgApp x [])
-   | not (usePlainReturn (idType x)) = returnUnliftedAtom d s p (StgVarArg x)
+   | isUnliftedType (idType x) = returnUnliftedAtom d s p (StgVarArg x)
 -- Delegate tail-calls to schemeT.
 schemeE d s p e@(StgApp {}) = schemeT d s p e
 schemeE d s p e@(StgConApp {}) = schemeT d s p e
@@ -681,8 +681,8 @@ schemeT d s p (StgOpApp (StgPrimOp op) args _ty)
 schemeT d s p (StgOpApp (StgPrimCallOp (PrimCall label unit)) args result_ty)
    = generatePrimCall d s p label (Just unit) result_ty args
 
-   -- Case 2: Unboxed tuple
 schemeT d s p (StgConApp con _cn args _tys)
+   -- Case 2: Unboxed tuple
    | isUnboxedTupleDataCon con || isUnboxedSumDataCon con
    = returnUnboxedTuple d s p args
 
@@ -691,7 +691,7 @@ schemeT d s p (StgConApp con _cn args _tys)
    = do alloc_con <- mkConAppCode d s p con args
         platform <- profilePlatform <$> getProfile
         return (alloc_con         `appOL`
-                mkSlideW 1 (bytesToWords platform $ d - s) `snocOL` RETURN)
+                mkSlideW 1 (bytesToWords platform $ d - s) `snocOL` RETURN P)
 
    -- Case 4: Tail call of function
 schemeT d s p (StgApp fn args)
@@ -831,14 +831,11 @@ doCase d s p scrut bndr alts
         -- have the same runtime rep. We have more efficient specialized
         -- return frames for the situations with one non-void element.
 
+        non_void_arg_reps = non_void (typeArgReps platform bndr_ty)
         ubx_tuple_frame =
           (isUnboxedTupleType bndr_ty || isUnboxedSumType bndr_ty) &&
           length non_void_arg_reps > 1
 
-        ubx_frame = not ubx_tuple_frame && not (usePlainReturn bndr_ty)
-
-        non_void_arg_reps = non_void (typeArgReps platform bndr_ty)
-
         profiling
           | Just interp <- hsc_interp hsc_env
           = interpreterProfiled interp
@@ -847,7 +844,8 @@ doCase d s p scrut bndr alts
         -- Top of stack is the return itbl, as usual.
         -- underneath it is the pointer to the alt_code BCO.
         -- When an alt is entered, it assumes the returned value is
-        -- on top of the itbl.
+        -- on top of the itbl; see Note [Return convention for non-tuple values]
+        -- for details.
         ret_frame_size_b :: StackDepth
         ret_frame_size_b | ubx_tuple_frame =
                              (if profiling then 5 else 4) * wordSize platform
@@ -861,7 +859,6 @@ doCase d s p scrut bndr alts
         -- The size of the return frame info table pointer if one exists
         unlifted_itbl_size_b :: StackDepth
         unlifted_itbl_size_b | ubx_tuple_frame = wordSize platform
-                             | ubx_frame       = wordSize platform
                              | otherwise       = 0
 
         (bndr_size, call_info, args_offsets)
@@ -1052,17 +1049,11 @@ doCase d s p scrut bndr alts
        then do tuple_bco <- emitBc (tupleBCO platform call_info args_offsets)
                return (PUSH_ALTS_TUPLE alt_bco' call_info tuple_bco
                        `consOL` scrut_code)
-       else let push_alts
-                  | not ubx_frame
-                  = PUSH_ALTS alt_bco'
-                  | otherwise
-                  = let unlifted_rep =
-                          case non_void_arg_reps of
-                            []    -> V
-                            [rep] -> rep
-                            _     -> panic "schemeE(StgCase).push_alts"
-                    in PUSH_ALTS_UNLIFTED alt_bco' unlifted_rep
-            in return (push_alts `consOL` scrut_code)
+       else let scrut_rep = case non_void_arg_reps of
+                  []    -> V
+                  [rep] -> rep
+                  _     -> panic "schemeE(StgCase).push_alts"
+            in return (PUSH_ALTS alt_bco' scrut_rep `consOL` scrut_code)
 
 
 -- -----------------------------------------------------------------------------
@@ -1130,21 +1121,38 @@ layoutNativeCall profile call_type start_off arg_ty reps =
                   (orig_stk_params ++ map get_byte_off new_stk_params)
      )
 
-{-
-  We use the plain return convention (ENTER/PUSH_ALTS) for
-  lifted types and unlifted algebraic types.
-
-  Other types use PUSH_ALTS_UNLIFTED/PUSH_ALTS_TUPLE which expect
-  additional data on the stack.
- -}
-usePlainReturn :: Type -> Bool
-usePlainReturn t
-  | isUnboxedTupleType t || isUnboxedSumType t = False
-  | otherwise = typePrimRep t == [LiftedRep] ||
-                (typePrimRep t == [UnliftedRep] && isAlgType t)
-
-{- Note [unboxed tuple bytecodes and tuple_BCO]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+{- Note [Return convention for non-tuple values]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The RETURN and ENTER instructions are used to return values. RETURN directly
+returns the value at the top of the stack while ENTER evaluates it first (so
+RETURN is only used when the result is already known to be evaluated), but the
+end result is the same: control returns to the enclosing stack frame with the
+result at the top of the stack.
+
+The PUSH_ALTS instruction pushes a two-word stack frame that receives a single
+lifted value. Its payload is a BCO that is executed when control returns, with
+the stack set up as if a RETURN instruction had just been executed: the returned
+value is at the top of the stack, and beneath it is the two-word frame being
+returned to. It is the continuation BCO’s job to pop its own frame off the
+stack, so the simplest possible continuation consists of two instructions:
+
+    SLIDE 1 2   -- pop the return frame off the stack, keeping the returned value
+    RETURN P    -- return the returned value to our caller
+
+RETURN and PUSH_ALTS are not really instructions but are in fact representation-
+polymorphic *families* of instructions indexed by ArgRep. ENTER, however, is a
+single real instruction, since it is only used to return lifted values, which
+are always pointers.
+
+The RETURN, ENTER, and PUSH_ALTS instructions are only used when the returned
+value has nullary or unary representation. Returning/receiving an unboxed
+tuple (or, indirectly, an unboxed sum, since unboxed sums have been desugared to
+unboxed tuples by Unarise) containing two or more results uses the special
+RETURN_TUPLE/PUSH_ALTS_TUPLE instructions, which use a different return
+convention. See Note [unboxed tuple bytecodes and tuple_BCO] for details.
+
+Note [unboxed tuple bytecodes and tuple_BCO]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   We have the bytecode instructions RETURN_TUPLE and PUSH_ALTS_TUPLE to
   return and receive arbitrary unboxed tuples, respectively. These
   instructions use the helper data tuple_BCO and call_info.
@@ -1580,7 +1588,7 @@ generateCCall d0 s p (CCallSpec target cconv safety) result_ty args
          -- slide and return
          d_after_r_min_s = bytesToWords platform (d_after_r - s)
          wrapup       = mkSlideW (trunc16W r_sizeW) (d_after_r_min_s - r_sizeW)
-                        `snocOL` RETURN_UNLIFTED (toArgRep platform r_rep)
+                        `snocOL` RETURN (toArgRep platform r_rep)
          --trace (show (arg1_offW, args_offW  ,  (map argRepSizeW a_reps) )) $
      return (
          push_args `appOL`
@@ -1694,7 +1702,6 @@ as a consequence.  The [Name] is a list of the constructors of this
 
 The code we generate is this:
                 push arg
-                push bogus-word
 
                 TESTEQ_I 0 L1
                   PUSH_G <lbl for first data con>
@@ -1712,13 +1719,6 @@ The code we generate is this:
 
         L_exit: SLIDE 1 n
                 ENTER
-
-The 'bogus-word' push is because TESTEQ_I expects the top of the stack
-to have an info-table, and the next word to have the value to be
-tested.  This is very weird, but it's the way it is right now.  See
-Interpreter.c.  We don't actually need an info-table here; we just
-need to have the argument to be one-from-top on the stack, hence pushing
-a 1-word null. See #8383.
 -}
 
 
@@ -1744,14 +1744,10 @@ implement_tagToId d s p arg names
            slide_ws = bytesToWords platform (d - s + arg_bytes)
 
        return (push_arg
-               `appOL` unitOL (PUSH_UBX LitNullAddr 1)
-                   -- Push bogus word (see Note [Implementing tagToEnum#])
                `appOL` concatOL steps
                `appOL` toOL [ LABEL label_fail, CASEFAIL,
                               LABEL label_exit ]
-               `appOL` mkSlideW 1 (slide_ws + 1)
-                   -- "+1" to account for bogus word
-                   --      (see Note [Implementing tagToEnum#])
+               `appOL` mkSlideW 1 slide_ws
                `appOL` unitOL ENTER)
   where
         mkStep l_exit (my_label, next_label, n, name_for_n)
diff --git a/compiler/GHC/StgToCmm/Closure.hs b/compiler/GHC/StgToCmm/Closure.hs
index fc76664d94..c9e4967be0 100644
--- a/compiler/GHC/StgToCmm/Closure.hs
+++ b/compiler/GHC/StgToCmm/Closure.hs
@@ -308,8 +308,7 @@ type DynTag = Int       -- The tag on a *pointer*
 -- Also see Note [Tagging big families] in GHC.StgToCmm.Expr
 --
 -- The interpreter also needs to be updated if we change the
--- tagging strategy. See Note [Data constructor dynamic tags] in
--- rts/Interpreter.c
+-- tagging strategy; see tagConstr in rts/Interpreter.c.
 
 isSmallFamily :: Platform -> Int -> Bool
 isSmallFamily platform fam_size = fam_size <= mAX_PTR_TAG platform
-- 
cgit v1.2.1