From 351de169e14ad9277aaca653df4a3753c151f7bb Mon Sep 17 00:00:00 2001
From: Ben Gamari <bgamari.foss@gmail.com>
Date: Thu, 12 Nov 2015 14:52:11 +0100
Subject: New magic function for applying realWorld#

Test Plan: validate

Reviewers: goldfire, erikd, rwbarton, simonpj, austin, simonmar, hvr

Reviewed By: simonpj

Subscribers: simonmar, thomie

Differential Revision: https://phabricator.haskell.org/D1103

GHC Trac Issues: #10678
---
 compiler/basicTypes/MkId.hs                   | 59 +++++++++++++++++++++++++--
 compiler/coreSyn/CorePrep.hs                  | 13 +++++-
 compiler/prelude/PrelNames.hs                 |  3 +-
 includes/stg/MiscClosures.h                   |  2 +
 libraries/base/GHC/IO.hs                      |  5 +--
 libraries/base/GHC/ST.hs                      | 58 ++------------------------
 libraries/ghc-prim/GHC/Magic.hs               | 19 ++++++++-
 libraries/ghc-prim/changelog.md               |  1 +
 libraries/integer-gmp/src/GHC/Integer/Type.hs |  3 +-
 testsuite/tests/perf/should_run/all.T         |  3 +-
 testsuite/tests/primops/should_run/T10678.hs  | 22 ++++++++++
 testsuite/tests/primops/should_run/all.T      |  9 ++++
 12 files changed, 129 insertions(+), 68 deletions(-)
 create mode 100644 testsuite/tests/primops/should_run/T10678.hs

diff --git a/compiler/basicTypes/MkId.hs b/compiler/basicTypes/MkId.hs
index c2a3678f02..989d79707e 100644
--- a/compiler/basicTypes/MkId.hs
+++ b/compiler/basicTypes/MkId.hs
@@ -30,7 +30,7 @@ module MkId (
         wiredInIds, ghcPrimIds,
         unsafeCoerceName, unsafeCoerceId, realWorldPrimId,
         voidPrimId, voidArgId,
-        nullAddrId, seqId, lazyId, lazyIdKey,
+        nullAddrId, seqId, lazyId, lazyIdKey, runRWId,
         coercionTokenId, magicDictId, coerceId,
         proxyHashId,
 
@@ -120,7 +120,7 @@ is right here.
 
 wiredInIds :: [Id]
 wiredInIds
-  =  [lazyId, dollarId, oneShotId]
+  =  [lazyId, dollarId, oneShotId, runRWId]
   ++ errorIds           -- Defined in MkCore
   ++ ghcPrimIds
 
@@ -1057,7 +1057,8 @@ another gun with which to shoot yourself in the foot.
 
 lazyIdName, unsafeCoerceName, nullAddrName, seqName,
    realWorldName, voidPrimIdName, coercionTokenName,
-   magicDictName, coerceName, proxyName, dollarName, oneShotName :: Name
+   magicDictName, coerceName, proxyName, dollarName, oneShotName,
+   runRWName :: Name
 unsafeCoerceName  = mkWiredInIdName gHC_PRIM  (fsLit "unsafeCoerce#")  unsafeCoerceIdKey  unsafeCoerceId
 nullAddrName      = mkWiredInIdName gHC_PRIM  (fsLit "nullAddr#")      nullAddrIdKey      nullAddrId
 seqName           = mkWiredInIdName gHC_PRIM  (fsLit "seq")            seqIdKey           seqId
@@ -1070,6 +1071,7 @@ coerceName        = mkWiredInIdName gHC_PRIM  (fsLit "coerce")         coerceKey
 proxyName         = mkWiredInIdName gHC_PRIM  (fsLit "proxy#")         proxyHashKey       proxyHashId
 dollarName        = mkWiredInIdName gHC_BASE  (fsLit "$")              dollarIdKey        dollarId
 oneShotName       = mkWiredInIdName gHC_MAGIC (fsLit "oneShot")        oneShotKey         oneShotId
+runRWName         = mkWiredInIdName gHC_MAGIC (fsLit "runRW#")         runRWKey           runRWId
 
 dollarId :: Id  -- Note [dollarId magic]
 dollarId = pcMiscPrelId dollarName ty
@@ -1182,6 +1184,19 @@ oneShotId = pcMiscPrelId oneShotName ty info
     x' = setOneShotLambda x
     rhs = mkLams [openAlphaTyVar, openBetaTyVar, body, x'] $ Var body `App` Var x
 
+runRWId :: Id -- See Note [runRW magic] in this module
+runRWId = pcMiscPrelId runRWName ty info
+  where
+    info    = noCafIdInfo `setInlinePragInfo` neverInlinePragma
+    -- State# RealWorld
+    stateRW = mkTyConApp statePrimTyCon [realWorldTy]
+    -- (# State# RealWorld, o #)
+    ret_ty  = mkTyConApp unboxedPairTyCon [stateRW, openAlphaTy]
+    -- State# RealWorld -> (# State# RealWorld, o #)
+    arg_ty  = stateRW `mkFunTy` ret_ty
+    -- (State# RealWorld -> (# State# RealWorld, o #))
+    --   -> (# State# RealWorld, o #)
+    ty      = mkForAllTys [openAlphaTyVar] (arg_ty `mkFunTy` ret_ty)
 
 --------------------------------------------------------------------------------
 magicDictId :: Id  -- See Note [magicDictId magic]
@@ -1322,6 +1337,44 @@ See Trac #3259 for a real world example.
 lazyId is defined in GHC.Base, so we don't *have* to inline it.  If it
 appears un-applied, we'll end up just calling it.
 
+Note [runRW magic]
+~~~~~~~~~~~~~~~~~~
+Some definitions, for instance @runST@, must have careful control over float out
+of the bindings in their body. Consider this use of @runST@,
+
+    f x = runST ( \ s -> let (a, s')  = newArray# 100 [] s
+                             (_, s'') = fill_in_array_or_something a x s'
+                         in freezeArray# a s'' )
+
+If we inline @runST@, we'll get:
+
+    f x = let (a, s')  = newArray# 100 [] realWorld#{-NB-}
+              (_, s'') = fill_in_array_or_something a x s'
+          in freezeArray# a s''
+
+And now if we allow the @newArray#@ binding to float out to become a CAF,
+we end up with a result that is totally and utterly wrong:
+
+    f = let (a, s')  = newArray# 100 [] realWorld#{-NB-} -- YIKES!!!
+        in \ x ->
+            let (_, s'') = fill_in_array_or_something a x s'
+            in freezeArray# a s''
+
+All calls to @f@ will share a {\em single} array! Clearly this is nonsense and
+must be prevented.
+
+This is what @runRW#@ gives us: by being inlined extremely late in the
+optimization (right before lowering to STG, in CorePrep), we can ensure that
+no further floating will occur. This allows us to safely inline things like
+@runST@, which are otherwise needlessly expensive (see #10678 and #5916).
+
+While the definition of @GHC.Magic.runRW#@, we override its type in @MkId@
+to be open-kinded,
+
+    runRW# :: (o :: OpenKind) => (State# RealWorld -> (# State# RealWorld, o #))
+                              -> (# State# RealWorld, o #)
+
+
 Note [The oneShot function]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 In the context of making left-folds fuse somewhat okish (see ticket #7994
diff --git a/compiler/coreSyn/CorePrep.hs b/compiler/coreSyn/CorePrep.hs
index 23afcdfb04..e49ece43d2 100644
--- a/compiler/coreSyn/CorePrep.hs
+++ b/compiler/coreSyn/CorePrep.hs
@@ -18,6 +18,7 @@ import OccurAnal
 
 import HscTypes
 import PrelNames
+import MkId             ( realWorldPrimId )
 import CoreUtils
 import CoreArity
 import CoreFVs
@@ -511,10 +512,20 @@ cpeRhsE env (Lit (LitInteger i _))
 cpeRhsE _env expr@(Lit {}) = return (emptyFloats, expr)
 cpeRhsE env expr@(Var {})  = cpeApp env expr
 
-cpeRhsE env (Var f `App` _ `App` arg)
+cpeRhsE env (Var f `App` _{-type-} `App` arg)
   | f `hasKey` lazyIdKey          -- Replace (lazy a) by a
   = cpeRhsE env arg               -- See Note [lazyId magic] in MkId
 
+    -- See Note [runRW magic] in MkId
+  | f `hasKey` runRWKey           -- Replace (runRW# f) by (f realWorld#),
+  = case arg of                   -- beta reducing if possible
+      Lam s body -> cpeRhsE env (substExpr (text "runRW#") subst body)
+        where subst = extendIdSubst emptySubst s (Var realWorldPrimId)
+                      -- XXX I think we can use emptySubst here
+                      -- because realWorldPrimId is a global variable
+                      -- and so cannot be bound by a lambda in body
+      _          -> cpeRhsE env (arg `App` Var realWorldPrimId)
+
 cpeRhsE env expr@(App {}) = cpeApp env expr
 
 cpeRhsE env (Let bind expr)
diff --git a/compiler/prelude/PrelNames.hs b/compiler/prelude/PrelNames.hs
index 05a38ffec9..7229f76401 100644
--- a/compiler/prelude/PrelNames.hs
+++ b/compiler/prelude/PrelNames.hs
@@ -1834,11 +1834,12 @@ rootMainKey, runMainKey :: Unique
 rootMainKey                   = mkPreludeMiscIdUnique 101
 runMainKey                    = mkPreludeMiscIdUnique 102
 
-thenIOIdKey, lazyIdKey, assertErrorIdKey, oneShotKey :: Unique
+thenIOIdKey, lazyIdKey, assertErrorIdKey, oneShotKey, runRWKey :: Unique
 thenIOIdKey                   = mkPreludeMiscIdUnique 103
 lazyIdKey                     = mkPreludeMiscIdUnique 104
 assertErrorIdKey              = mkPreludeMiscIdUnique 105
 oneShotKey                    = mkPreludeMiscIdUnique 106
+runRWKey                      = mkPreludeMiscIdUnique 107
 
 breakpointIdKey, breakpointCondIdKey, breakpointAutoIdKey,
     breakpointJumpIdKey, breakpointCondJumpIdKey,
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 6fd7181426..06d937a6c9 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -423,6 +423,8 @@ RTS_FUN_DECL(stg_addCFinalizzerToWeakzh);
 RTS_FUN_DECL(stg_finalizzeWeakzh);
 RTS_FUN_DECL(stg_deRefWeakzh);
 
+RTS_FUN_DECL(stg_runRWzh);
+
 RTS_FUN_DECL(stg_newBCOzh);
 RTS_FUN_DECL(stg_mkApUpd0zh);
 
diff --git a/libraries/base/GHC/IO.hs b/libraries/base/GHC/IO.hs
index 1e8c74e5c1..f38c88f009 100644
--- a/libraries/base/GHC/IO.hs
+++ b/libraries/base/GHC/IO.hs
@@ -176,11 +176,8 @@ like 'bracket' cannot be used safely within 'unsafeDupablePerformIO'.
 
 @since 4.4.0.0
 -}
-{-# NOINLINE unsafeDupablePerformIO #-}
-    -- See Note [unsafeDupablePerformIO is NOINLINE]
 unsafeDupablePerformIO  :: IO a -> a
-unsafeDupablePerformIO (IO m) = lazy (case m realWorld# of (# _, r #) -> r)
-     -- See Note [unsafeDupablePerformIO has a lazy RHS]
+unsafeDupablePerformIO (IO m) = case runRW# m of (# _, a #) -> a
 
 -- Note [unsafeDupablePerformIO is NOINLINE]
 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/libraries/base/GHC/ST.hs b/libraries/base/GHC/ST.hs
index 46c5196c9e..d84dd4d9d1 100644
--- a/libraries/base/GHC/ST.hs
+++ b/libraries/base/GHC/ST.hs
@@ -18,7 +18,7 @@
 
 module GHC.ST (
         ST(..), STret(..), STRep,
-        fixST, runST, runSTRep,
+        fixST, runST,
 
         -- * Unsafe functions
         liftST, unsafeInterleaveST
@@ -103,62 +103,10 @@ instance  Show (ST s a)  where
     showsPrec _ _  = showString "<<ST action>>"
     showList       = showList__ (showsPrec 0)
 
-{-
-Definition of runST
-~~~~~~~~~~~~~~~~~~~
-
-SLPJ 95/04: Why @runST@ must not have an unfolding; consider:
-\begin{verbatim}
-f x =
-  runST ( \ s -> let
-                    (a, s')  = newArray# 100 [] s
-                    (_, s'') = fill_in_array_or_something a x s'
-                  in
-                  freezeArray# a s'' )
-\end{verbatim}
-If we inline @runST@, we'll get:
-\begin{verbatim}
-f x = let
-        (a, s')  = newArray# 100 [] realWorld#{-NB-}
-        (_, s'') = fill_in_array_or_something a x s'
-      in
-      freezeArray# a s''
-\end{verbatim}
-And now the @newArray#@ binding can be floated to become a CAF, which
-is totally and utterly wrong:
-\begin{verbatim}
-f = let
-    (a, s')  = newArray# 100 [] realWorld#{-NB-} -- YIKES!!!
-    in
-    \ x ->
-        let (_, s'') = fill_in_array_or_something a x s' in
-        freezeArray# a s''
-\end{verbatim}
-All calls to @f@ will share a {\em single} array!  End SLPJ 95/04.
--}
-
 {-# INLINE runST #-}
--- The INLINE prevents runSTRep getting inlined in *this* module
--- so that it is still visible when runST is inlined in an importing
--- module.  Regrettably delicate.  runST is behaving like a wrapper.
-
 -- | Return the value computed by a state transformer computation.
 -- The @forall@ ensures that the internal state used by the 'ST'
 -- computation is inaccessible to the rest of the program.
 runST :: (forall s. ST s a) -> a
-runST st = runSTRep (case st of { ST st_rep -> st_rep })
-
--- I'm only letting runSTRep be inlined right at the end, in particular *after* full laziness
--- That's what the "INLINE [0]" says.
---              SLPJ Apr 99
--- {-# INLINE [0] runSTRep #-}
-
--- SDM: further to the above, inline phase 0 is run *before*
--- full-laziness at the moment, which means that the above comment is
--- invalid.  Inlining runSTRep doesn't make a huge amount of
--- difference, anyway.  Hence:
-
-{-# NOINLINE runSTRep #-}
-runSTRep :: (forall s. STRep s a) -> a
-runSTRep st_rep = case st_rep realWorld# of
-                        (# _, r #) -> r
+runST (ST st_rep) = case runRW# st_rep of (# _, a #) -> a
+-- See Note [Definition of runRW#] in GHC.Magic
diff --git a/libraries/ghc-prim/GHC/Magic.hs b/libraries/ghc-prim/GHC/Magic.hs
index 740abb729e..495705b3b4 100644
--- a/libraries/ghc-prim/GHC/Magic.hs
+++ b/libraries/ghc-prim/GHC/Magic.hs
@@ -1,5 +1,9 @@
+{-# LANGUAGE CPP #-}
 {-# LANGUAGE Trustworthy #-}
 {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+
 -----------------------------------------------------------------------------
 -- |
 -- Module      :  GHC.Magic
@@ -17,8 +21,9 @@
 --
 -----------------------------------------------------------------------------
 
-module GHC.Magic ( inline, lazy, oneShot ) where
+module GHC.Magic ( inline, lazy, oneShot, runRW# ) where
 
+import GHC.Prim
 import GHC.CString ()
 
 -- | The call @inline f@ arranges that 'f' is inlined, regardless of
@@ -82,3 +87,15 @@ oneShot :: (a -> b) -> (a -> b)
 oneShot f = f
 -- Implementation note: This is wired in in MkId.lhs, so the code here is
 -- mostly there to have a place for the documentation.
+
+-- | Apply a function to a 'RealWorld' token.
+runRW# :: (State# RealWorld -> (# State# RealWorld, o #))
+       -> (# State# RealWorld, o #)
+-- See Note [runRW magic] in MkId
+#if !defined(__HADDOCK_VERSION__)
+runRW# m = m realWorld#
+#else
+runRW# = runRW#   -- The realWorld# is too much for haddock
+#endif
+{-# NOINLINE runRW# #-}
+-- This is inlined manually in CorePrep
diff --git a/libraries/ghc-prim/changelog.md b/libraries/ghc-prim/changelog.md
index 2e4288622b..45daa64b26 100644
--- a/libraries/ghc-prim/changelog.md
+++ b/libraries/ghc-prim/changelog.md
@@ -27,6 +27,7 @@
 
         getSizeofMutableByteArray# :: MutableByteArray# d -> State# d -> (# State# d, Int# #)
         subWordC# :: Word# -> Word# -> (# Word#, Int# #)
+        runRW# :: (State# RealWorld -> (# State# RealWorld, o #)) -> (# State# RealWorld, o #)
 
 - Added to `GHC.Types`:
 
diff --git a/libraries/integer-gmp/src/GHC/Integer/Type.hs b/libraries/integer-gmp/src/GHC/Integer/Type.hs
index 5bc52539fd..167492d34d 100644
--- a/libraries/integer-gmp/src/GHC/Integer/Type.hs
+++ b/libraries/integer-gmp/src/GHC/Integer/Type.hs
@@ -1934,8 +1934,7 @@ liftIO (IO m) = m
 
 -- NB: equivalent of GHC.IO.unsafeDupablePerformIO, see notes there
 runS :: S RealWorld a -> a
-runS m = lazy (case m realWorld# of (# _, r #) -> r)
-{-# NOINLINE runS #-}
+runS m = case runRW# m of (# _, a #) -> a
 
 -- stupid hack
 fail :: [Char] -> S s a
diff --git a/testsuite/tests/perf/should_run/all.T b/testsuite/tests/perf/should_run/all.T
index 6ac8861450..a86d61f51d 100644
--- a/testsuite/tests/perf/should_run/all.T
+++ b/testsuite/tests/perf/should_run/all.T
@@ -271,8 +271,9 @@ test('T7257',
      [stats_num_field('bytes allocated',
                       [(wordsize(32), 1150000000, 10),
                     # expected value: 1246287228 (i386/Linux)
-                       (wordsize(64), 1774893760, 5)]),
+                       (wordsize(64), 1654893248, 5)]),
                         # 2012-09-21: 1774893760 (amd64/Linux)
+                        # 2015-11-03: 1654893248 (amd64/Linux)
       stats_num_field('peak_megabytes_allocated',
                       [(wordsize(32), 217, 5),
                         # 2012-10-08: 217 (x86/Linux)
diff --git a/testsuite/tests/primops/should_run/T10678.hs b/testsuite/tests/primops/should_run/T10678.hs
new file mode 100644
index 0000000000..9019ab6345
--- /dev/null
+++ b/testsuite/tests/primops/should_run/T10678.hs
@@ -0,0 +1,22 @@
+{-# LANGUAGE MagicHash #-}
+
+import GHC.Prim
+
+main :: IO ()
+main = go 1000000# 10 (2^100)
+
+go :: Int# -> Integer -> Integer -> IO ()
+go 0# _ _ = return ()
+go n# a b = (a + b) `seq` go (n# -# 1#) a b
+{-# NOINLINE go #-}
+
+{-
+This test is based on a strategy from rwbarton relying on the inefficiency
+of `Integer` addition as defined by `integer-gmp` without `runRW#`.
+
+    When I was testing the patch interactively, I measured allocations for,
+    say, a million (large Integer) + (small Integer) additions.  If that
+    addition allocates, say, 6 words, then one can fairly reliably write the
+    program so that it will allocate between 6 million and 7 million words,
+    total.
+-}
diff --git a/testsuite/tests/primops/should_run/all.T b/testsuite/tests/primops/should_run/all.T
index b21279aa99..b0001d6d32 100644
--- a/testsuite/tests/primops/should_run/all.T
+++ b/testsuite/tests/primops/should_run/all.T
@@ -3,3 +3,12 @@ test('T7689', normal, compile_and_run, [''])
 # The test is using unboxed tuples, so omit ghci
 test('T9430', omit_ways(['ghci']), compile_and_run, [''])
 test('T10481', exit_code(1), compile_and_run, [''])
+test('T10678',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(64), 88041768, 5)
+                       # 2015-11-04: 88041768 +/- 5%  (before runRW#)
+                       # 2015-11-04: 64004171         (after runRW#)
+                      ]),
+      only_ways('normal')
+     ],
+     compile_and_run, ['-O'])
-- 
cgit v1.2.1