2 files changed, 84 insertions, 16 deletions
diff --git a/compiler/GHC/Core/Opt/ConstantFold.hs b/compiler/GHC/Core/Opt/ConstantFold.hs
index de98dd0842..e6d23f3d0a 100644
--- a/compiler/GHC/Core/Opt/ConstantFold.hs
+++ b/compiler/GHC/Core/Opt/ConstantFold.hs
@@ -1519,18 +1519,40 @@ match_cstring_length env id_unf _ [lit1]
 match_cstring_length _ _ _ _ = Nothing
 
 ---------------------------------------------------
--- The rule is this:
---      inline f_ty (f a b c) = <f's unfolding> a b c
--- (if f has an unfolding, EVEN if it's a loop breaker)
---
--- It's important to allow the argument to 'inline' to have args itself
--- (a) because its more forgiving to allow the programmer to write
---       inline f a b c
---   or  inline (f a b c)
--- (b) because a polymorphic f wll get a type argument that the
---     programmer can't avoid
---
--- Also, don't forget about 'inline's type argument!
+{- Note [inlineId magic]
+~~~~~~~~~~~~~~~~~~~~~~~~
+The call 'inline f' arranges that 'f' is inlined, regardless of
+its size. More precisely, the call 'inline f' rewrites to the
+right-hand side of 'f's definition. This allows the programmer to
+control inlining from a particular call site rather than the
+definition site of the function.
+
+The moving parts are simple:
+
+* A very simple definition in the library base:GHC.Magic
+     {-# NOINLINE[0] inline #-}
+     inline :: a -> a
+     inline x = x
+  So in phase 0, 'inline' will be inlined, so its use imposes
+  no overhead.
+
+* A rewrite rule, in GHC.Core.Opt.ConstantFold, which makes
+  (inline f) inline, implemented by match_inline.
+  The rule for the 'inline' function is this:
+     inline f_ty (f a b c) = <f's unfolding> a b c
+  (if f has an unfolding, EVEN if it's a loop breaker)
+
+  It's important to allow the argument to 'inline' to have args itself
+  (a) because its more forgiving to allow the programmer to write
+      either  inline f a b c
+      or      inline (f a b c)
+  (b) because a polymorphic f wll get a type argument that the
+      programmer can't avoid, so the call may look like
+        inline (map @Int @Bool) g xs
+
+  Also, don't forget about 'inline's type argument!
+-}
+
 match_inline :: [Expr CoreBndr] -> Maybe (Expr CoreBndr)
 match_inline (Type _ : e : _)
   | (Var f, args1) <- collectArgs e,
@@ -1540,7 +1562,7 @@ match_inline (Type _ : e : _)
 
 match_inline _ = Nothing
 
-
+---------------------------------------------------
 -- See Note [magicDictId magic] in "GHC.Types.Id.Make"
 -- for a description of what is going on here.
 match_magicDict :: [Expr CoreBndr] -> Maybe (Expr CoreBndr)
diff --git a/compiler/GHC/Types/Unique/Supply.hs b/compiler/GHC/Types/Unique/Supply.hs
index e42edd8b83..1ccb3c0fd1 100644
--- a/compiler/GHC/Types/Unique/Supply.hs
+++ b/compiler/GHC/Types/Unique/Supply.hs
@@ -48,6 +48,7 @@ import GHC.Utils.Monad
 import Control.Monad
 import Data.Bits
 import Data.Char
+import GHC.Exts( inline )
 
 #include "Unique.h"
 
@@ -111,8 +112,15 @@ Why doesn't full laziness float out the (\s2...)?  Because of
 the state hack (#18238).
 
 So for this module we switch the state hack off -- it's an example
-of when it makes things worse rather than better. Now full laziness
-can float that lambda out, and we get
+of when it makes things worse rather than better.  And we use
+multiShotIO (see Note [multiShotIO]) thus:
+
+     mk_supply = multiShotIO $
+                 unsafeInterleaveIO $
+                 genSym      >>= \ u ->
+                 ...
+
+Now full laziness can float that lambda out, and we get
 
   $wmkSplitUniqSupply c# s
     = letrec
@@ -146,6 +154,38 @@ bit slower.  (Test perf/should_run/UniqLoop had a 20% perf change.)
 
 Sigh.  The test perf/should_run/UniqLoop keeps track of this loop.
 Watch it carefully.
+
+Note [multiShotIO]
+~~~~~~~~~~~~~~~~~~
+The function multiShotIO :: IO a -> IO a
+says that the argument IO action may be invoked repeatedly (is
+multi-shot), and so there should be a multi-shot lambda around it.
+It's quite easy to define, in any module with `-fno-state-hack`:
+    multiShotIO :: IO a -> IO a
+    {-# INLINE multiShotIO #-}
+    multiShotIO (IO m) = IO (\s -> inline m s)
+
+Because of -fno-state-hack, that '\s' will be multi-shot. Now,
+ignoring the casts from IO:
+    multiShotIO (\ss{one-shot}. blah)
+    ==> let m = \ss{one-shot}. blah
+        in \s. inline m s
+    ==> \s. (\ss{one-shot}.blah) s
+    ==> \s. blah[s/ss]
+
+The magic `inline` function does two things
+* It prevents eta reduction.  If we wrote just
+      multiShotIO (IO m) = IO (\s -> m s)
+  the lamda would eta-reduce to 'm' and all would be lost.
+
+* It helps ensure that 'm' really does inline.
+
+Note that 'inline' evaporates in phase 0.  See Note [inlineIdMagic]
+in GHC.Core.Opt.ConstantFold.match_inline.
+
+The INLINE pragma on multiShotIO is very important, else the
+'inline' call will evaporate when compiling the module that
+defines 'multiShotIO', before it is ever exported.
 -}
 
 
@@ -176,12 +216,18 @@ mkSplitUniqSupply c
         -- This is one of the most hammered bits in the whole compiler
         -- See Note [Optimising the unique supply]
         -- NB: Use unsafeInterleaveIO for thread-safety.
-     mk_supply = unsafeInterleaveIO $
+     mk_supply = multiShotIO $
+                 unsafeInterleaveIO $
                  genSym      >>= \ u ->
                  mk_supply   >>= \ s1 ->
                  mk_supply   >>= \ s2 ->
                  return (MkSplitUniqSupply (mask .|. u) s1 s2)
 
+multiShotIO :: IO a -> IO a
+{-# INLINE multiShotIO #-}
+-- See Note [multiShotIO]
+multiShotIO (IO m) = IO (\s -> inline m s)
+
 foreign import ccall unsafe "genSym" genSym :: IO Int
 foreign import ccall unsafe "initGenSym" initUniqSupply :: Int -> Int -> IO ()