16 files changed, 2993 insertions, 1899 deletions
diff --git a/compiler/simplCore/CSE.hs b/compiler/simplCore/CSE.hs
index 83f5ee6a3b..96fbd07454 100644
--- a/compiler/simplCore/CSE.hs
+++ b/compiler/simplCore/CSE.hs
@@ -10,21 +10,24 @@ module CSE (cseProgram, cseOneExpr) where
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import CoreSubst
 import Var              ( Var )
-import VarEnv           ( elemInScopeSet )
-import Id               ( Id, idType, idInlineActivation, isDeadBinder
+import VarEnv           ( elemInScopeSet, mkInScopeSet )
+import Id               ( Id, idType, isDeadBinder
+                        , idInlineActivation, setInlineActivation
                         , zapIdOccInfo, zapIdUsageInfo, idInlinePragma
-                        , isJoinId )
+                        , isJoinId, isJoinId_maybe )
 import CoreUtils        ( mkAltExpr, eqExpr
-                        , exprIsLiteralString
+                        , exprIsTickedString
                         , stripTicksE, stripTicksT, mkTicks )
+import CoreFVs          ( exprFreeVars )
 import Type             ( tyConAppArgs )
 import CoreSyn
 import Outputable
-import BasicTypes       ( TopLevelFlag(..), isTopLevel
-                        , isAlwaysActive, isAnyInlinePragma )
-import TrieMap
+import BasicTypes
+import CoreMap
 import Util             ( filterOut )
 import Data.List        ( mapAccumL )
 
@@ -204,8 +207,12 @@ is small).  The conclusion here is this:
   might replace <rhs> by 'bar', and then later be unable to see that it
   really was <rhs>.
 
+An except to the rule is when the INLINE pragma is not from the user, e.g. from
+WorkWrap (see Note [Wrapper activation]). We can tell because noUserInlineSpec
+is then true.
+
 Note that we do not (currently) do CSE on the unfolding stored inside
-an Id, even if is a 'stable' unfolding.  That means that when an
+an Id, even if it is a 'stable' unfolding.  That means that when an
 unfolding happens, it is always faithful to what the stable unfolding
 originally was.
 
@@ -266,7 +273,28 @@ compiling ppHtml in Haddock.Backends.Xhtml).
 
 We could try and be careful by tracking which join points are still valid at
 each subexpression, but since join points aren't allocated or shared, there's
-less to gain by trying to CSE them.
+less to gain by trying to CSE them. (#13219)
+
+Note [Look inside join-point binders]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Another way how CSE for joint points is tricky is
+
+  let join foo x = (x, 42)
+      join bar x = (x, 42)
+  in … jump foo 1 … jump bar 2 …
+
+naively, CSE would turn this into
+
+  let join foo x = (x, 42)
+      join bar = foo
+  in … jump foo 1 … jump bar 2 …
+
+but now bar is a join point that claims arity one, but its right-hand side
+is not a lambda, breaking the join-point invariant (this was #15002).
+
+So `cse_bind` must zoom past the lambdas of a join point (using
+`collectNBinders`) and resume searching for CSE opportunities only in
+the body of the join point.
 
 Note [CSE for recursive bindings]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -306,14 +334,16 @@ cseBind toplevel env (NonRec b e)
     (env1, b1)       = addBinder env b
     (env2, (b2, e2)) = cse_bind toplevel env1 (b,e) b1
 
-cseBind _ env (Rec [(in_id, rhs)])
+cseBind toplevel env (Rec [(in_id, rhs)])
   | noCSE in_id
   = (env1, Rec [(out_id, rhs')])
 
   -- See Note [CSE for recursive bindings]
   | Just previous <- lookupCSRecEnv env out_id rhs''
   , let previous' = mkTicks ticks previous
-  = (extendCSSubst env1 in_id previous', NonRec out_id previous')
+        out_id'   = delayInlining toplevel out_id
+  = -- We have a hit in the recursive-binding cache
+    (extendCSSubst env1 in_id previous', NonRec out_id' previous')
 
   | otherwise
   = (extendCSRecEnv env1 out_id rhs'' id_expr', Rec [(zapped_id, rhs')])
@@ -341,15 +371,33 @@ cseBind toplevel env (Rec pairs)
 -- which are equal to @out_rhs@.
 cse_bind :: TopLevelFlag -> CSEnv -> (InId, InExpr) -> OutId -> (CSEnv, (OutId, OutExpr))
 cse_bind toplevel env (in_id, in_rhs) out_id
-  | isTopLevel toplevel, exprIsLiteralString in_rhs
+  | isTopLevel toplevel, exprIsTickedString in_rhs
       -- See Note [Take care with literal strings]
-  = (env', (out_id, in_rhs))
+  = (env', (out_id', in_rhs))
+
+  | Just arity <- isJoinId_maybe in_id
+      -- See Note [Look inside join-point binders]
+  = let (params, in_body) = collectNBinders arity in_rhs
+        (env', params') = addBinders env params
+        out_body = tryForCSE env' in_body
+    in (env, (out_id, mkLams params' out_body))
 
   | otherwise
-  = (env', (out_id', out_rhs))
+  = (env', (out_id'', out_rhs))
   where
-    out_rhs         = tryForCSE env in_rhs
     (env', out_id') = addBinding env in_id out_id out_rhs
+    (cse_done, out_rhs) = try_for_cse env in_rhs
+    out_id'' | cse_done  = delayInlining toplevel out_id'
+             | otherwise = out_id'
+
+delayInlining :: TopLevelFlag -> Id -> Id
+-- Add a NOINLINE[2] if the Id doesn't have an INLNE pragma already
+delayInlining top_lvl bndr
+  | isTopLevel top_lvl
+  , isAlwaysActive (idInlineActivation bndr)
+  = bndr `setInlineActivation` activeAfterInitial
+  | otherwise
+  = bndr
 
 addBinding :: CSEnv                      -- Includes InId->OutId cloning
            -> InVar                      -- Could be a let-bound type
@@ -384,8 +432,11 @@ addBinding env in_id out_id rhs'
                    Var {} -> True
                    _      -> False
 
+-- | Given a binder `let x = e`, this function
+-- determines whether we should add `e -> x` to the cs_map
 noCSE :: InId -> Bool
-noCSE id =  not (isAlwaysActive (idInlineActivation id))
+noCSE id =  not (isAlwaysActive (idInlineActivation id)) &&
+            not (noUserInlineSpec (inlinePragmaSpec (idInlinePragma id)))
              -- See Note [CSE for INLINE and NOINLINE]
          || isAnyInlinePragma (idInlinePragma id)
              -- See Note [CSE for stable unfoldings]
@@ -425,12 +476,46 @@ The net effect is that for the y-binding we want to
   - but leave the original binding for y undisturbed
 
 This is done by cse_bind.  I got it wrong the first time (Trac #13367).
+
+Note [Delay inlining after CSE]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Suppose (Trac #15445) we have
+   f,g :: Num a => a -> a
+   f x = ...f (x-1).....
+   g y = ...g (y-1) ....
+
+and we make some specialisations of 'g', either automatically, or via
+a SPECIALISE pragma.  Then CSE kicks in and notices that the RHSs of
+'f' and 'g' are identical, so we get
+   f x = ...f (x-1)...
+   g = f
+   {-# RULES g @Int _ = $sg #-}
+
+Now there is terrible danger that, in an importing module, we'll inline
+'g' before we have a chance to run its specialisation!
+
+Solution: during CSE, when adding a top-level
+  g = f
+binding after a "hit" in the CSE cache, add a NOINLINE[2] activation
+to it, to ensure it's not inlined right away.
+
+Why top level only?  Because for nested bindings we are already past
+phase 2 and will never return there.
 -}
 
 tryForCSE :: CSEnv -> InExpr -> OutExpr
-tryForCSE env expr
-  | Just e <- lookupCSEnv env expr'' = mkTicks ticks e
-  | otherwise                        = expr'
+tryForCSE env expr = snd (try_for_cse env expr)
+
+try_for_cse :: CSEnv -> InExpr -> (Bool, OutExpr)
+-- (False, e') => We did not CSE the entire expression,
+--                but we might have CSE'd some sub-expressions,
+--                yielding e'
+--
+-- (True, te') => We CSE'd the entire expression,
+--                yielding the trivial expression te'
+try_for_cse env expr
+  | Just e <- lookupCSEnv env expr'' = (True,  mkTicks ticks e)
+  | otherwise                        = (False, expr')
     -- The varToCoreExpr is needed if we have
     --   case e of xco { ...case e of yco { ... } ... }
     -- Then CSE will substitute yco -> xco;
@@ -444,8 +529,13 @@ tryForCSE env expr
     -- top of the replaced sub-expression. This is probably not too
     -- useful in practice, but upholds our semantics.
 
+-- | Runs CSE on a single expression.
+--
+-- This entry point is not used in the compiler itself, but is provided
+-- as a convenient entry point for users of the GHC API.
 cseOneExpr :: InExpr -> OutExpr
-cseOneExpr = cseExpr emptyCSEnv
+cseOneExpr e = cseExpr env e
+  where env = emptyCSEnv {cs_subst = mkEmptySubst (mkInScopeSet (exprFreeVars e)) }
 
 cseExpr :: CSEnv -> InExpr -> OutExpr
 cseExpr env (Type t)              = Type (substTy (csEnvSubst env) t)
@@ -454,7 +544,7 @@ cseExpr _   (Lit lit)             = Lit lit
 cseExpr env (Var v)               = lookupSubst env v
 cseExpr env (App f a)             = App (cseExpr env f) (tryForCSE env a)
 cseExpr env (Tick t e)            = Tick t (cseExpr env e)
-cseExpr env (Cast e co)           = Cast (cseExpr env e) (substCo (csEnvSubst env) co)
+cseExpr env (Cast e co)           = Cast (tryForCSE env e) (substCo (csEnvSubst env) co)
 cseExpr env (Lam b e)             = let (env', b') = addBinder env b
                                     in Lam b' (cseExpr env' e)
 cseExpr env (Let bind e)          = let (env', bind') = cseBind NotTopLevel env bind
@@ -530,9 +620,9 @@ to transform
    W y z   -> e2
 
 In the simplifier we use cheapEqExpr, because it is called a lot.
-But here in CSE we use the full eqExpr.  After all, two alterantives usually
+But here in CSE we use the full eqExpr.  After all, two alternatives usually
 differ near the root, so it probably isn't expensive to compare the full
-alternative.  It seems like the the same kind of thing that CSE is supposed
+alternative.  It seems like the same kind of thing that CSE is supposed
 to be doing, which is why I put it here.
 
 I acutally saw some examples in the wild, where some inlining made e1 too
diff --git a/compiler/simplCore/CallArity.hs b/compiler/simplCore/CallArity.hs
index 0cf0c2f44f..ba1aa243ac 100644
--- a/compiler/simplCore/CallArity.hs
+++ b/compiler/simplCore/CallArity.hs
@@ -7,6 +7,8 @@ module CallArity
     , callArityRHS -- for testing
     ) where
 
+import GhcPrelude
+
 import VarSet
 import VarEnv
 import DynFlags ( DynFlags )
@@ -340,7 +342,7 @@ For a mutually recursive let, we begin by
  3. We combine the analysis result from the body and the memoized results for
     the arguments (if already present).
  4. For each variable, we find out the incoming arity and whether it is called
-    once, based on the the current analysis result. If this differs from the
+    once, based on the current analysis result. If this differs from the
     memoized results, we re-analyse the rhs and update the memoized table.
  5. If nothing had to be reanalyzed, we are done.
     Otherwise, repeat from step 3.
@@ -350,7 +352,7 @@ Note [Thunks in recursive groups]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 We never eta-expand a thunk in a recursive group, on the grounds that if it is
-part of a recursive group, then it will be called multipe times.
+part of a recursive group, then it will be called multiple times.
 
 This is not necessarily true, e.g.  it would be safe to eta-expand t2 (but not
 t1) in the following code:
@@ -404,7 +406,7 @@ published papers on Call Arity describe it.
 
 In practice, there are thunks that do a just little work, such as
 pattern-matching on a variable, and the benefits of eta-expansion likely
-oughtweigh the cost of doing that repeatedly. Therefore, this implementation of
+outweigh the cost of doing that repeatedly. Therefore, this implementation of
 Call Arity considers everything that is not cheap (`exprIsCheap`) as a thunk.
 
 Note [Call Arity and Join Points]
@@ -733,7 +735,7 @@ domRes (_, ae) = varEnvDom ae
 lookupCallArityRes :: CallArityRes -> Var -> (Arity, Bool)
 lookupCallArityRes (g, ae) v
     = case lookupVarEnv ae v of
-        Just a -> (a, not (v `elemUnVarSet` (neighbors g v)))
+        Just a -> (a, not (g `hasLoopAt` v))
         Nothing -> (0, False)
 
 calledWith :: CallArityRes -> Var -> UnVarSet
@@ -758,4 +760,4 @@ lubArityEnv :: VarEnv Arity -> VarEnv Arity -> VarEnv Arity
 lubArityEnv = plusVarEnv_C min
 
 lubRess :: [CallArityRes] -> CallArityRes
-lubRess = foldl lubRes emptyArityRes
+lubRess = foldl' lubRes emptyArityRes
diff --git a/compiler/simplCore/CoreMonad.hs b/compiler/simplCore/CoreMonad.hs
index c689eea346..0c5d8d9fd2 100644
--- a/compiler/simplCore/CoreMonad.hs
+++ b/compiler/simplCore/CoreMonad.hs
@@ -9,12 +9,12 @@
 module CoreMonad (
     -- * Configuration of the core-to-core passes
     CoreToDo(..), runWhen, runMaybe,
-    SimplifierMode(..),
+    SimplMode(..),
     FloatOutSwitches(..),
     pprPassDetails,
 
     -- * Plugins
-    PluginPass, bindsOnlyPass,
+    CorePluginPass, bindsOnlyPass,
 
     -- * Counting
     SimplCount, doSimplTick, doFreeSimplTick, simplCountN,
@@ -47,14 +47,11 @@ module CoreMonad (
     putMsg, putMsgS, errorMsg, errorMsgS, warnMsg,
     fatalErrorMsg, fatalErrorMsgS,
     debugTraceMsg, debugTraceMsgS,
-    dumpIfSet_dyn,
-
-    -- * Getting 'Name's
-    thNameToGhcName
+    dumpIfSet_dyn
   ) where
 
-import Name( Name )
-import TcRnMonad        ( initTcForLookup )
+import GhcPrelude hiding ( read )
+
 import CoreSyn
 import HscTypes
 import Module
@@ -64,13 +61,11 @@ import Annotations
 
 import IOEnv hiding     ( liftIO, failM, failWithM )
 import qualified IOEnv  ( liftIO )
-import TcEnv            ( lookupGlobal )
 import Var
 import Outputable
 import FastString
 import qualified ErrUtils as Err
 import ErrUtils( Severity(..) )
-import Maybes
 import UniqSupply
 import UniqFM       ( UniqFM, mapUFM, filterUFM )
 import MonadUtils
@@ -87,11 +82,6 @@ import Data.Word
 import Control.Monad
 import Control.Applicative ( Alternative(..) )
 
-import Prelude hiding   ( read )
-
-import {-# SOURCE #-} TcSplice ( lookupThName_maybe )
-import qualified Language.Haskell.TH as TH
-
 {-
 ************************************************************************
 *                                                                      *
@@ -107,14 +97,15 @@ data CoreToDo           -- These are diff core-to-core passes,
 
   = CoreDoSimplify      -- The core-to-core simplifier.
         Int                    -- Max iterations
-        SimplifierMode
-  | CoreDoPluginPass String PluginPass
+        SimplMode
+  | CoreDoPluginPass String CorePluginPass
   | CoreDoFloatInwards
   | CoreDoFloatOutwards FloatOutSwitches
   | CoreLiberateCase
   | CoreDoPrintCore
   | CoreDoStaticArgs
   | CoreDoCallArity
+  | CoreDoExitify
   | CoreDoStrictness
   | CoreDoWorkerWrapper
   | CoreDoSpecialising
@@ -122,7 +113,6 @@ data CoreToDo           -- These are diff core-to-core passes,
   | CoreCSE
   | CoreDoRuleCheck CompilerPhase String   -- Check for non-application of rules
                                            -- matching this string
-  | CoreDoVectorisation
   | CoreDoNothing                -- Useful when building up
   | CoreDoPasses [CoreToDo]      -- lists of these things
 
@@ -142,12 +132,12 @@ instance Outputable CoreToDo where
   ppr CoreLiberateCase         = text "Liberate case"
   ppr CoreDoStaticArgs         = text "Static argument"
   ppr CoreDoCallArity          = text "Called arity analysis"
+  ppr CoreDoExitify            = text "Exitification transformation"
   ppr CoreDoStrictness         = text "Demand analysis"
   ppr CoreDoWorkerWrapper      = text "Worker Wrapper binds"
   ppr CoreDoSpecialising       = text "Specialise"
   ppr CoreDoSpecConstr         = text "SpecConstr"
   ppr CoreCSE                  = text "Common sub-expression"
-  ppr CoreDoVectorisation      = text "Vectorisation"
   ppr CoreDesugar              = text "Desugar (before optimization)"
   ppr CoreDesugarOpt           = text "Desugar (after optimization)"
   ppr CoreTidy                 = text "Tidy Core"
@@ -163,17 +153,19 @@ pprPassDetails (CoreDoSimplify n md) = vcat [ text "Max iterations =" <+> int n
                                             , ppr md ]
 pprPassDetails _ = Outputable.empty
 
-data SimplifierMode             -- See comments in SimplMonad
+data SimplMode             -- See comments in SimplMonad
   = SimplMode
         { sm_names      :: [String] -- Name(s) of the phase
         , sm_phase      :: CompilerPhase
+        , sm_dflags     :: DynFlags -- Just for convenient non-monadic
+                                    -- access; we don't override these
         , sm_rules      :: Bool     -- Whether RULES are enabled
         , sm_inline     :: Bool     -- Whether inlining is enabled
         , sm_case_case  :: Bool     -- Whether case-of-case is enabled
         , sm_eta_expand :: Bool     -- Whether eta-expansion is enabled
         }
 
-instance Outputable SimplifierMode where
+instance Outputable SimplMode where
     ppr (SimplMode { sm_phase = p, sm_names = ss
                    , sm_rules = r, sm_inline = i
                    , sm_eta_expand = eta, sm_case_case = cc })
@@ -235,7 +227,7 @@ runMaybe Nothing  _ = CoreDoNothing
 -}
 
 -- | A description of the plugin pass itself
-type PluginPass = ModGuts -> CoreM ModGuts
+type CorePluginPass = ModGuts -> CoreM ModGuts
 
 bindsOnlyPass :: (CoreProgram -> CoreM CoreProgram) -> ModGuts -> CoreM ModGuts
 bindsOnlyPass pass guts
@@ -251,7 +243,7 @@ bindsOnlyPass pass guts
 -}
 
 getVerboseSimplStats :: (Bool -> SDoc) -> SDoc
-getVerboseSimplStats = sdocWithPprDebug          -- For now, anyway
+getVerboseSimplStats = getPprDebug          -- For now, anyway
 
 zeroSimplCount     :: DynFlags -> SimplCount
 isZeroSimplCount   :: SimplCount -> Bool
@@ -341,6 +333,79 @@ pprSimplCount (SimplCount { ticks = tks, details = dts, log1 = l1, log2 = l2 })
           else Outputable.empty
     ]
 
+{- Note [Which transformations are innocuous]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+At one point (Jun 18) I wondered if some transformations (ticks)
+might be  "innocuous", in the sense that they do not unlock a later
+transformation that does not occur in the same pass.  If so, we could
+refrain from bumping the overall tick-count for such innocuous
+transformations, and perhaps terminate the simplifier one pass
+earlier.
+
+BUt alas I found that virtually nothing was innocuous!  This Note
+just records what I learned, in case anyone wants to try again.
+
+These transformations are not innocuous:
+
+*** NB: I think these ones could be made innocuous
+          EtaExpansion
+          LetFloatFromLet
+
+LetFloatFromLet
+    x = K (let z = e2 in Just z)
+  prepareRhs transforms to
+    x2 = let z=e2 in Just z
+    x  = K xs
+  And now more let-floating can happen in the
+  next pass, on x2
+
+PreInlineUnconditionally
+  Example in spectral/cichelli/Auxil
+     hinsert = ...let lo = e in
+                  let j = ...lo... in
+                  case x of
+                    False -> ()
+                    True -> case lo of I# lo' ->
+                              ...j...
+  When we PreInlineUnconditionally j, lo's occ-info changes to once,
+  so it can be PreInlineUnconditionally in the next pass, and a
+  cascade of further things can happen.
+
+PostInlineUnconditionally
+  let x = e in
+  let y = ...x.. in
+  case .. of { A -> ...x...y...
+               B -> ...x...y... }
+  Current postinlineUnconditinaly will inline y, and then x; sigh.
+
+  But PostInlineUnconditionally might also unlock subsequent
+  transformations for the same reason as PreInlineUnconditionally,
+  so it's probably not innocuous anyway.
+
+KnownBranch, BetaReduction:
+  May drop chunks of code, and thereby enable PreInlineUnconditionally
+  for some let-binding which now occurs once
+
+EtaExpansion:
+  Example in imaginary/digits-of-e1
+    fail = \void. e          where e :: IO ()
+  --> etaExpandRhs
+    fail = \void. (\s. (e |> g) s) |> sym g      where g :: IO () ~ S -> (S,())
+  --> Next iteration of simplify
+    fail1 = \void. \s. (e |> g) s
+    fail = fail1 |> Void#->sym g
+  And now inline 'fail'
+
+CaseMerge:
+  case x of y {
+    DEFAULT -> case y of z { pi -> ei }
+    alts2 }
+  ---> CaseMerge
+    case x of { pi -> let z = y in ei
+              ; alts2 }
+  The "let z=y" case-binder-swap gets dealt with in the next pass
+-}
+
 pprTickCounts :: Map Tick Int -> SDoc
 pprTickCounts counts
   = vcat (map pprTickGroup groups)
@@ -358,7 +423,7 @@ pprTickGroup group@((tick1,_):_)
                | (tick,n) <- sortBy (flip (comparing snd)) group])
 pprTickGroup [] = panic "pprTickGroup"
 
-data Tick
+data Tick  -- See Note [Which transformations are innocuous]
   = PreInlineUnconditionally    Id
   | PostInlineUnconditionally   Id
 
@@ -379,7 +444,6 @@ data Tick
   | CaseIdentity                Id      -- Case binder
   | FillInCaseDefault           Id      -- Case binder
 
-  | BottomFound
   | SimplifierDone              -- Ticked at each iteration of the simplifier
 
 instance Outputable Tick where
@@ -408,7 +472,6 @@ tickToTag (CaseMerge _)                 = 10
 tickToTag (CaseElim _)                  = 11
 tickToTag (CaseIdentity _)              = 12
 tickToTag (FillInCaseDefault _)         = 13
-tickToTag BottomFound                   = 14
 tickToTag SimplifierDone                = 16
 tickToTag (AltMerge _)                  = 17
 
@@ -428,7 +491,6 @@ tickString (AltMerge _)                 = "AltMerge"
 tickString (CaseElim _)                 = "CaseElim"
 tickString (CaseIdentity _)             = "CaseIdentity"
 tickString (FillInCaseDefault _)        = "FillInCaseDefault"
-tickString BottomFound                  = "BottomFound"
 tickString SimplifierDone               = "SimplifierDone"
 
 pprTickCts :: Tick -> SDoc
@@ -780,34 +842,3 @@ dumpIfSet_dyn flag str doc
        ; unqual <- getPrintUnqualified
        ; when (dopt flag dflags) $ liftIO $
          Err.dumpSDoc dflags unqual flag str doc }
-
-{-
-************************************************************************
-*                                                                      *
-               Finding TyThings
-*                                                                      *
-************************************************************************
--}
-
-instance MonadThings CoreM where
-    lookupThing name = do { hsc_env <- getHscEnv
-                          ; liftIO $ lookupGlobal hsc_env name }
-
-{-
-************************************************************************
-*                                                                      *
-               Template Haskell interoperability
-*                                                                      *
-************************************************************************
--}
-
--- | Attempt to convert a Template Haskell name to one that GHC can
--- understand. Original TH names such as those you get when you use
--- the @'foo@ syntax will be translated to their equivalent GHC name
--- exactly. Qualified or unqualified TH names will be dynamically bound
--- to names in the module being compiled, if possible. Exact TH names
--- will be bound to the name they represent, exactly.
-thNameToGhcName :: TH.Name -> CoreM (Maybe Name)
-thNameToGhcName th_name = do
-    hsc_env <- getHscEnv
-    liftIO $ initTcForLookup hsc_env (lookupThName_maybe th_name)
diff --git a/compiler/simplCore/CoreMonad.hs-boot b/compiler/simplCore/CoreMonad.hs-boot
new file mode 100644
index 0000000000..206675e5e2
--- /dev/null
+++ b/compiler/simplCore/CoreMonad.hs-boot
@@ -0,0 +1,37 @@
+-- Created this hs-boot file to remove circular dependencies from the use of
+-- Plugins. Plugins needs CoreToDo and CoreM types to define core-to-core
+-- transformations.
+-- However CoreMonad does much more than defining these, and because Plugins are
+-- activated in various modules, the imports become circular. To solve this I
+-- extracted CoreToDo and CoreM into this file.
+-- I needed to write the whole definition of these types, otherwise it created
+-- a data-newtype conflict.
+
+module CoreMonad ( CoreToDo, CoreM ) where
+
+import GhcPrelude
+
+import IOEnv ( IOEnv )
+import UniqSupply ( UniqSupply )
+
+newtype CoreState = CoreState {
+        cs_uniq_supply :: UniqSupply
+}
+
+type CoreIOEnv = IOEnv CoreReader
+
+data CoreReader
+
+newtype CoreWriter = CoreWriter {
+        cw_simpl_count :: SimplCount
+}
+
+data SimplCount
+
+newtype CoreM a
+          = CoreM { unCoreM :: CoreState
+                                 -> CoreIOEnv (a, CoreState, CoreWriter) }
+
+instance Monad CoreM
+
+data CoreToDo
diff --git a/compiler/simplCore/Exitify.hs b/compiler/simplCore/Exitify.hs
new file mode 100644
index 0000000000..3e7d503d31
--- /dev/null
+++ b/compiler/simplCore/Exitify.hs
@@ -0,0 +1,499 @@
+module Exitify ( exitifyProgram ) where
+
+{-
+Note [Exitification]
+~~~~~~~~~~~~~~~~~~~~
+
+This module implements Exitification. The goal is to pull as much code out of
+recursive functions as possible, as the simplifier is better at inlining into
+call-sites that are not in recursive functions.
+
+Example:
+
+  let t = foo bar
+  joinrec go 0     x y = t (x*x)
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+
+We’d like to inline `t`, but that does not happen: Because t is a thunk and is
+used in a recursive function, doing so might lose sharing in general. In
+this case, however, `t` is on the _exit path_ of `go`, so called at most once.
+How do we make this clearly visible to the simplifier?
+
+A code path (i.e., an expression in a tail-recursive position) in a recursive
+function is an exit path if it does not contain a recursive call. We can bind
+this expression outside the recursive function, as a join-point.
+
+Example result:
+
+  let t = foo bar
+  join exit x = t (x*x)
+  joinrec go 0     x y = jump exit x
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+
+Now `t` is no longer in a recursive function, and good things happen!
+-}
+
+import GhcPrelude
+import Var
+import Id
+import IdInfo
+import CoreSyn
+import CoreUtils
+import State
+import Unique
+import VarSet
+import VarEnv
+import CoreFVs
+import FastString
+import Type
+import Util( mapSnd )
+
+import Data.Bifunctor
+import Control.Monad
+
+-- | Traverses the AST, simply to find all joinrecs and call 'exitify' on them.
+-- The really interesting function is exitifyRec
+exitifyProgram :: CoreProgram -> CoreProgram
+exitifyProgram binds = map goTopLvl binds
+  where
+    goTopLvl (NonRec v e) = NonRec v (go in_scope_toplvl e)
+    goTopLvl (Rec pairs) = Rec (map (second (go in_scope_toplvl)) pairs)
+      -- Top-level bindings are never join points
+
+    in_scope_toplvl = emptyInScopeSet `extendInScopeSetList` bindersOfBinds binds
+
+    go :: InScopeSet -> CoreExpr -> CoreExpr
+    go _    e@(Var{})       = e
+    go _    e@(Lit {})      = e
+    go _    e@(Type {})     = e
+    go _    e@(Coercion {}) = e
+    go in_scope (Cast e' c) = Cast (go in_scope e') c
+    go in_scope (Tick t e') = Tick t (go in_scope e')
+    go in_scope (App e1 e2) = App (go in_scope e1) (go in_scope e2)
+
+    go in_scope (Lam v e')
+      = Lam v (go in_scope' e')
+      where in_scope' = in_scope `extendInScopeSet` v
+
+    go in_scope (Case scrut bndr ty alts)
+      = Case (go in_scope scrut) bndr ty (map go_alt alts)
+      where
+        in_scope1 = in_scope `extendInScopeSet` bndr
+        go_alt (dc, pats, rhs) = (dc, pats, go in_scope' rhs)
+           where in_scope' = in_scope1 `extendInScopeSetList` pats
+
+    go in_scope (Let (NonRec bndr rhs) body)
+      = Let (NonRec bndr (go in_scope rhs)) (go in_scope' body)
+      where
+        in_scope' = in_scope `extendInScopeSet` bndr
+
+    go in_scope (Let (Rec pairs) body)
+      | is_join_rec = mkLets (exitifyRec in_scope' pairs') body'
+      | otherwise   = Let (Rec pairs') body'
+      where
+        is_join_rec = any (isJoinId . fst) pairs
+        in_scope'   = in_scope `extendInScopeSetList` bindersOf (Rec pairs)
+        pairs'      = mapSnd (go in_scope') pairs
+        body'       = go in_scope' body
+
+
+-- | State Monad used inside `exitify`
+type ExitifyM =  State [(JoinId, CoreExpr)]
+
+-- | Given a recursive group of a joinrec, identifies “exit paths” and binds them as
+--   join-points outside the joinrec.
+exitifyRec :: InScopeSet -> [(Var,CoreExpr)] -> [CoreBind]
+exitifyRec in_scope pairs
+  = [ NonRec xid rhs | (xid,rhs) <- exits ] ++ [Rec pairs']
+  where
+    -- We need the set of free variables of many subexpressions here, so
+    -- annotate the AST with them
+    -- see Note [Calculating free variables]
+    ann_pairs = map (second freeVars) pairs
+
+    -- Which are the recursive calls?
+    recursive_calls = mkVarSet $ map fst pairs
+
+    (pairs',exits) = (`runState` []) $ do
+        forM ann_pairs $ \(x,rhs) -> do
+            -- go past the lambdas of the join point
+            let (args, body) = collectNAnnBndrs (idJoinArity x) rhs
+            body' <- go args body
+            let rhs' = mkLams args body'
+            return (x, rhs')
+
+    ---------------------
+    -- 'go' is the main working function.
+    -- It goes through the RHS (tail-call positions only),
+    -- checks if there are no more recursive calls, if so, abstracts over
+    -- variables bound on the way and lifts it out as a join point.
+    --
+    -- ExitifyM is a state monad to keep track of floated binds
+    go :: [Var]           -- ^ Variables that are in-scope here, but
+                          -- not in scope at the joinrec; that is,
+                          -- we must potentially abstract over them.
+                          -- Invariant: they are kept in dependency order
+       -> CoreExprWithFVs -- ^ Current expression in tail position
+       -> ExitifyM CoreExpr
+
+    -- We first look at the expression (no matter what it shape is)
+    -- and determine if we can turn it into a exit join point
+    go captured ann_e
+        | -- An exit expression has no recursive calls
+          let fvs = dVarSetToVarSet (freeVarsOf ann_e)
+        , disjointVarSet fvs recursive_calls
+        = go_exit captured (deAnnotate ann_e) fvs
+
+    -- We could not turn it into a exit joint point. So now recurse
+    -- into all expression where eligible exit join points might sit,
+    -- i.e. into all tail-call positions:
+
+    -- Case right hand sides are in tail-call position
+    go captured (_, AnnCase scrut bndr ty alts) = do
+        alts' <- forM alts $ \(dc, pats, rhs) -> do
+            rhs' <- go (captured ++ [bndr] ++ pats) rhs
+            return (dc, pats, rhs')
+        return $ Case (deAnnotate scrut) bndr ty alts'
+
+    go captured (_, AnnLet ann_bind body)
+        -- join point, RHS and body are in tail-call position
+        | AnnNonRec j rhs <- ann_bind
+        , Just join_arity <- isJoinId_maybe j
+        = do let (params, join_body) = collectNAnnBndrs join_arity rhs
+             join_body' <- go (captured ++ params) join_body
+             let rhs' = mkLams params join_body'
+             body' <- go (captured ++ [j]) body
+             return $ Let (NonRec j rhs') body'
+
+        -- rec join point, RHSs and body are in tail-call position
+        | AnnRec pairs <- ann_bind
+        , isJoinId (fst (head pairs))
+        = do let js = map fst pairs
+             pairs' <- forM pairs $ \(j,rhs) -> do
+                 let join_arity = idJoinArity j
+                     (params, join_body) = collectNAnnBndrs join_arity rhs
+                 join_body' <- go (captured ++ js ++ params) join_body
+                 let rhs' = mkLams params join_body'
+                 return (j, rhs')
+             body' <- go (captured ++ js) body
+             return $ Let (Rec pairs') body'
+
+        -- normal Let, only the body is in tail-call position
+        | otherwise
+        = do body' <- go (captured ++ bindersOf bind ) body
+             return $ Let bind body'
+      where bind = deAnnBind ann_bind
+
+    -- Cannot be turned into an exit join point, but also has no
+    -- tail-call subexpression. Nothing to do here.
+    go _ ann_e = return (deAnnotate ann_e)
+
+    ---------------------
+    go_exit :: [Var]      -- Variables captured locally
+            -> CoreExpr   -- An exit expression
+            -> VarSet     -- Free vars of the expression
+            -> ExitifyM CoreExpr
+    -- go_exit deals with a tail expression that is floatable
+    -- out as an exit point; that is, it mentions no recursive calls
+    go_exit captured e fvs
+      -- Do not touch an expression that is already a join jump where all arguments
+      -- are captured variables. See Note [Idempotency]
+      -- But _do_ float join jumps with interesting arguments.
+      -- See Note [Jumps can be interesting]
+      | (Var f, args) <- collectArgs e
+      , isJoinId f
+      , all isCapturedVarArg args
+      = return e
+
+      -- Do not touch a boring expression (see Note [Interesting expression])
+      | not is_interesting
+      = return e
+
+      -- Cannot float out if local join points are used, as
+      -- we cannot abstract over them
+      | captures_join_points
+      = return e
+
+      -- We have something to float out!
+      | otherwise
+      = do { -- Assemble the RHS of the exit join point
+             let rhs   = mkLams abs_vars e
+                 avoid = in_scope `extendInScopeSetList` captured
+             -- Remember this binding under a suitable name
+           ; v <- addExit avoid (length abs_vars) rhs
+             -- And jump to it from here
+           ; return $ mkVarApps (Var v) abs_vars }
+
+      where
+        -- Used to detect exit expressoins that are already proper exit jumps
+        isCapturedVarArg (Var v) = v `elem` captured
+        isCapturedVarArg _ = False
+
+        -- An interesting exit expression has free, non-imported
+        -- variables from outside the recursive group
+        -- See Note [Interesting expression]
+        is_interesting = anyVarSet isLocalId $
+                         fvs `minusVarSet` mkVarSet captured
+
+        -- The arguments of this exit join point
+        -- See Note [Picking arguments to abstract over]
+        abs_vars = snd $ foldr pick (fvs, []) captured
+          where
+            pick v (fvs', acc) | v `elemVarSet` fvs' = (fvs' `delVarSet` v, zap v : acc)
+                               | otherwise           = (fvs',               acc)
+
+        -- We are going to abstract over these variables, so we must
+        -- zap any IdInfo they have; see Trac #15005
+        -- cf. SetLevels.abstractVars
+        zap v | isId v = setIdInfo v vanillaIdInfo
+              | otherwise = v
+
+        -- We cannot abstract over join points
+        captures_join_points = any isJoinId abs_vars
+
+
+-- Picks a new unique, which is disjoint from
+--  * the free variables of the whole joinrec
+--  * any bound variables (captured)
+--  * any exit join points created so far.
+mkExitJoinId :: InScopeSet -> Type -> JoinArity -> ExitifyM JoinId
+mkExitJoinId in_scope ty join_arity = do
+    fs <- get
+    let avoid = in_scope `extendInScopeSetList` (map fst fs)
+                         `extendInScopeSet` exit_id_tmpl -- just cosmetics
+    return (uniqAway avoid exit_id_tmpl)
+  where
+    exit_id_tmpl = mkSysLocal (fsLit "exit") initExitJoinUnique ty
+                    `asJoinId` join_arity
+
+addExit :: InScopeSet -> JoinArity -> CoreExpr -> ExitifyM JoinId
+addExit in_scope join_arity rhs = do
+    -- Pick a suitable name
+    let ty = exprType rhs
+    v <- mkExitJoinId in_scope ty join_arity
+    fs <- get
+    put ((v,rhs):fs)
+    return v
+
+{-
+Note [Interesting expression]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We do not want this to happen:
+
+  joinrec go 0     x y = x
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+==>
+  join exit x = x
+  joinrec go 0     x y = jump exit x
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+
+because the floated exit path (`x`) is simply a parameter of `go`; there are
+not useful interactions exposed this way.
+
+Neither do we want this to happen
+
+  joinrec go 0     x y = x+x
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+==>
+  join exit x = x+x
+  joinrec go 0     x y = jump exit x
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+
+where the floated expression `x+x` is a bit more complicated, but still not
+intersting.
+
+Expressions are interesting when they move an occurrence of a variable outside
+the recursive `go` that can benefit from being obviously called once, for example:
+ * a local thunk that can then be inlined (see example in note [Exitification])
+ * the parameter of a function, where the demand analyzer then can then
+   see that it is called at most once, and hence improve the function’s
+   strictness signature
+
+So we only hoist an exit expression out if it mentiones at least one free,
+non-imported variable.
+
+Note [Jumps can be interesting]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A jump to a join point can be interesting, if its arguments contain free
+non-exported variables (z in the following example):
+
+  joinrec go 0     x y = jump j (x+z)
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+==>
+  join exit x y = jump j (x+z)
+  joinrec go 0     x y = jump exit x
+          go (n-1) x y = jump go (n-1) (x+y)
+
+
+The join point itself can be interesting, even if none if its
+arguments have free variables free in the joinrec.  For example
+
+  join j p = case p of (x,y) -> x+y
+  joinrec go 0     x y = jump j (x,y)
+          go (n-1) x y = jump go (n-1) (x+y) y
+  in …
+
+Here, `j` would not be inlined because we do not inline something that looks
+like an exit join point (see Note [Do not inline exit join points]). But
+if we exitify the 'jump j (x,y)' we get
+
+  join j p = case p of (x,y) -> x+y
+  join exit x y = jump j (x,y)
+  joinrec go 0     x y = jump exit x y
+          go (n-1) x y = jump go (n-1) (x+y) y
+  in …
+
+and now 'j' can inline, and we get rid of the pair. Here's another
+example (assume `g` to be an imported function that, on its own,
+does not make this interesting):
+
+  join j y = map f y
+  joinrec go 0     x y = jump j (map g x)
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+
+Again, `j` would not be inlined because we do not inline something that looks
+like an exit join point (see Note [Do not inline exit join points]).
+
+But after exitification we have
+
+  join j y = map f y
+  join exit x = jump j (map g x)
+  joinrec go 0     x y = jump j (map g x)
+              go (n-1) x y = jump go (n-1) (x+y)
+  in …
+
+and now we can inline `j` and this will allow `map/map` to fire.
+
+
+Note [Idempotency]
+~~~~~~~~~~~~~~~~~~
+
+We do not want this to happen, where we replace the floated expression with
+essentially the same expression:
+
+  join exit x = t (x*x)
+  joinrec go 0     x y = jump exit x
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+==>
+  join exit x = t (x*x)
+  join exit' x = jump exit x
+  joinrec go 0     x y = jump exit' x
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+
+So when the RHS is a join jump, and all of its arguments are captured variables,
+then we leave it in place.
+
+Note that `jump exit x` in this example looks interesting, as `exit` is a free
+variable. Therefore, idempotency does not simply follow from floating only
+interesting expressions.
+
+Note [Calculating free variables]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We have two options where to annotate the tree with free variables:
+
+ A) The whole tree.
+ B) Each individual joinrec as we come across it.
+
+Downside of A: We pay the price on the whole module, even outside any joinrecs.
+Downside of B: We pay the price per joinrec, possibly multiple times when
+joinrecs are nested.
+
+Further downside of A: If the exitify function returns annotated expressions,
+it would have to ensure that the annotations are correct.
+
+We therefore choose B, and calculate the free variables in `exitify`.
+
+
+Note [Do not inline exit join points]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When we have
+
+  let t = foo bar
+  join exit x = t (x*x)
+  joinrec go 0     x y = jump exit x
+          go (n-1) x y = jump go (n-1) (x+y)
+  in …
+
+we do not want the simplifier to simply inline `exit` back in (which it happily
+would).
+
+To prevent this, we need to recognize exit join points, and then disable
+inlining.
+
+Exit join points, recognizeable using `isExitJoinId` are join points with an
+occurence in a recursive group, and can be recognized (after the occurence
+analyzer ran!) using `isExitJoinId`.
+This function detects joinpoints with `occ_in_lam (idOccinfo id) == True`,
+because the lambdas of a non-recursive join point are not considered for
+`occ_in_lam`.  For example, in the following code, `j1` is /not/ marked
+occ_in_lam, because `j2` is called only once.
+
+  join j1 x = x+1
+  join j2 y = join j1 (y+2)
+
+To prevent inlining, we check for isExitJoinId
+* In `preInlineUnconditionally` directly.
+* In `simplLetUnfolding` we simply give exit join points no unfolding, which
+  prevents inlining in `postInlineUnconditionally` and call sites.
+
+Note [Placement of the exitification pass]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+I (Joachim) experimented with multiple positions for the Exitification pass in
+the Core2Core pipeline:
+
+ A) Before the `simpl_phases`
+ B) Between the `simpl_phases` and the "main" simplifier pass
+ C) After demand_analyser
+ D) Before the final simplification phase
+
+Here is the table (this is without inlining join exit points in the final
+simplifier run):
+
+        Program |                       Allocs                      |                      Instrs
+                | ABCD.log     A.log     B.log     C.log     D.log  | ABCD.log     A.log     B.log     C.log     D.log
+----------------|---------------------------------------------------|-------------------------------------------------
+ fannkuch-redux |   -99.9%     +0.0%    -99.9%    -99.9%    -99.9%  |    -3.9%     +0.5%     -3.0%     -3.9%     -3.9%
+          fasta |    -0.0%     +0.0%     +0.0%     -0.0%     -0.0%  |    -8.5%     +0.0%     +0.0%     -0.0%     -8.5%
+            fem |     0.0%      0.0%      0.0%      0.0%     +0.0%  |    -2.2%     -0.1%     -0.1%     -2.1%     -2.1%
+           fish |     0.0%      0.0%      0.0%      0.0%     +0.0%  |    -3.1%     +0.0%     -1.1%     -1.1%     -0.0%
+   k-nucleotide |   -91.3%    -91.0%    -91.0%    -91.3%    -91.3%  |    -6.3%    +11.4%    +11.4%     -6.3%     -6.2%
+            scs |    -0.0%     -0.0%     -0.0%     -0.0%     -0.0%  |    -3.4%     -3.0%     -3.1%     -3.3%     -3.3%
+         simple |    -6.0%      0.0%     -6.0%     -6.0%     +0.0%  |    -3.4%     +0.0%     -5.2%     -3.4%     -0.1%
+  spectral-norm |    -0.0%      0.0%      0.0%     -0.0%     +0.0%  |    -2.7%     +0.0%     -2.7%     -5.4%     -5.4%
+----------------|---------------------------------------------------|-------------------------------------------------
+            Min |   -95.0%    -91.0%    -95.0%    -95.0%    -95.0%  |    -8.5%     -3.0%     -5.2%     -6.3%     -8.5%
+            Max |    +0.2%     +0.2%     +0.2%     +0.2%     +1.5%  |    +0.4%    +11.4%    +11.4%     +0.4%     +1.5%
+ Geometric Mean |    -4.7%     -2.1%     -4.7%     -4.7%     -4.6%  |    -0.4%     +0.1%     -0.1%     -0.3%     -0.2%
+
+Position A is disqualified, as it does not get rid of the allocations in
+fannkuch-redux.
+Position A and B are disqualified because it increases instructions in k-nucleotide.
+Positions C and D have their advantages: C decreases allocations in simpl, but D instructions in fasta.
+
+Assuming we have a budget of _one_ run of Exitification, then C wins (but we
+could get more from running it multiple times, as seen in fish).
+
+Note [Picking arguments to abstract over]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When we create an exit join point, so we need to abstract over those of its
+free variables that are be out-of-scope at the destination of the exit join
+point. So we go through the list `captured` and pick those that are actually
+free variables of the join point.
+
+We do not just `filter (`elemVarSet` fvs) captured`, as there might be
+shadowing, and `captured` may contain multiple variables with the same Unique. I
+these cases we want to abstract only over the last occurence, hence the `foldr`
+(with emphasis on the `r`). This is #15110.
+
+-}
diff --git a/compiler/simplCore/FloatIn.hs b/compiler/simplCore/FloatIn.hs
index 3e44e81cea..2593b1d7a1 100644
--- a/compiler/simplCore/FloatIn.hs
+++ b/compiler/simplCore/FloatIn.hs
@@ -19,6 +19,8 @@ module FloatIn ( floatInwards ) where
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import CoreSyn
 import MkCore
 import HscTypes         ( ModGuts(..) )
@@ -179,7 +181,7 @@ fiExpr dflags to_drop ann_expr@(_,AnnApp {})
          -- lists without evaluating extra_fvs, and hence without
          -- peering into each argument
 
-    (_, extra_fvs) = foldl add_arg (fun_ty, extra_fvs0) ann_args
+    (_, extra_fvs) = foldl' add_arg (fun_ty, extra_fvs0) ann_args
     extra_fvs0 = case ann_fun of
                    (_, AnnVar _) -> fun_fvs
                    _             -> emptyDVarSet
@@ -413,6 +415,16 @@ But there are wrinkles
   cases like Trac #5658.   This is implemented in sepBindsByJoinPoint;
   if is_case is False we dump all floating cases right here.
 
+* Trac #14511 is another example of why we want to restrict float-in
+  of case-expressions.  Consider
+     case indexArray# a n of (# r #) -> writeArray# ma i (f r)
+  Now, floating that indexing operation into the (f r) thunk will
+  not create any new thunks, but it will keep the array 'a' alive
+  for much longer than the programmer expected.
+
+  So again, not floating a case into a let or argument seems like
+  the Right Thing
+
 For @Case@, the possible drop points for the 'to_drop'
 bindings are:
   (a) inside the scrutinee
@@ -459,7 +471,7 @@ fiExpr dflags to_drop (_, AnnCase scrut case_bndr ty alts)
     alts_fvs     = map alt_fvs alts
     all_alts_fvs = unionDVarSets alts_fvs
     alt_fvs (_con, args, rhs)
-      = foldl delDVarSet (freeVarsOf rhs) (case_bndr:args)
+      = foldl' delDVarSet (freeVarsOf rhs) (case_bndr:args)
            -- Delete case_bndr and args from free vars of rhs
            -- to get free vars of alt
 
diff --git a/compiler/simplCore/FloatOut.hs b/compiler/simplCore/FloatOut.hs
index 72fc0d1ff7..6cb21f9470 100644
--- a/compiler/simplCore/FloatOut.hs
+++ b/compiler/simplCore/FloatOut.hs
@@ -11,6 +11,8 @@
 
 module FloatOut ( floatOutwards ) where
 
+import GhcPrelude
+
 import CoreSyn
 import CoreUtils
 import MkCore
@@ -21,7 +23,6 @@ import DynFlags
 import ErrUtils         ( dumpIfSet_dyn )
 import Id               ( Id, idArity, idType, isBottomingId,
                           isJoinId, isJoinId_maybe )
-import BasicTypes       ( TopLevelFlag(..), isTopLevel )
 import SetLevels
 import UniqSupply       ( UniqSupply )
 import Bag
@@ -735,26 +736,19 @@ atJoinCeiling (fs, floats, expr')
 
 wrapTick :: Tickish Id -> FloatBinds -> FloatBinds
 wrapTick t (FB tops ceils defns)
-  = FB (mapBag (wrap_bind TopLevel) tops)
-       (wrap_defns NotTopLevel ceils)
-       (M.map (M.map (wrap_defns NotTopLevel)) defns)
+  = FB (mapBag wrap_bind tops) (wrap_defns ceils)
+       (M.map (M.map wrap_defns) defns)
   where
-    wrap_defns toplvl = mapBag (wrap_one toplvl)
-
-    wrap_bind toplvl (NonRec binder rhs) = NonRec binder (maybe_tick toplvl rhs)
-    wrap_bind toplvl (Rec pairs)         = Rec (mapSnd (maybe_tick toplvl) pairs)
-
-    wrap_one toplvl (FloatLet bind)      = FloatLet (wrap_bind toplvl bind)
-    wrap_one toplvl (FloatCase e b c bs) = FloatCase (maybe_tick toplvl e) b c bs
-
-    maybe_tick :: TopLevelFlag -> CoreExpr -> CoreExpr
-    maybe_tick toplvl e
-        -- We must take care not to tick top-level literal
-        -- strings as this violated the Core invariants. See Note [CoreSyn
-        -- top-level string literals].
-      | isTopLevel toplvl && exprIsLiteralString e = e
-      | exprIsHNF e = tickHNFArgs t e
-      | otherwise   = mkTick t e
+    wrap_defns = mapBag wrap_one
+
+    wrap_bind (NonRec binder rhs) = NonRec binder (maybe_tick rhs)
+    wrap_bind (Rec pairs)         = Rec (mapSnd maybe_tick pairs)
+
+    wrap_one (FloatLet bind)      = FloatLet (wrap_bind bind)
+    wrap_one (FloatCase e b c bs) = FloatCase (maybe_tick e) b c bs
+
+    maybe_tick e | exprIsHNF e = tickHNFArgs t e
+                 | otherwise   = mkTick t e
       -- we don't need to wrap a tick around an HNF when we float it
       -- outside a tick: that is an invariant of the tick semantics
       -- Conversely, inlining of HNFs inside an SCC is allowed, and
diff --git a/compiler/simplCore/LiberateCase.hs b/compiler/simplCore/LiberateCase.hs
index 1776db51fd..b484de3bc3 100644
--- a/compiler/simplCore/LiberateCase.hs
+++ b/compiler/simplCore/LiberateCase.hs
@@ -9,9 +9,12 @@ module LiberateCase ( liberateCase ) where
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import DynFlags
 import CoreSyn
 import CoreUnfold       ( couldBeSmallEnoughToInline )
+import TysWiredIn       ( unitDataConId )
 import Id
 import VarEnv
 import Util             ( notNull )
@@ -66,24 +69,6 @@ Exactly the same optimisation (unrolling one call to f) will work here,
 despite the cast.  See mk_alt_env in the Case branch of libCase.
 
 
-Note [Only functions!]
-~~~~~~~~~~~~~~~~~~~~~~
-Consider the following code
-
-       f = g (case v of V a b -> a : t f)
-
-where g is expensive. If we aren't careful, liberate case will turn this into
-
-       f = g (case v of
-               V a b -> a : t (letrec f = g (case v of V a b -> a : f t)
-                                in f)
-             )
-
-Yikes! We evaluate g twice. This leads to a O(2^n) explosion
-if g calls back to the same code recursively.
-
-Solution: make sure that we only do the liberate-case thing on *functions*
-
 To think about (Apr 94)
 ~~~~~~~~~~~~~~
 Main worry: duplicating code excessively.  At the moment we duplicate
@@ -154,18 +139,63 @@ libCaseBind env (Rec pairs)
         -- We extend the rec-env by binding each Id to its rhs, first
         -- processing the rhs with an *un-extended* environment, so
         -- that the same process doesn't occur for ever!
-    env_rhs = addRecBinds env [ (localiseId binder, libCase env_body rhs)
-                              | (binder, rhs) <- pairs
-                              , rhs_small_enough binder rhs ]
+    env_rhs | is_dupable_bind = addRecBinds env dup_pairs
+            | otherwise       = env
+
+    dup_pairs = [ (localiseId binder, libCase env_body rhs)
+                | (binder, rhs) <- pairs ]
         -- localiseID : see Note [Need to localiseId in libCaseBind]
 
+    is_dupable_bind = small_enough && all ok_pair pairs
 
-    rhs_small_enough id rhs     -- Note [Small enough]
-        =  idArity id > 0       -- Note [Only functions!]
-        && maybe True (\size -> couldBeSmallEnoughToInline (lc_dflags env) size rhs)
-                      (bombOutSize env)
+    -- Size: we are going to duplicate dup_pairs; to find their
+    --       size, build a fake binding (let { dup_pairs } in (),
+    --       and find the size of that
+    -- See Note [Small enough]
+    small_enough = case bombOutSize env of
+                      Nothing   -> True   -- Infinity
+                      Just size -> couldBeSmallEnoughToInline (lc_dflags env) size $
+                                   Let (Rec dup_pairs) (Var unitDataConId)
+
+    ok_pair (id,_)
+        =  idArity id > 0          -- Note [Only functions!]
+        && not (isBottomingId id)  -- Note [Not bottoming ids]
+
+{- Note [Not bottoming Ids]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Do not specialise error-functions (this is unusual, but I once saw it,
+(acually in Data.Typable.Internal)
+
+Note [Only functions!]
+~~~~~~~~~~~~~~~~~~~~~~
+Consider the following code
+
+       f = g (case v of V a b -> a : t f)
+
+where g is expensive. If we aren't careful, liberate case will turn this into
+
+       f = g (case v of
+               V a b -> a : t (letrec f = g (case v of V a b -> a : f t)
+                                in f)
+             )
+
+Yikes! We evaluate g twice. This leads to a O(2^n) explosion
+if g calls back to the same code recursively.
+
+Solution: make sure that we only do the liberate-case thing on *functions*
+
+Note [Small enough]
+~~~~~~~~~~~~~~~~~~~
+Consider
+  \fv. letrec
+         f = \x. BIG...(case fv of { (a,b) -> ...g.. })...
+         g = \y. SMALL...f...
+
+Then we *can* in principle do liberate-case on 'g' (small RHS) but not
+for 'f' (too big).  But doing so is not profitable, because duplicating
+'g' at its call site in 'f' doesn't get rid of any cases.  So we just
+ask for the whole group to be small enough.
 
-{-
 Note [Need to localiseId in libCaseBind]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The call to localiseId is needed for two subtle reasons
@@ -179,16 +209,6 @@ The call to localiseId is needed for two subtle reasons
         nested; if it were floated to the top level, we'd get a name
         clash at code generation time.
 
-Note [Small enough]
-~~~~~~~~~~~~~~~~~~~
-Consider
-  \fv. letrec
-         f = \x. BIG...(case fv of { (a,b) -> ...g.. })...
-         g = \y. SMALL...f...
-Then we *can* do liberate-case on g (small RHS) but not for f (too big).
-But we can choose on a item-by-item basis, and that's what the
-rhs_small_enough call in the comprehension for env_rhs does.
-
 Expressions
 ~~~~~~~~~~~
 -}
diff --git a/compiler/simplCore/OccurAnal.hs b/compiler/simplCore/OccurAnal.hs
index 5dd30aa668..236bb81066 100644
--- a/compiler/simplCore/OccurAnal.hs
+++ b/compiler/simplCore/OccurAnal.hs
@@ -19,10 +19,13 @@ module OccurAnal (
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import CoreSyn
 import CoreFVs
 import CoreUtils        ( exprIsTrivial, isDefaultAlt, isExpandableApp,
                           stripTicksTopE, mkTicks )
+import CoreArity        ( joinRhsArity )
 import Id
 import IdInfo
 import Name( localiseName )
@@ -56,11 +59,12 @@ import Control.Arrow    ( second )
 Here's the externally-callable interface:
 -}
 
-occurAnalysePgm :: Module       -- Used only in debug output
-                -> (Activation -> Bool)
-                -> [CoreRule] -> [CoreVect] -> VarSet
+occurAnalysePgm :: Module         -- Used only in debug output
+                -> (Id -> Bool)         -- Active unfoldings
+                -> (Activation -> Bool) -- Active rules
+                -> [CoreRule]
                 -> CoreProgram -> CoreProgram
-occurAnalysePgm this_mod active_rule imp_rules vects vectVars binds
+occurAnalysePgm this_mod active_unf active_rule imp_rules binds
   | isEmptyDetails final_usage
   = occ_anald_binds
 
@@ -69,7 +73,9 @@ occurAnalysePgm this_mod active_rule imp_rules vects vectVars binds
                    2 (ppr final_usage ) )
     occ_anald_glommed_binds
   where
-    init_env = initOccEnv active_rule
+    init_env = initOccEnv { occ_rule_act = active_rule
+                          , occ_unf_act  = active_unf }
+
     (final_usage, occ_anald_binds) = go init_env binds
     (_, occ_anald_glommed_binds)   = occAnalRecBind init_env TopLevel
                                                     imp_rule_edges
@@ -80,12 +86,8 @@ occurAnalysePgm this_mod active_rule imp_rules vects vectVars binds
           -- we can easily create an infinite loop (Trac #9583 is an example)
 
     initial_uds = addManyOccsSet emptyDetails
-                            (rulesFreeVars imp_rules `unionVarSet`
-                             vectsFreeVars vects `unionVarSet`
-                             vectVars)
-    -- The RULES and VECTORISE declarations keep things alive! (For VECTORISE declarations,
-    -- we only get them *until* the vectoriser runs. Afterwards, these dependencies are
-    -- reflected in 'vectors' — see Note [Vectorisation declarations and occurrences].)
+                            (rulesFreeVars imp_rules)
+    -- The RULES declarations keep things alive!
 
     -- Note [Preventing loops due to imported functions rules]
     imp_rule_edges = foldr (plusVarEnv_C unionVarSet) emptyVarEnv
@@ -118,9 +120,7 @@ occurAnalyseExpr' :: Bool -> CoreExpr -> CoreExpr
 occurAnalyseExpr' enable_binder_swap expr
   = snd (occAnal env expr)
   where
-    env = (initOccEnv all_active_rules) {occ_binder_swap = enable_binder_swap}
-    -- To be conservative, we say that all inlines and rules are active
-    all_active_rules = \_ -> True
+    env = initOccEnv { occ_binder_swap = enable_binder_swap }
 
 {- Note [Plugin rules]
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -170,7 +170,7 @@ we treat it like this (occAnalRecBind):
 
 4. To do so we form a new set of Nodes, with the same details, but
    different edges, the "loop-breaker nodes". The loop-breaker nodes
-   have both more and fewer depedencies than the scope edges
+   have both more and fewer dependencies than the scope edges
    (see Note [Choosing loop breakers])
 
    More edges: if f calls g, and g has an active rule that mentions h
@@ -698,39 +698,6 @@ costs us anything when, for some `j`:
 This appears to be very rare in practice. TODO Perhaps we should gather
 statistics to be sure.
 
-Note [Excess polymorphism and join points]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In principle, if a function would be a join point except that it fails
-the polymorphism rule (see Note [The polymorphism rule of join points] in
-CoreSyn), it can still be made a join point with some effort. This is because
-all tail calls must return the same type (they return to the same context!), and
-thus if the return type depends on an argument, that argument must always be the
-same.
-
-For instance, consider:
-
-  let f :: forall a. a -> Char -> [a]
-      f @a x c = ... f @a x 'a' ...
-  in ... f @Int 1 'b' ... f @Int 2 'c' ...
-
-(where the calls are tail calls). `f` fails the polymorphism rule because its
-return type is [a], where [a] is bound. But since the type argument is always
-'Int', we can rewrite it as:
-
-  let f' :: Int -> Char -> [Int]
-      f' x c = ... f' x 'a' ...
-  in ... f' 1 'b' ... f 2 'c' ...
-
-and now we can make f' a join point:
-
-  join f' :: Int -> Char -> [Int]
-       f' x c = ... jump f' x 'a' ...
-  in ... jump f' 1 'b' ... jump f' 2 'c' ...
-
-It's not clear that this comes up often, however. TODO: Measure how often and
-add this analysis if necessary.
-
 ------------------------------------------------------------
 Note [Adjusting right-hand sides]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -801,7 +768,7 @@ occAnalNonRecBind env lvl imp_rule_edges binder rhs body_usage
   = (body_usage, [])
 
   | otherwise                   -- It's mentioned in the body
-  = (body_usage' +++ rhs_usage', [NonRec tagged_binder rhs'])
+  = (body_usage' `andUDs` rhs_usage', [NonRec tagged_binder rhs'])
   where
     (body_usage', tagged_binder) = tagNonRecBinder lvl body_usage binder
     mb_join_arity = willBeJoinId_maybe tagged_binder
@@ -816,16 +783,17 @@ occAnalNonRecBind env lvl imp_rule_edges binder rhs body_usage
     -- Unfoldings
     -- See Note [Unfoldings and join points]
     rhs_usage2 = case occAnalUnfolding env NonRecursive binder of
-                   Just unf_usage -> rhs_usage1 +++ unf_usage
+                   Just unf_usage -> rhs_usage1 `andUDs` unf_usage
                    Nothing        -> rhs_usage1
 
     -- Rules
     -- See Note [Rules are extra RHSs] and Note [Rule dependency info]
     rules_w_uds = occAnalRules env mb_join_arity NonRecursive tagged_binder
-    rhs_usage3 = rhs_usage2 +++ combineUsageDetailsList
-                                  (map (\(_, l, r) -> l +++ r) rules_w_uds)
-    rhs_usage4 = maybe rhs_usage3 (addManyOccsSet rhs_usage3) $
-                 lookupVarEnv imp_rule_edges binder
+    rule_uds    = map (\(_, l, r) -> l `andUDs` r) rules_w_uds
+    rhs_usage3 = foldr andUDs rhs_usage2 rule_uds
+    rhs_usage4 = case lookupVarEnv imp_rule_edges binder of
+                   Nothing -> rhs_usage3
+                   Just vs -> addManyOccsSet rhs_usage3 vs
        -- See Note [Preventing loops due to imported functions rules]
 
     -- Final adjustment
@@ -835,7 +803,7 @@ occAnalNonRecBind env lvl imp_rule_edges binder rhs body_usage
 occAnalRecBind :: OccEnv -> TopLevelFlag -> ImpRuleEdges -> [(Var,CoreExpr)]
                -> UsageDetails -> (UsageDetails, [CoreBind])
 occAnalRecBind env lvl imp_rule_edges pairs body_usage
-  = foldr (occAnalRec lvl) (body_usage, []) sccs
+  = foldr (occAnalRec env lvl) (body_usage, []) sccs
         -- For a recursive group, we
         --      * occ-analyse all the RHSs
         --      * compute strongly-connected components
@@ -862,20 +830,20 @@ calls for the purpose of finding join points.
 -}
 
 -----------------------------
-occAnalRec :: TopLevelFlag
+occAnalRec :: OccEnv -> TopLevelFlag
            -> SCC Details
            -> (UsageDetails, [CoreBind])
            -> (UsageDetails, [CoreBind])
 
         -- The NonRec case is just like a Let (NonRec ...) above
-occAnalRec lvl (AcyclicSCC (ND { nd_bndr = bndr, nd_rhs = rhs
-                               , nd_uds = rhs_uds, nd_rhs_bndrs = rhs_bndrs }))
+occAnalRec _ lvl (AcyclicSCC (ND { nd_bndr = bndr, nd_rhs = rhs
+                                 , nd_uds = rhs_uds, nd_rhs_bndrs = rhs_bndrs }))
            (body_uds, binds)
   | not (bndr `usedIn` body_uds)
   = (body_uds, binds)           -- See Note [Dead code]
 
   | otherwise                   -- It's mentioned in the body
-  = (body_uds' +++ rhs_uds',
+  = (body_uds' `andUDs` rhs_uds',
      NonRec tagged_bndr rhs : binds)
   where
     (body_uds', tagged_bndr) = tagNonRecBinder lvl body_uds bndr
@@ -885,7 +853,7 @@ occAnalRec lvl (AcyclicSCC (ND { nd_bndr = bndr, nd_rhs = rhs
         -- The Rec case is the interesting one
         -- See Note [Recursive bindings: the grand plan]
         -- See Note [Loop breaking]
-occAnalRec lvl (CyclicSCC details_s) (body_uds, binds)
+occAnalRec env lvl (CyclicSCC details_s) (body_uds, binds)
   | not (any (`usedIn` body_uds) bndrs) -- NB: look at body_uds, not total_uds
   = (body_uds, binds)                   -- See Note [Dead code]
 
@@ -904,7 +872,7 @@ occAnalRec lvl (CyclicSCC details_s) (body_uds, binds)
     final_uds :: UsageDetails
     loop_breaker_nodes :: [LetrecNode]
     (final_uds, loop_breaker_nodes)
-      = mkLoopBreakerNodes lvl bndr_set body_uds details_s
+      = mkLoopBreakerNodes env lvl bndr_set body_uds details_s
 
     ------------------------------
     weak_fvs :: VarSet
@@ -955,7 +923,8 @@ recording inlinings for any Ids which aren't marked as "no-inline" as it goes.
 
 -- Return the bindings sorted into a plausible order, and marked with loop breakers.
 loopBreakNodes depth bndr_set weak_fvs nodes binds
-  = go (stronglyConnCompFromEdgedVerticesUniqR nodes) binds
+  = -- pprTrace "loopBreakNodes" (ppr nodes) $
+    go (stronglyConnCompFromEdgedVerticesUniqR nodes) binds
   where
     go []         binds = binds
     go (scc:sccs) binds = loop_break_scc scc (go sccs binds)
@@ -972,8 +941,8 @@ reOrderNodes :: Int -> VarSet -> VarSet -> [LetrecNode] -> [Binding] -> [Binding
 reOrderNodes _ _ _ []     _     = panic "reOrderNodes"
 reOrderNodes _ _ _ [node] binds = mk_loop_breaker node : binds
 reOrderNodes depth bndr_set weak_fvs (node : nodes) binds
-  = -- pprTrace "reOrderNodes" (text "unchosen" <+> ppr unchosen $$
-    --                           text "chosen" <+> ppr chosen_nodes) $
+  = -- pprTrace "reOrderNodes" (vcat [ text "unchosen" <+> ppr unchosen
+    --                              , text "chosen" <+> ppr chosen_nodes ]) $
     loopBreakNodes new_depth bndr_set weak_fvs unchosen $
     (map mk_loop_breaker chosen_nodes ++ binds)
   where
@@ -1243,11 +1212,11 @@ makeNode env imp_rule_edges bndr_set (bndr, rhs)
     (bndrs, body) = collectBinders rhs
     (rhs_usage1, bndrs', body') = occAnalRecRhs env bndrs body
     rhs' = mkLams bndrs' body'
-    rhs_usage2 = rhs_usage1 +++ all_rule_uds
+    rhs_usage2 = foldr andUDs rhs_usage1 rule_uds
                    -- Note [Rules are extra RHSs]
                    -- Note [Rule dependency info]
     rhs_usage3 = case mb_unf_uds of
-                   Just unf_uds -> rhs_usage2 +++ unf_uds
+                   Just unf_uds -> rhs_usage2 `andUDs` unf_uds
                    Nothing      -> rhs_usage2
     node_fvs = udFreeVars bndr_set rhs_usage3
 
@@ -1263,8 +1232,7 @@ makeNode env imp_rule_edges bndr_set (bndr, rhs)
       -- See Note [Preventing loops due to imported functions rules]
                       [ (ru_act rule, udFreeVars bndr_set rhs_uds)
                       | (rule, _, rhs_uds) <- rules_w_uds ]
-    all_rule_uds = combineUsageDetailsList $
-                     concatMap (\(_, l, r) -> [l, r]) rules_w_uds
+    rule_uds = map (\(_, l, r) -> l `andUDs` r) rules_w_uds
     active_rule_fvs = unionVarSets [fvs | (a,fvs) <- rules_w_rhs_fvs
                                         , is_active a]
 
@@ -1280,7 +1248,7 @@ makeNode env imp_rule_edges bndr_set (bndr, rhs)
                       -- isn't the right thing (it tells about
                       -- RULE activation), so we'd need more plumbing
 
-mkLoopBreakerNodes :: TopLevelFlag
+mkLoopBreakerNodes :: OccEnv -> TopLevelFlag
                    -> VarSet
                    -> UsageDetails   -- for BODY of let
                    -> [Details]
@@ -1293,7 +1261,7 @@ mkLoopBreakerNodes :: TopLevelFlag
 --      the loop-breaker SCC analysis
 --   d) adjust each RHS's usage details according to
 --      the binder's (new) shotness and join-point-hood
-mkLoopBreakerNodes lvl bndr_set body_uds details_s
+mkLoopBreakerNodes env lvl bndr_set body_uds details_s
   = (final_uds, zipWith mk_lb_node details_s bndrs')
   where
     (final_uds, bndrs') = tagRecBinders lvl body_uds
@@ -1309,7 +1277,7 @@ mkLoopBreakerNodes lvl bndr_set body_uds details_s
               -- Note [Deterministic SCC] in Digraph.
       where
         nd'     = nd { nd_bndr = bndr', nd_score = score }
-        score   = nodeScore bndr bndr' rhs lb_deps
+        score   = nodeScore env bndr bndr' rhs lb_deps
         lb_deps = extendFvs_ rule_fv_env inl_fvs
 
     rule_fv_env :: IdEnv IdSet
@@ -1325,18 +1293,22 @@ mkLoopBreakerNodes lvl bndr_set body_uds details_s
 
 
 ------------------------------------------
-nodeScore :: Id        -- Binder has old occ-info (just for loop-breaker-ness)
+nodeScore :: OccEnv
+          -> Id        -- Binder has old occ-info (just for loop-breaker-ness)
           -> Id        -- Binder with new occ-info
           -> CoreExpr  -- RHS
           -> VarSet    -- Loop-breaker dependencies
           -> NodeScore
-nodeScore old_bndr new_bndr bind_rhs lb_deps
+nodeScore env old_bndr new_bndr bind_rhs lb_deps
   | not (isId old_bndr)     -- A type or cercion variable is never a loop breaker
   = (100, 0, False)
 
   | old_bndr `elemVarSet` lb_deps  -- Self-recursive things are great loop breakers
   = (0, 0, True)                   -- See Note [Self-recursion and loop breakers]
 
+  | not (occ_unf_act env old_bndr) -- A binder whose inlining is inactive (e.g. has
+  = (0, 0, True)                   -- a NOINLINE pragam) makes a great loop breaker
+
   | exprIsTrivial rhs
   = mk_score 10  -- Practically certain to be inlined
     -- Used to have also: && not (isExportedId bndr)
@@ -1553,19 +1525,24 @@ occAnalNonRecRhs :: OccEnv
 occAnalNonRecRhs env bndr bndrs body
   = occAnalLamOrRhs rhs_env bndrs body
   where
-    -- See Note [Cascading inlines]
-    env1 | certainly_inline = env
+    env1 | is_join_point    = env  -- See Note [Join point RHSs]
+         | certainly_inline = env  -- See Note [Cascading inlines]
          | otherwise        = rhsCtxt env
 
     -- See Note [Sources of one-shot information]
     rhs_env = env1 { occ_one_shots = argOneShots dmd }
 
     certainly_inline -- See Note [Cascading inlines]
-      = case idOccInfo bndr of
+      = case occ of
           OneOcc { occ_in_lam = in_lam, occ_one_br = one_br }
-                                 -> not in_lam && one_br && active && not_stable
-          _                      -> False
+            -> not in_lam && one_br && active && not_stable
+          _ -> False
+
+    is_join_point = isAlwaysTailCalled occ
+    -- Like (isJoinId bndr) but happens one step earlier
+    --  c.f. willBeJoinId_maybe
 
+    occ        = idOccInfo bndr
     dmd        = idDemandInfo bndr
     active     = isAlwaysActive (idInlineActivation bndr)
     not_stable = not (isStableUnfolding (idUnfolding bndr))
@@ -1591,7 +1568,7 @@ occAnalUnfolding env rec_flag id
       DFunUnfolding { df_bndrs = bndrs, df_args = args }
         -> Just $ zapDetails (delDetailsList usage bndrs)
         where
-          usage = foldr (+++) emptyDetails (map (fst . occAnal env) args)
+          usage = andUDsList (map (fst . occAnal env) args)
 
       _ -> Nothing
 
@@ -1626,7 +1603,18 @@ occAnalRules env mb_expected_join_arity rec_flag id
       = case mb_expected_join_arity of
           Just ar | args `lengthIs` ar -> uds
           _                            -> markAllNonTailCalled uds
-{-
+{- Note [Join point RHSs]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider
+   x = e
+   join j = Just x
+
+We want to inline x into j right away, so we don't want to give
+the join point a RhsCtxt (Trac #14137).  It's not a huge deal, because
+the FloatIn pass knows to float into join point RHSs; and the simplifier
+does not float things out of join point RHSs.  But it's a simple, cheap
+thing to do.  See Trac #14137.
+
 Note [Cascading inlines]
 ~~~~~~~~~~~~~~~~~~~~~~~~
 By default we use an rhsCtxt for the RHS of a binding.  This tells the
@@ -1653,15 +1641,19 @@ definitely inline the next time round, and so we analyse x3's rhs in
 an ordinary context, not rhsCtxt.  Hence the "certainly_inline" stuff.
 
 Annoyingly, we have to approximate SimplUtils.preInlineUnconditionally.
-If we say "yes" when preInlineUnconditionally says "no" the simplifier iterates
-indefinitely:
+If (a) the RHS is expandable (see isExpandableApp in occAnalApp), and
+   (b) certainly_inline says "yes" when preInlineUnconditionally says "no"
+then the simplifier iterates indefinitely:
         x = f y
-        k = Just x
+        k = Just x   -- We decide that k is 'certainly_inline'
+        v = ...k...  -- but preInlineUnconditionally doesn't inline it
 inline ==>
         k = Just (f y)
+        v = ...k...
 float ==>
         x1 = f y
         k = Just x1
+        v = ...k...
 
 This is worse than the slow cascade, so we only want to say "certainly_inline"
 if it really is certain.  Look at the note with preInlineUnconditionally
@@ -1702,11 +1694,17 @@ we can sort them into the right place when doing dependency analysis.
 -}
 
 occAnal env (Tick tickish body)
+  | SourceNote{} <- tickish
+  = (usage, Tick tickish body')
+                  -- SourceNotes are best-effort; so we just proceed as usual.
+                  -- If we drop a tick due to the issues described below it's
+                  -- not the end of the world.
+
   | tickish `tickishScopesLike` SoftScope
   = (markAllNonTailCalled usage, Tick tickish body')
 
   | Breakpoint _ ids <- tickish
-  = (usage_lam +++ foldr addManyOccs emptyDetails ids, Tick tickish body')
+  = (usage_lam `andUDs` foldr addManyOccs emptyDetails ids, Tick tickish body')
     -- never substitute for any of the Ids in a Breakpoint
 
   | otherwise
@@ -1721,16 +1719,17 @@ occAnal env (Tick tickish body)
                   -- Making j a join point may cause the simplifier to drop t
                   -- (if the tick is put into the continuation). So we don't
                   -- count j 1 as a tail call.
+                  -- See #14242.
 
 occAnal env (Cast expr co)
   = case occAnal env expr of { (usage, expr') ->
     let usage1 = zapDetailsIf (isRhsEnv env) usage
+          -- usage1: if we see let x = y `cast` co
+          -- then mark y as 'Many' so that we don't
+          -- immediately inline y again.
         usage2 = addManyOccsSet usage1 (coVarsOfCo co)
-          -- See Note [Gather occurrences of coercion variables]
+          -- usage2: see Note [Gather occurrences of coercion variables]
     in (markAllNonTailCalled usage2, Cast expr' co)
-        -- If we see let x = y `cast` co
-        -- then mark y as 'Many' so that we don't
-        -- immediately inline y again.
     }
 
 occAnal env app@(App _ _)
@@ -1772,30 +1771,13 @@ occAnal env (Case scrut bndr ty alts)
   = case occ_anal_scrut scrut alts     of { (scrut_usage, scrut') ->
     case mapAndUnzip occ_anal_alt alts of { (alts_usage_s, alts')   ->
     let
-        alts_usage  = foldr combineAltsUsageDetails emptyDetails alts_usage_s
-        (alts_usage1, tagged_bndr) = tag_case_bndr alts_usage bndr
-        total_usage = markAllNonTailCalled scrut_usage +++ alts_usage1
+        alts_usage  = foldr orUDs emptyDetails alts_usage_s
+        (alts_usage1, tagged_bndr) = tagLamBinder alts_usage bndr
+        total_usage = markAllNonTailCalled scrut_usage `andUDs` alts_usage1
                         -- Alts can have tail calls, but the scrutinee can't
     in
     total_usage `seq` (total_usage, Case scrut' tagged_bndr ty alts') }}
   where
-        -- Note [Case binder usage]
-        -- ~~~~~~~~~~~~~~~~~~~~~~~~
-        -- The case binder gets a usage of either "many" or "dead", never "one".
-        -- Reason: we like to inline single occurrences, to eliminate a binding,
-        -- but inlining a case binder *doesn't* eliminate a binding.
-        -- We *don't* want to transform
-        --      case x of w { (p,q) -> f w }
-        -- into
-        --      case x of w { (p,q) -> f (p,q) }
-    tag_case_bndr usage bndr
-      = (usage', setIdOccInfo bndr final_occ_info)
-      where
-        occ_info       = lookupDetails usage bndr
-        usage'         = usage `delDetails` bndr
-        final_occ_info = case occ_info of IAmDead -> IAmDead
-                                          _       -> noOccInfo
-
     alt_env = mkAltEnv env scrut bndr
     occ_anal_alt = occAnalAlt alt_env
 
@@ -1834,7 +1816,7 @@ occAnalArgs env (arg:args) one_shots
   = case argCtxt env one_shots           of { (arg_env, one_shots') ->
     case occAnal arg_env arg             of { (uds1, arg') ->
     case occAnalArgs env args one_shots' of { (uds2, args') ->
-    (uds1 +++ uds2, arg':args') }}}
+    (uds1 `andUDs` uds2, arg':args') }}}
 
 {-
 Applications are dealt with specially because we want
@@ -1860,7 +1842,7 @@ occAnalApp env (Var fun, args, ticks)
   | null ticks = (uds, mkApps (Var fun) args')
   | otherwise  = (uds, mkTicks ticks $ mkApps (Var fun) args')
   where
-    uds = fun_uds +++ final_args_uds
+    uds = fun_uds `andUDs` final_args_uds
 
     !(args_uds, args') = occAnalArgs env args one_shots
     !final_args_uds
@@ -1890,7 +1872,7 @@ occAnalApp env (Var fun, args, ticks)
         -- See Note [Sources of one-shot information], bullet point A']
 
 occAnalApp env (fun, args, ticks)
-  = (markAllNonTailCalled (fun_uds +++ args_uds),
+  = (markAllNonTailCalled (fun_uds `andUDs` args_uds),
      mkTicks ticks $ mkApps fun' args')
   where
     !(fun_uds, fun') = occAnal (addAppCtxt env args) fun
@@ -2024,10 +2006,9 @@ occAnalAlt :: (OccEnv, Maybe (Id, CoreExpr))
 occAnalAlt (env, scrut_bind) (con, bndrs, rhs)
   = case occAnal env rhs of { (rhs_usage1, rhs1) ->
     let
-        (alt_usg, tagged_bndrs) = tagLamBinders rhs_usage1 bndrs
-                                  -- See Note [Binders in case alternatives]
-        (alt_usg', rhs2) =
-          wrapAltRHS env scrut_bind alt_usg tagged_bndrs rhs1
+      (alt_usg, tagged_bndrs) = tagLamBinders rhs_usage1 bndrs
+                                -- See Note [Binders in case alternatives]
+      (alt_usg', rhs2) = wrapAltRHS env scrut_bind alt_usg tagged_bndrs rhs1
     in
     (alt_usg', (con, tagged_bndrs, rhs2)) }
 
@@ -2042,15 +2023,19 @@ wrapAltRHS env (Just (scrut_var, let_rhs)) alt_usg bndrs alt_rhs
   , scrut_var `usedIn` alt_usg -- bndrs are not be present in alt_usg so this
                                -- handles condition (a) in Note [Binder swap]
   , not captured               -- See condition (b) in Note [Binder swap]
-  = ( alt_usg' +++ let_rhs_usg
+  = ( alt_usg' `andUDs` let_rhs_usg
     , Let (NonRec tagged_scrut_var let_rhs') alt_rhs )
   where
-    captured = any (`usedIn` let_rhs_usg) bndrs
+    captured = any (`usedIn` let_rhs_usg) bndrs  -- Check condition (b)
+
     -- The rhs of the let may include coercion variables
     -- if the scrutinee was a cast, so we must gather their
     -- usage. See Note [Gather occurrences of coercion variables]
+    -- Moreover, the rhs of the let may mention the case-binder, and
+    -- we want to gather its occ-info as well
     (let_rhs_usg, let_rhs') = occAnal env let_rhs
-    (alt_usg', [tagged_scrut_var]) = tagLamBinders alt_usg [scrut_var]
+
+    (alt_usg', tagged_scrut_var) = tagLamBinder alt_usg scrut_var
 
 wrapAltRHS _ _ alt_usg _ alt_rhs
   = (alt_usg, alt_rhs)
@@ -2067,8 +2052,12 @@ data OccEnv
   = OccEnv { occ_encl       :: !OccEncl      -- Enclosing context information
            , occ_one_shots  :: !OneShots     -- See Note [OneShots]
            , occ_gbl_scrut  :: GlobalScruts
+
+           , occ_unf_act   :: Id -> Bool   -- Which Id unfoldings are active
+
            , occ_rule_act   :: Activation -> Bool   -- Which rules are active
              -- See Note [Finding rule RHS free vars]
+
            , occ_binder_swap :: !Bool -- enable the binder_swap
              -- See CorePrep Note [Dead code in CorePrep]
     }
@@ -2081,7 +2070,7 @@ type GlobalScruts = IdSet   -- See Note [Binder swap on GlobalId scrutinees]
 --      x = (p,q)               -- Don't inline p or q
 --      y = /\a -> (p a, q a)   -- Still don't inline p or q
 --      z = f (p,q)             -- Do inline p,q; it may make a rule fire
--- So OccEncl tells enought about the context to know what to do when
+-- So OccEncl tells enough about the context to know what to do when
 -- we encounter a constructor application or PAP.
 
 data OccEncl
@@ -2097,12 +2086,15 @@ instance Outputable OccEncl where
 -- See note [OneShots]
 type OneShots = [OneShotInfo]
 
-initOccEnv :: (Activation -> Bool) -> OccEnv
-initOccEnv active_rule
+initOccEnv :: OccEnv
+initOccEnv
   = OccEnv { occ_encl      = OccVanilla
            , occ_one_shots = []
            , occ_gbl_scrut = emptyVarSet
-           , occ_rule_act  = active_rule
+                 -- To be conservative, we say that all
+                 -- inlines and rules are active
+           , occ_unf_act   = \_ -> True
+           , occ_rule_act  = \_ -> True
            , occ_binder_swap = True }
 
 vanillaCtxt :: OccEnv -> OccEnv
@@ -2160,7 +2152,12 @@ markJoinOneShots mb_join_arity bndrs
       Just n  -> go n bndrs
  where
    go 0 bndrs  = bndrs
-   go _ []     = WARN( True, ppr mb_join_arity <+> ppr bndrs ) []
+   go _ []     = [] -- This can legitimately happen.
+                    -- e.g.    let j = case ... in j True
+                    -- This will become an arity-1 join point after the
+                    -- simplifier has eta-expanded it; but it may not have
+                    -- enough lambdas /yet/. (Lint checks that JoinIds do
+                    -- have enough lambdas.)
    go n (b:bs) = b' : go (n-1) bs
      where
        b' | isId b    = setOneShotLambda b
@@ -2298,6 +2295,9 @@ Core Lint never expects to find an *occurrence* of an Id marked
 as Dead, so we must zap the OccInfo on cb before making the
 binding x = cb.  See Trac #5028.
 
+NB: the OccInfo on /occurrences/ really doesn't matter much; the simplifier
+doesn't use it. So this is only to satisfy the perhpas-over-picky Lint.
+
 Historical note [no-case-of-case]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We *used* to suppress the binder-swap in case expressions when
@@ -2361,10 +2361,10 @@ information right.
 -}
 
 mkAltEnv :: OccEnv -> CoreExpr -> Id -> (OccEnv, Maybe (Id, CoreExpr))
--- Does two things: a) makes the occ_one_shots = OccVanilla
---                  b) extends the GlobalScruts if possible
---                  c) returns a proxy mapping, binding the scrutinee
---                     to the case binder, if possible
+-- Does three things: a) makes the occ_one_shots = OccVanilla
+--                    b) extends the GlobalScruts if possible
+--                    c) returns a proxy mapping, binding the scrutinee
+--                       to the case binder, if possible
 mkAltEnv env@(OccEnv { occ_gbl_scrut = pe }) scrut case_bndr
   = case stripTicksTopE (const True) scrut of
       Var v           -> add_scrut v case_bndr'
@@ -2373,15 +2373,19 @@ mkAltEnv env@(OccEnv { occ_gbl_scrut = pe }) scrut case_bndr
       _               -> (env { occ_encl = OccVanilla }, Nothing)
 
   where
-    add_scrut v rhs = ( env { occ_encl = OccVanilla, occ_gbl_scrut = pe `extendVarSet` v }
+    add_scrut v rhs = ( env { occ_encl = OccVanilla
+                            , occ_gbl_scrut = pe `extendVarSet` v }
                       , Just (localise v, rhs) )
 
-    case_bndr' = Var (zapIdOccInfo case_bndr) -- See Note [Zap case binders in proxy bindings]
-    localise scrut_var = mkLocalIdOrCoVar (localiseName (idName scrut_var)) (idType scrut_var)
-        -- Localise the scrut_var before shadowing it; we're making a
-        -- new binding for it, and it might have an External Name, or
-        -- even be a GlobalId; Note [Binder swap on GlobalId scrutinees]
-        -- Also we don't want any INLINE or NOINLINE pragmas!
+    case_bndr' = Var (zapIdOccInfo case_bndr)
+                   -- See Note [Zap case binders in proxy bindings]
+
+    -- Localise the scrut_var before shadowing it; we're making a
+    -- new binding for it, and it might have an External Name, or
+    -- even be a GlobalId; Note [Binder swap on GlobalId scrutinees]
+    -- Also we don't want any INLINE or NOINLINE pragmas!
+    localise scrut_var = mkLocalIdOrCoVar (localiseName (idName scrut_var))
+                                          (idType scrut_var)
 
 {-
 ************************************************************************
@@ -2426,13 +2430,13 @@ instance Outputable UsageDetails where
 -------------------
 -- UsageDetails API
 
-(+++), combineAltsUsageDetails
+andUDs, orUDs
         :: UsageDetails -> UsageDetails -> UsageDetails
-(+++) = combineUsageDetailsWith addOccInfo
-combineAltsUsageDetails = combineUsageDetailsWith orOccInfo
+andUDs = combineUsageDetailsWith addOccInfo
+orUDs  = combineUsageDetailsWith orOccInfo
 
-combineUsageDetailsList :: [UsageDetails] -> UsageDetails
-combineUsageDetailsList = foldl (+++) emptyDetails
+andUDsList :: [UsageDetails] -> UsageDetails
+andUDsList = foldl' andUDs emptyDetails
 
 mkOneOcc :: OccEnv -> Id -> InterestingCxt -> JoinArity -> UsageDetails
 mkOneOcc env id int_cxt arity
@@ -2581,14 +2585,21 @@ tagLamBinders :: UsageDetails          -- Of scope
               -> [Id]                  -- Binders
               -> (UsageDetails,        -- Details with binders removed
                  [IdWithOccInfo])    -- Tagged binders
+tagLamBinders usage binders
+  = usage' `seq` (usage', bndrs')
+  where
+    (usage', bndrs') = mapAccumR tagLamBinder usage binders
+
+tagLamBinder :: UsageDetails       -- Of scope
+             -> Id                 -- Binder
+             -> (UsageDetails,     -- Details with binder removed
+                 IdWithOccInfo)    -- Tagged binders
 -- Used for lambda and case binders
 -- It copes with the fact that lambda bindings can have a
 -- stable unfolding, used for join points
-tagLamBinders usage binders = usage' `seq` (usage', bndrs')
+tagLamBinder usage bndr
+  = (usage2, bndr')
   where
-    (usage', bndrs') = mapAccumR tag_lam usage binders
-    tag_lam usage bndr = (usage2, bndr')
-      where
         occ    = lookupDetails usage bndr
         bndr'  = setBinderOcc (markNonTailCalled occ) bndr
                    -- Don't try to make an argument into a join point
@@ -2633,7 +2644,7 @@ tagRecBinders lvl body_uds triples
 
      -- 1. Determine join-point-hood of whole group, as determined by
      --    the *unadjusted* usage details
-     unadj_uds     = body_uds +++ combineUsageDetailsList rhs_udss
+     unadj_uds     = foldr andUDs body_uds rhs_udss
      will_be_joins = decideJoinPointHood lvl unadj_uds bndrs
 
      -- 2. Adjust usage details of each RHS, taking into account the
@@ -2650,19 +2661,15 @@ tagRecBinders lvl body_uds triples
            , AlwaysTailCalled arity <- tailCallInfo occ
            = Just arity
            | otherwise
-           = ASSERT(not will_be_joins) -- Should be AlwaysTailCalled if we're
-                                       -- making join points!
-             Nothing
+           = ASSERT(not will_be_joins) -- Should be AlwaysTailCalled if
+             Nothing                   -- we are making join points!
 
      -- 3. Compute final usage details from adjusted RHS details
-     adj_uds   = body_uds +++ combineUsageDetailsList rhs_udss'
+     adj_uds   = foldr andUDs body_uds rhs_udss'
 
-     -- 4. Tag each binder with its adjusted details modulo the
-     --    join-point-hood decision
-     occs      = map (lookupDetails adj_uds) bndrs
-     occs'     | will_be_joins = occs
-               | otherwise     = map markNonTailCalled occs
-     bndrs'    = zipWith setBinderOcc occs' bndrs
+     -- 4. Tag each binder with its adjusted details
+     bndrs'    = [ setBinderOcc (lookupDetails adj_uds bndr) bndr
+                 | bndr <- bndrs ]
 
      -- 5. Drop the binders from the adjusted details and return
      usage'    = adj_uds `delDetailsList` bndrs
@@ -2683,10 +2690,15 @@ setBinderOcc occ_info bndr
 
 -- | Decide whether some bindings should be made into join points or not.
 -- Returns `False` if they can't be join points. Note that it's an
--- all-or-nothing decision, as if multiple binders are given, they're assumed to
--- be mutually recursive.
+-- all-or-nothing decision, as if multiple binders are given, they're
+-- assumed to be mutually recursive.
+--
+-- It must, however, be a final decision. If we say "True" for 'f',
+-- and then subsequently decide /not/ make 'f' into a join point, then
+-- the decision about another binding 'g' might be invalidated if (say)
+-- 'f' tail-calls 'g'.
 --
--- See Note [Invariants for join points] in CoreSyn.
+-- See Note [Invariants on join points] in CoreSyn.
 decideJoinPointHood :: TopLevelFlag -> UsageDetails
                     -> [CoreBndr]
                     -> Bool
@@ -2708,11 +2720,18 @@ decideJoinPointHood NotTopLevel usage bndrs
     ok bndr
       | -- Invariant 1: Only tail calls, all same join arity
         AlwaysTailCalled arity <- tailCallInfo (lookupDetails usage bndr)
+
       , -- Invariant 1 as applied to LHSes of rules
         all (ok_rule arity) (idCoreRules bndr)
+
+        -- Invariant 2a: stable unfoldings
+        -- See Note [Join points and INLINE pragmas]
+      , ok_unfolding arity (realIdUnfolding bndr)
+
         -- Invariant 4: Satisfies polymorphism rule
       , isValidJoinPointType arity (idType bndr)
       = True
+
       | otherwise
       = False
 
@@ -2721,14 +2740,52 @@ decideJoinPointHood NotTopLevel usage bndrs
       = args `lengthIs` join_arity
         -- Invariant 1 as applied to LHSes of rules
 
+    -- ok_unfolding returns False if we should /not/ convert a non-join-id
+    -- into a join-id, even though it is AlwaysTailCalled
+    ok_unfolding join_arity (CoreUnfolding { uf_src = src, uf_tmpl = rhs })
+      = not (isStableSource src && join_arity > joinRhsArity rhs)
+    ok_unfolding _ (DFunUnfolding {})
+      = False
+    ok_unfolding _ _
+      = True
+
 willBeJoinId_maybe :: CoreBndr -> Maybe JoinArity
 willBeJoinId_maybe bndr
-  | AlwaysTailCalled arity <- tailCallInfo (idOccInfo bndr)
-  = Just arity
-  | otherwise
-  = isJoinId_maybe bndr
+  = case tailCallInfo (idOccInfo bndr) of
+      AlwaysTailCalled arity -> Just arity
+      _                      -> isJoinId_maybe bndr
+
+
+{- Note [Join points and INLINE pragmas]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider
+   f x = let g = \x. not  -- Arity 1
+             {-# INLINE g #-}
+         in case x of
+              A -> g True True
+              B -> g True False
+              C -> blah2
+
+Here 'g' is always tail-called applied to 2 args, but the stable
+unfolding captured by the INLINE pragma has arity 1.  If we try to
+convert g to be a join point, its unfolding will still have arity 1
+(since it is stable, and we don't meddle with stable unfoldings), and
+Lint will complain (see Note [Invariants on join points], (2a), in
+CoreSyn.  Trac #13413.
+
+Moreover, since g is going to be inlined anyway, there is no benefit
+from making it a join point.
+
+If it is recursive, and uselessly marked INLINE, this will stop us
+making it a join point, which is annoying.  But occasionally
+(notably in class methods; see Note [Instances and loop breakers] in
+TcInstDcls) we mark recursive things as INLINE but the recursion
+unravels; so ignoring INLINE pragmas on recursive things isn't good
+either.
+
+See Invariant 2a of Note [Invariants on join points] in CoreSyn
+
 
-{-
 ************************************************************************
 *                                                                      *
 \subsection{Operations over OccInfo}
@@ -2762,10 +2819,11 @@ orOccInfo (OneOcc { occ_in_lam = in_lam1, occ_int_cxt = int_cxt1
                   , occ_tail   = tail1 })
           (OneOcc { occ_in_lam = in_lam2, occ_int_cxt = int_cxt2
                   , occ_tail   = tail2 })
-  = OneOcc { occ_in_lam  = in_lam1 || in_lam2
-           , occ_one_br  = False -- False, because it occurs in both branches
+  = OneOcc { occ_one_br  = False -- False, because it occurs in both branches
+           , occ_in_lam  = in_lam1 || in_lam2
            , occ_int_cxt = int_cxt1 && int_cxt2
            , occ_tail    = tail1 `andTailCallInfo` tail2 }
+
 orOccInfo a1 a2 = ASSERT( not (isDeadOcc a1 || isDeadOcc a2) )
                   ManyOccs { occ_tail = tailCallInfo a1 `andTailCallInfo`
                                         tailCallInfo a2 }
diff --git a/compiler/simplCore/SAT.hs b/compiler/simplCore/SAT.hs
index 923d3a4209..e9a62d530d 100644
--- a/compiler/simplCore/SAT.hs
+++ b/compiler/simplCore/SAT.hs
@@ -51,6 +51,8 @@ essential to make this work well!
 {-# LANGUAGE CPP #-}
 module SAT ( doStaticArgs ) where
 
+import GhcPrelude
+
 import Var
 import CoreSyn
 import CoreUtils
diff --git a/compiler/simplCore/SetLevels.hs b/compiler/simplCore/SetLevels.hs
index 2b533b73bd..b8212c72f3 100644
--- a/compiler/simplCore/SetLevels.hs
+++ b/compiler/simplCore/SetLevels.hs
@@ -62,6 +62,8 @@ module SetLevels (
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import CoreSyn
 import CoreMonad        ( FloatOutSwitches(..) )
 import CoreUtils        ( exprType, exprIsHNF
@@ -79,12 +81,14 @@ import Id
 import IdInfo
 import Var
 import VarSet
+import UniqSet          ( nonDetFoldUniqSet )
 import VarEnv
 import Literal          ( litIsTrivial )
 import Demand           ( StrictSig, Demand, isStrictDmd, splitStrictSig, increaseStrictSigArity )
 import Name             ( getOccName, mkSystemVarName )
 import OccName          ( occNameString )
-import Type             ( Type, mkLamTypes, splitTyConApp_maybe )
+import Type             ( Type, mkLamTypes, splitTyConApp_maybe, tyCoVarsOfType )
+import TyCoRep          ( closeOverKindsDSet )
 import BasicTypes       ( Arity, RecFlag(..), isRec )
 import DataCon          ( dataConOrigResTy )
 import TysWiredIn
@@ -120,7 +124,7 @@ data FloatSpec
   = FloatMe Level       -- Float to just inside the binding
                         --    tagged with this level
   | StayPut Level       -- Stay where it is; binding is
-                        --     tagged with tihs level
+                        --     tagged with this level
 
 floatSpecLevel :: FloatSpec -> Level
 floatSpecLevel (FloatMe l) = l
@@ -399,13 +403,13 @@ lvlApp env orig_expr ((_,AnnVar fn), args)
   , Nothing <- isClassOpId_maybe fn
   =  do { rargs' <- mapM (lvlNonTailMFE env False) rargs
         ; lapp'  <- lvlNonTailMFE env False lapp
-        ; return (foldl App lapp' rargs') }
+        ; return (foldl' App lapp' rargs') }
 
   | otherwise
   = do { (_, args') <- mapAccumLM lvl_arg stricts args
             -- Take account of argument strictness; see
             -- Note [Floating to the top]
-       ; return (foldl App (lookupVar env fn) args') }
+       ; return (foldl' App (lookupVar env fn) args') }
   where
     n_val_args = count (isValArg . deAnnotate) args
     arity      = idArity fn
@@ -446,7 +450,7 @@ lvlApp env _ (fun, args)
      -- arguments and the function.
      do { args' <- mapM (lvlNonTailMFE env False) args
         ; fun'  <- lvlNonTailExpr env fun
-        ; return (foldl App fun' args') }
+        ; return (foldl' App fun' args') }
 
 -------------------------------------------
 lvlCase :: LevelEnv             -- Level of in-scope names/tyvars
@@ -457,7 +461,8 @@ lvlCase :: LevelEnv             -- Level of in-scope names/tyvars
         -> LvlM LevelledExpr    -- Result expression
 lvlCase env scrut_fvs scrut' case_bndr ty alts
   | [(con@(DataAlt {}), bs, body)] <- alts
-  , exprOkForSpeculation scrut'   -- See Note [Check the output scrutinee for okForSpec]
+  , exprOkForSpeculation (deTagExpr scrut')
+                                  -- See Note [Check the output scrutinee for okForSpec]
   , not (isTopLvl dest_lvl)       -- Can't have top-level cases
   , not (floatTopLvlOnly env)     -- Can float anywhere
   =     -- See Note [Floating cases]
@@ -528,7 +533,7 @@ okForSpeculation we must be careful to test the *result* scrutinee ('x'
 in this case), not the *input* one 'y'.  The latter *is* ok for
 speculation here, but the former is not -- and indeed we can't float
 the inner case out, at least not unless x is also evaluated at its
-binding site.
+binding site.  See Trac #5453.
 
 That's why we apply exprOkForSpeculation to scrut' and not to scrut.
 -}
@@ -557,7 +562,8 @@ lvlMFE env _ (_, AnnType ty)
 -- and then inline lvl.  Better just to float out the payload.
 lvlMFE env strict_ctxt (_, AnnTick t e)
   = do { e' <- lvlMFE env strict_ctxt e
-       ; return (Tick t e') }
+       ; let t' = substTickish (le_subst env) t
+       ; return (Tick t' e') }
 
 lvlMFE env strict_ctxt (_, AnnCast e (_, co))
   = do  { e' <- lvlMFE env strict_ctxt e
@@ -625,13 +631,14 @@ lvlMFE env strict_ctxt ann_expr
     expr         = deAnnotate ann_expr
     expr_ty      = exprType expr
     fvs          = freeVarsOf ann_expr
+    fvs_ty       = tyCoVarsOfType expr_ty
     is_bot       = isBottomThunk mb_bot_str
     is_function  = isFunction ann_expr
     mb_bot_str   = exprBotStrictness_maybe expr
                            -- See Note [Bottoming floats]
                            -- esp Bottoming floats (2)
     expr_ok_for_spec = exprOkForSpeculation expr
-    dest_lvl     = destLevel env fvs is_function is_bot False
+    dest_lvl     = destLevel env fvs fvs_ty is_function is_bot False
     abs_vars     = abstractVars dest_lvl env fvs
 
     -- float_is_new_lam: the floated thing will be a new value lambda
@@ -1024,7 +1031,7 @@ lvlBind env (AnnNonRec bndr rhs)
   || isCoVar bndr   -- Difficult to fix up CoVar occurrences (see extendPolyLvlEnv)
                     -- so we will ignore this case for now
   || not (profitableFloat env dest_lvl)
-  || (isTopLvl dest_lvl && not (exprIsTopLevelBindable deann_rhs (idType bndr)))
+  || (isTopLvl dest_lvl && not (exprIsTopLevelBindable deann_rhs bndr_ty))
           -- We can't float an unlifted binding to top level (except
           -- literal strings), so we don't float it at all.  It's a
           -- bit brutal, but unlifted bindings aren't expensive either
@@ -1053,10 +1060,12 @@ lvlBind env (AnnNonRec bndr rhs)
        ; return (NonRec (TB bndr2 (FloatMe dest_lvl)) rhs', env') }
 
   where
+    bndr_ty    = idType bndr
+    ty_fvs     = tyCoVarsOfType bndr_ty
     rhs_fvs    = freeVarsOf rhs
     bind_fvs   = rhs_fvs `unionDVarSet` dIdFreeVars bndr
     abs_vars   = abstractVars dest_lvl env bind_fvs
-    dest_lvl   = destLevel env bind_fvs (isFunction rhs) is_bot is_join
+    dest_lvl   = destLevel env bind_fvs ty_fvs (isFunction rhs) is_bot is_join
 
     deann_rhs  = deAnnotate rhs
     mb_bot_str = exprBotStrictness_maybe deann_rhs
@@ -1147,7 +1156,8 @@ lvlBind env (AnnRec pairs)
                `delDVarSetList`
                 bndrs
 
-    dest_lvl = destLevel env bind_fvs is_fun is_bot is_join
+    ty_fvs   = foldr (unionVarSet . tyCoVarsOfType . idType) emptyVarSet bndrs
+    dest_lvl = destLevel env bind_fvs ty_fvs is_fun is_bot is_join
     abs_vars = abstractVars dest_lvl env bind_fvs
 
 profitableFloat :: LevelEnv -> Level -> Bool
@@ -1260,7 +1270,7 @@ substBndrsSL :: RecFlag -> LevelEnv -> [InVar] -> (LevelEnv, [OutVar])
 -- So named only to avoid the name clash with CoreSubst.substBndrs
 substBndrsSL is_rec env@(LE { le_subst = subst, le_env = id_env }) bndrs
   = ( env { le_subst    = subst'
-          , le_env      = foldl add_id  id_env (bndrs `zip` bndrs') }
+          , le_env      = foldl' add_id  id_env (bndrs `zip` bndrs') }
     , bndrs')
   where
     (subst', bndrs') = case is_rec of
@@ -1310,13 +1320,16 @@ stayPut new_lvl bndr = TB bndr (StayPut new_lvl)
 
   -- Destination level is the max Id level of the expression
   -- (We'll abstract the type variables, if any.)
-destLevel :: LevelEnv -> DVarSet
+destLevel :: LevelEnv
+          -> DVarSet    -- Free vars of the term
+          -> TyCoVarSet -- Free in the /type/ of the term
+                        -- (a subset of the previous argument)
           -> Bool   -- True <=> is function
           -> Bool   -- True <=> is bottom
           -> Bool   -- True <=> is a join point
           -> Level
 -- INVARIANT: if is_join=True then result >= join_ceiling
-destLevel env fvs is_function is_bot is_join
+destLevel env fvs fvs_ty is_function is_bot is_join
   | isTopLvl max_fv_id_level  -- Float even joins if they get to top level
                               -- See Note [Floating join point bindings]
   = tOP_LEVEL
@@ -1328,21 +1341,48 @@ destLevel env fvs is_function is_bot is_join
     else max_fv_id_level
 
   | is_bot              -- Send bottoming bindings to the top
-  = tOP_LEVEL           -- regardless; see Note [Bottoming floats]
+  = as_far_as_poss      -- regardless; see Note [Bottoming floats]
                         -- Esp Bottoming floats (1)
 
   | Just n_args <- floatLams env
   , n_args > 0  -- n=0 case handled uniformly by the 'otherwise' case
   , is_function
   , countFreeIds fvs <= n_args
-  = tOP_LEVEL   -- Send functions to top level; see
-                -- the comments with isFunction
+  = as_far_as_poss  -- Send functions to top level; see
+                    -- the comments with isFunction
 
   | otherwise = max_fv_id_level
   where
-    max_fv_id_level = maxFvLevel isId env fvs -- Max over Ids only; the tyvars
-                                              -- will be abstracted
-    join_ceiling = joinCeilingLevel env
+    join_ceiling    = joinCeilingLevel env
+    max_fv_id_level = maxFvLevel isId env fvs -- Max over Ids only; the
+                                              -- tyvars will be abstracted
+
+    as_far_as_poss = maxFvLevel' isId env fvs_ty
+                     -- See Note [Floating and kind casts]
+
+{- Note [Floating and kind casts]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider this
+   case x of
+     K (co :: * ~# k) -> let v :: Int |> co
+                             v = e
+                         in blah
+
+Then, even if we are abstracting over Ids, or if e is bottom, we can't
+float v outside the 'co' binding.  Reason: if we did we'd get
+    v' :: forall k. (Int ~# Age) => Int |> co
+and now 'co' isn't in scope in that type. The underlying reason is
+that 'co' is a value-level thing and we can't abstract over that in a
+type (else we'd get a dependent type).  So if v's /type/ mentions 'co'
+we can't float it out beyond the binding site of 'co'.
+
+That's why we have this as_far_as_poss stuff.  Usually as_far_as_poss
+is just tOP_LEVEL; but occasionally a coercion variable (which is an
+Id) mentioned in type prevents this.
+
+Example Trac #14270 comment:15.
+-}
+
 
 isFunction :: CoreExprWithFVs -> Bool
 -- The idea here is that we want to float *functions* to
@@ -1439,7 +1479,7 @@ addLvl :: Level -> VarEnv Level -> OutVar -> VarEnv Level
 addLvl dest_lvl env v' = extendVarEnv env v' dest_lvl
 
 addLvls :: Level -> VarEnv Level -> [OutVar] -> VarEnv Level
-addLvls dest_lvl env vs = foldl (addLvl dest_lvl) env vs
+addLvls dest_lvl env vs = foldl' (addLvl dest_lvl) env vs
 
 floatLams :: LevelEnv -> Maybe Int
 floatLams le = floatOutLambdas (le_switches le)
@@ -1476,14 +1516,20 @@ placeJoinCeiling le@(LE { le_ctxt_lvl = lvl })
     lvl' = asJoinCeilLvl (incMinorLvl lvl)
 
 maxFvLevel :: (Var -> Bool) -> LevelEnv -> DVarSet -> Level
-maxFvLevel max_me (LE { le_lvl_env = lvl_env, le_env = id_env }) var_set
-  = foldDVarSet max_in tOP_LEVEL var_set
+maxFvLevel max_me env var_set
+  = foldDVarSet (maxIn max_me env) tOP_LEVEL var_set
+
+maxFvLevel' :: (Var -> Bool) -> LevelEnv -> TyCoVarSet -> Level
+-- Same but for TyCoVarSet
+maxFvLevel' max_me env var_set
+  = nonDetFoldUniqSet (maxIn max_me env) tOP_LEVEL var_set
+
+maxIn :: (Var -> Bool) -> LevelEnv -> InVar -> Level -> Level
+maxIn max_me (LE { le_lvl_env = lvl_env, le_env = id_env }) in_var lvl
+  = case lookupVarEnv id_env in_var of
+      Just (abs_vars, _) -> foldr max_out lvl abs_vars
+      Nothing            -> max_out in_var lvl
   where
-    max_in in_var lvl
-       = foldr max_out lvl (case lookupVarEnv id_env in_var of
-                                Just (abs_vars, _) -> abs_vars
-                                Nothing            -> [in_var])
-
     max_out out_var lvl
         | max_me out_var = case lookupVarEnv lvl_env out_var of
                                 Just lvl' -> maxLvl lvl' lvl
@@ -1513,17 +1559,14 @@ abstractVars :: Level -> LevelEnv -> DVarSet -> [OutVar]
         -- Uniques are not deterministic.
 abstractVars dest_lvl (LE { le_subst = subst, le_lvl_env = lvl_env }) in_fvs
   =  -- NB: sortQuantVars might not put duplicates next to each other
-    map zap $ sortQuantVars $ uniq
-    [out_var | out_fv  <- dVarSetElems (substDVarSet subst in_fvs)
-             , out_var <- dVarSetElems (close out_fv)
-             , abstract_me out_var ]
+    map zap $ sortQuantVars $
+    filter abstract_me      $
+    dVarSetElems            $
+    closeOverKindsDSet      $
+    substDVarSet subst in_fvs
         -- NB: it's important to call abstract_me only on the OutIds the
         -- come from substDVarSet (not on fv, which is an InId)
   where
-    uniq :: [Var] -> [Var]
-        -- Remove duplicates, preserving order
-    uniq = dVarSetElems . mkDVarSet
-
     abstract_me v = case lookupVarEnv lvl_env v of
                         Just lvl -> dest_lvl `ltLvl` lvl
                         Nothing  -> False
@@ -1536,12 +1579,6 @@ abstractVars dest_lvl (LE { le_subst = subst, le_lvl_env = lvl_env }) in_fvs
                      setIdInfo v vanillaIdInfo
           | otherwise = v
 
-    close :: Var -> DVarSet  -- Close over variables free in the type
-                             -- Result includes the input variable itself
-    close v = foldDVarSet (unionDVarSet . close)
-                          (unitDVarSet v)
-                          (fvDVarSet $ varTypeTyCoFVs v)
-
 type LvlM result = UniqSM result
 
 initLvl :: UniqSupply -> UniqSM a -> a
@@ -1559,8 +1596,8 @@ newPolyBndrs dest_lvl
       ; let new_bndrs = zipWith mk_poly_bndr bndrs uniqs
             bndr_prs  = bndrs `zip` new_bndrs
             env' = env { le_lvl_env = addLvls dest_lvl lvl_env new_bndrs
-                       , le_subst   = foldl add_subst subst   bndr_prs
-                       , le_env     = foldl add_id    id_env  bndr_prs }
+                       , le_subst   = foldl' add_subst subst   bndr_prs
+                       , le_env     = foldl' add_id    id_env  bndr_prs }
       ; return (env', new_bndrs) }
   where
     add_subst env (v, v') = extendIdSubst env v (mkVarApps (Var v') abs_vars)
@@ -1603,7 +1640,7 @@ newLvlVar lvld_rhs join_arity_maybe is_mk_static
       = mkExportedVanillaId (mkSystemVarName uniq (mkFastString "static_ptr"))
                             rhs_ty
       | otherwise
-      = mkLocalIdOrCoVar (mkSystemVarName uniq (mkFastString "lvl")) rhs_ty
+      = mkSysLocalOrCoVar (mkFastString "lvl") uniq rhs_ty
 
 cloneCaseBndrs :: LevelEnv -> Level -> [Var] -> LvlM (LevelEnv, [Var])
 cloneCaseBndrs env@(LE { le_subst = subst, le_lvl_env = lvl_env, le_env = id_env })
@@ -1614,7 +1651,7 @@ cloneCaseBndrs env@(LE { le_subst = subst, le_lvl_env = lvl_env, le_env = id_env
                         , le_join_ceil = new_lvl
                         , le_lvl_env   = addLvls new_lvl lvl_env vs'
                         , le_subst     = subst'
-                        , le_env       = foldl add_id id_env (vs `zip` vs') }
+                        , le_env       = foldl' add_id id_env (vs `zip` vs') }
 
        ; return (env', vs') }
 
@@ -1636,7 +1673,7 @@ cloneLetVars is_rec
              prs  = vs `zip` vs2
              env' = env { le_lvl_env = addLvls dest_lvl lvl_env vs2
                         , le_subst   = subst'
-                        , le_env     = foldl add_id id_env prs }
+                        , le_env     = foldl' add_id id_env prs }
 
        ; return (env', vs2) }
   where
diff --git a/compiler/simplCore/SimplCore.hs b/compiler/simplCore/SimplCore.hs
index c1513b8af6..168ece971c 100644
--- a/compiler/simplCore/SimplCore.hs
+++ b/compiler/simplCore/SimplCore.hs
@@ -10,12 +10,15 @@ module SimplCore ( core2core, simplifyExpr ) where
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import DynFlags
 import CoreSyn
 import HscTypes
 import CSE              ( cseProgram )
 import Rules            ( mkRuleBase, unionRuleBase,
-                          extendRuleBaseList, ruleCheckProgram, addRuleInfo, )
+                          extendRuleBaseList, ruleCheckProgram, addRuleInfo,
+                          getRules )
 import PprCore          ( pprCoreBindings, pprCoreExpr )
 import OccurAnal        ( occurAnalysePgm, occurAnalyseExpr )
 import IdInfo
@@ -24,7 +27,7 @@ import CoreUtils        ( mkTicks, stripTicksTop )
 import CoreLint         ( endPass, lintPassResult, dumpPassResult,
                           lintAnnots )
 import Simplify         ( simplTopBinds, simplExpr, simplRules )
-import SimplUtils       ( simplEnvForGHCi, activeRule )
+import SimplUtils       ( simplEnvForGHCi, activeRule, activeUnfolding )
 import SimplEnv
 import SimplMonad
 import CoreMonad
@@ -34,7 +37,7 @@ import FloatOut         ( floatOutwards )
 import FamInstEnv
 import Id
 import ErrUtils         ( withTiming )
-import BasicTypes       ( CompilerPhase(..), isDefaultInlinePragma )
+import BasicTypes       ( CompilerPhase(..), isDefaultInlinePragma, defaultInlinePragma )
 import VarSet
 import VarEnv
 import LiberateCase     ( liberateCase )
@@ -43,26 +46,19 @@ import Specialise       ( specProgram)
 import SpecConstr       ( specConstrProgram)
 import DmdAnal          ( dmdAnalProgram )
 import CallArity        ( callArityAnalProgram )
+import Exitify          ( exitifyProgram )
 import WorkWrap         ( wwTopBinds )
-import Vectorise        ( vectorise )
 import SrcLoc
 import Util
 import Module
+import Plugins          ( withPlugins, installCoreToDos )
+import DynamicLoading  -- ( initializePlugins )
 
-import Maybes
 import UniqSupply       ( UniqSupply, mkSplitUniqSupply, splitUniqSupply )
 import UniqFM
 import Outputable
 import Control.Monad
 import qualified GHC.LanguageExtensions as LangExt
-
-#if defined(GHCI)
-import DynamicLoading   ( loadPlugins )
-import Plugins          ( installCoreToDos )
-#else
-import DynamicLoading   ( pluginError )
-#endif
-
 {-
 ************************************************************************
 *                                                                      *
@@ -84,7 +80,12 @@ core2core hsc_env guts@(ModGuts { mg_module  = mod
        ;
        ; (guts2, stats) <- runCoreM hsc_env hpt_rule_base us mod
                                     orph_mods print_unqual loc $
-                           do { all_passes <- addPluginPasses builtin_passes
+                           do { hsc_env' <- getHscEnv
+                              ; dflags' <- liftIO $ initializePlugins hsc_env'
+                                                      (hsc_dflags hsc_env')
+                              ; all_passes <- withPlugins dflags'
+                                                installCoreToDos
+                                                builtin_passes
                               ; runCorePasses all_passes guts }
 
        ; Err.dumpIfSet_dyn dflags Opt_D_dump_simpl_stats
@@ -120,6 +121,7 @@ getCoreToDo dflags
     max_iter      = maxSimplIterations dflags
     rule_check    = ruleCheck          dflags
     call_arity    = gopt Opt_CallArity                    dflags
+    exitification = gopt Opt_Exitification                dflags
     strictness    = gopt Opt_Strictness                   dflags
     full_laziness = gopt Opt_FullLaziness                 dflags
     do_specialise = gopt Opt_Specialise                   dflags
@@ -128,11 +130,11 @@ getCoreToDo dflags
     spec_constr   = gopt Opt_SpecConstr                   dflags
     liberate_case = gopt Opt_LiberateCase                 dflags
     late_dmd_anal = gopt Opt_LateDmdAnal                  dflags
+    late_specialise = gopt Opt_LateSpecialise             dflags
     static_args   = gopt Opt_StaticArgumentTransformation dflags
     rules_on      = gopt Opt_EnableRewriteRules           dflags
     eta_expand_on = gopt Opt_DoLambdaEtaExpansion         dflags
     ww_on         = gopt Opt_WorkerWrapper                dflags
-    vectorise_on  = gopt Opt_Vectorise                    dflags
     static_ptrs   = xopt LangExt.StaticPointers           dflags
 
     maybe_rule_check phase = runMaybe rule_check (CoreDoRuleCheck phase)
@@ -142,6 +144,7 @@ getCoreToDo dflags
 
     base_mode = SimplMode { sm_phase      = panic "base_mode"
                           , sm_names      = []
+                          , sm_dflags     = dflags
                           , sm_rules      = rules_on
                           , sm_eta_expand = eta_expand_on
                           , sm_inline     = True
@@ -156,30 +159,6 @@ getCoreToDo dflags
 
           , maybe_rule_check (Phase phase) ]
 
-          -- Vectorisation can introduce a fair few common sub expressions involving
-          --  DPH primitives. For example, see the Reverse test from dph-examples.
-          --  We need to eliminate these common sub expressions before their definitions
-          --  are inlined in phase 2. The CSE introduces lots of  v1 = v2 bindings,
-          --  so we also run simpl_gently to inline them.
-      ++  (if vectorise_on && phase == 3
-            then [CoreCSE, simpl_gently]
-            else [])
-
-    vectorisation
-      = runWhen vectorise_on $
-          CoreDoPasses [ simpl_gently, CoreDoVectorisation ]
-
-                -- By default, we have 2 phases before phase 0.
-
-                -- Want to run with inline phase 2 after the specialiser to give
-                -- maximum chance for fusion to work before we inline build/augment
-                -- in phase 1.  This made a difference in 'ansi' where an
-                -- overloaded function wasn't inlined till too late.
-
-                -- Need phase 1 so that build/augment get
-                -- inlined.  I found that spectral/hartel/genfft lost some useful
-                -- strictness in the function sumcode' if augment is not inlined
-                -- before strictness analysis runs
     simpl_phases = CoreDoPasses [ simpl_phase phase ["main"] max_iter
                                 | phase <- [phases, phases-1 .. 1] ]
 
@@ -189,7 +168,7 @@ getCoreToDo dflags
                        (base_mode { sm_phase = InitialPhase
                                   , sm_names = ["Gentle"]
                                   , sm_rules = rules_on   -- Note [RULEs enabled in SimplGently]
-                                  , sm_inline = not vectorise_on
+                                  , sm_inline = True
                                               -- See Note [Inline in InitialPhase]
                                   , sm_case_case = False })
                           -- Don't do case-of-case transformations.
@@ -222,8 +201,7 @@ getCoreToDo dflags
 
     core_todo =
      if opt_level == 0 then
-       [ vectorisation,
-         static_ptrs_float_outwards,
+       [ static_ptrs_float_outwards,
          CoreDoSimplify max_iter
              (base_mode { sm_phase = Phase 0
                         , sm_names = ["Non-opt simplification"] })
@@ -237,10 +215,6 @@ getCoreToDo dflags
     -- after this before anything else
         runWhen static_args (CoreDoPasses [ simpl_gently, CoreDoStaticArgs ]),
 
-        -- We run vectorisation here for now, but we might also try to run
-        -- it later
-        vectorisation,
-
         -- initial simplify: mk specialiser happy: minimum effort please
         simpl_gently,
 
@@ -305,6 +279,9 @@ getCoreToDo dflags
 
         runWhen strictness demand_analyser,
 
+        runWhen exitification CoreDoExitify,
+            -- See note [Placement of the exitification pass]
+
         runWhen full_laziness $
            CoreDoFloatOutwards FloatOutSwitches {
                                  floatOutLambdas     = floatLamArgs dflags,
@@ -340,6 +317,16 @@ getCoreToDo dflags
 
         maybe_rule_check (Phase 0),
 
+        runWhen late_specialise
+          (CoreDoPasses [ CoreDoSpecialising
+                        , simpl_phase 0 ["post-late-spec"] max_iter]),
+
+        -- LiberateCase can yield new CSE opportunities because it peels
+        -- off one layer of a recursive function (concretely, I saw this
+        -- in wheel-sieve1), and I'm guessing that SpecConstr can too
+        -- And CSE is a very cheap pass. So it seems worth doing here.
+        runWhen ((liberate_case || spec_constr) && cse) CoreCSE,
+
         -- Final clean-up simplification:
         simpl_phase 0 ["final"] max_iter,
 
@@ -365,24 +352,6 @@ getCoreToDo dflags
       flatten_todos passes ++ flatten_todos rest
     flatten_todos (todo : rest) = todo : flatten_todos rest
 
--- Loading plugins
-
-addPluginPasses :: [CoreToDo] -> CoreM [CoreToDo]
-#if !defined(GHCI)
-addPluginPasses builtin_passes
-  = do { dflags <- getDynFlags
-       ; let pluginMods = pluginModNames dflags
-       ; unless (null pluginMods) (pluginError pluginMods)
-       ; return builtin_passes }
-#else
-addPluginPasses builtin_passes
-  = do { hsc_env <- getHscEnv
-       ; named_plugins <- liftIO (loadPlugins hsc_env)
-       ; foldM query_plug builtin_passes named_plugins }
-  where
-    query_plug todos (_, plug, options) = installCoreToDos plug options todos
-#endif
-
 {- Note [Inline in InitialPhase]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 In GHC 8 and earlier we did not inline anything in the InitialPhase. But that is
@@ -473,6 +442,9 @@ doCorePass CoreDoStaticArgs          = {-# SCC "StaticArgs" #-}
 doCorePass CoreDoCallArity           = {-# SCC "CallArity" #-}
                                        doPassD callArityAnalProgram
 
+doCorePass CoreDoExitify             = {-# SCC "Exitify" #-}
+                                       doPass exitifyProgram
+
 doCorePass CoreDoStrictness          = {-# SCC "NewStranal" #-}
                                        doPassDFM dmdAnalProgram
 
@@ -485,9 +457,6 @@ doCorePass CoreDoSpecialising        = {-# SCC "Specialise" #-}
 doCorePass CoreDoSpecConstr          = {-# SCC "SpecConstr" #-}
                                        specConstrProgram
 
-doCorePass CoreDoVectorisation       = {-# SCC "Vectorise" #-}
-                                       vectorise
-
 doCorePass CoreDoPrintCore              = observe   printCore
 doCorePass (CoreDoRuleCheck phase pat)  = ruleCheckPass phase pat
 doCorePass CoreDoNothing                = return
@@ -495,9 +464,15 @@ doCorePass (CoreDoPasses passes)        = runCorePasses passes
 
 #if defined(GHCI)
 doCorePass (CoreDoPluginPass _ pass) = {-# SCC "Plugin" #-} pass
+#else
+doCorePass pass@CoreDoPluginPass {}  = pprPanic "doCorePass" (ppr pass)
 #endif
 
-doCorePass pass = pprPanic "doCorePass" (ppr pass)
+doCorePass pass@CoreDesugar          = pprPanic "doCorePass" (ppr pass)
+doCorePass pass@CoreDesugarOpt       = pprPanic "doCorePass" (ppr pass)
+doCorePass pass@CoreTidy             = pprPanic "doCorePass" (ppr pass)
+doCorePass pass@CorePrep             = pprPanic "doCorePass" (ppr pass)
+doCorePass pass@CoreOccurAnal        = pprPanic "doCorePass" (ppr pass)
 
 {-
 ************************************************************************
@@ -519,10 +494,12 @@ ruleCheckPass current_phase pat guts =
     { rb <- getRuleBase
     ; dflags <- getDynFlags
     ; vis_orphs <- getVisibleOrphanMods
+    ; let rule_fn fn = getRules (RuleEnv rb vis_orphs) fn
+                        ++ (mg_rules guts)
     ; liftIO $ putLogMsg dflags NoReason Err.SevDump noSrcSpan
                    (defaultDumpStyle dflags)
                    (ruleCheckProgram current_phase pat
-                      (RuleEnv rb vis_orphs) (mg_binds guts))
+                      rule_fn (mg_binds guts))
     ; return guts }
 
 doPassDUM :: (DynFlags -> UniqSupply -> CoreProgram -> IO CoreProgram) -> ModGuts -> CoreM ModGuts
@@ -619,7 +596,7 @@ simplExprGently :: SimplEnv -> CoreExpr -> SimplM CoreExpr
 --      (b) the LHS and RHS of a RULE
 --      (c) Template Haskell splices
 --
--- The name 'Gently' suggests that the SimplifierMode is SimplGently,
+-- The name 'Gently' suggests that the SimplMode is SimplGently,
 -- and in fact that is so.... but the 'Gently' in simplExprGently doesn't
 -- enforce that; it just simplifies the expression twice
 
@@ -679,7 +656,8 @@ simplifyPgmIO pass@(CoreDoSimplify max_iterations mode)
     dflags       = hsc_dflags hsc_env
     print_unqual = mkPrintUnqualified dflags rdr_env
     simpl_env    = mkSimplEnv mode
-    active_rule  = activeRule simpl_env
+    active_rule  = activeRule mode
+    active_unf   = activeUnfolding mode
 
     do_iteration :: UniqSupply
                  -> Int          -- Counts iterations
@@ -711,30 +689,9 @@ simplifyPgmIO pass@(CoreDoSimplify max_iterations mode)
       , () <- sz `seq` ()     -- Force it
       = do {
                 -- Occurrence analysis
-           let {   -- Note [Vectorisation declarations and occurrences]
-                   -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                   -- During the 'InitialPhase' (i.e., before vectorisation), we need to make sure
-                   -- that the right-hand sides of vectorisation declarations are taken into
-                   -- account during occurrence analysis. After the 'InitialPhase', we need to ensure
-                   -- that the binders representing variable vectorisation declarations are kept alive.
-                   -- (In contrast to automatically vectorised variables, their unvectorised versions
-                   -- don't depend on them.)
-                 vectVars = mkVarSet $
-                              catMaybes [ fmap snd $ lookupDVarEnv (vectInfoVar (mg_vect_info guts)) bndr
-                                        | Vect bndr _ <- mg_vect_decls guts]
-                              ++
-                              catMaybes [ fmap snd $ lookupDVarEnv (vectInfoVar (mg_vect_info guts)) bndr
-                                        | bndr <- bindersOfBinds binds]
-                                        -- FIXME: This second comprehensions is only needed as long as we
-                                        --        have vectorised bindings where we get "Could NOT call
-                                        --        vectorised from original version".
-              ;  (maybeVects, maybeVectVars)
-                   = case sm_phase mode of
-                       InitialPhase -> (mg_vect_decls guts, vectVars)
-                       _            -> ([], vectVars)
-               ; tagged_binds = {-# SCC "OccAnal" #-}
-                     occurAnalysePgm this_mod active_rule rules
-                                     maybeVects maybeVectVars binds
+           let { tagged_binds = {-# SCC "OccAnal" #-}
+                     occurAnalysePgm this_mod active_unf active_rule rules
+                                     binds
                } ;
            Err.dumpIfSet_dyn dflags Opt_D_dump_occur_anal "Occurrence analysis"
                      (pprCoreBindings tagged_binds);
@@ -754,18 +711,19 @@ simplifyPgmIO pass@(CoreDoSimplify max_iterations mode)
                 -- Simplify the program
            ((binds1, rules1), counts1) <-
              initSmpl dflags (mkRuleEnv rule_base2 vis_orphs) fam_envs us1 sz $
-               do { env1 <- {-# SCC "SimplTopBinds" #-}
-                            simplTopBinds simpl_env tagged_binds
+               do { (floats, env1) <- {-# SCC "SimplTopBinds" #-}
+                                      simplTopBinds simpl_env tagged_binds
 
                       -- Apply the substitution to rules defined in this module
                       -- for imported Ids.  Eg  RULE map my_f = blah
                       -- If we have a substitution my_f :-> other_f, we'd better
                       -- apply it to the rule to, or it'll never match
-                  ; rules1 <- simplRules env1 Nothing rules
+                  ; rules1 <- simplRules env1 Nothing rules Nothing
 
-                  ; return (getFloatBinds env1, rules1) } ;
+                  ; return (getTopFloatBinds floats, rules1) } ;
 
                 -- Stop if nothing happened; don't dump output
+                -- See Note [Which transformations are innocuous] in CoreMonad
            if isZeroSimplCount counts1 then
                 return ( "Simplifier reached fixed point", iteration_no
                        , totalise (counts1 : counts_so_far)  -- Include "free" ticks
@@ -838,16 +796,6 @@ save a gratuitous jump (from \tr{x_exported} to \tr{x_local}), and
 makes strictness information propagate better.  This used to happen in
 the final phase, but it's tidier to do it here.
 
-Note [Transferring IdInfo]
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-We want to propagage any useful IdInfo on x_local to x_exported.
-
-STRICTNESS: if we have done strictness analysis, we want the strictness info on
-x_local to transfer to x_exported.  Hence the copyIdInfo call.
-
-RULES: we want to *add* any RULES for x_local to x_exported.
-
-
 Note [Messing up the exported Id's RULES]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We must be careful about discarding (obviously) or even merging the
@@ -941,7 +889,6 @@ unfolding for something.
 
 Note [Indirection zapping and ticks]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 Unfortunately this is another place where we need a special case for
 ticks. The following happens quite regularly:
 
@@ -981,12 +928,18 @@ shortOutIndirections binds
     zap (Rec pairs)       = [Rec (concatMap zapPair pairs)]
 
     zapPair (bndr, rhs)
-        | bndr `elemVarSet` exp_id_set = []
+        | bndr `elemVarSet` exp_id_set
+        = []   -- Kill the exported-id binding
+
         | Just (exp_id, ticks) <- lookupVarEnv ind_env bndr
-                                       = [(transferIdInfo exp_id bndr,
-                                           mkTicks ticks rhs),
-                                          (bndr, Var exp_id)]
-        | otherwise                    = [(bndr,rhs)]
+        , (exp_id', lcl_id') <- transferIdInfo exp_id bndr
+        =      -- Turn a local-id binding into two bindings
+               --    exp_id = rhs; lcl_id = exp_id
+          [ (exp_id', mkTicks ticks rhs),
+            (lcl_id', Var exp_id') ]
+
+        | otherwise
+        = [(bndr,rhs)]
 
 makeIndEnv :: [CoreBind] -> IndEnv
 makeIndEnv binds
@@ -1039,16 +992,32 @@ hasShortableIdInfo id
      info = idInfo id
 
 -----------------
-transferIdInfo :: Id -> Id -> Id
+{- Note [Transferring IdInfo]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If we have
+     lcl_id = e; exp_id = lcl_id
+
+and lcl_id has useful IdInfo, we don't want to discard it by going
+     gbl_id = e; lcl_id = gbl_id
+
+Instead, transfer IdInfo from lcl_id to exp_id, specifically
+* (Stable) unfolding
+* Strictness
+* Rules
+* Inline pragma
+
+Overwriting, rather than merging, seems to work ok.
+
+We also zap the InlinePragma on the lcl_id. It might originally
+have had a NOINLINE, which we have now transferred; and we really
+want the lcl_id to inline now that its RHS is trivial!
+-}
+
+transferIdInfo :: Id -> Id -> (Id, Id)
 -- See Note [Transferring IdInfo]
--- If we have
---      lcl_id = e; exp_id = lcl_id
--- and lcl_id has useful IdInfo, we don't want to discard it by going
---      gbl_id = e; lcl_id = gbl_id
--- Instead, transfer IdInfo from lcl_id to exp_id
--- Overwriting, rather than merging, seems to work ok.
 transferIdInfo exported_id local_id
-  = modifyIdInfo transfer exported_id
+  = ( modifyIdInfo transfer exported_id
+    , local_id `setInlinePragma` defaultInlinePragma )
   where
     local_info = idInfo local_id
     transfer exp_info = exp_info `setStrictnessInfo`    strictnessInfo local_info
diff --git a/compiler/simplCore/SimplEnv.hs b/compiler/simplCore/SimplEnv.hs
index 9316ec08af..1d55f359fa 100644
--- a/compiler/simplCore/SimplEnv.hs
+++ b/compiler/simplCore/SimplEnv.hs
@@ -8,14 +8,14 @@
 
 module SimplEnv (
         -- * The simplifier mode
-        setMode, getMode, updMode,
+        setMode, getMode, updMode, seDynFlags,
 
         -- * Environments
-        SimplEnv(..), StaticEnv, pprSimplEnv,   -- Temp not abstract
+        SimplEnv(..), pprSimplEnv,   -- Temp not abstract
         mkSimplEnv, extendIdSubst,
         SimplEnv.extendTvSubst, SimplEnv.extendCvSubst,
         zapSubstEnv, setSubstEnv,
-        getInScope, setInScopeAndZapFloats,
+        getInScope, setInScopeFromE, setInScopeFromF,
         setInScopeSet, modifyInScope, addNewInScopeIds,
         getSimplRules,
 
@@ -29,19 +29,26 @@ module SimplEnv (
         substCo, substCoVar,
 
         -- * Floats
-        Floats, emptyFloats, isEmptyFloats,
-        addNonRec, addLetFloats, addFloats, extendFloats, addFlts,
-        wrapFloats, setFloats, zapFloats, addRecFloats, mapFloats,
-        doFloatFromRhs, getFloatBinds,
-
-        JoinFloat, JoinFloats, emptyJoinFloats, isEmptyJoinFloats,
-        wrapJoinFloats, wrapJoinFloatsX, zapJoinFloats, addJoinFloats
+        SimplFloats(..), emptyFloats, mkRecFloats,
+        mkFloatBind, addLetFloats, addJoinFloats, addFloats,
+        extendFloats, wrapFloats,
+        doFloatFromRhs, getTopFloatBinds,
+
+        -- * LetFloats
+        LetFloats, letFloatBinds, emptyLetFloats, unitLetFloat,
+        addLetFlts,  mapLetFloats,
+
+        -- * JoinFloats
+        JoinFloat, JoinFloats, emptyJoinFloats,
+        wrapJoinFloats, wrapJoinFloatsX, unitJoinFloat, addJoinFlts
     ) where
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import SimplMonad
-import CoreMonad                ( SimplifierMode(..) )
+import CoreMonad                ( SimplMode(..) )
 import CoreSyn
 import CoreUtils
 import Var
@@ -50,6 +57,7 @@ import VarSet
 import OrdList
 import Id
 import MkCore                   ( mkWildValBinder )
+import DynFlags                 ( DynFlags )
 import TysWiredIn
 import qualified Type
 import Type hiding              ( substTy, substTyVar, substTyVarBndr )
@@ -77,12 +85,12 @@ data SimplEnv
      -- Static in the sense of lexically scoped,
      -- wrt the original expression
 
-        seMode      :: SimplifierMode,
+        seMode      :: SimplMode
 
         -- The current substitution
-        seTvSubst   :: TvSubstEnv,      -- InTyVar |--> OutType
-        seCvSubst   :: CvSubstEnv,      -- InCoVar |--> OutCoercion
-        seIdSubst   :: SimplIdSubst,    -- InId    |--> OutExpr
+      , seTvSubst   :: TvSubstEnv      -- InTyVar |--> OutType
+      , seCvSubst   :: CvSubstEnv      -- InCoVar |--> OutCoercion
+      , seIdSubst   :: SimplIdSubst    -- InId    |--> OutExpr
 
      ----------- Dynamic part of the environment -----------
      -- Dynamic in the sense of describing the setup where
@@ -90,23 +98,40 @@ data SimplEnv
 
         -- The current set of in-scope variables
         -- They are all OutVars, and all bound in this module
-        seInScope   :: InScopeSet,      -- OutVars only
-                -- Includes all variables bound
-                -- by seLetFloats and seJoinFloats
+      , seInScope   :: InScopeSet       -- OutVars only
+    }
 
-        -- Ordinary bindings
-        seLetFloats  :: Floats,
-                -- See Note [Simplifier floats]
+data SimplFloats
+  = SimplFloats
+      { -- Ordinary let bindings
+        sfLetFloats  :: LetFloats
+                -- See Note [LetFloats]
 
         -- Join points
-        seJoinFloats :: JoinFloats
+      , sfJoinFloats :: JoinFloats
                 -- Handled separately; they don't go very far
-                -- We consider these to be /inside/ seLetFloats
+                -- We consider these to be /inside/ sfLetFloats
                 -- because join points can refer to ordinary bindings,
                 -- but not vice versa
-    }
 
-type StaticEnv = SimplEnv       -- Just the static part is relevant
+        -- Includes all variables bound by sfLetFloats and
+        -- sfJoinFloats, plus at least whatever is in scope where
+        -- these bindings land up.
+      , sfInScope :: InScopeSet  -- All OutVars
+      }
+
+instance Outputable SimplFloats where
+  ppr (SimplFloats { sfLetFloats = lf, sfJoinFloats = jf, sfInScope = is })
+    = text "SimplFloats"
+      <+> braces (vcat [ text "lets: " <+> ppr lf
+                       , text "joins:" <+> ppr jf
+                       , text "in_scope:" <+> ppr is ])
+
+emptyFloats :: SimplEnv -> SimplFloats
+emptyFloats env
+  = SimplFloats { sfLetFloats  = emptyLetFloats
+                , sfJoinFloats = emptyJoinFloats
+                , sfInScope    = seInScope env }
 
 pprSimplEnv :: SimplEnv -> SDoc
 -- Used for debugging; selective
@@ -241,12 +266,10 @@ need to know at the occurrence site that the variable is a join point
 so that we know to drop the context. Thus we remember which join
 points we're substituting. -}
 
-mkSimplEnv :: SimplifierMode -> SimplEnv
+mkSimplEnv :: SimplMode -> SimplEnv
 mkSimplEnv mode
   = SimplEnv { seMode = mode
              , seInScope = init_in_scope
-             , seLetFloats = emptyFloats
-             , seJoinFloats = emptyJoinFloats
              , seTvSubst = emptyVarEnv
              , seCvSubst = emptyVarEnv
              , seIdSubst = emptyVarEnv }
@@ -276,13 +299,16 @@ wild-ids before doing much else.
 It's a very dark corner of GHC.  Maybe it should be cleaned up.
 -}
 
-getMode :: SimplEnv -> SimplifierMode
+getMode :: SimplEnv -> SimplMode
 getMode env = seMode env
 
-setMode :: SimplifierMode -> SimplEnv -> SimplEnv
+seDynFlags :: SimplEnv -> DynFlags
+seDynFlags env = sm_dflags (seMode env)
+
+setMode :: SimplMode -> SimplEnv -> SimplEnv
 setMode mode env = env { seMode = mode }
 
-updMode :: (SimplifierMode -> SimplifierMode) -> SimplEnv -> SimplEnv
+updMode :: (SimplMode -> SimplMode) -> SimplEnv -> SimplEnv
 updMode upd env = env { seMode = upd (seMode env) }
 
 ---------------------
@@ -293,7 +319,7 @@ extendIdSubst env@(SimplEnv {seIdSubst = subst}) var res
 
 extendTvSubst :: SimplEnv -> TyVar -> Type -> SimplEnv
 extendTvSubst env@(SimplEnv {seTvSubst = tsubst}) var res
-  = ASSERT( isTyVar var )
+  = ASSERT2( isTyVar var, ppr var $$ ppr res )
     env {seTvSubst = extendVarEnv tsubst var res}
 
 extendCvSubst :: SimplEnv -> CoVar -> Coercion -> SimplEnv
@@ -308,19 +334,12 @@ getInScope env = seInScope env
 setInScopeSet :: SimplEnv -> InScopeSet -> SimplEnv
 setInScopeSet env in_scope = env {seInScope = in_scope}
 
-setInScopeAndZapFloats :: SimplEnv -> SimplEnv -> SimplEnv
--- Set the in-scope set, and *zap* the floats
-setInScopeAndZapFloats env env_with_scope
-  = env { seInScope    = seInScope env_with_scope,
-          seLetFloats  = emptyFloats,
-          seJoinFloats = emptyJoinFloats }
+setInScopeFromE :: SimplEnv -> SimplEnv -> SimplEnv
+-- See Note [Setting the right in-scope set]
+setInScopeFromE rhs_env here_env = rhs_env { seInScope = seInScope here_env }
 
-setFloats :: SimplEnv -> SimplEnv -> SimplEnv
--- Set the in-scope set *and* the floats
-setFloats env env_with_floats
-  = env { seInScope    = seInScope env_with_floats,
-          seLetFloats  = seLetFloats  env_with_floats,
-          seJoinFloats = seJoinFloats env_with_floats }
+setInScopeFromF :: SimplEnv -> SimplFloats -> SimplEnv
+setInScopeFromF env floats = env { seInScope = sfInScope floats }
 
 addNewInScopeIds :: SimplEnv -> [CoreBndr] -> SimplEnv
         -- The new Ids are guaranteed to be freshly allocated
@@ -340,6 +359,30 @@ modifyInScope :: SimplEnv -> CoreBndr -> SimplEnv
 modifyInScope env@(SimplEnv {seInScope = in_scope}) v
   = env {seInScope = extendInScopeSet in_scope v}
 
+{- Note [Setting the right in-scope set]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider
+  \x. (let x = e in b) arg[x]
+where the let shadows the lambda.  Really this means something like
+  \x1. (let x2 = e in b) arg[x1]
+
+- When we capture the 'arg' in an ApplyToVal continuation, we capture
+  the environment, which says what 'x' is bound to, namely x1
+
+- Then that continuation gets pushed under the let
+
+- Finally we simplify 'arg'.  We want
+     - the static, lexical environment bindig x :-> x1
+     - the in-scopeset from "here", under the 'let' which includes
+       both x1 and x2
+
+It's important to have the right in-scope set, else we may rename a
+variable to one that is already in scope.  So we must pick up the
+in-scope set from "here", but otherwise use the environment we
+captured along with 'arg'.  This transfer of in-scope set is done by
+setInScopeFromE.
+-}
+
 ---------------------
 zapSubstEnv :: SimplEnv -> SimplEnv
 zapSubstEnv env = env {seTvSubst = emptyVarEnv, seCvSubst = emptyVarEnv, seIdSubst = emptyVarEnv}
@@ -353,13 +396,13 @@ mkContEx (SimplEnv { seTvSubst = tvs, seCvSubst = cvs, seIdSubst = ids }) e = Co
 {-
 ************************************************************************
 *                                                                      *
-\subsection{Floats}
+\subsection{LetFloats}
 *                                                                      *
 ************************************************************************
 
-Note [Simplifier floats]
-~~~~~~~~~~~~~~~~~~~~~~~~~
-The Floats is a bunch of bindings, classified by a FloatFlag.
+Note [LetFloats]
+~~~~~~~~~~~~~~~~
+The LetFloats is a bunch of bindings, classified by a FloatFlag.
 
 * All of them satisfy the let/app invariant
 
@@ -378,8 +421,8 @@ Can't happen:
   NonRec x# (f y)       -- Might diverge; does not satisfy let/app
 -}
 
-data Floats = Floats (OrdList OutBind) FloatFlag
-        -- See Note [Simplifier floats]
+data LetFloats = LetFloats (OrdList OutBind) FloatFlag
+                 -- See Note [LetFloats]
 
 type JoinFloat  = OutBind
 type JoinFloats = OrdList JoinFloat
@@ -401,12 +444,12 @@ data FloatFlag
                 --      and not guaranteed cheap
                 --      Do not float these bindings out of a lazy let
 
-instance Outputable Floats where
-  ppr (Floats binds ff) = ppr ff $$ ppr (fromOL binds)
+instance Outputable LetFloats where
+  ppr (LetFloats binds ff) = ppr ff $$ ppr (fromOL binds)
 
 instance Outputable FloatFlag where
-  ppr FltLifted = text "FltLifted"
-  ppr FltOkSpec = text "FltOkSpec"
+  ppr FltLifted  = text "FltLifted"
+  ppr FltOkSpec  = text "FltOkSpec"
   ppr FltCareful = text "FltCareful"
 
 andFF :: FloatFlag -> FloatFlag -> FloatFlag
@@ -415,9 +458,9 @@ andFF FltOkSpec  FltCareful = FltCareful
 andFF FltOkSpec  _          = FltOkSpec
 andFF FltLifted  flt        = flt
 
-doFloatFromRhs :: TopLevelFlag -> RecFlag -> Bool -> OutExpr -> SimplEnv -> Bool
+doFloatFromRhs :: TopLevelFlag -> RecFlag -> Bool -> SimplFloats -> OutExpr -> Bool
 -- If you change this function look also at FloatIn.noFloatFromRhs
-doFloatFromRhs lvl rec str rhs (SimplEnv {seLetFloats = Floats fs ff})
+doFloatFromRhs lvl rec str (SimplFloats { sfLetFloats = LetFloats fs ff }) rhs
   =  not (isNilOL fs) && want_to_float && can_float
   where
      want_to_float = isTopLevel lvl || exprIsCheap rhs || exprIsExpandable rhs
@@ -439,23 +482,23 @@ But there are
 so we must take the 'or' of the two.
 -}
 
-emptyFloats :: Floats
-emptyFloats = Floats nilOL FltLifted
+emptyLetFloats :: LetFloats
+emptyLetFloats = LetFloats nilOL FltLifted
 
 emptyJoinFloats :: JoinFloats
 emptyJoinFloats = nilOL
 
-unitFloat :: OutBind -> Floats
+unitLetFloat :: OutBind -> LetFloats
 -- This key function constructs a singleton float with the right form
-unitFloat bind = ASSERT(all (not . isJoinId) (bindersOf bind))
-                 Floats (unitOL bind) (flag bind)
+unitLetFloat bind = ASSERT(all (not . isJoinId) (bindersOf bind))
+                    LetFloats (unitOL bind) (flag bind)
   where
     flag (Rec {})                = FltLifted
     flag (NonRec bndr rhs)
       | not (isStrictId bndr)    = FltLifted
-      | exprIsLiteralString rhs  = FltLifted
+      | exprIsTickedString rhs   = FltLifted
           -- String literals can be floated freely.
-          -- See Note [CoreSyn top-level string ltierals] in CoreSyn.
+          -- See Note [CoreSyn top-level string literals] in CoreSyn.
       | exprOkForSpeculation rhs = FltOkSpec  -- Unlifted, and lifted but ok-for-spec (eg HNF)
       | otherwise                = ASSERT2( not (isUnliftedType (idType bndr)), ppr bndr )
                                    FltCareful
@@ -465,138 +508,132 @@ unitJoinFloat :: OutBind -> JoinFloats
 unitJoinFloat bind = ASSERT(all isJoinId (bindersOf bind))
                      unitOL bind
 
-addNonRec :: SimplEnv -> OutId -> OutExpr -> SimplEnv
--- Add a non-recursive binding and extend the in-scope set
--- The latter is important; the binder may already be in the
--- in-scope set (although it might also have been created with newId)
--- but it may now have more IdInfo
-addNonRec env@(SimplEnv { seLetFloats  = floats
-                        , seJoinFloats = jfloats
-                        , seInScope = in_scope })
-          id rhs
-  | isJoinId id  -- This test incidentally forces the Id, and hence
-                 -- its IdInfo, and hence any inner substitutions
-  = env { seInScope    = in_scope'
-        , seLetFloats  = floats
-        , seJoinFloats = jfloats' }
-  | otherwise
-  = env { seInScope    = in_scope'
-        , seLetFloats  = floats'
-        , seJoinFloats = jfloats }
+mkFloatBind :: SimplEnv -> OutBind -> (SimplFloats, SimplEnv)
+-- Make a singleton SimplFloats, and
+-- extend the incoming SimplEnv's in-scope set with its binders
+-- These binders may already be in the in-scope set,
+-- but may have by now been augmented with more IdInfo
+mkFloatBind env bind
+  = (floats, env { seInScope = in_scope' })
   where
-    bind      = NonRec id rhs
-    in_scope' = extendInScopeSet in_scope id
-    floats'   = floats  `addFlts`     unitFloat     bind
-    jfloats'  = jfloats `addJoinFlts` unitJoinFloat bind
-
-extendFloats :: SimplEnv -> OutBind -> SimplEnv
+    floats
+      | isJoinBind bind
+      = SimplFloats { sfLetFloats  = emptyLetFloats
+                    , sfJoinFloats = unitJoinFloat bind
+                    , sfInScope    = in_scope' }
+      | otherwise
+      = SimplFloats { sfLetFloats  = unitLetFloat bind
+                    , sfJoinFloats = emptyJoinFloats
+                    , sfInScope    = in_scope' }
+
+    in_scope' = seInScope env `extendInScopeSetBind` bind
+
+extendFloats :: SimplFloats -> OutBind -> SimplFloats
 -- Add this binding to the floats, and extend the in-scope env too
-extendFloats env@(SimplEnv { seLetFloats  = floats
-                           , seJoinFloats = jfloats
-                           , seInScope = in_scope })
+extendFloats (SimplFloats { sfLetFloats  = floats
+                          , sfJoinFloats = jfloats
+                          , sfInScope    = in_scope })
              bind
   | isJoinBind bind
-  = env { seInScope    = in_scope'
-        , seLetFloats  = floats
-        , seJoinFloats = jfloats' }
+  = SimplFloats { sfInScope    = in_scope'
+                , sfLetFloats  = floats
+                , sfJoinFloats = jfloats' }
   | otherwise
-  = env { seInScope    = in_scope'
-        , seLetFloats  = floats'
-        , seJoinFloats = jfloats }
+  = SimplFloats { sfInScope    = in_scope'
+                , sfLetFloats  = floats'
+                , sfJoinFloats = jfloats }
   where
-    bndrs = bindersOf bind
-
-    in_scope' = extendInScopeSetList in_scope bndrs
-    floats'   = floats  `addFlts`     unitFloat bind
+    in_scope' = in_scope `extendInScopeSetBind` bind
+    floats'   = floats  `addLetFlts`  unitLetFloat bind
     jfloats'  = jfloats `addJoinFlts` unitJoinFloat bind
 
-addLetFloats :: SimplEnv -> SimplEnv -> SimplEnv
+addLetFloats :: SimplFloats -> LetFloats -> SimplFloats
 -- Add the let-floats for env2 to env1;
 -- *plus* the in-scope set for env2, which is bigger
 -- than that for env1
-addLetFloats env1 env2
-  = env1 { seLetFloats = seLetFloats env1 `addFlts` seLetFloats env2
-         , seInScope   = seInScope env2 }
-
-addFloats :: SimplEnv -> SimplEnv -> SimplEnv
+addLetFloats floats let_floats@(LetFloats binds _)
+  = floats { sfLetFloats = sfLetFloats floats `addLetFlts` let_floats
+           , sfInScope   = foldlOL extendInScopeSetBind
+                                   (sfInScope floats) binds }
+
+addJoinFloats :: SimplFloats -> JoinFloats -> SimplFloats
+addJoinFloats floats join_floats
+  = floats { sfJoinFloats = sfJoinFloats floats `addJoinFlts` join_floats
+           , sfInScope    = foldlOL extendInScopeSetBind
+                                    (sfInScope floats) join_floats }
+
+extendInScopeSetBind :: InScopeSet -> CoreBind -> InScopeSet
+extendInScopeSetBind in_scope bind
+  = extendInScopeSetList in_scope (bindersOf bind)
+
+addFloats :: SimplFloats -> SimplFloats -> SimplFloats
 -- Add both let-floats and join-floats for env2 to env1;
 -- *plus* the in-scope set for env2, which is bigger
 -- than that for env1
-addFloats env1 env2
-  = env1 { seLetFloats  = seLetFloats env1 `addFlts` seLetFloats env2
-         , seJoinFloats = seJoinFloats env1 `addJoinFlts` seJoinFloats env2
-         , seInScope    = seInScope env2 }
+addFloats (SimplFloats { sfLetFloats = lf1, sfJoinFloats = jf1 })
+          (SimplFloats { sfLetFloats = lf2, sfJoinFloats = jf2, sfInScope = in_scope })
+  = SimplFloats { sfLetFloats  = lf1 `addLetFlts` lf2
+                , sfJoinFloats = jf1 `addJoinFlts` jf2
+                , sfInScope    = in_scope }
 
-addFlts :: Floats -> Floats -> Floats
-addFlts (Floats bs1 l1) (Floats bs2 l2)
-  = Floats (bs1 `appOL` bs2) (l1 `andFF` l2)
+addLetFlts :: LetFloats -> LetFloats -> LetFloats
+addLetFlts (LetFloats bs1 l1) (LetFloats bs2 l2)
+  = LetFloats (bs1 `appOL` bs2) (l1 `andFF` l2)
+
+letFloatBinds :: LetFloats -> [CoreBind]
+letFloatBinds (LetFloats bs _) = fromOL bs
 
 addJoinFlts :: JoinFloats -> JoinFloats -> JoinFloats
 addJoinFlts = appOL
 
-zapFloats :: SimplEnv -> SimplEnv
-zapFloats env = env { seLetFloats  = emptyFloats
-                    , seJoinFloats = emptyJoinFloats }
-
-zapJoinFloats :: SimplEnv -> SimplEnv
-zapJoinFloats env = env { seJoinFloats = emptyJoinFloats }
-
-addJoinFloats :: SimplEnv -> JoinFloats -> SimplEnv
-addJoinFloats env@(SimplEnv { seJoinFloats = fb1 }) fb2
-  = env { seJoinFloats = fb1 `addJoinFlts` fb2 }
-
-addRecFloats :: SimplEnv -> SimplEnv -> SimplEnv
+mkRecFloats :: SimplFloats -> SimplFloats
 -- Flattens the floats from env2 into a single Rec group,
--- prepends the floats from env1, and puts the result back in env2
--- This is all very specific to the way recursive bindings are
--- handled; see Simplify.simplRecBind
-addRecFloats env1 env2@(SimplEnv {seLetFloats  = Floats bs ff
-                                 ,seJoinFloats = jbs })
+-- They must either all be lifted LetFloats or all JoinFloats
+mkRecFloats floats@(SimplFloats { sfLetFloats  = LetFloats bs ff
+                                , sfJoinFloats = jbs
+                                , sfInScope    = in_scope })
   = ASSERT2( case ff of { FltLifted -> True; _ -> False }, ppr (fromOL bs) )
-    env2 {seLetFloats = seLetFloats env1 `addFlts` floats'
-         ,seJoinFloats = seJoinFloats env1 `addJoinFlts` jfloats'}
+    ASSERT2( isNilOL bs || isNilOL jbs, ppr floats )
+    SimplFloats { sfLetFloats  = floats'
+                , sfJoinFloats = jfloats'
+                , sfInScope    = in_scope }
   where
-    floats'  | isNilOL bs  = emptyFloats
-             | otherwise   = unitFloat (Rec (flattenBinds (fromOL bs)))
+    floats'  | isNilOL bs  = emptyLetFloats
+             | otherwise   = unitLetFloat (Rec (flattenBinds (fromOL bs)))
     jfloats' | isNilOL jbs = emptyJoinFloats
              | otherwise   = unitJoinFloat (Rec (flattenBinds (fromOL jbs)))
 
-wrapFloats :: SimplEnv -> OutExpr -> OutExpr
+wrapFloats :: SimplFloats -> OutExpr -> OutExpr
 -- Wrap the floats around the expression; they should all
 -- satisfy the let/app invariant, so mkLets should do the job just fine
-wrapFloats (SimplEnv { seLetFloats  = Floats bs _
-                     , seJoinFloats = jbs }) body
+wrapFloats (SimplFloats { sfLetFloats  = LetFloats bs _
+                        , sfJoinFloats = jbs }) body
   = foldrOL Let (wrapJoinFloats jbs body) bs
      -- Note: Always safe to put the joins on the inside
      -- since the values can't refer to them
 
-wrapJoinFloatsX :: SimplEnv -> OutExpr -> (SimplEnv, OutExpr)
--- Wrap the seJoinFloats of the env around the expression,
+wrapJoinFloatsX :: SimplFloats -> OutExpr -> (SimplFloats, OutExpr)
+-- Wrap the sfJoinFloats of the env around the expression,
 -- and take them out of the SimplEnv
-wrapJoinFloatsX env@(SimplEnv { seJoinFloats = jbs }) body
-  = (zapJoinFloats env, wrapJoinFloats jbs body)
+wrapJoinFloatsX floats body
+  = ( floats { sfJoinFloats = emptyJoinFloats }
+    , wrapJoinFloats (sfJoinFloats floats) body )
 
 wrapJoinFloats :: JoinFloats -> OutExpr -> OutExpr
--- Wrap the seJoinFloats of the env around the expression,
+-- Wrap the sfJoinFloats of the env around the expression,
 -- and take them out of the SimplEnv
 wrapJoinFloats join_floats body
   = foldrOL Let body join_floats
 
-getFloatBinds :: SimplEnv -> [CoreBind]
-getFloatBinds (SimplEnv {seLetFloats = Floats bs _, seJoinFloats = jbs})
-  = fromOL bs ++ fromOL jbs
-
-isEmptyFloats :: SimplEnv -> Bool
-isEmptyFloats env@(SimplEnv {seLetFloats = Floats bs _})
-  = isNilOL bs && isEmptyJoinFloats env
-
-isEmptyJoinFloats :: SimplEnv -> Bool
-isEmptyJoinFloats (SimplEnv {seJoinFloats = jbs})
-  = isNilOL jbs
+getTopFloatBinds :: SimplFloats -> [CoreBind]
+getTopFloatBinds (SimplFloats { sfLetFloats  = lbs
+                              , sfJoinFloats = jbs})
+  = ASSERT( isNilOL jbs )  -- Can't be any top-level join bindings
+    letFloatBinds lbs
 
-mapFloats :: Floats -> ((Id,CoreExpr) -> (Id,CoreExpr)) -> Floats
-mapFloats (Floats fs ff) fun
-   = Floats (mapOL app fs) ff
+mapLetFloats :: LetFloats -> ((Id,CoreExpr) -> (Id,CoreExpr)) -> LetFloats
+mapLetFloats (LetFloats fs ff) fun
+   = LetFloats (mapOL app fs) ff
    where
     app (NonRec b e) = case fun (b,e) of (b',e') -> NonRec b' e'
     app (Rec bs)     = Rec (map fun bs)
@@ -657,6 +694,34 @@ lookupRecBndr (SimplEnv { seInScope = in_scope, seIdSubst = ids }) v
 
 
 These functions are in the monad only so that they can be made strict via seq.
+
+Note [Return type for join points]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider
+
+   (join j :: Char -> Int -> Int) 77
+   (     j x = \y. y + ord x    )
+   (in case v of                )
+   (     A -> j 'x'             )
+   (     B -> j 'y'             )
+   (     C -> <blah>            )
+
+The simplifier pushes the "apply to 77" continuation inwards to give
+
+   join j :: Char -> Int
+        j x = (\y. y + ord x) 77
+   in case v of
+        A -> j 'x'
+        B -> j 'y'
+        C -> <blah> 77
+
+Notice that the "apply to 77" continuation went into the RHS of the
+join point.  And that meant that the return type of the join point
+changed!!
+
+That's why we pass res_ty into simplNonRecJoinBndr, and substIdBndr
+takes a (Just res_ty) argument so that it knows to do the type-changing
+thing.
 -}
 
 simplBinders :: SimplEnv -> [InBndr] -> SimplM (SimplEnv, [OutBndr])
@@ -685,8 +750,9 @@ simplNonRecBndr env id
 ---------------
 simplNonRecJoinBndr :: SimplEnv -> OutType -> InBndr
                     -> SimplM (SimplEnv, OutBndr)
--- A non-recursive let binder for a join point; context being pushed inward may
--- change the type
+-- A non-recursive let binder for a join point;
+-- context being pushed inward may change the type
+-- See Note [Return type for join points]
 simplNonRecJoinBndr env res_ty id
   = do  { let (env1, id1) = substIdBndr (Just res_ty) env id
         ; seqId id1 `seq` return (env1, id1) }
@@ -701,8 +767,9 @@ simplRecBndrs env@(SimplEnv {}) ids
 
 ---------------
 simplRecJoinBndrs :: SimplEnv -> OutType -> [InBndr] -> SimplM SimplEnv
--- Recursive let binders for join points; context being pushed inward may
--- change types
+-- Recursive let binders for join points;
+-- context being pushed inward may change types
+-- See Note [Return type for join points]
 simplRecJoinBndrs env@(SimplEnv {}) res_ty ids
   = ASSERT(all isJoinId ids)
     do  { let (env1, ids1) = mapAccumL (substIdBndr (Just res_ty)) env ids
@@ -718,6 +785,7 @@ substIdBndr new_res_ty env bndr
 ---------------
 substNonCoVarIdBndr
    :: Maybe OutType -- New result type, if a join binder
+                    -- See Note [Return type for join points]
    -> SimplEnv
    -> InBndr    -- Env and binder to transform
    -> (SimplEnv, OutBndr)
@@ -748,10 +816,13 @@ substNonCoVarIdBndr new_res_ty
   where
     id1    = uniqAway in_scope old_id
     id2    = substIdType env id1
+
     id3    | Just res_ty <- new_res_ty
            = id2 `setIdType` setJoinResTy (idJoinArity id2) res_ty (idType id2)
+                             -- See Note [Return type for join points]
            | otherwise
            = id2
+
     new_id = zapFragileIdInfo id3       -- Zaps rules, worker-info, unfolding
                                         -- and fragile OccInfo
 
diff --git a/compiler/simplCore/SimplMonad.hs b/compiler/simplCore/SimplMonad.hs
index 015ee5c786..915c89ee91 100644
--- a/compiler/simplCore/SimplMonad.hs
+++ b/compiler/simplCore/SimplMonad.hs
@@ -19,6 +19,8 @@ module SimplMonad (
         plusSimplCount, isZeroSimplCount
     ) where
 
+import GhcPrelude
+
 import Var              ( Var, isTyVar, mkLocalVar )
 import Name             ( mkSystemVarName )
 import Id               ( Id, mkSysLocalOrCoVar )
diff --git a/compiler/simplCore/SimplUtils.hs b/compiler/simplCore/SimplUtils.hs
index e6e660b91f..ca1b9bd23d 100644
--- a/compiler/simplCore/SimplUtils.hs
+++ b/compiler/simplCore/SimplUtils.hs
@@ -17,8 +17,8 @@ module SimplUtils (
         simplEnvForGHCi, updModeForStableUnfoldings, updModeForRules,
 
         -- The continuation type
-        SimplCont(..), DupFlag(..),
-        isSimplified,
+        SimplCont(..), DupFlag(..), StaticEnv,
+        isSimplified, contIsStop,
         contIsDupable, contResultType, contHoleType,
         contIsTrivial, contArgs,
         countArgs,
@@ -30,13 +30,18 @@ module SimplUtils (
         addValArgTo, addCastTo, addTyArgTo,
         argInfoExpr, argInfoAppArgs, pushSimplifiedArgs,
 
-        abstractFloats
+        abstractFloats,
+
+        -- Utilities
+        isExitJoinId
     ) where
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import SimplEnv
-import CoreMonad        ( SimplifierMode(..), Tick(..) )
+import CoreMonad        ( SimplMode(..), Tick(..) )
 import DynFlags
 import CoreSyn
 import qualified CoreSubst
@@ -57,6 +62,7 @@ import DataCon          ( dataConWorkId, isNullaryRepDataCon )
 import VarSet
 import BasicTypes
 import Util
+import OrdList          ( isNilOL )
 import MonadUtils
 import Outputable
 import Pair
@@ -114,7 +120,7 @@ data SimplCont
   | ApplyToVal         -- (ApplyToVal arg K)[e] = K[ e arg ]
       { sc_dup  :: DupFlag      -- See Note [DupFlag invariants]
       , sc_arg  :: InExpr       -- The argument,
-      , sc_env  :: StaticEnv    --     and its static env
+      , sc_env  :: StaticEnv    -- see Note [StaticEnv invariant]
       , sc_cont :: SimplCont }
 
   | ApplyToTy          -- (ApplyToTy ty K)[e] = K[ e ty ]
@@ -127,7 +133,7 @@ data SimplCont
       { sc_dup  :: DupFlag        -- See Note [DupFlag invariants]
       , sc_bndr :: InId           -- case binder
       , sc_alts :: [InAlt]        -- Alternatives
-      , sc_env  :: StaticEnv      --   and their static environment
+      , sc_env  :: StaticEnv      -- See Note [StaticEnv invariant]
       , sc_cont :: SimplCont }
 
   -- The two strict forms have no DupFlag, because we never duplicate them
@@ -137,7 +143,7 @@ data SimplCont
       , sc_bndr  :: InId
       , sc_bndrs :: [InBndr]
       , sc_body  :: InExpr
-      , sc_env   :: StaticEnv
+      , sc_env   :: StaticEnv      -- See Note [StaticEnv invariant]
       , sc_cont  :: SimplCont }
 
   | StrictArg           -- (StrictArg (f e1 ..en) K)[e] = K[ f e1 .. en e ]
@@ -151,6 +157,8 @@ data SimplCont
         (Tickish Id)    -- Tick tickish <hole>
         SimplCont
 
+type StaticEnv = SimplEnv       -- Just the static part is relevant
+
 data DupFlag = NoDup       -- Unsimplified, might be big
              | Simplified  -- Simplified
              | OkToDup     -- Simplified and small
@@ -164,7 +172,25 @@ perhapsSubstTy dup env ty
   | isSimplified dup = ty
   | otherwise        = substTy env ty
 
-{-
+{- Note [StaticEnv invariant]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We pair up an InExpr or InAlts with a StaticEnv, which establishes the
+lexical scope for that InExpr.  When we simplify that InExpr/InAlts, we
+use
+  - Its captured StaticEnv
+  - Overriding its InScopeSet with the larger one at the
+    simplification point.
+
+Why override the InScopeSet?  Example:
+      (let y = ey in f) ex
+By the time we simplify ex, 'y' will be in scope.
+
+However the InScopeSet in the StaticEnv is not irrelevant: it should
+include all the free vars of applying the substitution to the InExpr.
+Reason: contHoleType uses perhapsSubstTy to apply the substitution to
+the expression, and that (rightly) gives ASSERT failures if the InScopeSet
+isn't big enough.
+
 Note [DupFlag invariants]
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 In both (ApplyToVal dup _ env k)
@@ -196,7 +222,7 @@ instance Outputable SimplCont where
     = (text "StrictArg" <+> ppr (ai_fun ai)) $$ ppr cont
   ppr (Select { sc_dup = dup, sc_bndr = bndr, sc_alts = alts, sc_env = se, sc_cont = cont })
     = (text "Select" <+> ppr dup <+> ppr bndr) $$
-       ifPprDebug (nest 2 $ vcat [ppr (seTvSubst se), ppr alts]) $$ ppr cont
+       whenPprDebug (nest 2 $ vcat [ppr (seTvSubst se), ppr alts]) $$ ppr cont
 
 
 {- Note [The hole type in ApplyToTy]
@@ -345,6 +371,10 @@ contIsRhs (Stop _ RhsCtxt) = True
 contIsRhs _                = False
 
 -------------------
+contIsStop :: SimplCont -> Bool
+contIsStop (Stop {}) = True
+contIsStop _         = False
+
 contIsDupable :: SimplCont -> Bool
 contIsDupable (Stop {})                         = True
 contIsDupable (ApplyToTy  { sc_cont = k })      = contIsDupable k
@@ -419,23 +449,25 @@ contArgs cont
 
 
 -------------------
-mkArgInfo :: Id
+mkArgInfo :: SimplEnv
+          -> Id
           -> [CoreRule] -- Rules for function
           -> Int        -- Number of value args
           -> SimplCont  -- Context of the call
           -> ArgInfo
 
-mkArgInfo fun rules n_val_args call_cont
+mkArgInfo env fun rules n_val_args call_cont
   | n_val_args < idArity fun            -- Note [Unsaturated functions]
   = ArgInfo { ai_fun = fun, ai_args = [], ai_type = fun_ty
-            , ai_rules = fun_rules, ai_encl = False
+            , ai_rules = fun_rules
+            , ai_encl = False
             , ai_strs = vanilla_stricts
             , ai_discs = vanilla_discounts }
   | otherwise
   = ArgInfo { ai_fun = fun, ai_args = [], ai_type = fun_ty
             , ai_rules = fun_rules
-            , ai_encl = interestingArgContext rules call_cont
-            , ai_strs  = add_type_str fun_ty arg_stricts
+            , ai_encl  = interestingArgContext rules call_cont
+            , ai_strs  = arg_stricts
             , ai_discs = arg_discounts }
   where
     fun_ty = idType fun
@@ -453,7 +485,11 @@ mkArgInfo fun rules n_val_args call_cont
     vanilla_stricts  = repeat False
 
     arg_stricts
-      = case splitStrictSig (idStrictness fun) of
+      | not (sm_inline (seMode env))
+      = vanilla_stricts -- See Note [Do not expose strictness if sm_inline=False]
+      | otherwise
+      = add_type_str fun_ty $
+        case splitStrictSig (idStrictness fun) of
           (demands, result_info)
                 | not (demands `lengthExceeds` n_val_args)
                 ->      -- Enough args, use the strictness given.
@@ -475,26 +511,25 @@ mkArgInfo fun rules n_val_args call_cont
     add_type_str :: Type -> [Bool] -> [Bool]
     -- If the function arg types are strict, record that in the 'strictness bits'
     -- No need to instantiate because unboxed types (which dominate the strict
-    -- types) can't instantiate type variables.
-    -- add_type_str is done repeatedly (for each call); might be better
-    -- once-for-all in the function
+    --   types) can't instantiate type variables.
+    -- add_type_str is done repeatedly (for each call);
+    --   might be better once-for-all in the function
     -- But beware primops/datacons with no strictness
 
-    add_type_str
-      = go
-      where
-        go _ [] = []
-        go fun_ty strs            -- Look through foralls
-            | Just (_, fun_ty') <- splitForAllTy_maybe fun_ty       -- Includes coercions
-            = go fun_ty' strs
-        go fun_ty (str:strs)      -- Add strict-type info
-            | Just (arg_ty, fun_ty') <- splitFunTy_maybe fun_ty
-            = (str || Just False == isLiftedType_maybe arg_ty) : go fun_ty' strs
-               -- If the type is levity-polymorphic, we can't know whether it's
-               -- strict. isLiftedType_maybe will return Just False only when
-               -- we're sure the type is unlifted.
-        go _ strs
-            = strs
+    add_type_str _ [] = []
+    add_type_str fun_ty all_strs@(str:strs)
+      | Just (arg_ty, fun_ty') <- splitFunTy_maybe fun_ty        -- Add strict-type info
+      = (str || Just False == isLiftedType_maybe arg_ty)
+        : add_type_str fun_ty' strs
+          -- If the type is levity-polymorphic, we can't know whether it's
+          -- strict. isLiftedType_maybe will return Just False only when
+          -- we're sure the type is unlifted.
+
+      | Just (_, fun_ty') <- splitForAllTy_maybe fun_ty
+      = add_type_str fun_ty' all_strs     -- Look through foralls
+
+      | otherwise
+      = all_strs
 
 {- Note [Unsaturated functions]
   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -504,6 +539,28 @@ Consider (test eyeball/inline4)
 where f has arity 2.  Then we do not want to inline 'x', because
 it'll just be floated out again.  Even if f has lots of discounts
 on its first argument -- it must be saturated for these to kick in
+
+Note [Do not expose strictness if sm_inline=False]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Trac #15163 showed a case in which we had
+
+  {-# INLINE [1] zip #-}
+  zip = undefined
+
+  {-# RULES "foo" forall as bs. stream (zip as bs) = ..blah... #-}
+
+If we expose zip's bottoming nature when simplifing the LHS of the
+RULE we get
+  {-# RULES "foo" forall as bs.
+                   stream (case zip of {}) = ..blah... #-}
+discarding the arguments to zip.  Usually this is fine, but on the
+LHS of a rule it's not, because 'as' and 'bs' are now not bound on
+the LHS.
+
+This is a pretty pathalogical example, so I'm not losing sleep over
+it, but the simplest solution was to check sm_inline; if it is False,
+which it is on the LHS of a rule (see updModeForRules), then don't
+make use of the strictness info for the function.
 -}
 
 
@@ -546,14 +603,31 @@ since we can just eliminate this case instead (x is in WHNF).  Similar
 applies when x is bound to a lambda expression.  Hence
 contIsInteresting looks for case expressions with just a single
 default case.
+
+Note [No case of case is boring]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If we see
+   case f x of <alts>
+
+we'd usually treat the context as interesting, to encourage 'f' to
+inline.  But if case-of-case is off, it's really not so interesting
+after all, because we are unlikely to be able to push the case
+expression into the branches of any case in f's unfolding.  So, to
+reduce unnecessary code expansion, we just make the context look boring.
+This made a small compile-time perf improvement in perf/compiler/T6048,
+and it looks plausible to me.
 -}
 
-interestingCallContext :: SimplCont -> CallCtxt
+interestingCallContext :: SimplEnv -> SimplCont -> CallCtxt
 -- See Note [Interesting call context]
-interestingCallContext cont
+interestingCallContext env cont
   = interesting cont
   where
-    interesting (Select {})     = CaseCtxt
+    interesting (Select {})
+       | sm_case_case (getMode env) = CaseCtxt
+       | otherwise                  = BoringCtxt
+       -- See Note [No case of case is boring]
+
     interesting (ApplyToVal {}) = ValAppCtxt
         -- Can happen if we have (f Int |> co) y
         -- If f has an INLINE prag we need to give it some
@@ -694,11 +768,11 @@ interestingArg env e = go env 0 e
 {-
 ************************************************************************
 *                                                                      *
-                  SimplifierMode
+                  SimplMode
 *                                                                      *
 ************************************************************************
 
-The SimplifierMode controls several switches; see its definition in
+The SimplMode controls several switches; see its definition in
 CoreMonad
         sm_rules      :: Bool     -- Whether RULES are enabled
         sm_inline     :: Bool     -- Whether inlining is enabled
@@ -708,19 +782,20 @@ CoreMonad
 
 simplEnvForGHCi :: DynFlags -> SimplEnv
 simplEnvForGHCi dflags
-  = mkSimplEnv $ SimplMode { sm_names = ["GHCi"]
-                           , sm_phase = InitialPhase
-                           , sm_rules = rules_on
+  = mkSimplEnv $ SimplMode { sm_names  = ["GHCi"]
+                           , sm_phase  = InitialPhase
+                           , sm_dflags = dflags
+                           , sm_rules  = rules_on
                            , sm_inline = False
                            , sm_eta_expand = eta_expand_on
-                           , sm_case_case = True }
+                           , sm_case_case  = True }
   where
     rules_on      = gopt Opt_EnableRewriteRules   dflags
     eta_expand_on = gopt Opt_DoLambdaEtaExpansion dflags
    -- Do not do any inlining, in case we expose some unboxed
    -- tuple stuff that confuses the bytecode interpreter
 
-updModeForStableUnfoldings :: Activation -> SimplifierMode -> SimplifierMode
+updModeForStableUnfoldings :: Activation -> SimplMode -> SimplMode
 -- See Note [Simplifying inside stable unfoldings]
 updModeForStableUnfoldings inline_rule_act current_mode
   = current_mode { sm_phase      = phaseFromActivation inline_rule_act
@@ -733,12 +808,12 @@ updModeForStableUnfoldings inline_rule_act current_mode
     phaseFromActivation (ActiveAfter _ n) = Phase n
     phaseFromActivation _                 = InitialPhase
 
-updModeForRules :: SimplifierMode -> SimplifierMode
+updModeForRules :: SimplMode -> SimplMode
 -- See Note [Simplifying rules]
 updModeForRules current_mode
-  = current_mode { sm_phase  = InitialPhase
-                 , sm_inline = False
-                 , sm_rules  = False
+  = current_mode { sm_phase      = InitialPhase
+                 , sm_inline     = False  -- See Note [Do not expose strictness if sm_inline=False]
+                 , sm_rules      = False
                  , sm_eta_expand = False }
 
 {- Note [Simplifying rules]
@@ -840,7 +915,7 @@ f when it is inlined.  So our conservative plan (implemented by
 updModeForStableUnfoldings) is this:
 
   -------------------------------------------------------------
-  When simplifying the RHS of an stable unfolding, set the phase
+  When simplifying the RHS of a stable unfolding, set the phase
   to the phase in which the stable unfolding first becomes active
   -------------------------------------------------------------
 
@@ -890,8 +965,8 @@ mark it 'demanded', so when the RHS is simplified, it'll get an ArgOf
 continuation.
 -}
 
-activeUnfolding :: SimplEnv -> Id -> Bool
-activeUnfolding env id
+activeUnfolding :: SimplMode -> Id -> Bool
+activeUnfolding mode id
   | isCompulsoryUnfolding (realIdUnfolding id)
   = True   -- Even sm_inline can't override compulsory unfoldings
   | otherwise
@@ -902,8 +977,6 @@ activeUnfolding env id
       --  (a) they are active
       --  (b) sm_inline says so, except that for stable unfoldings
       --                         (ie pragmas) we inline anyway
-  where
-    mode = getMode env
 
 getUnfoldingInRuleMatch :: SimplEnv -> InScopeEnv
 -- When matching in RULE, we want to "look through" an unfolding
@@ -928,13 +1001,11 @@ getUnfoldingInRuleMatch env
      | otherwise           = isActive (sm_phase mode) (idInlineActivation id)
 
 ----------------------
-activeRule :: SimplEnv -> Activation -> Bool
+activeRule :: SimplMode -> Activation -> Bool
 -- Nothing => No rules at all
-activeRule env
+activeRule mode
   | not (sm_rules mode) = \_ -> False     -- Rewriting is off
   | otherwise           = isActive (sm_phase mode)
-  where
-    mode = getMode env
 
 {-
 ************************************************************************
@@ -1017,7 +1088,7 @@ spectral/mandel/Mandel.hs, where the mandelset function gets a useful
 let-float if you inline windowToViewport
 
 However, as usual for Gentle mode, do not inline things that are
-inactive in the intial stages.  See Note [Gentle mode].
+inactive in the initial stages.  See Note [Gentle mode].
 
 Note [Stable unfoldings and preInlineUnconditionally]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1041,6 +1112,11 @@ want PreInlineUnconditionally to second-guess it.  A live example is
 Trac #3736.
     c.f. Note [Stable unfoldings and postInlineUnconditionally]
 
+NB: if the pragama is INLINEABLE, then we don't want to behave int
+this special way -- an INLINEABLE pragam just says to GHC "inline this
+if you like".  But if there is a unique occurrence, we want to inline
+the stable unfolding, not the RHS.
+
 Note [Top-level bottoming Ids]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Don't inline top-level Ids that are bottoming, even if they are used just
@@ -1054,31 +1130,45 @@ is a term (not a coercion) so we can't necessarily inline the latter in
 the former.
 -}
 
-preInlineUnconditionally :: DynFlags -> SimplEnv -> TopLevelFlag -> InId -> InExpr -> Bool
+preInlineUnconditionally
+    :: SimplEnv -> TopLevelFlag -> InId
+    -> InExpr -> StaticEnv  -- These two go together
+    -> Maybe SimplEnv       -- Returned env has extended substitution
 -- Precondition: rhs satisfies the let/app invariant
 -- See Note [CoreSyn let/app invariant] in CoreSyn
 -- Reason: we don't want to inline single uses, or discard dead bindings,
 --         for unlifted, side-effect-ful bindings
-preInlineUnconditionally dflags env top_lvl bndr rhs
-  | not active                               = False
-  | isStableUnfolding (idUnfolding bndr)     = False -- Note [Stable unfoldings and preInlineUnconditionally]
-  | isTopLevel top_lvl && isBottomingId bndr = False -- Note [Top-level bottoming Ids]
-  | not (gopt Opt_SimplPreInlining dflags)   = False
-  | isCoVar bndr                             = False -- Note [Do not inline CoVars unconditionally]
-  | otherwise = case idOccInfo bndr of
-                  IAmDead                    -> True -- Happens in ((\x.1) v)
-                  occ@OneOcc { occ_one_br = True }
-                                             -> try_once (occ_in_lam occ)
-                                                         (occ_int_cxt occ)
-                  _                          -> False
+preInlineUnconditionally env top_lvl bndr rhs rhs_env
+  | not pre_inline_unconditionally           = Nothing
+  | not active                               = Nothing
+  | isTopLevel top_lvl && isBottomingId bndr = Nothing -- Note [Top-level bottoming Ids]
+  | isCoVar bndr                             = Nothing -- Note [Do not inline CoVars unconditionally]
+  | isExitJoinId bndr                        = Nothing -- Note [Do not inline exit join points]
+                                                       -- in module Exitify
+  | not (one_occ (idOccInfo bndr))           = Nothing
+  | not (isStableUnfolding unf)              = Just (extend_subst_with rhs)
+
+  -- Note [Stable unfoldings and preInlineUnconditionally]
+  | isInlinablePragma inline_prag
+  , Just inl <- maybeUnfoldingTemplate unf   = Just (extend_subst_with inl)
+  | otherwise                                = Nothing
   where
-    mode = getMode env
-    active = isActive (sm_phase mode) act
-             -- See Note [pre/postInlineUnconditionally in gentle mode]
-    act = idInlineActivation bndr
-    try_once in_lam int_cxt     -- There's one textual occurrence
+    unf = idUnfolding bndr
+    extend_subst_with inl_rhs = extendIdSubst env bndr (mkContEx rhs_env inl_rhs)
+
+    one_occ IAmDead = True -- Happens in ((\x.1) v)
+    one_occ (OneOcc { occ_one_br = True      -- One textual occurrence
+                    , occ_in_lam = in_lam
+                    , occ_int_cxt = int_cxt })
         | not in_lam = isNotTopLevel top_lvl || early_phase
         | otherwise  = int_cxt && canInlineInLam rhs
+    one_occ _        = False
+
+    pre_inline_unconditionally = gopt Opt_SimplPreInlining (seDynFlags env)
+    mode   = getMode env
+    active = isActive (sm_phase mode) (inlinePragmaActivation inline_prag)
+             -- See Note [pre/postInlineUnconditionally in gentle mode]
+    inline_prag = idInlinePragma bndr
 
 -- Be very careful before inlining inside a lambda, because (a) we must not
 -- invalidate occurrence information, and (b) we want to avoid pushing a
@@ -1163,18 +1253,16 @@ story for now.
 -}
 
 postInlineUnconditionally
-    :: DynFlags -> SimplEnv -> TopLevelFlag
-    -> OutId            -- The binder (an InId would be fine too)
-                        --            (*not* a CoVar)
+    :: SimplEnv -> TopLevelFlag
+    -> OutId            -- The binder (*not* a CoVar), including its unfolding
     -> OccInfo          -- From the InId
     -> OutExpr
-    -> Unfolding
     -> Bool
 -- Precondition: rhs satisfies the let/app invariant
 -- See Note [CoreSyn let/app invariant] in CoreSyn
 -- Reason: we don't want to inline single uses, or discard dead bindings,
 --         for unlifted, side-effect-ful bindings
-postInlineUnconditionally dflags env top_lvl bndr occ_info rhs unfolding
+postInlineUnconditionally env top_lvl bndr occ_info rhs
   | not active                  = False
   | isWeakLoopBreaker occ_info  = False -- If it's a loop-breaker of any kind, don't inline
                                         -- because it might be referred to "earlier"
@@ -1242,7 +1330,9 @@ postInlineUnconditionally dflags env top_lvl bndr occ_info rhs unfolding
 -- Alas!
 
   where
-    active = isActive (sm_phase (getMode env)) (idInlineActivation bndr)
+    unfolding = idUnfolding bndr
+    dflags    = seDynFlags env
+    active    = isActive (sm_phase (getMode env)) (idInlineActivation bndr)
         -- See Note [pre/postInlineUnconditionally in gentle mode]
 
 {-
@@ -1278,7 +1368,7 @@ ones that are trivial):
 
 Note [Stable unfoldings and postInlineUnconditionally]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Do not do postInlineUnconditionally if the Id has an stable unfolding,
+Do not do postInlineUnconditionally if the Id has a stable unfolding,
 otherwise we lose the unfolding.  Example
 
      -- f has stable unfolding with rhs (e |> co)
@@ -1414,40 +1504,49 @@ because the latter is not well-kinded.
 ************************************************************************
 -}
 
-tryEtaExpandRhs :: SimplEnv -> RecFlag -> OutId -> OutExpr
-                -> SimplM (Arity, OutExpr)
+tryEtaExpandRhs :: SimplMode -> OutId -> OutExpr
+                -> SimplM (Arity, Bool, OutExpr)
 -- See Note [Eta-expanding at let bindings]
-tryEtaExpandRhs env is_rec bndr rhs
-  = do { dflags <- getDynFlags
-       ; (new_arity, new_rhs) <- try_expand dflags
+-- If tryEtaExpandRhs rhs = (n, is_bot, rhs') then
+--   (a) rhs' has manifest arity
+--   (b) if is_bot is True then rhs' applied to n args is guaranteed bottom
+tryEtaExpandRhs mode bndr rhs
+  | Just join_arity <- isJoinId_maybe bndr
+  = do { let (join_bndrs, join_body) = collectNBinders join_arity rhs
+       ; return (count isId join_bndrs, exprIsBottom join_body, rhs) }
+         -- Note [Do not eta-expand join points]
+         -- But do return the correct arity and bottom-ness, because
+         -- these are used to set the bndr's IdInfo (Trac #15517)
+
+  | otherwise
+  = do { (new_arity, is_bot, new_rhs) <- try_expand
 
        ; WARN( new_arity < old_id_arity,
                (text "Arity decrease:" <+> (ppr bndr <+> ppr old_id_arity
                 <+> ppr old_arity <+> ppr new_arity) $$ ppr new_rhs) )
                         -- Note [Arity decrease] in Simplify
-         return (new_arity, new_rhs) }
+         return (new_arity, is_bot, new_rhs) }
   where
-    try_expand dflags
+    try_expand
       | exprIsTrivial rhs
-      = return (exprArity rhs, rhs)
-
-      | sm_eta_expand (getMode env)      -- Provided eta-expansion is on
-      , let new_arity1 = findRhsArity dflags bndr rhs old_arity
-            new_arity2 = idCallArity bndr
-            new_arity  = max new_arity1 new_arity2
-      , new_arity > old_arity      -- And the current manifest arity isn't enough
-      = if is_rec == Recursive && isJoinId bndr
-           then WARN(True, text "Can't eta-expand recursive join point:" <+>
-                             ppr bndr)
-                return (old_arity, rhs)
-           else do { tick (EtaExpansion bndr)
-                   ; return (new_arity, etaExpand new_arity rhs) }
+      = return (exprArity rhs, False, rhs)
+
+      | sm_eta_expand mode      -- Provided eta-expansion is on
+      , new_arity > old_arity   -- And the current manifest arity isn't enough
+      = do { tick (EtaExpansion bndr)
+           ; return (new_arity, is_bot, etaExpand new_arity rhs) }
+
       | otherwise
-      = return (old_arity, rhs)
+      = return (old_arity, is_bot && new_arity == old_arity, rhs)
 
+    dflags       = sm_dflags mode
     old_arity    = exprArity rhs -- See Note [Do not expand eta-expand PAPs]
     old_id_arity = idArity bndr
 
+    (new_arity1, is_bot) = findRhsArity dflags bndr rhs old_arity
+    new_arity2 = idCallArity bndr
+    new_arity  = max new_arity1 new_arity2
+
 {-
 Note [Eta-expanding at let bindings]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1473,6 +1572,44 @@ because then 'genMap' will inline, and it really shouldn't: at least
 as far as the programmer is concerned, it's not applied to two
 arguments!
 
+Note [Do not eta-expand join points]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Similarly to CPR (see Note [Don't CPR join points] in WorkWrap), a join point
+stands well to gain from its outer binding's eta-expansion, and eta-expanding a
+join point is fraught with issues like how to deal with a cast:
+
+    let join $j1 :: IO ()
+             $j1 = ...
+             $j2 :: Int -> IO ()
+             $j2 n = if n > 0 then $j1
+                              else ...
+
+    =>
+
+    let join $j1 :: IO ()
+             $j1 = (\eta -> ...)
+                     `cast` N:IO :: State# RealWorld -> (# State# RealWorld, ())
+                                 ~  IO ()
+             $j2 :: Int -> IO ()
+             $j2 n = (\eta -> if n > 0 then $j1
+                                       else ...)
+                     `cast` N:IO :: State# RealWorld -> (# State# RealWorld, ())
+                                 ~  IO ()
+
+The cast here can't be pushed inside the lambda (since it's not casting to a
+function type), so the lambda has to stay, but it can't because it contains a
+reference to a join point. In fact, $j2 can't be eta-expanded at all. Rather
+than try and detect this situation (and whatever other situations crop up!), we
+don't bother; again, any surrounding eta-expansion will improve these join
+points anyway, since an outer cast can *always* be pushed inside. By the time
+CorePrep comes around, the code is very likely to look more like this:
+
+    let join $j1 :: State# RealWorld -> (# State# RealWorld, ())
+             $j1 = (...) eta
+             $j2 :: Int -> State# RealWorld -> (# State# RealWorld, ())
+             $j2 = if n > 0 then $j1
+                            else (...) eta
+
 Note [Do not eta-expand PAPs]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We used to have old_arity = manifestArity rhs, which meant that we
@@ -1603,22 +1740,25 @@ new binding is abstracted.  Note that
     which is obviously bogus.
 -}
 
-abstractFloats :: [OutTyVar] -> SimplEnv -> OutExpr -> SimplM ([OutBind], OutExpr)
-abstractFloats main_tvs body_env body
+abstractFloats :: DynFlags -> TopLevelFlag -> [OutTyVar] -> SimplFloats
+              -> OutExpr -> SimplM ([OutBind], OutExpr)
+abstractFloats dflags top_lvl main_tvs floats body
   = ASSERT( notNull body_floats )
+    ASSERT( isNilOL (sfJoinFloats floats) )
     do  { (subst, float_binds) <- mapAccumLM abstract empty_subst body_floats
         ; return (float_binds, CoreSubst.substExpr (text "abstract_floats1") subst body) }
   where
+    is_top_lvl  = isTopLevel top_lvl
     main_tv_set = mkVarSet main_tvs
-    body_floats = getFloatBinds body_env
-    empty_subst = CoreSubst.mkEmptySubst (seInScope body_env)
+    body_floats = letFloatBinds (sfLetFloats floats)
+    empty_subst = CoreSubst.mkEmptySubst (sfInScope floats)
 
     abstract :: CoreSubst.Subst -> OutBind -> SimplM (CoreSubst.Subst, OutBind)
     abstract subst (NonRec id rhs)
-      = do { (poly_id, poly_app) <- mk_poly tvs_here id
-           ; let poly_rhs = mkLams tvs_here rhs'
-                 subst'   = CoreSubst.extendIdSubst subst id poly_app
-           ; return (subst', (NonRec poly_id poly_rhs)) }
+      = do { (poly_id1, poly_app) <- mk_poly1 tvs_here id
+           ; let (poly_id2, poly_rhs) = mk_poly2 poly_id1 tvs_here rhs'
+                 subst' = CoreSubst.extendIdSubst subst id poly_app
+           ; return (subst', NonRec poly_id2 poly_rhs) }
       where
         rhs' = CoreSubst.substExpr (text "abstract_floats2") subst rhs
 
@@ -1629,11 +1769,13 @@ abstractFloats main_tvs body_env body
                    exprSomeFreeVarsList isTyVar rhs'
 
     abstract subst (Rec prs)
-       = do { (poly_ids, poly_apps) <- mapAndUnzipM (mk_poly tvs_here) ids
+       = do { (poly_ids, poly_apps) <- mapAndUnzipM (mk_poly1 tvs_here) ids
             ; let subst' = CoreSubst.extendSubstList subst (ids `zip` poly_apps)
-                  poly_rhss = [mkLams tvs_here (CoreSubst.substExpr (text "abstract_floats3") subst' rhs)
-                              | rhs <- rhss]
-            ; return (subst', Rec (poly_ids `zip` poly_rhss)) }
+                  poly_pairs = [ mk_poly2 poly_id tvs_here rhs'
+                               | (poly_id, rhs) <- poly_ids `zip` rhss
+                               , let rhs' = CoreSubst.substExpr (text "abstract_floats")
+                                                                subst' rhs ]
+            ; return (subst', Rec poly_pairs) }
        where
          (ids,rhss) = unzip prs
                 -- For a recursive group, it's a bit of a pain to work out the minimal
@@ -1651,7 +1793,8 @@ abstractFloats main_tvs body_env body
                 -- Here, we must abstract 'x' over 'a'.
          tvs_here = toposortTyVars main_tvs
 
-    mk_poly tvs_here var
+    mk_poly1 :: [TyVar] -> Id -> SimplM (Id, CoreExpr)
+    mk_poly1 tvs_here var
       = do { uniq <- getUniqueM
            ; let  poly_name = setNameUnique (idName var) uniq           -- Keep same name
                   poly_ty   = mkInvForAllTys tvs_here (idType var) -- But new type of course
@@ -1671,6 +1814,21 @@ abstractFloats main_tvs body_env body
                 -- the occurrences of x' will be just the occurrences originally
                 -- pinned on x.
 
+    mk_poly2 :: Id -> [TyVar] -> CoreExpr -> (Id, CoreExpr)
+    mk_poly2 poly_id tvs_here rhs
+      = (poly_id `setIdUnfolding` unf, poly_rhs)
+      where
+        poly_rhs = mkLams tvs_here rhs
+        unf = mkUnfolding dflags InlineRhs is_top_lvl False poly_rhs
+
+        -- We want the unfolding.  Consider
+        --      let
+        --            x = /\a. let y = ... in Just y
+        --      in body
+        -- Then we float the y-binding out (via abstractFloats and addPolyBind)
+        -- but 'x' may well then be inlined in 'body' in which case we'd like the
+        -- opportunity to inline 'y' too.
+
 {-
 Note [Abstract over coercions]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1785,7 +1943,7 @@ prepareAlts scrut case_bndr' alts
 mkCase tries these things
 
 * Note [Nerge nested cases]
-* Note [Elimiante identity case]
+* Note [Eliminate identity case]
 * Note [Scrutinee constant folding]
 
 Note [Merge Nested Cases]
@@ -1985,13 +2143,18 @@ mkCase1 dflags scrut bndr alts_ty alts = mkCase2 dflags scrut bndr alts_ty alts
 
 mkCase2 dflags scrut bndr alts_ty alts
   | -- See Note [Scrutinee Constant Folding]
-    case alts of  -- Not if there is just a DEFAULT alterantive
+    case alts of  -- Not if there is just a DEFAULT alternative
       [(DEFAULT,_,_)] -> False
       _               -> True
   , gopt Opt_CaseFolding dflags
   , Just (scrut', tx_con, mk_orig) <- caseRules dflags scrut
   = do { bndr' <- newId (fsLit "lwild") (exprType scrut')
-       ; alts' <- mapM  (tx_alt tx_con mk_orig bndr') alts
+
+       ; alts' <- mapMaybeM (tx_alt tx_con mk_orig bndr') alts
+                  -- mapMaybeM: discard unreachable alternatives
+                  -- See Note [Unreachable caseRules alternatives]
+                  -- in PrelRules
+
        ; mkCase3 dflags scrut' bndr' alts_ty $
          add_default (re_sort alts')
        }
@@ -2015,19 +2178,14 @@ mkCase2 dflags scrut bndr alts_ty alts
     -- to construct an expression equivalent to the original one, for use
     -- in the DEFAULT case
 
+    tx_alt :: (AltCon -> Maybe AltCon) -> (Id -> CoreExpr) -> Id
+           -> CoreAlt -> SimplM (Maybe CoreAlt)
     tx_alt tx_con mk_orig new_bndr (con, bs, rhs)
-      | DataAlt dc <- con', not (isNullaryRepDataCon dc)
-      = -- For non-nullary data cons we must invent some fake binders
-        -- See Note [caseRules for dataToTag] in PrelRules
-        do { us <- getUniquesM
-           ; let (ex_tvs, arg_ids) = dataConRepInstPat us dc
-                                         (tyConAppArgs (idType new_bndr))
-           ; return (con', ex_tvs ++ arg_ids, rhs') }
-      | otherwise
-      = return (con', [], rhs')
+      = case tx_con con of
+          Nothing   -> return Nothing
+          Just con' -> do { bs' <- mk_new_bndrs new_bndr con'
+                          ; return (Just (con', bs', rhs')) }
       where
-        con' = tx_con con
-
         rhs' | isDeadBinder bndr = rhs
              | otherwise         = bindNonRec bndr orig_val rhs
 
@@ -2036,23 +2194,61 @@ mkCase2 dflags scrut bndr alts_ty alts
                       LitAlt l   -> Lit l
                       DataAlt dc -> mkConApp2 dc (tyConAppArgs (idType bndr)) bs
 
+    mk_new_bndrs new_bndr (DataAlt dc)
+      | not (isNullaryRepDataCon dc)
+      = -- For non-nullary data cons we must invent some fake binders
+        -- See Note [caseRules for dataToTag] in PrelRules
+        do { us <- getUniquesM
+           ; let (ex_tvs, arg_ids) = dataConRepInstPat us dc
+                                        (tyConAppArgs (idType new_bndr))
+           ; return (ex_tvs ++ arg_ids) }
+    mk_new_bndrs _ _ = return []
 
     re_sort :: [CoreAlt] -> [CoreAlt]  -- Re-sort the alternatives to
     re_sort alts = sortBy cmpAlt alts  -- preserve the #case_invariants#
 
     add_default :: [CoreAlt] -> [CoreAlt]
-    -- TagToEnum may change a boolean True/False set of alternatives
-    -- to LitAlt 0#/1# alterantives.  But literal alternatives always
-    -- have a DEFAULT (I think).  So add it.
+    -- See Note [Literal cases]
     add_default ((LitAlt {}, bs, rhs) : alts) = (DEFAULT, bs, rhs) : alts
     add_default alts                          = alts
 
+{- Note [Literal cases]
+~~~~~~~~~~~~~~~~~~~~~~~
+If we have
+  case tagToEnum (a ># b) of
+     False -> e1
+     True  -> e2
+
+then caseRules for TagToEnum will turn it into
+  case tagToEnum (a ># b) of
+     0# -> e1
+     1# -> e2
+
+Since the case is exhaustive (all cases are) we can convert it to
+  case tagToEnum (a ># b) of
+     DEFAULT -> e1
+     1#      -> e2
+
+This may generate sligthtly better code (although it should not, since
+all cases are exhaustive) and/or optimise better.  I'm not certain that
+it's necessary, but currenty we do make this change.  We do it here,
+NOT in the TagToEnum rules (see "Beware" in Note [caseRules for tagToEnum]
+in PrelRules)
+-}
+
 --------------------------------------------------
 --      Catch-all
 --------------------------------------------------
 mkCase3 _dflags scrut bndr alts_ty alts
   = return (Case scrut bndr alts_ty alts)
 
+-- See Note [Exitification] and Note [Do not inline exit join points] in Exitify.hs
+-- This lives here (and not in Id) because occurrence info is only valid on
+-- InIds, so it's crucial that isExitJoinId is only called on freshly
+-- occ-analysed code. It's not a generic function you can call anywhere.
+isExitJoinId :: Var -> Bool
+isExitJoinId id = isJoinId id && isOneOcc (idOccInfo id) && occ_in_lam (idOccInfo id)
+
 {-
 Note [Dead binders]
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/compiler/simplCore/Simplify.hs b/compiler/simplCore/Simplify.hs
index 91ed644057..872973925f 100644
--- a/compiler/simplCore/Simplify.hs
+++ b/compiler/simplCore/Simplify.hs
@@ -10,6 +10,8 @@ module Simplify ( simplTopBinds, simplExpr, simplRules ) where
 
 #include "HsVersions.h"
 
+import GhcPrelude
+
 import DynFlags
 import SimplMonad
 import Type hiding      ( substTy, substTyVar, extendTvSubst, extendCvSubst )
@@ -22,29 +24,28 @@ import Id
 import MkId             ( seqId )
 import MkCore           ( mkImpossibleExpr, castBottomExpr )
 import IdInfo
-import Name             ( Name, mkSystemVarName, isExternalName, getOccFS )
+import Name             ( mkSystemVarName, isExternalName, getOccFS )
 import Coercion hiding  ( substCo, substCoVar )
 import OptCoercion      ( optCoercion )
 import FamInstEnv       ( topNormaliseType_maybe )
-import DataCon          ( DataCon, dataConWorkId, dataConRepStrictness, dataConRepArgTys )
---import TyCon            ( isEnumerationTyCon ) -- temporalily commented out. See #8326
-import CoreMonad        ( Tick(..), SimplifierMode(..) )
+import DataCon          ( DataCon, dataConWorkId, dataConRepStrictness
+                        , dataConRepArgTys, isUnboxedTupleCon
+                        , StrictnessMark (..) )
+import CoreMonad        ( Tick(..), SimplMode(..) )
 import CoreSyn
 import Demand           ( StrictSig(..), dmdTypeDepth, isStrictDmd )
 import PprCore          ( pprCoreExpr )
 import CoreUnfold
 import CoreUtils
-import CoreArity
 import CoreOpt          ( pushCoTyArg, pushCoValArg
                         , joinPointBinding_maybe, joinPointBindings_maybe )
---import PrimOp           ( tagToEnumKey ) -- temporalily commented out. See #8326
 import Rules            ( mkRuleInfo, lookupRule, getRules )
---import TysPrim          ( intPrimTy ) -- temporalily commented out. See #8326
+import Demand           ( mkClosedStrictSig, topDmd, exnRes )
 import BasicTypes       ( TopLevelFlag(..), isNotTopLevel, isTopLevel,
-                          RecFlag(..) )
-import MonadUtils       ( foldlM, mapAccumLM, liftIO )
-import Maybes           ( isJust, fromJust, orElse, catMaybes )
---import Unique           ( hasKey ) -- temporalily commented out. See #8326
+                          RecFlag(..), Arity )
+import MonadUtils       ( mapAccumLM, liftIO )
+import Var              ( isTyCoVar )
+import Maybes           (  orElse )
 import Control.Monad
 import Outputable
 import FastString
@@ -52,147 +53,57 @@ import Pair
 import Util
 import ErrUtils
 import Module          ( moduleName, pprModuleName )
+import PrimOp          ( PrimOp (SeqOp) )
+
 
 {-
 The guts of the simplifier is in this module, but the driver loop for
 the simplifier is in SimplCore.hs.
 
+Note [The big picture]
+~~~~~~~~~~~~~~~~~~~~~~
+The general shape of the simplifier is this:
 
------------------------------------------
-        *** IMPORTANT NOTE ***
------------------------------------------
-The simplifier used to guarantee that the output had no shadowing, but
-it does not do so any more.   (Actually, it never did!)  The reason is
-documented with simplifyArgs.
-
-
------------------------------------------
-        *** IMPORTANT NOTE ***
------------------------------------------
-Many parts of the simplifier return a bunch of "floats" as well as an
-expression. This is wrapped as a datatype SimplUtils.FloatsWith.
-
-All "floats" are let-binds, not case-binds, but some non-rec lets may
-be unlifted (with RHS ok-for-speculation).
-
-
-
------------------------------------------
-        ORGANISATION OF FUNCTIONS
------------------------------------------
-simplTopBinds
-  - simplify all top-level binders
-  - for NonRec, call simplRecOrTopPair
-  - for Rec,    call simplRecBind
-
-
-        ------------------------------
-simplExpr (applied lambda)      ==> simplNonRecBind
-simplExpr (Let (NonRec ...) ..) ==> simplNonRecBind
-simplExpr (Let (Rec ...)    ..) ==> simplify binders; simplRecBind
-
-        ------------------------------
-simplRecBind    [binders already simplfied]
-  - use simplRecOrTopPair on each pair in turn
-
-simplRecOrTopPair [binder already simplified]
-  Used for: recursive bindings (top level and nested)
-            top-level non-recursive bindings
-  Returns:
-  - check for PreInlineUnconditionally
-  - simplLazyBind
-
-simplNonRecBind
-  Used for: non-top-level non-recursive bindings
-            beta reductions (which amount to the same thing)
-  Because it can deal with strict arts, it takes a
-        "thing-inside" and returns an expression
-
-  - check for PreInlineUnconditionally
-  - simplify binder, including its IdInfo
-  - if strict binding
-        simplStrictArg
-        mkAtomicArgs
-        completeNonRecX
-    else
-        simplLazyBind
-        addFloats
-
-simplNonRecX:   [given a *simplified* RHS, but an *unsimplified* binder]
-  Used for: binding case-binder and constr args in a known-constructor case
-  - check for PreInLineUnconditionally
-  - simplify binder
-  - completeNonRecX
-
-        ------------------------------
-simplLazyBind:  [binder already simplified, RHS not]
-  Used for: recursive bindings (top level and nested)
-            top-level non-recursive bindings
-            non-top-level, but *lazy* non-recursive bindings
-        [must not be strict or unboxed]
-  Returns floats + an augmented environment, not an expression
-  - substituteIdInfo and add result to in-scope
-        [so that rules are available in rec rhs]
-  - simplify rhs
-  - mkAtomicArgs
-  - float if exposes constructor or PAP
-  - completeBind
-
-
-completeNonRecX:        [binder and rhs both simplified]
-  - if the the thing needs case binding (unlifted and not ok-for-spec)
-        build a Case
-   else
-        completeBind
-        addFloats
-
-completeBind:   [given a simplified RHS]
-        [used for both rec and non-rec bindings, top level and not]
-  - try PostInlineUnconditionally
-  - add unfolding [this is the only place we add an unfolding]
-  - add arity
-
-
-
-Right hand sides and arguments
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In many ways we want to treat
-        (a) the right hand side of a let(rec), and
-        (b) a function argument
-in the same way.  But not always!  In particular, we would
-like to leave these arguments exactly as they are, so they
-will match a RULE more easily.
-
-        f (g x, h x)
-        g (+ x)
-
-It's harder to make the rule match if we ANF-ise the constructor,
-or eta-expand the PAP:
+  simplExpr :: SimplEnv -> InExpr -> SimplCont -> SimplM (SimplFloats, OutExpr)
+  simplBind :: SimplEnv -> InBind -> SimplM (SimplFloats, SimplEnv)
 
-        f (let { a = g x; b = h x } in (a,b))
-        g (\y. + x y)
+ * SimplEnv contains
+     - Simplifier mode (which includes DynFlags for convenience)
+     - Ambient substitution
+     - InScopeSet
 
-On the other hand if we see the let-defns
+ * SimplFloats contains
+     - Let-floats (which includes ok-for-spec case-floats)
+     - Join floats
+     - InScopeSet (including all the floats)
 
-        p = (g x, h x)
-        q = + x
+ * Expressions
+      simplExpr :: SimplEnv -> InExpr -> SimplCont
+                -> SimplM (SimplFloats, OutExpr)
+   The result of simplifying an /expression/ is (floats, expr)
+      - A bunch of floats (let bindings, join bindings)
+      - A simplified expression.
+   The overall result is effectively (let floats in expr)
 
-then we *do* want to ANF-ise and eta-expand, so that p and q
-can be safely inlined.
+ * Bindings
+      simplBind :: SimplEnv -> InBind -> SimplM (SimplFloats, SimplEnv)
+   The result of simplifying a binding is
+     - A bunch of floats, the last of which is the simplified binding
+       There may be auxiliary bindings too; see prepareRhs
+     - An environment suitable for simplifying the scope of the binding
 
-Even floating lets out is a bit dubious.  For let RHS's we float lets
-out if that exposes a value, so that the value can be inlined more vigorously.
-For example
+   The floats may also be empty, if the binding is inlined unconditionally;
+   in that case the returned SimplEnv will have an augmented substitution.
 
-        r = let x = e in (x,x)
+   The returned floats and env both have an in-scope set, and they are
+   guaranteed to be the same.
 
-Here, if we float the let out we'll expose a nice constructor. We did experiments
-that showed this to be a generally good thing.  But it was a bad thing to float
-lets out unconditionally, because that meant they got allocated more often.
 
-For function arguments, there's less reason to expose a constructor (it won't
-get inlined).  Just possibly it might make a rule match, but I'm pretty skeptical.
-So for the moment we don't float lets out of function arguments either.
+Note [Shadowing]
+~~~~~~~~~~~~~~~~
+The simplifier used to guarantee that the output had no shadowing, but
+it does not do so any more.   (Actually, it never did!)  The reason is
+documented with simplifyArgs.
 
 
 Eta expansion
@@ -206,36 +117,6 @@ lambdas together.  And in general that's a good thing to do.  Perhaps
 we should eta expand wherever we find a (value) lambda?  Then the eta
 expansion at a let RHS can concentrate solely on the PAP case.
 
-
-Case-of-case and join points
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When we perform the case-of-case transform (or otherwise push continuations
-inward), we want to treat join points specially. Since they're always
-tail-called and we want to maintain this invariant, we can do this (for any
-evaluation context E):
-
-  E[join j = e
-    in case ... of
-         A -> jump j 1
-         B -> jump j 2
-         C -> f 3]
-
-    -->
-
-  join j = E[e]
-  in case ... of
-       A -> jump j 1
-       B -> jump j 2
-       C -> E[f 3]
-
-As is evident from the example, there are two components to this behavior:
-
-  1. When entering the RHS of a join point, copy the context inside.
-  2. When a join point is invoked, discard the outer context.
-
-Clearly we need to be very careful here to remain consistent---neither part is
-optional!
-
 ************************************************************************
 *                                                                      *
 \subsection{Bindings}
@@ -243,38 +124,39 @@ optional!
 ************************************************************************
 -}
 
-simplTopBinds :: SimplEnv -> [InBind] -> SimplM SimplEnv
-
+simplTopBinds :: SimplEnv -> [InBind] -> SimplM (SimplFloats, SimplEnv)
+-- See Note [The big picture]
 simplTopBinds env0 binds0
   = do  {       -- Put all the top-level binders into scope at the start
                 -- so that if a transformation rule has unexpectedly brought
                 -- anything into scope, then we don't get a complaint about that.
                 -- It's rather as if the top-level binders were imported.
                 -- See note [Glomming] in OccurAnal.
-        ; env1 <- simplRecBndrs env0 (bindersOfBinds binds0)
-        ; env2 <- simpl_binds env1 binds0
+        ; env1 <- {-#SCC "simplTopBinds-simplRecBndrs" #-} simplRecBndrs env0 (bindersOfBinds binds0)
+        ; (floats, env2) <- {-#SCC "simplTopBinds-simpl_binds" #-} simpl_binds env1 binds0
         ; freeTick SimplifierDone
-        ; return env2 }
+        ; return (floats, env2) }
   where
         -- We need to track the zapped top-level binders, because
         -- they should have their fragile IdInfo zapped (notably occurrence info)
         -- That's why we run down binds and bndrs' simultaneously.
         --
-    simpl_binds :: SimplEnv -> [InBind] -> SimplM SimplEnv
-    simpl_binds env []           = return env
-    simpl_binds env (bind:binds) = do { env' <- simpl_bind env bind
-                                      ; simpl_binds env' binds }
-
-    simpl_bind env (Rec pairs)  = simplRecBind env TopLevel Nothing pairs
-    simpl_bind env (NonRec b r) = do { (env', b') <- addBndrRules env b (lookupRecBndr env b)
-                                     ; simplRecOrTopPair env' TopLevel
-                                                         NonRecursive Nothing
-                                                         b b' r }
+    simpl_binds :: SimplEnv -> [InBind] -> SimplM (SimplFloats, SimplEnv)
+    simpl_binds env []           = return (emptyFloats env, env)
+    simpl_binds env (bind:binds) = do { (float,  env1) <- simpl_bind env bind
+                                      ; (floats, env2) <- simpl_binds env1 binds
+                                      ; return (float `addFloats` floats, env2) }
+
+    simpl_bind env (Rec pairs)
+      = simplRecBind env TopLevel Nothing pairs
+    simpl_bind env (NonRec b r)
+      = do { (env', b') <- addBndrRules env b (lookupRecBndr env b) Nothing
+           ; simplRecOrTopPair env' TopLevel NonRecursive Nothing b b' r }
 
 {-
 ************************************************************************
 *                                                                      *
-\subsection{Lazy bindings}
+        Lazy bindings
 *                                                                      *
 ************************************************************************
 
@@ -282,28 +164,27 @@ simplRecBind is used for
         * recursive bindings only
 -}
 
-simplRecBind :: SimplEnv -> TopLevelFlag -> Maybe SimplCont
+simplRecBind :: SimplEnv -> TopLevelFlag -> MaybeJoinCont
              -> [(InId, InExpr)]
-             -> SimplM SimplEnv
+             -> SimplM (SimplFloats, SimplEnv)
 simplRecBind env0 top_lvl mb_cont pairs0
   = do  { (env_with_info, triples) <- mapAccumLM add_rules env0 pairs0
-        ; env1 <- go (zapFloats env_with_info) triples
-        ; return (env0 `addRecFloats` env1) }
-        -- addRecFloats adds the floats from env1,
-        -- _and_ updates env0 with the in-scope set from env1
+        ; (rec_floats, env1) <- go env_with_info triples
+        ; return (mkRecFloats rec_floats, env1) }
   where
     add_rules :: SimplEnv -> (InBndr,InExpr) -> SimplM (SimplEnv, (InBndr, OutBndr, InExpr))
         -- Add the (substituted) rules to the binder
     add_rules env (bndr, rhs)
-        = do { (env', bndr') <- addBndrRules env bndr (lookupRecBndr env bndr)
+        = do { (env', bndr') <- addBndrRules env bndr (lookupRecBndr env bndr) mb_cont
              ; return (env', (bndr, bndr', rhs)) }
 
-    go env [] = return env
+    go env [] = return (emptyFloats env, env)
 
     go env ((old_bndr, new_bndr, rhs) : pairs)
-        = do { env' <- simplRecOrTopPair env top_lvl Recursive mb_cont
-                                         old_bndr new_bndr rhs
-             ; go env' pairs }
+        = do { (float, env1) <- simplRecOrTopPair env top_lvl Recursive mb_cont
+                                                  old_bndr new_bndr rhs
+             ; (floats, env2) <- go env1 pairs
+             ; return (float `addFloats` floats, env2) }
 
 {-
 simplOrTopPair is used for
@@ -314,59 +195,40 @@ It assumes the binder has already been simplified, but not its IdInfo.
 -}
 
 simplRecOrTopPair :: SimplEnv
-                  -> TopLevelFlag -> RecFlag -> Maybe SimplCont
+                  -> TopLevelFlag -> RecFlag -> MaybeJoinCont
                   -> InId -> OutBndr -> InExpr  -- Binder and rhs
-                  -> SimplM SimplEnv    -- Returns an env that includes the binding
+                  -> SimplM (SimplFloats, SimplEnv)
 
 simplRecOrTopPair env top_lvl is_rec mb_cont old_bndr new_bndr rhs
-  = do { dflags <- getDynFlags
-       ; trace_bind dflags $
-           if preInlineUnconditionally dflags env top_lvl old_bndr rhs
-                    -- Check for unconditional inline
-           then do tick (PreInlineUnconditionally old_bndr)
-                   return (extendIdSubst env old_bndr (mkContEx env rhs))
-           else simplBind env top_lvl is_rec mb_cont old_bndr new_bndr rhs env }
+  | Just env' <- preInlineUnconditionally env top_lvl old_bndr rhs env
+  = {-#SCC "simplRecOrTopPair-pre-inline-uncond" #-}
+    trace_bind "pre-inline-uncond" $
+    do { tick (PreInlineUnconditionally old_bndr)
+       ; return ( emptyFloats env, env' ) }
+
+  | Just cont <- mb_cont
+  = {-#SCC "simplRecOrTopPair-join" #-}
+    ASSERT( isNotTopLevel top_lvl && isJoinId new_bndr )
+    trace_bind "join" $
+    simplJoinBind env cont old_bndr new_bndr rhs env
+
+  | otherwise
+  = {-#SCC "simplRecOrTopPair-normal" #-}
+    trace_bind "normal" $
+    simplLazyBind env top_lvl is_rec old_bndr new_bndr rhs env
+
   where
-    trace_bind dflags thing_inside
+    dflags = seDynFlags env
+
+    -- trace_bind emits a trace for each top-level binding, which
+    -- helps to locate the tracing for inlining and rule firing
+    trace_bind what thing_inside
       | not (dopt Opt_D_verbose_core2core dflags)
       = thing_inside
       | otherwise
-      = pprTrace "SimplBind" (ppr old_bndr) thing_inside
-        -- trace_bind emits a trace for each top-level binding, which
-        -- helps to locate the tracing for inlining and rule firing
-
-{-
-simplBind is used for
-  * [simplRecOrTopPair] recursive bindings (whether top level or not)
-  * [simplRecOrTopPair] top-level non-recursive bindings
-  * [simplNonRecE]      non-top-level *lazy* non-recursive bindings
-
-Nota bene:
-    1. It assumes that the binder is *already* simplified,
-       and is in scope, and its IdInfo too, except unfolding
-
-    2. It assumes that the binder type is lifted.
-
-    3. It does not check for pre-inline-unconditionally;
-       that should have been done already.
--}
-
-simplBind :: SimplEnv
-          -> TopLevelFlag -> RecFlag -> Maybe SimplCont
-          -> InId -> OutId      -- Binder, both pre-and post simpl
-                                -- Can be a JoinId
-                                -- The OutId has IdInfo, except arity, unfolding
-                                -- Ids only, no TyVars
-          -> InExpr -> SimplEnv -- The RHS and its environment
-          -> SimplM SimplEnv
-simplBind env top_lvl is_rec mb_cont bndr bndr1 rhs rhs_se
-  | ASSERT( isId bndr1 )
-    isJoinId bndr1
-  = ASSERT(isNotTopLevel top_lvl && isJust mb_cont)
-    simplJoinBind env is_rec (fromJust mb_cont) bndr bndr1 rhs rhs_se
-  | otherwise
-  = simplLazyBind env top_lvl is_rec bndr bndr1 rhs rhs_se
+      = pprTrace ("SimplBind " ++ what) (ppr old_bndr) thing_inside
 
+--------------------------
 simplLazyBind :: SimplEnv
               -> TopLevelFlag -> RecFlag
               -> InId -> OutId          -- Binder, both pre-and post simpl
@@ -374,7 +236,7 @@ simplLazyBind :: SimplEnv
                                         -- The OutId has IdInfo, except arity, unfolding
                                         -- Ids only, no TyVars
               -> InExpr -> SimplEnv     -- The RHS and its environment
-              -> SimplM SimplEnv
+              -> SimplM (SimplFloats, SimplEnv)
 -- Precondition: not a JoinId
 -- Precondition: rhs obeys the let/app invariant
 -- NOT used for JoinIds
@@ -382,7 +244,7 @@ simplLazyBind env top_lvl is_rec bndr bndr1 rhs rhs_se
   = ASSERT( isId bndr )
     ASSERT2( not (isJoinId bndr), ppr bndr )
     -- pprTrace "simplLazyBind" ((ppr bndr <+> ppr bndr1) $$ ppr rhs $$ ppr (seIdSubst rhs_se)) $
-    do  { let   rhs_env     = rhs_se `setInScopeAndZapFloats` env
+    do  { let   rhs_env     = rhs_se `setInScopeFromE` env
                 (tvs, body) = case collectTyAndValBinders rhs of
                                 (tvs, [], body)
                                   | surely_not_lam body -> (tvs, body)
@@ -399,151 +261,120 @@ simplLazyBind env top_lvl is_rec bndr bndr1 rhs rhs_se
                         -- should eta-reduce.
 
 
-        ; (body_env, tvs') <- simplBinders rhs_env tvs
+        ; (body_env, tvs') <- {-#SCC "simplBinders" #-} simplBinders rhs_env tvs
                 -- See Note [Floating and type abstraction] in SimplUtils
 
         -- Simplify the RHS
         ; let rhs_cont = mkRhsStop (substTy body_env (exprType body))
-        ; (body_env0, body0) <- simplExprF body_env body rhs_cont
-        ; let (body_env1, body1) = wrapJoinFloatsX body_env0 body0
-
-        -- ANF-ise a constructor or PAP rhs
-        ; (body_env2, body2) <- prepareRhs top_lvl body_env1 bndr1 body1
+        ; (body_floats0, body0) <- {-#SCC "simplExprF" #-} simplExprF body_env body rhs_cont
 
-        -- We need body_env2 for its let-floats (only);
-        -- we've dealt with its join-floats, which are now empty
-        ; (env', rhs')
-            <-  if not (doFloatFromRhs top_lvl is_rec False body2 body_env2)
-                then                            -- No floating, revert to body1
-                     do { rhs' <- mkLam env tvs' (wrapFloats body_env1 body1) rhs_cont
-                        ; return (env, rhs') }
+              -- Never float join-floats out of a non-join let-binding
+              -- So wrap the body in the join-floats right now
+              -- Henc: body_floats1 consists only of let-floats
+        ; let (body_floats1, body1) = wrapJoinFloatsX body_floats0 body0
 
-                else if null tvs then           -- Simple floating
+        -- ANF-ise a constructor or PAP rhs
+        -- We get at most one float per argument here
+        ; (let_floats, body2) <- {-#SCC "prepareRhs" #-} prepareRhs (getMode env) top_lvl
+                                            (getOccFS bndr1) (idInfo bndr1) body1
+        ; let body_floats2 = body_floats1 `addLetFloats` let_floats
+
+        ; (rhs_floats, rhs')
+            <-  if not (doFloatFromRhs top_lvl is_rec False body_floats2 body2)
+                then                    -- No floating, revert to body1
+                     {-#SCC "simplLazyBind-no-floating" #-}
+                     do { rhs' <- mkLam env tvs' (wrapFloats body_floats2 body1) rhs_cont
+                        ; return (emptyFloats env, rhs') }
+
+                else if null tvs then   -- Simple floating
+                     {-#SCC "simplLazyBind-simple-floating" #-}
                      do { tick LetFloatFromLet
-                        ; return (addLetFloats env body_env2, body2) }
+                        ; return (body_floats2, body2) }
 
-                else                            -- Do type-abstraction first
+                else                    -- Do type-abstraction first
+                     {-#SCC "simplLazyBind-type-abstraction-first" #-}
                      do { tick LetFloatFromLet
-                        ; (poly_binds, body3) <- abstractFloats tvs' body_env2 body2
+                        ; (poly_binds, body3) <- abstractFloats (seDynFlags env) top_lvl
+                                                                tvs' body_floats2 body2
+                        ; let floats = foldl' extendFloats (emptyFloats env) poly_binds
                         ; rhs' <- mkLam env tvs' body3 rhs_cont
-                        ; env' <- foldlM (addPolyBind top_lvl) env poly_binds
-                        ; return (env', rhs') }
+                        ; return (floats, rhs') }
 
-        ; completeBind env' top_lvl is_rec Nothing bndr bndr1 rhs' }
+        ; (bind_float, env2) <- completeBind (env `setInScopeFromF` rhs_floats)
+                                             top_lvl Nothing bndr bndr1 rhs'
+        ; return (rhs_floats `addFloats` bind_float, env2) }
 
+--------------------------
 simplJoinBind :: SimplEnv
-              -> RecFlag
               -> SimplCont
               -> InId -> OutId          -- Binder, both pre-and post simpl
                                         -- The OutId has IdInfo, except arity,
                                         --   unfolding
-              -> InExpr -> SimplEnv     -- The RHS and its environment
-              -> SimplM SimplEnv
-simplJoinBind env is_rec cont bndr bndr1 rhs rhs_se
-  = -- pprTrace "simplLazyBind" ((ppr bndr <+> ppr bndr1) $$
-    --                           ppr rhs $$ ppr (seIdSubst rhs_se)) $
-    do  { let rhs_env = rhs_se `setInScopeAndZapFloats` env
-        ; rhs' <- simplJoinRhs rhs_env bndr rhs cont
-        ; completeBind env NotTopLevel is_rec (Just cont) bndr bndr1 rhs' }
-
-{-
-A specialised variant of simplNonRec used when the RHS is already simplified,
-notably in knownCon.  It uses case-binding where necessary.
--}
-
+              -> InExpr -> SimplEnv     -- The right hand side and its env
+              -> SimplM (SimplFloats, SimplEnv)
+simplJoinBind env cont old_bndr new_bndr rhs rhs_se
+  = do  { let rhs_env = rhs_se `setInScopeFromE` env
+        ; rhs' <- simplJoinRhs rhs_env old_bndr rhs cont
+        ; completeBind env NotTopLevel (Just cont) old_bndr new_bndr rhs' }
+
+--------------------------
 simplNonRecX :: SimplEnv
              -> InId            -- Old binder; not a JoinId
              -> OutExpr         -- Simplified RHS
-             -> SimplM SimplEnv
+             -> SimplM (SimplFloats, SimplEnv)
+-- A specialised variant of simplNonRec used when the RHS is already
+-- simplified, notably in knownCon.  It uses case-binding where necessary.
+--
 -- Precondition: rhs satisfies the let/app invariant
+
 simplNonRecX env bndr new_rhs
   | ASSERT2( not (isJoinId bndr), ppr bndr )
     isDeadBinder bndr   -- Not uncommon; e.g. case (a,b) of c { (p,q) -> p }
-  = return env    --  Here c is dead, and we avoid creating
-                  --   the binding c = (a,b)
+  = return (emptyFloats env, env)    --  Here c is dead, and we avoid
+                                         --  creating the binding c = (a,b)
 
   | Coercion co <- new_rhs
-  = return (extendCvSubst env bndr co)
+  = return (emptyFloats env, extendCvSubst env bndr co)
 
   | otherwise
   = do  { (env', bndr') <- simplBinder env bndr
         ; completeNonRecX NotTopLevel env' (isStrictId bndr) bndr bndr' new_rhs }
                 -- simplNonRecX is only used for NotTopLevel things
 
+--------------------------
 completeNonRecX :: TopLevelFlag -> SimplEnv
                 -> Bool
                 -> InId                 -- Old binder; not a JoinId
                 -> OutId                -- New binder
                 -> OutExpr              -- Simplified RHS
-                -> SimplM SimplEnv      -- The new binding extends the seLetFloats
-                                        -- of the resulting SimpleEnv
+                -> SimplM (SimplFloats, SimplEnv)    -- The new binding is in the floats
 -- Precondition: rhs satisfies the let/app invariant
 --               See Note [CoreSyn let/app invariant] in CoreSyn
 
 completeNonRecX top_lvl env is_strict old_bndr new_bndr new_rhs
   = ASSERT2( not (isJoinId new_bndr), ppr new_bndr )
-    do  { (env1, rhs1) <- prepareRhs top_lvl (zapFloats env) new_bndr new_rhs
-        ; (env2, rhs2) <-
-                if doFloatFromRhs NotTopLevel NonRecursive is_strict rhs1 env1
-                then do { tick LetFloatFromLet
-                        ; return (addLetFloats env env1, rhs1) }   -- Add the floats to the main env
-                else return (env, wrapFloats env1 rhs1)         -- Wrap the floats around the RHS
-        ; completeBind env2 NotTopLevel NonRecursive Nothing
-                       old_bndr new_bndr rhs2 }
-
-{- No, no, no!  Do not try preInlineUnconditionally in completeNonRecX
-   Doing so risks exponential behaviour, because new_rhs has been simplified once already
-   In the cases described by the following comment, postInlineUnconditionally will
-   catch many of the relevant cases.
-        -- This happens; for example, the case_bndr during case of
-        -- known constructor:  case (a,b) of x { (p,q) -> ... }
-        -- Here x isn't mentioned in the RHS, so we don't want to
-        -- create the (dead) let-binding  let x = (a,b) in ...
-        --
-        -- Similarly, single occurrences can be inlined vigourously
-        -- e.g.  case (f x, g y) of (a,b) -> ....
-        -- If a,b occur once we can avoid constructing the let binding for them.
-
-   Furthermore in the case-binding case preInlineUnconditionally risks extra thunks
-        -- Consider     case I# (quotInt# x y) of
-        --                I# v -> let w = J# v in ...
-        -- If we gaily inline (quotInt# x y) for v, we end up building an
-        -- extra thunk:
-        --                let w = J# (quotInt# x y) in ...
-        -- because quotInt# can fail.
-
-  | preInlineUnconditionally env NotTopLevel bndr new_rhs
-  = thing_inside (extendIdSubst env bndr (DoneEx new_rhs))
--}
-
-----------------------------------
-{- Note [Avoiding exponential behaviour]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-One way in which we can get exponential behaviour is if we simplify a
-big expression, and the re-simplify it -- and then this happens in a
-deeply-nested way.  So we must be jolly careful about re-simplifying
-an expression.  That is why completeNonRecX does not try
-preInlineUnconditionally.
-
-Example:
-  f BIG, where f has a RULE
-Then
- * We simplify BIG before trying the rule; but the rule does not fire
- * We inline f = \x. x True
- * So if we did preInlineUnconditionally we'd re-simplify (BIG True)
+    do  { (prepd_floats, rhs1) <- prepareRhs (getMode env) top_lvl (getOccFS new_bndr)
+                                             (idInfo new_bndr) new_rhs
+        ; let floats = emptyFloats env `addLetFloats` prepd_floats
+        ; (rhs_floats, rhs2) <-
+                if doFloatFromRhs NotTopLevel NonRecursive is_strict floats rhs1
+                then    -- Add the floats to the main env
+                     do { tick LetFloatFromLet
+                        ; return (floats, rhs1) }
+                else    -- Do not float; wrap the floats around the RHS
+                     return (emptyFloats env, wrapFloats floats rhs1)
 
-However, if BIG has /not/ already been simplified, we'd /like/ to
-simplify BIG True; maybe good things happen.  That is why
+        ; (bind_float, env2) <- completeBind (env `setInScopeFromF` rhs_floats)
+                                             NotTopLevel Nothing
+                                             old_bndr new_bndr rhs2
+        ; return (rhs_floats `addFloats` bind_float, env2) }
 
-* simplLam has
-    - a case for (isSimplified dup), which goes via simplNonRecX, and
-    - a case for the un-simplified case, which goes via simplNonRecE
 
-* We go to some efforts to avoid unnecessarily simplifying ApplyToVal,
-  in at least two places
-    - In simplCast/addCoerce, where we check for isReflCo
-    - In rebuildCall we avoid simplifying arguments before we have to
-      (see Note [Trying rewrite rules])
+{- *********************************************************************
+*                                                                      *
+           prepareRhs, makeTrivial
+*                                                                      *
+************************************************************************
 
 Note [prepareRhs]
 ~~~~~~~~~~~~~~~~~
@@ -563,71 +394,68 @@ Here we want to make e1,e2 trivial and get
 That's what the 'go' loop in prepareRhs does
 -}
 
-prepareRhs :: TopLevelFlag -> SimplEnv -> OutId -> OutExpr -> SimplM (SimplEnv, OutExpr)
--- See Note [prepareRhs]
--- Adds new floats to the env iff that allows us to return a good RHS
+prepareRhs :: SimplMode -> TopLevelFlag
+           -> FastString   -- Base for any new variables
+           -> IdInfo       -- IdInfo for the LHS of this binding
+           -> OutExpr
+           -> SimplM (LetFloats, OutExpr)
+-- Transforms a RHS into a better RHS by adding floats
+-- e.g        x = Just e
+-- becomes    a = e
+--            x = Just a
 -- See Note [prepareRhs]
-prepareRhs top_lvl env id (Cast rhs co)    -- Note [Float coercions]
-  | Pair ty1 _ty2 <- coercionKind co       -- Do *not* do this if rhs has an unlifted type
-  , not (isUnliftedType ty1)            -- see Note [Float coercions (unlifted)]
-  = do  { (env', rhs') <- makeTrivialWithInfo top_lvl env (getOccFS id) sanitised_info rhs
-        ; return (env', Cast rhs' co) }
+prepareRhs mode top_lvl occ info (Cast rhs co)  -- Note [Float coercions]
+  | Pair ty1 _ty2 <- coercionKind co         -- Do *not* do this if rhs has an unlifted type
+  , not (isUnliftedType ty1)                 -- see Note [Float coercions (unlifted)]
+  = do  { (floats, rhs') <- makeTrivialWithInfo mode top_lvl occ sanitised_info rhs
+        ; return (floats, Cast rhs' co) }
   where
     sanitised_info = vanillaIdInfo `setStrictnessInfo` strictnessInfo info
-                                   `setDemandInfo` demandInfo info
-    info = idInfo id
+                                   `setDemandInfo`     demandInfo info
 
-prepareRhs top_lvl env0 id rhs0
-  = do  { (_is_exp, env1, rhs1) <- go 0 env0 rhs0
-        ; return (env1, rhs1) }
+prepareRhs mode top_lvl occ _ rhs0
+  = do  { (_is_exp, floats, rhs1) <- go 0 rhs0
+        ; return (floats, rhs1) }
   where
-    go n_val_args env (Cast rhs co)
-        = do { (is_exp, env', rhs') <- go n_val_args env rhs
-             ; return (is_exp, env', Cast rhs' co) }
-    go n_val_args env (App fun (Type ty))
-        = do { (is_exp, env', rhs') <- go n_val_args env fun
-             ; return (is_exp, env', App rhs' (Type ty)) }
-    go n_val_args env (App fun arg)
-        = do { (is_exp, env', fun') <- go (n_val_args+1) env fun
+    go :: Int -> OutExpr -> SimplM (Bool, LetFloats, OutExpr)
+    go n_val_args (Cast rhs co)
+        = do { (is_exp, floats, rhs') <- go n_val_args rhs
+             ; return (is_exp, floats, Cast rhs' co) }
+    go n_val_args (App fun (Type ty))
+        = do { (is_exp, floats, rhs') <- go n_val_args fun
+             ; return (is_exp, floats, App rhs' (Type ty)) }
+    go n_val_args (App fun arg)
+        = do { (is_exp, floats1, fun') <- go (n_val_args+1) fun
              ; case is_exp of
-                True -> do { (env'', arg') <- makeTrivial top_lvl env' (getOccFS id) arg
-                           ; return (True, env'', App fun' arg') }
-                False -> return (False, env, App fun arg) }
-    go n_val_args env (Var fun)
-        = return (is_exp, env, Var fun)
+                False -> return (False, emptyLetFloats, App fun arg)
+                True  -> do { (floats2, arg') <- makeTrivial mode top_lvl occ arg
+                            ; return (True, floats1 `addLetFlts` floats2, App fun' arg') } }
+    go n_val_args (Var fun)
+        = return (is_exp, emptyLetFloats, Var fun)
         where
           is_exp = isExpandableApp fun n_val_args   -- The fun a constructor or PAP
                         -- See Note [CONLIKE pragma] in BasicTypes
                         -- The definition of is_exp should match that in
                         -- OccurAnal.occAnalApp
 
-    go n_val_args env (Tick t rhs)
+    go n_val_args (Tick t rhs)
         -- We want to be able to float bindings past this
         -- tick. Non-scoping ticks don't care.
         | tickishScoped t == NoScope
-        = do { (is_exp, env', rhs') <- go n_val_args env rhs
-             ; return (is_exp, env', Tick t rhs') }
+        = do { (is_exp, floats, rhs') <- go n_val_args rhs
+             ; return (is_exp, floats, Tick t rhs') }
 
         -- On the other hand, for scoping ticks we need to be able to
         -- copy them on the floats, which in turn is only allowed if
         -- we can obtain non-counting ticks.
         | (not (tickishCounts t) || tickishCanSplit t)
-        = do { (is_exp, env', rhs') <- go n_val_args (zapFloats env) rhs
-                    -- env' has the extra let-bindings from
-                    -- the makeTrivial calls in 'go'; no join floats
-             ; let tickIt (id, expr)
-                       -- we have to take care not to tick top-level literal
-                       -- strings. See Note [CoreSyn top-level string literals].
-                     | isTopLevel top_lvl && exprIsLiteralString expr
-                     = (id, expr)
-                     | otherwise
-                     = (id, mkTick (mkNoCount t) expr)
-                   floats' = seLetFloats env `addFlts`
-                             mapFloats (seLetFloats env') tickIt
-             ; return (is_exp, env' { seLetFloats = floats' }, Tick t rhs') }
-
-    go _ env other
-        = return (False, env, other)
+        = do { (is_exp, floats, rhs') <- go n_val_args rhs
+             ; let tickIt (id, expr) = (id, mkTick (mkNoCount t) expr)
+                   floats' = mapLetFloats floats tickIt
+             ; return (is_exp, floats', Tick t rhs') }
+
+    go _ other
+        = return (False, emptyLetFloats, other)
 
 {-
 Note [Float coercions]
@@ -680,50 +508,55 @@ These strange casts can happen as a result of case-of-case
                 (# p,q #) -> p+q
 -}
 
-makeTrivialArg :: SimplEnv -> ArgSpec -> SimplM (SimplEnv, ArgSpec)
-makeTrivialArg env (ValArg e) = do
-    { (env', e') <- makeTrivial NotTopLevel env (fsLit "arg") e
-    ; return (env', ValArg e') }
-makeTrivialArg env arg        = return (env, arg)  -- CastBy, TyArg
-
-makeTrivial :: TopLevelFlag -> SimplEnv
-            -> FastString  -- ^ a "friendly name" to build the new binder from
-            -> OutExpr -> SimplM (SimplEnv, OutExpr)
+makeTrivialArg :: SimplMode -> ArgSpec -> SimplM (LetFloats, ArgSpec)
+makeTrivialArg mode (ValArg e)
+  = do { (floats, e') <- makeTrivial mode NotTopLevel (fsLit "arg") e
+       ; return (floats, ValArg e') }
+makeTrivialArg _ arg
+  = return (emptyLetFloats, arg)  -- CastBy, TyArg
+
+makeTrivial :: SimplMode -> TopLevelFlag
+            -> FastString  -- ^ A "friendly name" to build the new binder from
+            -> OutExpr     -- ^ This expression satisfies the let/app invariant
+            -> SimplM (LetFloats, OutExpr)
 -- Binds the expression to a variable, if it's not trivial, returning the variable
-makeTrivial top_lvl env context expr =
-    makeTrivialWithInfo top_lvl env context vanillaIdInfo expr
-
-makeTrivialWithInfo :: TopLevelFlag -> SimplEnv
-                    -> FastString
-                    -- ^ a "friendly name" to build the new binder from
-                    -> IdInfo -> OutExpr -> SimplM (SimplEnv, OutExpr)
+makeTrivial mode top_lvl context expr
+ = makeTrivialWithInfo mode top_lvl context vanillaIdInfo expr
+
+makeTrivialWithInfo :: SimplMode -> TopLevelFlag
+                    -> FastString  -- ^ a "friendly name" to build the new binder from
+                    -> IdInfo
+                    -> OutExpr     -- ^ This expression satisfies the let/app invariant
+                    -> SimplM (LetFloats, OutExpr)
 -- Propagate strictness and demand info to the new binder
 -- Note [Preserve strictness when floating coercions]
 -- Returned SimplEnv has same substitution as incoming one
-makeTrivialWithInfo top_lvl env context info expr
+makeTrivialWithInfo mode top_lvl occ_fs info expr
   | exprIsTrivial expr                          -- Already trivial
   || not (bindingOk top_lvl expr expr_ty)       -- Cannot trivialise
                                                 --   See Note [Cannot trivialise]
-  = return (env, expr)
-
-  | otherwise           -- See Note [Take care] below
-  = do  { uniq <- getUniqueM
-        ; let name = mkSystemVarName uniq context
-              var = mkLocalIdOrCoVarWithInfo name expr_ty info
-        ; env'  <- completeNonRecX top_lvl env False var var expr
-        ; expr' <- simplVar env' var
-        ; return (env', expr') }
-        -- The simplVar is needed because we're constructing a new binding
-        --     a = rhs
-        -- And if rhs is of form (rhs1 |> co), then we might get
-        --     a1 = rhs1
-        --     a = a1 |> co
-        -- and now a's RHS is trivial and can be substituted out, and that
-        -- is what completeNonRecX will do
-        -- To put it another way, it's as if we'd simplified
-        --    let var = e in var
-  where
-    expr_ty = exprType expr
+  = return (emptyLetFloats, expr)
+
+  | otherwise
+  = do  { (floats, expr1) <- prepareRhs mode top_lvl occ_fs info expr
+        ; if   exprIsTrivial expr1  -- See Note [Trivial after prepareRhs]
+          then return (floats, expr1)
+          else do
+        { uniq <- getUniqueM
+        ; let name = mkSystemVarName uniq occ_fs
+              var  = mkLocalIdOrCoVarWithInfo name expr_ty info
+
+        -- Now something very like completeBind,
+        -- but without the postInlineUnconditinoally part
+        ; (arity, is_bot, expr2) <- tryEtaExpandRhs mode var expr1
+        ; unf <- mkLetUnfolding (sm_dflags mode) top_lvl InlineRhs var expr2
+
+        ; let final_id = addLetBndrInfo var arity is_bot unf
+              bind     = NonRec final_id expr2
+
+        ; return ( floats `addLetFlts` unitLetFloat bind, Var final_id ) }}
+   where
+     expr_ty = exprType expr
 
 bindingOk :: TopLevelFlag -> CoreExpr -> Type -> Bool
 -- True iff we can have a binding of this expression at this level
@@ -732,10 +565,16 @@ bindingOk top_lvl expr expr_ty
   | isTopLevel top_lvl = exprIsTopLevelBindable expr expr_ty
   | otherwise          = True
 
-{-
+{- Note [Trivial after prepareRhs]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If we call makeTrival on (e |> co), the recursive use of prepareRhs
+may leave us with
+   { a1 = e }  and   (a1 |> co)
+Now the latter is trivial, so we don't want to let-bind it.
+
 Note [Cannot trivialise]
 ~~~~~~~~~~~~~~~~~~~~~~~~
-Consider tih
+Consider:
    f :: Int -> Addr#
 
    foo :: Bar
@@ -761,7 +600,7 @@ See Note [CoreSyn top-level string literals] in CoreSyn.
 
 ************************************************************************
 *                                                                      *
-\subsection{Completing a lazy binding}
+          Completing a lazy binding
 *                                                                      *
 ************************************************************************
 
@@ -787,22 +626,21 @@ Nor does it do the atomic-argument thing
 
 completeBind :: SimplEnv
              -> TopLevelFlag            -- Flag stuck into unfolding
-             -> RecFlag                 -- Recursive binding?
-             -> Maybe SimplCont         -- Required only for join point
+             -> MaybeJoinCont           -- Required only for join point
              -> InId                    -- Old binder
              -> OutId -> OutExpr        -- New binder and RHS
-             -> SimplM SimplEnv
+             -> SimplM (SimplFloats, SimplEnv)
 -- completeBind may choose to do its work
 --      * by extending the substitution (e.g. let x = y in ...)
 --      * or by adding to the floats in the envt
 --
 -- Binder /can/ be a JoinId
 -- Precondition: rhs obeys the let/app invariant
-completeBind env top_lvl is_rec mb_cont old_bndr new_bndr new_rhs
+completeBind env top_lvl mb_cont old_bndr new_bndr new_rhs
  | isCoVar old_bndr
  = case new_rhs of
-     Coercion co -> return (extendCvSubst env old_bndr co)
-     _           -> return (addNonRec env new_bndr new_rhs)
+     Coercion co -> return (emptyFloats env, extendCvSubst env old_bndr co)
+     _           -> return (mkFloatBind env (NonRec new_bndr new_rhs))
 
  | otherwise
  = ASSERT( isId new_bndr )
@@ -810,87 +648,61 @@ completeBind env top_lvl is_rec mb_cont old_bndr new_bndr new_rhs
             old_unf  = unfoldingInfo old_info
             occ_info = occInfo old_info
 
-        -- Do eta-expansion on the RHS of the binding
-        -- See Note [Eta-expanding at let bindings] in SimplUtils
-      ; (new_arity, final_rhs) <- if isJoinId new_bndr
-                                    then return (manifestArity new_rhs, new_rhs)
-                                         -- Note [Don't eta-expand join points]
-                                    else tryEtaExpandRhs env is_rec
-                                                         new_bndr new_rhs
+         -- Do eta-expansion on the RHS of the binding
+         -- See Note [Eta-expanding at let bindings] in SimplUtils
+      ; (new_arity, is_bot, final_rhs) <- tryEtaExpandRhs (getMode env)
+                                                          new_bndr new_rhs
 
         -- Simplify the unfolding
       ; new_unfolding <- simplLetUnfolding env top_lvl mb_cont old_bndr
-                                           final_rhs old_unf
+                                           final_rhs (idType new_bndr) old_unf
+
+      ; let final_bndr = addLetBndrInfo new_bndr new_arity is_bot new_unfolding
 
-      ; dflags <- getDynFlags
-      ; if postInlineUnconditionally dflags env top_lvl new_bndr occ_info
-                                     final_rhs new_unfolding
+      ; if postInlineUnconditionally env top_lvl final_bndr occ_info final_rhs
 
-                        -- Inline and discard the binding
-        then do  { tick (PostInlineUnconditionally old_bndr)
-                 ; return (extendIdSubst env old_bndr
-                              (DoneEx final_rhs (isJoinId_maybe new_bndr))) }
+        then -- Inline and discard the binding
+             do  { tick (PostInlineUnconditionally old_bndr)
+                 ; return ( emptyFloats env
+                          , extendIdSubst env old_bndr $
+                            DoneEx final_rhs (isJoinId_maybe new_bndr)) }
                 -- Use the substitution to make quite, quite sure that the
                 -- substitution will happen, since we are going to discard the binding
-        else
-   do { let info1 = idInfo new_bndr `setArityInfo` new_arity
-
-              -- Unfolding info: Note [Setting the new unfolding]
-            info2 = info1 `setUnfoldingInfo` new_unfolding
-
-              -- Demand info: Note [Setting the demand info]
-              --
-              -- We also have to nuke demand info if for some reason
-              -- eta-expansion *reduces* the arity of the binding to less
-              -- than that of the strictness sig. This can happen: see Note [Arity decrease].
-            info3 | isEvaldUnfolding new_unfolding
-                    || (case strictnessInfo info2 of
-                          StrictSig dmd_ty -> new_arity < dmdTypeDepth dmd_ty)
-                  = zapDemandInfo info2 `orElse` info2
-                  | otherwise
-                  = info2
-
-              -- Zap call arity info. We have used it by now (via
-              -- `tryEtaExpandRhs`), and the simplifier can invalidate this
-              -- information, leading to broken code later (e.g. #13479)
-            info4 = zapCallArityInfo info3
-
-            final_id = new_bndr `setIdInfo` info4
-
-      ; -- pprTrace "Binding" (ppr final_id <+> ppr new_unfolding) $
-        return (addNonRec env final_id final_rhs) } }
-                -- The addNonRec adds it to the in-scope set too
-
-------------------------------
-addPolyBind :: TopLevelFlag -> SimplEnv -> OutBind -> SimplM SimplEnv
--- Add a new binding to the environment, complete with its unfolding
--- but *do not* do postInlineUnconditionally, because we have already
--- processed some of the scope of the binding
--- We still want the unfolding though.  Consider
---      let
---            x = /\a. let y = ... in Just y
---      in body
--- Then we float the y-binding out (via abstractFloats and addPolyBind)
--- but 'x' may well then be inlined in 'body' in which case we'd like the
--- opportunity to inline 'y' too.
---
--- INVARIANT: the arity is correct on the incoming binders
 
-addPolyBind top_lvl env (NonRec poly_id rhs)
-  = do  { unfolding <- simplLetUnfolding env top_lvl Nothing poly_id rhs
-                                         noUnfolding
-                        -- Assumes that poly_id did not have an INLINE prag
-                        -- which is perhaps wrong.  ToDo: think about this
-        ; let final_id = setIdInfo poly_id $
-                         idInfo poly_id `setUnfoldingInfo` unfolding
+        else -- Keep the binding
+             -- pprTrace "Binding" (ppr final_bndr <+> ppr new_unfolding) $
+             return (mkFloatBind env (NonRec final_bndr final_rhs)) }
 
-        ; return (addNonRec env final_id rhs) }
+addLetBndrInfo :: OutId -> Arity -> Bool -> Unfolding -> OutId
+addLetBndrInfo new_bndr new_arity is_bot new_unf
+  = new_bndr `setIdInfo` info5
+  where
+    info1 = idInfo new_bndr `setArityInfo` new_arity
+
+    -- Unfolding info: Note [Setting the new unfolding]
+    info2 = info1 `setUnfoldingInfo` new_unf
+
+    -- Demand info: Note [Setting the demand info]
+    -- We also have to nuke demand info if for some reason
+    -- eta-expansion *reduces* the arity of the binding to less
+    -- than that of the strictness sig. This can happen: see Note [Arity decrease].
+    info3 | isEvaldUnfolding new_unf
+            || (case strictnessInfo info2 of
+                  StrictSig dmd_ty -> new_arity < dmdTypeDepth dmd_ty)
+          = zapDemandInfo info2 `orElse` info2
+          | otherwise
+          = info2
+
+    -- Bottoming bindings: see Note [Bottoming bindings]
+    info4 | is_bot    = info3 `setStrictnessInfo`
+                        mkClosedStrictSig (replicate new_arity topDmd) exnRes
+          | otherwise = info3
+
+     -- Zap call arity info. We have used it by now (via
+     -- `tryEtaExpandRhs`), and the simplifier can invalidate this
+     -- information, leading to broken code later (e.g. #13479)
+    info5 = zapCallArityInfo info4
 
-addPolyBind _ env bind@(Rec _)
-  = return (extendFloats env bind)
-        -- Hack: letrecs are more awkward, so we extend "by steam"
-        -- without adding unfoldings etc.  At worst this leads to
-        -- more simplifier iterations
 
 {- Note [Arity decrease]
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -915,6 +727,26 @@ Here opInt has arity 1; but when we apply the rule its arity drops to 0.
 That's why Specialise goes to a little trouble to pin the right arity
 on specialised functions too.
 
+Note [Bottoming bindings]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+Suppose we have
+   let x = error "urk"
+   in ...(case x of <alts>)...
+or
+   let f = \x. error (x ++ "urk")
+   in ...(case f "foo" of <alts>)...
+
+Then we'd like to drop the dead <alts> immediately.  So it's good to
+propagate the info that x's RHS is bottom to x's IdInfo as rapidly as
+possible.
+
+We use tryEtaExpandRhs on every binding, and it turns ou that the
+arity computation it performs (via CoreArity.findRhsArity) already
+does a simple bottoming-expression analysis.  So all we need to do
+is propagate that info to the binder's IdInfo.
+
+This showed up in Trac #12150; see comment:16.
+
 Note [Setting the demand info]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 If the unfolding is a value, the demand info may
@@ -931,44 +763,6 @@ After inlining f at some of its call sites the original binding may
 (for example) be no longer strictly demanded.
 The solution here is a bit ad hoc...
 
-Note [Don't eta-expand join points]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Similarly to CPR (see Note [Don't CPR join points] in WorkWrap), a join point
-stands well to gain from its outer binding's eta-expansion, and eta-expanding a
-join point is fraught with issues like how to deal with a cast:
-
-    let join $j1 :: IO ()
-             $j1 = ...
-             $j2 :: Int -> IO ()
-             $j2 n = if n > 0 then $j1
-                              else ...
-
-    =>
-
-    let join $j1 :: IO ()
-             $j1 = (\eta -> ...)
-                     `cast` N:IO :: State# RealWorld -> (# State# RealWorld, ())
-                                 ~  IO ()
-             $j2 :: Int -> IO ()
-             $j2 n = (\eta -> if n > 0 then $j1
-                                       else ...)
-                     `cast` N:IO :: State# RealWorld -> (# State# RealWorld, ())
-                                 ~  IO ()
-
-The cast here can't be pushed inside the lambda (since it's not casting to a
-function type), so the lambda has to stay, but it can't because it contains a
-reference to a join point. In fact, $j2 can't be eta-expanded at all. Rather
-than try and detect this situation (and whatever other situations crop up!), we
-don't bother; again, any surrounding eta-expansion will improve these join
-points anyway, since an outer cast can *always* be pushed inside. By the time
-CorePrep comes around, the code is very likely to look more like this:
-
-    let join $j1 :: State# RealWorld -> (# State# RealWorld, ())
-             $j1 = (...) eta
-             $j2 :: Int -> State# RealWorld -> (# State# RealWorld, ())
-             $j2 = if n > 0 then $j1
-                            else (...) eta
 
 ************************************************************************
 *                                                                      *
@@ -1034,17 +828,17 @@ simplExprC :: SimplEnv
         -- Simplify an expression, given a continuation
 simplExprC env expr cont
   = -- pprTrace "simplExprC" (ppr expr $$ ppr cont {- $$ ppr (seIdSubst env) -} $$ ppr (seLetFloats env) ) $
-    do  { (env', expr') <- simplExprF (zapFloats env) expr cont
+    do  { (floats, expr') <- simplExprF env expr cont
         ; -- pprTrace "simplExprC ret" (ppr expr $$ ppr expr') $
           -- pprTrace "simplExprC ret3" (ppr (seInScope env')) $
           -- pprTrace "simplExprC ret4" (ppr (seLetFloats env')) $
-          return (wrapFloats env' expr') }
+          return (wrapFloats floats expr') }
 
 --------------------------------------------------
 simplExprF :: SimplEnv
            -> InExpr     -- A term-valued expression, never (Type ty)
            -> SimplCont
-           -> SimplM (SimplEnv, OutExpr)
+           -> SimplM (SimplFloats, OutExpr)
 
 simplExprF env e cont
   = {- pprTrace "simplExprF" (vcat
@@ -1054,12 +848,11 @@ simplExprF env e cont
       , text "tvsubst =" <+> ppr (seTvSubst env)
       , text "idsubst =" <+> ppr (seIdSubst env)
       , text "cvsubst =" <+> ppr (seCvSubst env)
-      {- , ppr (seLetFloats env) -}
       ]) $ -}
     simplExprF1 env e cont
 
 simplExprF1 :: SimplEnv -> InExpr -> SimplCont
-            -> SimplM (SimplEnv, OutExpr)
+            -> SimplM (SimplFloats, OutExpr)
 
 simplExprF1 _ (Type ty) _
   = pprPanic "simplExprF: type" (ppr ty)
@@ -1067,14 +860,14 @@ simplExprF1 _ (Type ty) _
     -- The (Type ty) case is handled separately by simplExpr
     -- and by the other callers of simplExprF
 
-simplExprF1 env (Var v)        cont = simplIdF env v cont
-simplExprF1 env (Lit lit)      cont = rebuild env (Lit lit) cont
-simplExprF1 env (Tick t expr)  cont = simplTick env t expr cont
-simplExprF1 env (Cast body co) cont = simplCast env body co cont
-simplExprF1 env (Coercion co)  cont = simplCoercionF env co cont
+simplExprF1 env (Var v)        cont = {-#SCC "simplIdF" #-} simplIdF env v cont
+simplExprF1 env (Lit lit)      cont = {-#SCC "rebuild" #-} rebuild env (Lit lit) cont
+simplExprF1 env (Tick t expr)  cont = {-#SCC "simplTick" #-} simplTick env t expr cont
+simplExprF1 env (Cast body co) cont = {-#SCC "simplCast" #-} simplCast env body co cont
+simplExprF1 env (Coercion co)  cont = {-#SCC "simplCoercionF" #-} simplCoercionF env co cont
 
 simplExprF1 env (App fun arg) cont
-  = case arg of
+  = {-#SCC "simplExprF1-App" #-} case arg of
       Type ty -> do { -- The argument type will (almost) certainly be used
                       -- in the output program, so just force it now.
                       -- See Note [Avoiding space leaks in OutType]
@@ -1094,7 +887,8 @@ simplExprF1 env (App fun arg) cont
                             , sc_dup = NoDup, sc_cont = cont }
 
 simplExprF1 env expr@(Lam {}) cont
-  = simplLam env zapped_bndrs body cont
+  = {-#SCC "simplExprF1-Lam" #-}
+    simplLam env zapped_bndrs body cont
         -- The main issue here is under-saturated lambdas
         --   (\x1. \x2. e) arg1
         -- Here x1 might have "occurs-once" occ-info, because occ-info
@@ -1116,28 +910,30 @@ simplExprF1 env expr@(Lam {}) cont
           | otherwise = zapLamIdInfo b
 
 simplExprF1 env (Case scrut bndr _ alts) cont
-  | sm_case_case (getMode env)
-  = simplExprF env scrut (Select { sc_dup = NoDup, sc_bndr = bndr
+  = {-#SCC "simplExprF1-Case" #-}
+    simplExprF env scrut (Select { sc_dup = NoDup, sc_bndr = bndr
                                  , sc_alts = alts
                                  , sc_env = env, sc_cont = cont })
-  | otherwise
-  = do { (env', scrut') <- simplExprF (zapFloats env) scrut $
-                           mkBoringStop (substTy env (idType bndr))
-       ; let scrut'' = wrapJoinFloats (seJoinFloats env') scrut'
-             env''   = env `addLetFloats` env'
-       ; rebuildCase env'' scrut'' bndr alts cont }
 
 simplExprF1 env (Let (Rec pairs) body) cont
-  = simplRecE env pairs body cont
+  | Just pairs' <- joinPointBindings_maybe pairs
+  = {-#SCC "simplRecJoinPoin" #-} simplRecJoinPoint env pairs' body cont
+
+  | otherwise
+  = {-#SCC "simplRecE" #-} simplRecE env pairs body cont
 
 simplExprF1 env (Let (NonRec bndr rhs) body) cont
   | Type ty <- rhs    -- First deal with type lets (let a = Type ty in e)
-  = ASSERT( isTyVar bndr )
+  = {-#SCC "simplExprF1-NonRecLet-Type" #-}
+    ASSERT( isTyVar bndr )
     do { ty' <- simplType env ty
        ; simplExprF (extendTvSubst env bndr ty') body cont }
 
+  | Just (bndr', rhs') <- joinPointBinding_maybe bndr rhs
+  = {-#SCC "simplNonRecJoinPoint" #-} simplNonRecJoinPoint env bndr' rhs' body cont
+
   | otherwise
-  = simplNonRecE env bndr (rhs, env) ([], body) cont
+  = {-#SCC "simplNonRecE" #-} simplNonRecE env bndr (rhs, env) ([], body) cont
 
 {- Note [Avoiding space leaks in OutType]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1213,15 +1009,16 @@ simplType env ty
 
 ---------------------------------
 simplCoercionF :: SimplEnv -> InCoercion -> SimplCont
-               -> SimplM (SimplEnv, OutExpr)
+               -> SimplM (SimplFloats, OutExpr)
 simplCoercionF env co cont
   = do { co' <- simplCoercion env co
        ; rebuild env (Coercion co') cont }
 
 simplCoercion :: SimplEnv -> InCoercion -> SimplM OutCoercion
 simplCoercion env co
-  = let opt_co = optCoercion (getTCvSubst env) co
-    in seqCo opt_co `seq` return opt_co
+  = do { dflags <- getDynFlags
+       ; let opt_co = optCoercion dflags (getTCvSubst env) co
+       ; seqCo opt_co `seq` return opt_co }
 
 -----------------------------------
 -- | Push a TickIt context outwards past applications and cases, as
@@ -1229,7 +1026,7 @@ simplCoercion env co
 -- optimisations apply.
 
 simplTick :: SimplEnv -> Tickish Id -> InExpr -> SimplCont
-          -> SimplM (SimplEnv, OutExpr)
+          -> SimplM (SimplFloats, OutExpr)
 simplTick env tickish expr cont
   -- A scoped tick turns into a continuation, so that we can spot
   -- (scc t (\x . e)) in simplLam and eliminate the scc.  If we didn't do
@@ -1256,8 +1053,8 @@ simplTick env tickish expr cont
   -- application context, allowing the normal case and application
   -- optimisations to fire.
   | tickish `tickishScopesLike` SoftScope
-  = do { (env', expr') <- simplExprF env expr cont
-       ; return (env', mkTick tickish expr')
+  = do { (floats, expr') <- simplExprF env expr cont
+       ; return (floats, mkTick tickish expr')
        }
 
   -- Push tick inside if the context looks like this will allow us to
@@ -1295,8 +1092,8 @@ simplTick env tickish expr cont
 
   no_floating_past_tick =
     do { let (inc,outc) = splitCont cont
-       ; (env1, expr1) <- simplExprF (zapFloats env) expr inc
-       ; let expr2    = wrapFloats env1 expr1
+       ; (floats, expr1) <- simplExprF env expr inc
+       ; let expr2    = wrapFloats floats expr1
              tickish' = simplTickish env tickish
        ; rebuild env (mkTick tickish' expr2) outc
        }
@@ -1378,27 +1175,28 @@ simplTick env tickish expr cont
 ************************************************************************
 -}
 
-rebuild :: SimplEnv -> OutExpr -> SimplCont -> SimplM (SimplEnv, OutExpr)
--- At this point the substitution in the SimplEnv should be irrelevant
--- only the in-scope set and floats should matter
+rebuild :: SimplEnv -> OutExpr -> SimplCont -> SimplM (SimplFloats, OutExpr)
+-- At this point the substitution in the SimplEnv should be irrelevant;
+-- only the in-scope set matters
 rebuild env expr cont
   = case cont of
-      Stop {}          -> return (env, expr)
+      Stop {}          -> return (emptyFloats env, expr)
       TickIt t cont    -> rebuild env (mkTick t expr) cont
       CastIt co cont   -> rebuild env (mkCast expr co) cont
                        -- NB: mkCast implements the (Coercion co |> g) optimisation
 
       Select { sc_bndr = bndr, sc_alts = alts, sc_env = se, sc_cont = cont }
-        -> rebuildCase (se `setFloats` env) expr bndr alts cont
+        -> rebuildCase (se `setInScopeFromE` env) expr bndr alts cont
 
       StrictArg { sc_fun = fun, sc_cont = cont }
         -> rebuildCall env (fun `addValArgTo` expr) cont
       StrictBind { sc_bndr = b, sc_bndrs = bs, sc_body = body
                  , sc_env = se, sc_cont = cont }
-        -> do { env' <- simplNonRecX (se `setFloats` env) b expr
-                             -- expr satisfies let/app since it started life
-                             -- in a call to simplNonRecE
-              ; simplLam env' bs body cont }
+        -> do { (floats1, env') <- simplNonRecX (se `setInScopeFromE` env) b expr
+                                  -- expr satisfies let/app since it started life
+                                  -- in a call to simplNonRecE
+              ; (floats2, expr') <- simplLam env' bs body cont
+              ; return (floats1 `addFloats` floats2, expr') }
 
       ApplyToTy  { sc_arg_ty = ty, sc_cont = cont}
         -> rebuild env (App expr (Type ty)) cont
@@ -1416,54 +1214,94 @@ rebuild env expr cont
 ************************************************************************
 -}
 
+{- Note [Optimising reflexivity]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+It's important (for compiler performance) to get rid of reflexivity as soon
+as it appears.  See Trac #11735, #14737, and #15019.
+
+In particular, we want to behave well on
+
+ *  e |> co1 |> co2
+    where the two happen to cancel out entirely. That is quite common;
+    e.g. a newtype wrapping and unwrapping cancel.
+
+
+ * (f |> co) @t1 @t2 ... @tn x1 .. xm
+   Here we wil use pushCoTyArg and pushCoValArg successively, which
+   build up NthCo stacks.  Silly to do that if co is reflexive.
+
+However, we don't want to call isReflexiveCo too much, because it uses
+type equality which is expensive on big types (Trac #14737 comment:7).
+
+A good compromise (determined experimentally) seems to be to call
+isReflexiveCo
+ * when composing casts, and
+ * at the end
+
+In investigating this I saw missed opportunities for on-the-fly
+coercion shrinkage. See Trac #15090.
+-}
+
+
 simplCast :: SimplEnv -> InExpr -> Coercion -> SimplCont
-          -> SimplM (SimplEnv, OutExpr)
+          -> SimplM (SimplFloats, OutExpr)
 simplCast env body co0 cont0
-  = do  { co1   <- simplCoercion env co0
-        ; cont1 <- addCoerce co1 cont0
-        ; simplExprF env body cont1 }
+  = do  { co1   <- {-#SCC "simplCast-simplCoercion" #-} simplCoercion env co0
+        ; cont1 <- {-#SCC "simplCast-addCoerce" #-}
+                   if isReflCo co1
+                   then return cont0  -- See Note [Optimising reflexivity]
+                   else addCoerce co1 cont0
+        ; {-#SCC "simplCast-simplExprF" #-} simplExprF env body cont1 }
   where
-       addCoerce :: OutCoercion -> SimplCont -> SimplM SimplCont
-       addCoerce co1 (CastIt co2 cont)
-         = addCoerce (mkTransCo co1 co2) cont
-
-       addCoerce co cont@(ApplyToTy { sc_arg_ty = arg_ty, sc_cont = tail })
-         | Just (arg_ty', co') <- pushCoTyArg co arg_ty
-         = do { tail' <- addCoerce co' tail
-              ; return (cont { sc_arg_ty = arg_ty', sc_cont = tail' }) }
-
-       addCoerce co cont@(ApplyToVal { sc_arg = arg, sc_env = arg_se
-                                , sc_dup = dup, sc_cont = tail })
-         | Just (co1, co2) <- pushCoValArg co
-         , Pair _ new_ty <- coercionKind co1
-         , not (isTypeLevPoly new_ty)  -- without this check, we get a lev-poly arg
-                                       -- See Note [Levity polymorphism invariants] in CoreSyn
-                                       -- test: typecheck/should_run/EtaExpandLevPoly
-         = do { tail' <- addCoerce co2 tail
-              ; if isReflCo co1
-                then return (cont { sc_cont = tail' })
-                     -- Avoid simplifying if possible;
-                     -- See Note [Avoiding exponential behaviour]
-                else do
-              { (dup', arg_se', arg') <- simplArg env dup arg_se arg
-                   -- When we build the ApplyTo we can't mix the OutCoercion
-                   -- 'co' with the InExpr 'arg', so we simplify
-                   -- to make it all consistent.  It's a bit messy.
-                   -- But it isn't a common case.
-                   -- Example of use: Trac #995
-              ; return (ApplyToVal { sc_arg  = mkCast arg' co1
-                                   , sc_env  = arg_se'
-                                   , sc_dup  = dup'
-                                   , sc_cont = tail' }) } }
-
-       addCoerce co cont
-         | isReflexiveCo co = return cont
-         | otherwise        = return (CastIt co cont)
-                -- It's worth checking isReflexiveCo.
-                -- For example, in the initial form of a worker
-                -- we may find  (coerce T (coerce S (\x.e))) y
-                -- and we'd like it to simplify to e[y/x] in one round
-                -- of simplification
+        -- If the first parameter is MRefl, then simplifying revealed a
+        -- reflexive coercion. Omit.
+        addCoerceM :: MOutCoercion -> SimplCont -> SimplM SimplCont
+        addCoerceM MRefl   cont = return cont
+        addCoerceM (MCo co) cont = addCoerce co cont
+
+        addCoerce :: OutCoercion -> SimplCont -> SimplM SimplCont
+        addCoerce co1 (CastIt co2 cont)  -- See Note [Optimising reflexivity]
+          | isReflexiveCo co' = return cont
+          | otherwise         = addCoerce co' cont
+          where
+            co' = mkTransCo co1 co2
+
+        addCoerce co cont@(ApplyToTy { sc_arg_ty = arg_ty, sc_cont = tail })
+          | Just (arg_ty', m_co') <- pushCoTyArg co arg_ty
+          = {-#SCC "addCoerce-pushCoTyArg" #-}
+            do { tail' <- addCoerceM m_co' tail
+               ; return (cont { sc_arg_ty = arg_ty', sc_cont = tail' }) }
+
+        addCoerce co cont@(ApplyToVal { sc_arg = arg, sc_env = arg_se
+                                      , sc_dup = dup, sc_cont = tail })
+          | Just (co1, m_co2) <- pushCoValArg co
+          , Pair _ new_ty <- coercionKind co1
+          , not (isTypeLevPoly new_ty)  -- Without this check, we get a lev-poly arg
+                                        -- See Note [Levity polymorphism invariants] in CoreSyn
+                                        -- test: typecheck/should_run/EtaExpandLevPoly
+          = {-#SCC "addCoerce-pushCoValArg" #-}
+            do { tail' <- addCoerceM m_co2 tail
+               ; if isReflCo co1
+                 then return (cont { sc_cont = tail' })
+                      -- Avoid simplifying if possible;
+                      -- See Note [Avoiding exponential behaviour]
+                 else do
+               { (dup', arg_se', arg') <- simplArg env dup arg_se arg
+                    -- When we build the ApplyTo we can't mix the OutCoercion
+                    -- 'co' with the InExpr 'arg', so we simplify
+                    -- to make it all consistent.  It's a bit messy.
+                    -- But it isn't a common case.
+                    -- Example of use: Trac #995
+               ; return (ApplyToVal { sc_arg  = mkCast arg' co1
+                                    , sc_env  = arg_se'
+                                    , sc_dup  = dup'
+                                    , sc_cont = tail' }) } }
+
+        addCoerce co cont
+          | isReflexiveCo co = return cont  -- Having this at the end makes a huge
+                                            -- difference in T12227, for some reason
+                                            -- See Note [Optimising reflexivity]
+          | otherwise        = return (CastIt co cont)
 
 simplArg :: SimplEnv -> DupFlag -> StaticEnv -> CoreExpr
          -> SimplM (DupFlag, StaticEnv, OutExpr)
@@ -1471,7 +1309,7 @@ simplArg env dup_flag arg_env arg
   | isSimplified dup_flag
   = return (dup_flag, arg_env, arg)
   | otherwise
-  = do { arg' <- simplExpr (arg_env `setInScopeAndZapFloats` env) arg
+  = do { arg' <- simplExpr (arg_env `setInScopeFromE` env) arg
        ; return (Simplified, zapSubstEnv arg_env, arg') }
 
 {-
@@ -1480,27 +1318,13 @@ simplArg env dup_flag arg_env arg
 \subsection{Lambdas}
 *                                                                      *
 ************************************************************************
-
-Note [Zap unfolding when beta-reducing]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Lambda-bound variables can have stable unfoldings, such as
-   $j = \x. \b{Unf=Just x}. e
-See Note [Case binders and join points] below; the unfolding for lets
-us optimise e better.  However when we beta-reduce it we want to
-revert to using the actual value, otherwise we can end up in the
-stupid situation of
-          let x = blah in
-          let b{Unf=Just x} = y
-          in ...b...
-Here it'd be far better to drop the unfolding and use the actual RHS.
 -}
 
 simplLam :: SimplEnv -> [InId] -> InExpr -> SimplCont
-         -> SimplM (SimplEnv, OutExpr)
+         -> SimplM (SimplFloats, OutExpr)
 
-simplLam env [] body cont = simplExprF env body cont
-
-        -- Beta reduction
+simplLam env [] body cont
+  = simplExprF env body cont
 
 simplLam env (bndr:bndrs) body (ApplyToTy { sc_arg_ty = arg_ty, sc_cont = cont })
   = do { tick (BetaReduction bndr)
@@ -1511,8 +1335,9 @@ simplLam env (bndr:bndrs) body (ApplyToVal { sc_arg = arg, sc_env = arg_se
   | isSimplified dup  -- Don't re-simplify if we've simplified it once
                       -- See Note [Avoiding exponential behaviour]
   = do  { tick (BetaReduction bndr)
-        ; env' <- simplNonRecX env zapped_bndr arg
-        ; simplLam env' bndrs body cont }
+        ; (floats1, env') <- simplNonRecX env zapped_bndr arg
+        ; (floats2, expr') <- simplLam env' bndrs body cont
+        ; return (floats1 `addFloats` floats2, expr') }
 
   | otherwise
   = do  { tick (BetaReduction bndr)
@@ -1522,7 +1347,7 @@ simplLam env (bndr:bndrs) body (ApplyToVal { sc_arg = arg, sc_env = arg_se
       | isId bndr = zapStableUnfolding bndr
       | otherwise = bndr
 
-      -- discard a non-counting tick on a lambda.  This may change the
+      -- Discard a non-counting tick on a lambda.  This may change the
       -- cost attribution slightly (moving the allocation of the
       -- lambda elsewhere), but we don't care: optimisation changes
       -- cost attribution all the time.
@@ -1537,9 +1362,6 @@ simplLam env bndrs body cont
         ; new_lam <- mkLam env bndrs' body' cont
         ; rebuild env' new_lam cont }
 
-simplLamBndrs :: SimplEnv -> [InBndr] -> SimplM (SimplEnv, [OutBndr])
-simplLamBndrs env bndrs = mapAccumLM simplLamBndr env bndrs
-
 -------------
 simplLamBndr :: SimplEnv -> InBndr -> SimplM (SimplEnv, OutBndr)
 -- Used for lambda binders.  These sometimes have unfoldings added by
@@ -1551,7 +1373,8 @@ simplLamBndr :: SimplEnv -> InBndr -> SimplM (SimplEnv, OutBndr)
 simplLamBndr env bndr
   | isId bndr && isFragileUnfolding old_unf   -- Special case
   = do { (env1, bndr1) <- simplBinder env bndr
-       ; unf'          <- simplUnfolding env1 NotTopLevel Nothing bndr old_unf
+       ; unf'          <- simplStableUnfolding env1 NotTopLevel Nothing bndr
+                                               old_unf (idType bndr1)
        ; let bndr2 = bndr1 `setIdUnfolding` unf'
        ; return (modifyInScope env1 bndr2, bndr2) }
 
@@ -1560,18 +1383,21 @@ simplLamBndr env bndr
   where
     old_unf = idUnfolding bndr
 
+simplLamBndrs :: SimplEnv -> [InBndr] -> SimplM (SimplEnv, [OutBndr])
+simplLamBndrs env bndrs = mapAccumLM simplLamBndr env bndrs
+
 ------------------
 simplNonRecE :: SimplEnv
              -> InId                    -- The binder, always an Id
-                                        -- Can be a join point
+                                        -- Never a join point
              -> (InExpr, SimplEnv)      -- Rhs of binding (or arg of lambda)
              -> ([InBndr], InExpr)      -- Body of the let/lambda
                                         --      \xs.e
              -> SimplCont
-             -> SimplM (SimplEnv, OutExpr)
+             -> SimplM (SimplFloats, OutExpr)
 
 -- simplNonRecE is used for
---  * non-top-level non-recursive lets in expressions
+--  * non-top-level non-recursive non-join-point lets in expressions
 --  * beta reduction
 --
 -- simplNonRec env b (rhs, rhs_se) (bs, body) k
@@ -1590,74 +1416,276 @@ simplNonRecE :: SimplEnv
 --       the call to simplLam in simplExprF (Lam ...)
 
 simplNonRecE env bndr (rhs, rhs_se) (bndrs, body) cont
-  = ASSERT( isId bndr )
-    do dflags <- getDynFlags
-       case () of
-         _ | preInlineUnconditionally dflags env NotTopLevel bndr rhs
-           -> do { tick (PreInlineUnconditionally bndr)
-                 ; -- pprTrace "preInlineUncond" (ppr bndr <+> ppr rhs) $
-                  simplLam (extendIdSubst env bndr (mkContEx rhs_se rhs)) bndrs body cont }
-
-           -- Deal with join points
-           | Just (bndr', rhs') <- joinPointBinding_maybe bndr rhs
-           -> ASSERT( null bndrs )  -- Must be a let-binding;
-                                    -- join points are never lambda-bound
-              do { (env1, cont') <- prepareJoinCont env cont
-
-                   -- We push cont_dup into the join RHS and the body;
-                   -- and wrap cont_nodup around the whole thing
-                 ; let res_ty = contResultType cont'
-                 ; (env2, bndr1) <- simplNonRecJoinBndr env1 res_ty bndr'
-                 ; (env3, bndr2) <- addBndrRules env2 bndr' bndr1
-                 ; env4 <- simplJoinBind env3 NonRecursive cont'
-                                         bndr' bndr2 rhs' rhs_se
-                 ; simplExprF env4 body cont' }
-
-           -- Deal with strict bindings
-           | isStrictId bndr          -- Includes coercions
-           , sm_case_case (getMode env)
-           -> simplExprF (rhs_se `setFloats` env) rhs
-                         (StrictBind { sc_bndr = bndr, sc_bndrs = bndrs, sc_body = body
-                                     , sc_env = env, sc_cont = cont, sc_dup = NoDup })
-
-           -- Deal with lazy bindings
-           | otherwise
-           -> ASSERT( not (isTyVar bndr) )
-              do { (env1, bndr1) <- simplNonRecBndr env bndr
-                 ; (env2, bndr2) <- addBndrRules env1 bndr bndr1
-                 ; env3 <- simplLazyBind env2 NotTopLevel NonRecursive bndr bndr2 rhs rhs_se
-                 ; simplLam env3 bndrs body cont }
+  | ASSERT( isId bndr && not (isJoinId bndr) ) True
+  , Just env' <- preInlineUnconditionally env NotTopLevel bndr rhs rhs_se
+  = do { tick (PreInlineUnconditionally bndr)
+       ; -- pprTrace "preInlineUncond" (ppr bndr <+> ppr rhs) $
+         simplLam env' bndrs body cont }
+
+  -- Deal with strict bindings
+  | isStrictId bndr          -- Includes coercions
+  , sm_case_case (getMode env)
+  = simplExprF (rhs_se `setInScopeFromE` env) rhs
+               (StrictBind { sc_bndr = bndr, sc_bndrs = bndrs, sc_body = body
+                           , sc_env = env, sc_cont = cont, sc_dup = NoDup })
+
+  -- Deal with lazy bindings
+  | otherwise
+  = ASSERT( not (isTyVar bndr) )
+    do { (env1, bndr1) <- simplNonRecBndr env bndr
+       ; (env2, bndr2) <- addBndrRules env1 bndr bndr1 Nothing
+       ; (floats1, env3) <- simplLazyBind env2 NotTopLevel NonRecursive bndr bndr2 rhs rhs_se
+       ; (floats2, expr') <- simplLam env3 bndrs body cont
+       ; return (floats1 `addFloats` floats2, expr') }
 
 ------------------
 simplRecE :: SimplEnv
           -> [(InId, InExpr)]
           -> InExpr
           -> SimplCont
-          -> SimplM (SimplEnv, OutExpr)
+          -> SimplM (SimplFloats, OutExpr)
 
 -- simplRecE is used for
 --  * non-top-level recursive lets in expressions
 simplRecE env pairs body cont
-  | Just pairs' <- joinPointBindings_maybe pairs
-  = do  { (env1, cont') <- prepareJoinCont env cont
-        ; let bndrs' = map fst pairs'
-              res_ty = contResultType cont
-        ; env2 <- simplRecJoinBndrs env1 res_ty bndrs'
-                -- NB: bndrs' don't have unfoldings or rules
-                -- We add them as we go down
-        ; env3 <- simplRecBind env2 NotTopLevel (Just cont') pairs'
-        ; simplExprF env3 body cont' }
-
-  | otherwise
   = do  { let bndrs = map fst pairs
         ; MASSERT(all (not . isJoinId) bndrs)
         ; env1 <- simplRecBndrs env bndrs
                 -- NB: bndrs' don't have unfoldings or rules
                 -- We add them as we go down
-        ; env2 <- simplRecBind env1 NotTopLevel Nothing pairs
-        ; simplExprF env2 body cont }
+        ; (floats1, env2) <- simplRecBind env1 NotTopLevel Nothing pairs
+        ; (floats2, expr') <- simplExprF env2 body cont
+        ; return (floats1 `addFloats` floats2, expr') }
+
+{- Note [Avoiding exponential behaviour]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+One way in which we can get exponential behaviour is if we simplify a
+big expression, and the re-simplify it -- and then this happens in a
+deeply-nested way.  So we must be jolly careful about re-simplifying
+an expression.  That is why completeNonRecX does not try
+preInlineUnconditionally.
+
+Example:
+  f BIG, where f has a RULE
+Then
+ * We simplify BIG before trying the rule; but the rule does not fire
+ * We inline f = \x. x True
+ * So if we did preInlineUnconditionally we'd re-simplify (BIG True)
+
+However, if BIG has /not/ already been simplified, we'd /like/ to
+simplify BIG True; maybe good things happen.  That is why
+
+* simplLam has
+    - a case for (isSimplified dup), which goes via simplNonRecX, and
+    - a case for the un-simplified case, which goes via simplNonRecE
+
+* We go to some efforts to avoid unnecessarily simplifying ApplyToVal,
+  in at least two places
+    - In simplCast/addCoerce, where we check for isReflCo
+    - In rebuildCall we avoid simplifying arguments before we have to
+      (see Note [Trying rewrite rules])
+
+
+Note [Zap unfolding when beta-reducing]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Lambda-bound variables can have stable unfoldings, such as
+   $j = \x. \b{Unf=Just x}. e
+See Note [Case binders and join points] below; the unfolding for lets
+us optimise e better.  However when we beta-reduce it we want to
+revert to using the actual value, otherwise we can end up in the
+stupid situation of
+          let x = blah in
+          let b{Unf=Just x} = y
+          in ...b...
+Here it'd be far better to drop the unfolding and use the actual RHS.
+
+************************************************************************
+*                                                                      *
+                     Join points
+*                                                                      *
+********************************************************************* -}
+
+{- Note [Rules and unfolding for join points]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Suppose we have
+
+   simplExpr (join j x = rhs                         ) cont
+             (      {- RULE j (p:ps) = blah -}       )
+             (      {- StableUnfolding j = blah -}   )
+             (in blah                                )
+
+Then we will push 'cont' into the rhs of 'j'.  But we should *also* push
+'cont' into the RHS of
+  * Any RULEs for j, e.g. generated by SpecConstr
+  * Any stable unfolding for j, e.g. the result of an INLINE pragma
+
+Simplifying rules and stable-unfoldings happens a bit after
+simplifying the right-hand side, so we remember whether or not it
+is a join point, and what 'cont' is, in a value of type MaybeJoinCont
+
+Trac #13900 wsa caused by forgetting to push 'cont' into the RHS
+of a SpecConstr-generated RULE for a join point.
+-}
+
+type MaybeJoinCont = Maybe SimplCont
+  -- Nothing => Not a join point
+  -- Just k  => This is a join binding with continuation k
+  -- See Note [Rules and unfolding for join points]
+
+simplNonRecJoinPoint :: SimplEnv -> InId -> InExpr
+                     -> InExpr -> SimplCont
+                     -> SimplM (SimplFloats, OutExpr)
+simplNonRecJoinPoint env bndr rhs body cont
+  | ASSERT( isJoinId bndr ) True
+  , Just env' <- preInlineUnconditionally env NotTopLevel bndr rhs env
+  = do { tick (PreInlineUnconditionally bndr)
+       ; simplExprF env' body cont }
+
+   | otherwise
+   = wrapJoinCont env cont $ \ env cont ->
+     do { -- We push join_cont into the join RHS and the body;
+          -- and wrap wrap_cont around the whole thing
+        ; let res_ty = contResultType cont
+        ; (env1, bndr1)    <- simplNonRecJoinBndr env res_ty bndr
+        ; (env2, bndr2)    <- addBndrRules env1 bndr bndr1 (Just cont)
+        ; (floats1, env3)  <- simplJoinBind env2 cont bndr bndr2 rhs env
+        ; (floats2, body') <- simplExprF env3 body cont
+        ; return (floats1 `addFloats` floats2, body') }
+
+
+------------------
+simplRecJoinPoint :: SimplEnv -> [(InId, InExpr)]
+                  -> InExpr -> SimplCont
+                  -> SimplM (SimplFloats, OutExpr)
+simplRecJoinPoint env pairs body cont
+  = wrapJoinCont env cont $ \ env cont ->
+    do { let bndrs = map fst pairs
+             res_ty = contResultType cont
+       ; env1 <- simplRecJoinBndrs env res_ty bndrs
+               -- NB: bndrs' don't have unfoldings or rules
+               -- We add them as we go down
+       ; (floats1, env2)  <- simplRecBind env1 NotTopLevel (Just cont) pairs
+       ; (floats2, body') <- simplExprF env2 body cont
+       ; return (floats1 `addFloats` floats2, body') }
+
+--------------------
+wrapJoinCont :: SimplEnv -> SimplCont
+             -> (SimplEnv -> SimplCont -> SimplM (SimplFloats, OutExpr))
+             -> SimplM (SimplFloats, OutExpr)
+-- Deal with making the continuation duplicable if necessary,
+-- and with the no-case-of-case situation.
+wrapJoinCont env cont thing_inside
+  | contIsStop cont        -- Common case; no need for fancy footwork
+  = thing_inside env cont
+
+  | not (sm_case_case (getMode env))
+    -- See Note [Join points wih -fno-case-of-case]
+  = do { (floats1, expr1) <- thing_inside env (mkBoringStop (contHoleType cont))
+       ; let (floats2, expr2) = wrapJoinFloatsX floats1 expr1
+       ; (floats3, expr3) <- rebuild (env `setInScopeFromF` floats2) expr2 cont
+       ; return (floats2 `addFloats` floats3, expr3) }
+
+  | otherwise
+    -- Normal case; see Note [Join points and case-of-case]
+  = do { (floats1, cont')  <- mkDupableCont env cont
+       ; (floats2, result) <- thing_inside (env `setInScopeFromF` floats1) cont'
+       ; return (floats1 `addFloats` floats2, result) }
+
+
+--------------------
+trimJoinCont :: Id -> Maybe JoinArity -> SimplCont -> SimplCont
+-- Drop outer context from join point invocation (jump)
+-- See Note [Join points and case-of-case]
+
+trimJoinCont _ Nothing cont
+  = cont -- Not a jump
+trimJoinCont var (Just arity) cont
+  = trim arity cont
+  where
+    trim 0 cont@(Stop {})
+      = cont
+    trim 0 cont
+      = mkBoringStop (contResultType cont)
+    trim n cont@(ApplyToVal { sc_cont = k })
+      = cont { sc_cont = trim (n-1) k }
+    trim n cont@(ApplyToTy { sc_cont = k })
+      = cont { sc_cont = trim (n-1) k } -- join arity counts types!
+    trim _ cont
+      = pprPanic "completeCall" $ ppr var $$ ppr cont
+
+
+{- Note [Join points and case-of-case]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When we perform the case-of-case transform (or otherwise push continuations
+inward), we want to treat join points specially. Since they're always
+tail-called and we want to maintain this invariant, we can do this (for any
+evaluation context E):
+
+  E[join j = e
+    in case ... of
+         A -> jump j 1
+         B -> jump j 2
+         C -> f 3]
+
+    -->
+
+  join j = E[e]
+  in case ... of
+       A -> jump j 1
+       B -> jump j 2
+       C -> E[f 3]
+
+As is evident from the example, there are two components to this behavior:
+
+  1. When entering the RHS of a join point, copy the context inside.
+  2. When a join point is invoked, discard the outer context.
+
+We need to be very careful here to remain consistent---neither part is
+optional!
+
+We need do make the continuation E duplicable (since we are duplicating it)
+with mkDuableCont.
+
+
+Note [Join points wih -fno-case-of-case]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Supose case-of-case is switched off, and we are simplifying
+
+    case (join j x = <j-rhs> in
+          case y of
+             A -> j 1
+             B -> j 2
+             C -> e) of <outer-alts>
+
+Usually, we'd push the outer continuation (case . of <outer-alts>) into
+both the RHS and the body of the join point j.  But since we aren't doing
+case-of-case we may then end up with this totally bogus result
+
+    join x = case <j-rhs> of <outer-alts> in
+    case (case y of
+             A -> j 1
+             B -> j 2
+             C -> e) of <outer-alts>
+
+This would be OK in the language of the paper, but not in GHC: j is no longer
+a join point.  We can only do the "push contination into the RHS of the
+join point j" if we also push the contination right down to the /jumps/ to
+j, so that it can evaporate there.  If we are doing case-of-case, we'll get to
+
+    join x = case <j-rhs> of <outer-alts> in
+    case y of
+      A -> j 1
+      B -> j 2
+      C -> case e of <outer-alts>
+
+which is great.
+
+Bottom line: if case-of-case is off, we must stop pushing the continuation
+inwards altogether at any join point.  Instead simplify the (join ... in ...)
+with a Stop continuation, and wrap the original continuation around the
+outside.  Surprisingly tricky!
+
 
-{-
 ************************************************************************
 *                                                                      *
                      Variables
@@ -1676,67 +1704,53 @@ simplVar env var
         DoneId var1          -> return (Var var1)
         DoneEx e _           -> return e
 
-simplIdF :: SimplEnv -> InId -> SimplCont -> SimplM (SimplEnv, OutExpr)
+simplIdF :: SimplEnv -> InId -> SimplCont -> SimplM (SimplFloats, OutExpr)
 simplIdF env var cont
   = case substId env var of
-        ContEx tvs cvs ids e -> simplExprF (setSubstEnv env tvs cvs ids) e cont
-                                  -- Don't trim; haven't already simplified e,
-                                  -- so the cont is not embodied in e
-
-        DoneId var1      -> completeCall env var1 (trim_cont (isJoinId_maybe var1))
-        DoneEx e mb_join -> simplExprF (zapSubstEnv env) e (trim_cont mb_join)
-                -- Note [zapSubstEnv]
-                -- The template is already simplified, so don't re-substitute.
-                -- This is VITAL.  Consider
-                --      let x = e in
-                --      let y = \z -> ...x... in
-                --      \ x -> ...y...
-                -- We'll clone the inner \x, adding x->x' in the id_subst
-                -- Then when we inline y, we must *not* replace x by x' in
-                -- the inlined copy!!
-  where
-    trim_cont (Just arity) = trim arity cont
-    trim_cont Nothing      = cont
-
-    -- Drop outer context from join point invocation
-    -- Note [Case-of-case and join points]
-    trim 0 cont@(Stop {})
-      = cont
-    trim 0 cont
-      = mkBoringStop (contResultType cont)
-    trim n cont@(ApplyToVal { sc_cont = k })
-      = cont { sc_cont = trim (n-1) k }
-    trim n cont@(ApplyToTy { sc_cont = k })
-      = cont { sc_cont = trim (n-1) k } -- join arity counts types!
-    trim _ cont
-      = pprPanic "completeCall" $ ppr var $$ ppr cont
+      ContEx tvs cvs ids e -> simplExprF (setSubstEnv env tvs cvs ids) e cont
+                                -- Don't trim; haven't already simplified e,
+                                -- so the cont is not embodied in e
+
+      DoneId var1 -> completeCall env var1 (trimJoinCont var (isJoinId_maybe var1) cont)
+
+      DoneEx e mb_join -> simplExprF (zapSubstEnv env) e (trimJoinCont var mb_join cont)
+              -- Note [zapSubstEnv]
+              -- The template is already simplified, so don't re-substitute.
+              -- This is VITAL.  Consider
+              --      let x = e in
+              --      let y = \z -> ...x... in
+              --      \ x -> ...y...
+              -- We'll clone the inner \x, adding x->x' in the id_subst
+              -- Then when we inline y, we must *not* replace x by x' in
+              -- the inlined copy!!
 
 ---------------------------------------------------------
 --      Dealing with a call site
 
-completeCall :: SimplEnv -> OutId -> SimplCont -> SimplM (SimplEnv, OutExpr)
+completeCall :: SimplEnv -> OutId -> SimplCont -> SimplM (SimplFloats, OutExpr)
 completeCall env var cont
-  = do  {   ------------- Try inlining ----------------
-          dflags <- getDynFlags
-        ; let  (lone_variable, arg_infos, call_cont) = contArgs cont
-               n_val_args = length arg_infos
-               interesting_cont = interestingCallContext call_cont
-               unfolding    = activeUnfolding env var
-               maybe_inline = callSiteInline dflags var unfolding
-                                             lone_variable arg_infos interesting_cont
-        ; case maybe_inline of
-            Just expr      -- There is an inlining!
-              ->  do { checkedTick (UnfoldingDone var)
-                     ; dump_inline dflags expr cont
-                     ; simplExprF (zapSubstEnv env) expr cont }
-
-            ; Nothing -> do { rule_base <- getSimplRules
-                            ; let info = mkArgInfo var (getRules rule_base var)
-                                                   n_val_args call_cont
-                            ; rebuildCall env info cont }
-    }
+  | Just expr <- callSiteInline dflags var active_unf
+                                lone_variable arg_infos interesting_cont
+  -- Inline the variable's RHS
+  = do { checkedTick (UnfoldingDone var)
+       ; dump_inline expr cont
+       ; simplExprF (zapSubstEnv env) expr cont }
+
+  | otherwise
+  -- Don't inline; instead rebuild the call
+  = do { rule_base <- getSimplRules
+       ; let info = mkArgInfo env var (getRules rule_base var)
+                              n_val_args call_cont
+       ; rebuildCall env info cont }
+
   where
-    dump_inline dflags unfolding cont
+    dflags = seDynFlags env
+    (lone_variable, arg_infos, call_cont) = contArgs cont
+    n_val_args       = length arg_infos
+    interesting_cont = interestingCallContext env call_cont
+    active_unf       = activeUnfolding (getMode env) var
+
+    dump_inline unfolding cont
       | not (dopt Opt_D_dump_inlinings dflags) = return ()
       | not (dopt Opt_D_verbose_core2core dflags)
       = when (isExternalName (idName var)) $
@@ -1751,7 +1765,7 @@ completeCall env var cont
 rebuildCall :: SimplEnv
             -> ArgInfo
             -> SimplCont
-            -> SimplM (SimplEnv, OutExpr)
+            -> SimplM (SimplFloats, OutExpr)
 -- We decided not to inline, so
 --    - simplify the arguments
 --    - try rewrite rules
@@ -1773,7 +1787,7 @@ rebuildCall env (ArgInfo { ai_fun = fun, ai_args = rev_args, ai_strs = [] }) con
                                  -- continuation to discard, else we do it
                                  -- again and again!
   = seqType cont_ty `seq`        -- See Note [Avoiding space leaks in OutType]
-    return (env, castBottomExpr res cont_ty)
+    return (emptyFloats env, castBottomExpr res cont_ty)
   where
     res     = argInfoExpr fun rev_args
     cont_ty = contResultType cont
@@ -1812,10 +1826,10 @@ rebuildCall env info@(ArgInfo { ai_encl = encl_rules, ai_type = fun_ty
   | isSimplified dup_flag     -- See Note [Avoid redundant simplification]
   = rebuildCall env (addValArgTo info' arg) cont
 
-  | str                 -- Strict argument
+  | str         -- Strict argument
   , sm_case_case (getMode env)
   = -- pprTrace "Strict Arg" (ppr arg $$ ppr (seIdSubst env) $$ ppr (seInScope env)) $
-    simplExprF (arg_se `setFloats` env) arg
+    simplExprF (arg_se `setInScopeFromE` env) arg
                (StrictArg { sc_fun = info', sc_cci = cci_strict
                           , sc_dup = Simplified, sc_cont = cont })
                 -- Note [Shadowing]
@@ -1825,7 +1839,7 @@ rebuildCall env info@(ArgInfo { ai_encl = encl_rules, ai_type = fun_ty
         -- There is no benefit (unlike in a let-binding), and we'd
         -- have to be very careful about bogus strictness through
         -- floating a demanded let.
-  = do  { arg' <- simplExprC (arg_se `setInScopeAndZapFloats` env) arg
+  = do  { arg' <- simplExprC (arg_se `setInScopeFromE` env) arg
                              (mkLazyArgStop arg_ty cci_lazy)
         ; rebuildCall env (addValArgTo info' arg') cont }
   where
@@ -1936,13 +1950,13 @@ tryRules :: SimplEnv -> [CoreRule]
 tryRules env rules fn args call_cont
   | null rules
   = return Nothing
+
 {- Disabled until we fix #8326
   | fn `hasKey` tagToEnumKey   -- See Note [Optimising tagToEnum#]
   , [_type_arg, val_arg] <- args
   , Select dup bndr ((_,[],rhs1) : rest_alts) se cont <- call_cont
   , isDeadBinder bndr
-  = do { dflags <- getDynFlags
-       ; let enum_to_tag :: CoreAlt -> CoreAlt
+  = do { let enum_to_tag :: CoreAlt -> CoreAlt
                 -- Takes   K -> e  into   tagK# -> e
                 -- where tagK# is the tag of constructor K
              enum_to_tag (DataAlt con, [], rhs)
@@ -1957,35 +1971,39 @@ tryRules env rules fn args call_cont
                  -- The binder is dead, but should have the right type
       ; return (Just (val_arg, Select dup new_bndr new_alts se cont)) }
 -}
-  | otherwise
-  = do { dflags <- getDynFlags
-       ; case lookupRule dflags (getUnfoldingInRuleMatch env) (activeRule env)
-                         fn (argInfoAppArgs args) rules of {
-           Nothing ->
-             do { nodump dflags  -- This ensures that an empty file is written
-                ; return Nothing } ;  -- No rule matches
-           Just (rule, rule_rhs) ->
-             do { checkedTick (RuleFired (ruleName rule))
-                ; let cont' = pushSimplifiedArgs zapped_env
-                                                 (drop (ruleArity rule) args)
-                                                 call_cont
-                              -- (ruleArity rule) says how
-                              -- many args the rule consumed
-
-                      occ_anald_rhs = occurAnalyseExpr rule_rhs
-                          -- See Note [Occurrence-analyse after rule firing]
-                ; dump dflags rule rule_rhs
-                ; return (Just (zapped_env, occ_anald_rhs, cont')) }}}
-                     -- The occ_anald_rhs and cont' are all Out things
-                     -- hence zapping the environment
+
+  | Just (rule, rule_rhs) <- lookupRule dflags (getUnfoldingInRuleMatch env)
+                                        (activeRule (getMode env)) fn
+                                        (argInfoAppArgs args) rules
+  -- Fire a rule for the function
+  = do { checkedTick (RuleFired (ruleName rule))
+       ; let cont' = pushSimplifiedArgs zapped_env
+                                        (drop (ruleArity rule) args)
+                                        call_cont
+                     -- (ruleArity rule) says how
+                     -- many args the rule consumed
+
+             occ_anald_rhs = occurAnalyseExpr rule_rhs
+                 -- See Note [Occurrence-analyse after rule firing]
+       ; dump rule rule_rhs
+       ; return (Just (zapped_env, occ_anald_rhs, cont')) }
+            -- The occ_anald_rhs and cont' are all Out things
+            -- hence zapping the environment
+
+  | otherwise  -- No rule fires
+  = do { nodump  -- This ensures that an empty file is written
+       ; return Nothing }
+
   where
+    dflags     = seDynFlags env
     zapped_env = zapSubstEnv env  -- See Note [zapSubstEnv]
 
-    printRuleModule rule =
-      parens
-        (maybe (text "BUILTIN") (pprModuleName . moduleName) (ruleModule rule))
+    printRuleModule rule
+      = parens (maybe (text "BUILTIN")
+                      (pprModuleName . moduleName)
+                      (ruleModule rule))
 
-    dump dflags rule rule_rhs
+    dump rule rule_rhs
       | dopt Opt_D_dump_rule_rewrites dflags
       = log_rule dflags Opt_D_dump_rule_rewrites "Rule fired" $ vcat
           [ text "Rule:" <+> ftext (ruleName rule)
@@ -2002,7 +2020,7 @@ tryRules env rules fn args call_cont
       | otherwise
       = return ()
 
-    nodump dflags
+    nodump
       | dopt Opt_D_dump_rule_rewrites dflags
       = liftIO $ dumpSDoc dflags alwaysQualify Opt_D_dump_rule_rewrites "" empty
 
@@ -2195,49 +2213,62 @@ to just
 This particular example shows up in default methods for
 comparison operations (e.g. in (>=) for Int.Int32)
 
-Note [Case elimination: lifted case]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-If a case over a lifted type has a single alternative, and is being used
-as a strict 'let' (all isDeadBinder bndrs), we may want to do this
-transformation:
+Note [Case to let transformation]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If a case over a lifted type has a single alternative, and is being
+used as a strict 'let' (all isDeadBinder bndrs), we may want to do
+this transformation:
 
     case e of r       ===>   let r = e in ...r...
       _ -> ...r...
 
-        (a) 'e' is already evaluated (it may so if e is a variable)
-            Specifically we check (exprIsHNF e).  In this case
-            we can just allocate the WHNF directly with a let.
-or
-        (b) 'x' is not used at all and e is ok-for-speculation
-             The ok-for-spec bit checks that we don't lose any
-             exceptions or divergence.
+We treat the unlifted and lifted cases separately:
+
+* Unlifted case: 'e' satisfies exprOkForSpeculation
+  (ok-for-spec is needed to satisfy the let/app invariant).
+  This turns     case a +# b of r -> ...r...
+  into           let r = a +# b in ...r...
+  and thence     .....(a +# b)....
+
+  However, if we have
+      case indexArray# a i of r -> ...r...
+  we might like to do the same, and inline the (indexArray# a i).
+  But indexArray# is not okForSpeculation, so we don't build a let
+  in rebuildCase (lest it get floated *out*), so the inlining doesn't
+  happen either.  Annoying.
+
+* Lifted case: we need to be sure that the expression is already
+  evaluated (exprIsHNF).  If it's not already evaluated
+      - we risk losing exceptions, divergence or
+        user-specified thunk-forcing
+      - even if 'e' is guaranteed to converge, we don't want to
+        create a thunk (call by need) instead of evaluating it
+        right away (call by value)
+
+  However, we can turn the case into a /strict/ let if the 'r' is
+  used strictly in the body.  Then we won't lose divergence; and
+  we won't build a thunk because the let is strict.
+  See also Note [Case-to-let for strictly-used binders]
+
+  NB: absentError satisfies exprIsHNF: see Note [aBSENT_ERROR_ID] in MkCore.
+  We want to turn
+     case (absentError "foo") of r -> ...MkT r...
+  into
+     let r = absentError "foo" in ...MkT r...
+
+
+Note [Case-to-let for strictly-used binders]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If we have this:
+   case <scrut> of r { _ -> ..r.. }
 
-             NB: it'd be *sound* to switch from case to let if the
-             scrutinee was not yet WHNF but was guaranteed to
-             converge; but sticking with case means we won't build a
-             thunk
+where 'r' is used strictly in (..r..), we can safely transform to
+   let r = <scrut> in ...r...
 
-or
-        (c) 'x' is used strictly in the body, and 'e' is a variable
-            Then we can just substitute 'e' for 'x' in the body.
-            See Note [Eliminating redundant seqs]
-
-For (b), the "not used at all" test is important.  Consider
-   case (case a ># b of { True -> (p,q); False -> (q,p) }) of
-     r -> blah
-The scrutinee is ok-for-speculation (it looks inside cases), but we do
-not want to transform to
-   let r = case a ># b of { True -> (p,q); False -> (q,p) }
-   in blah
-because that builds an unnecessary thunk.
-
-Note [Eliminating redundant seqs]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-If we have this:
-   case x of r { _ -> ..r.. }
-where 'r' is used strictly in (..r..), the case is effectively a 'seq'
-on 'x', but since 'r' is used strictly anyway, we can safely transform to
-   (...x...)
+This is a Good Thing, because 'r' might be dead (if the body just
+calls error), or might be used just once (in which case it can be
+inlined); or we might be able to float the let-binding up or down.
+E.g. Trac #15631 has an example.
 
 Note that this can change the error behaviour.  For example, we might
 transform
@@ -2253,7 +2284,24 @@ transformation bit us in practice.
 
 See also Note [Empty case alternatives] in CoreSyn.
 
-Just for reference, the original code (added Jan 13) looked like this:
+Historical notes
+
+There have been various earlier versions of this patch:
+
+* By Sept 18 the code looked like this:
+     || scrut_is_demanded_var scrut
+
+    scrut_is_demanded_var :: CoreExpr -> Bool
+    scrut_is_demanded_var (Cast s _) = scrut_is_demanded_var s
+    scrut_is_demanded_var (Var _)    = isStrictDmd (idDemandInfo case_bndr)
+    scrut_is_demanded_var _          = False
+
+  This only fired if the scrutinee was a /variable/, which seems
+  an unnecessary restriction. So in Trac #15631 I relaxed it to allow
+  arbitrary scrutinees.  Less code, less to explain -- but the change
+  had 0.00% effect on nofib.
+
+* Previously, in Jan 13 the code looked like this:
      || case_bndr_evald_next rhs
 
     case_bndr_evald_next :: CoreExpr -> Bool
@@ -2264,25 +2312,8 @@ Just for reference, the original code (added Jan 13) looked like this:
     case_bndr_evald_next (Case e _ _ _)  = case_bndr_evald_next e
     case_bndr_evald_next _               = False
 
-(This came up when fixing Trac #7542. See also Note [Eta reduction of
-an eval'd function] in CoreUtils.)
-
-
-Note [Case elimination: unlifted case]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Consider
-   case a +# b of r -> ...r...
-Then we do case-elimination (to make a let) followed by inlining,
-to get
-        .....(a +# b)....
-If we have
-   case indexArray# a i of r -> ...r...
-we might like to do the same, and inline the (indexArray# a i).
-But indexArray# is not okForSpeculation, so we don't build a let
-in rebuildCase (lest it get floated *out*), so the inlining doesn't
-happen either.
-
-This really isn't a big deal I think. The let can be
+  This patch was part of fixing Trac #7542. See also
+  Note [Eta reduction of an eval'd function] in CoreUtils.)
 
 
 Further notes about case elimination
@@ -2334,7 +2365,7 @@ rebuildCase, reallyRebuildCase
    -> InId             -- Case binder
    -> [InAlt]          -- Alternatives (inceasing order)
    -> SimplCont
-   -> SimplM (SimplEnv, OutExpr)
+   -> SimplM (SimplFloats, OutExpr)
 
 --------------------------------------------------
 --      1. Eliminate the case if there's a known constructor
@@ -2361,10 +2392,11 @@ rebuildCase env scrut case_bndr alts cont
         }
   where
     simple_rhs bs rhs = ASSERT( null bs )
-                        do { env' <- simplNonRecX env case_bndr scrut
+                        do { (floats1, env') <- simplNonRecX env case_bndr scrut
                                -- scrut is a constructor application,
                                -- hence satisfies let/app invariant
-                           ; simplExprF env' rhs cont }
+                           ; (floats2, expr') <- simplExprF env' rhs cont
+                           ; return (floats1 `addFloats` floats2, expr') }
 
 
 --------------------------------------------------
@@ -2392,14 +2424,13 @@ rebuildCase env scrut case_bndr alts@[(_, bndrs, rhs)] cont
   --      a) it binds only the case-binder
   --      b) unlifted case: the scrutinee is ok-for-speculation
   --           lifted case: the scrutinee is in HNF (or will later be demanded)
+  -- See Note [Case to let transformation]
   | all_dead_bndrs
-  , if is_unlifted
-    then exprOkForSpeculation scrut  -- See Note [Case elimination: unlifted case]
-    else exprIsHNF scrut             -- See Note [Case elimination: lifted case]
-      || scrut_is_demanded_var scrut
+  , doCaseToLet scrut case_bndr
   = do { tick (CaseElim case_bndr)
-       ; env' <- simplNonRecX env case_bndr scrut
-       ; simplExprF env' rhs cont }
+       ; (floats1, env') <- simplNonRecX env case_bndr scrut
+       ; (floats2, expr') <- simplExprF env' rhs cont
+       ; return (floats1 `addFloats` floats2, expr') }
 
   -- 2c. Try the seq rules if
   --     a) it binds only the case binder
@@ -2411,42 +2442,45 @@ rebuildCase env scrut case_bndr alts@[(_, bndrs, rhs)] cont
            Just (env', rule_rhs, cont') -> simplExprF env' rule_rhs cont'
            Nothing                      -> reallyRebuildCase env scrut case_bndr alts cont }
   where
-    is_unlifted        = isUnliftedType (idType case_bndr)
-    all_dead_bndrs     = all isDeadBinder bndrs       -- bndrs are [InId]
-    is_plain_seq       = all_dead_bndrs && isDeadBinder case_bndr -- Evaluation *only* for effect
-
-    scrut_is_demanded_var :: CoreExpr -> Bool
-            -- See Note [Eliminating redundant seqs]
-    scrut_is_demanded_var (Cast s _) = scrut_is_demanded_var s
-    scrut_is_demanded_var (Var _)    = isStrictDmd (idDemandInfo case_bndr)
-    scrut_is_demanded_var _          = False
-
+    all_dead_bndrs = all isDeadBinder bndrs       -- bndrs are [InId]
+    is_plain_seq   = all_dead_bndrs && isDeadBinder case_bndr -- Evaluation *only* for effect
 
 rebuildCase env scrut case_bndr alts cont
   = reallyRebuildCase env scrut case_bndr alts cont
 
+
+doCaseToLet :: OutExpr          -- Scrutinee
+            -> InId             -- Case binder
+            -> Bool
+-- The situation is         case scrut of b { DEFAULT -> body }
+-- Can we transform thus?   let { b = scrut } in body
+doCaseToLet scrut case_bndr
+  | isTyCoVar case_bndr    -- Respect CoreSyn
+  = isTyCoArg scrut        -- Note [CoreSyn type and coercion invariant]
+
+  | isUnliftedType (idType case_bndr)
+  = exprOkForSpeculation scrut
+
+  | otherwise  -- Scrut has a lifted type
+  = exprIsHNF scrut
+    || isStrictDmd (idDemandInfo case_bndr)
+    -- See Note [Case-to-let for strictly-used binders]
+
 --------------------------------------------------
 --      3. Catch-all case
 --------------------------------------------------
 
 reallyRebuildCase env scrut case_bndr alts cont
-  = do  {       -- Prepare the continuation;
-                -- The new subst_env is in place
-          (env, alt_cont, wrap_cont) <- prepareCaseCont env alts cont
-
-        -- Simplify the alternatives
-        ; (scrut', case_bndr', alts') <- simplAlts env scrut case_bndr alts alt_cont
+  | not (sm_case_case (getMode env))
+  = do { case_expr <- simplAlts env scrut case_bndr alts
+                                (mkBoringStop (contHoleType cont))
+       ; rebuild env case_expr cont }
 
-        ; dflags <- getDynFlags
-        ; let alts_ty' = contResultType alt_cont
-        -- See Note [Avoiding space leaks in OutType]
-        ; case_expr <- seqType alts_ty' `seq`
-                       mkCase dflags scrut' case_bndr' alts_ty' alts'
-
-        -- Notice that rebuild gets the in-scope set from env', not alt_env
-        -- (which in any case is only build in simplAlts)
-        -- The case binder *not* scope over the whole returned case-expression
-        ; rebuild env case_expr wrap_cont }
+  | otherwise
+  = do { (floats, cont') <- mkDupableCaseCont env alts cont
+       ; case_expr <- simplAlts (env `setInScopeFromF` floats)
+                                scrut case_bndr alts cont'
+       ; return (floats, case_expr) }
 
 {-
 simplCaseBinder checks whether the scrutinee is a variable, v.  If so,
@@ -2528,18 +2562,16 @@ robust here.  (Otherwise, there's a danger that we'll simply drop the
 -}
 
 simplAlts :: SimplEnv
-          -> OutExpr
-          -> InId                       -- Case binder
-          -> [InAlt]                    -- Non-empty
+          -> OutExpr         -- Scrutinee
+          -> InId            -- Case binder
+          -> [InAlt]         -- Non-empty
           -> SimplCont
-          -> SimplM (OutExpr, OutId, [OutAlt])  -- Includes the continuation
--- Like simplExpr, this just returns the simplified alternatives;
--- it does not return an environment
--- The returned alternatives can be empty, none are possible
-
-simplAlts env scrut case_bndr alts cont'
-  = do  { let env0 = zapFloats env
+          -> SimplM OutExpr  -- Returns the complete simplified case expression
 
+simplAlts env0 scrut case_bndr alts cont'
+  = do  { traceSmpl "simplAlts" (vcat [ ppr case_bndr
+                                      , text "cont':" <+> ppr cont'
+                                      , text "in_scope" <+> ppr (seInScope env0) ])
         ; (env1, case_bndr1) <- simplBinder env0 case_bndr
         ; let case_bndr2 = case_bndr1 `setIdUnfolding` evaldUnfolding
               env2       = modifyInScope env1 case_bndr2
@@ -2555,7 +2587,11 @@ simplAlts env scrut case_bndr alts cont'
 
         ; alts' <- mapM (simplAlt alt_env' (Just scrut') imposs_deflt_cons case_bndr' cont') in_alts
         ; -- pprTrace "simplAlts" (ppr case_bndr $$ ppr alts_ty $$ ppr alts_ty' $$ ppr alts $$ ppr cont') $
-          return (scrut', case_bndr', alts') }
+
+        ; let alts_ty' = contResultType cont'
+        -- See Note [Avoiding space leaks in OutType]
+        ; seqType alts_ty' `seq`
+          mkCase (seDynFlags env0) scrut' case_bndr' alts_ty' alts' }
 
 
 ------------------------------------
@@ -2599,11 +2635,8 @@ simplAlt env scrut' _ case_bndr' cont' (LitAlt lit, bndrs, rhs)
         ; return (LitAlt lit, [], rhs') }
 
 simplAlt env scrut' _ case_bndr' cont' (DataAlt con, vs, rhs)
-  = do  {       -- Deal with the pattern-bound variables
-                -- Mark the ones that are in ! positions in the
-                -- data constructor as certainly-evaluated.
-                -- NB: simplLamBinders preserves this eval info
-        ; let vs_with_evals = add_evals (dataConRepStrictness con)
+  = do  { -- See Note [Adding evaluatedness info to pattern-bound variables]
+          let vs_with_evals = addEvals scrut' con vs
         ; (env', vs') <- simplLamBndrs env vs_with_evals
 
                 -- Bind the case-binder to (con args)
@@ -2614,53 +2647,92 @@ simplAlt env scrut' _ case_bndr' cont' (DataAlt con, vs, rhs)
         ; env'' <- addAltUnfoldings env' scrut' case_bndr' con_app
         ; rhs' <- simplExprC env'' rhs cont'
         ; return (DataAlt con, vs', rhs') }
-  where
-        -- add_evals records the evaluated-ness of the bound variables of
-        -- a case pattern.  This is *important*.  Consider
-        --      data T = T !Int !Int
-        --
-        --      case x of { T a b -> T (a+1) b }
-        --
-        -- We really must record that b is already evaluated so that we don't
-        -- go and re-evaluate it when constructing the result.
-        -- See Note [Data-con worker strictness] in MkId.hs
-    add_evals the_strs
-        = go vs the_strs
+
+{- Note [Adding evaluatedness info to pattern-bound variables]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+addEvals records the evaluated-ness of the bound variables of
+a case pattern.  This is *important*.  Consider
+
+     data T = T !Int !Int
+
+     case x of { T a b -> T (a+1) b }
+
+We really must record that b is already evaluated so that we don't
+go and re-evaluate it when constructing the result.
+See Note [Data-con worker strictness] in MkId.hs
+
+NB: simplLamBinders preserves this eval info
+
+In addition to handling data constructor fields with !s, addEvals
+also records the fact that the result of seq# is always in WHNF.
+See Note [seq# magic] in PrelRules.  Example (Trac #15226):
+
+  case seq# v s of
+    (# s', v' #) -> E
+
+we want the compiler to be aware that v' is in WHNF in E.
+
+Open problem: we don't record that v itself is in WHNF (and we can't
+do it here).  The right thing is to do some kind of binder-swap;
+see Trac #15226 for discussion.
+-}
+
+addEvals :: Maybe OutExpr -> DataCon -> [Id] -> [Id]
+-- See Note [Adding evaluatedness info to pattern-bound variables]
+addEvals scrut con vs
+  -- Deal with seq# applications
+  | Just scr <- scrut
+  , isUnboxedTupleCon con
+  , [s,x] <- vs
+    -- Use stripNArgs rather than collectArgsTicks to avoid building
+    -- a list of arguments only to throw it away immediately.
+  , Just (Var f) <- stripNArgs 4 scr
+  , Just SeqOp <- isPrimOpId_maybe f
+  , let x' = zapIdOccInfoAndSetEvald MarkedStrict x
+  = [s, x']
+
+  -- Deal with banged datacon fields
+addEvals _scrut con vs = go vs the_strs
+    where
+      the_strs = dataConRepStrictness con
+
+      go [] [] = []
+      go (v:vs') strs | isTyVar v = v : go vs' strs
+      go (v:vs') (str:strs) = zapIdOccInfoAndSetEvald str v : go vs' strs
+      go _ _ = pprPanic "Simplify.addEvals"
+                (ppr con $$
+                 ppr vs  $$
+                 ppr_with_length (map strdisp the_strs) $$
+                 ppr_with_length (dataConRepArgTys con) $$
+                 ppr_with_length (dataConRepStrictness con))
         where
-          go [] [] = []
-          go (v:vs') strs | isTyVar v = v : go vs' strs
-          go (v:vs') (str:strs) = zap str v : go vs' strs
-          go _ _ = pprPanic "cat_evals"
-                    (ppr con $$
-                     ppr vs  $$
-                     ppr_with_length the_strs $$
-                     ppr_with_length (dataConRepArgTys con) $$
-                     ppr_with_length (dataConRepStrictness con))
-            where
-              ppr_with_length list
-                = ppr list <+> parens (text "length =" <+> ppr (length list))
-                                    -- NB: If this panic triggers, note that
-                                    -- NoStrictnessMark doesn't print!
-
-          zap str v = setCaseBndrEvald str $ -- Add eval'dness info
-                      zapIdOccInfo v         -- And kill occ info;
-                                             -- see Note [Case alternative occ info]
+          ppr_with_length list
+            = ppr list <+> parens (text "length =" <+> ppr (length list))
+          strdisp MarkedStrict = "MarkedStrict"
+          strdisp NotMarkedStrict = "NotMarkedStrict"
+
+zapIdOccInfoAndSetEvald :: StrictnessMark -> Id -> Id
+zapIdOccInfoAndSetEvald str v =
+  setCaseBndrEvald str $ -- Add eval'dness info
+  zapIdOccInfo v         -- And kill occ info;
+                         -- see Note [Case alternative occ info]
 
 addAltUnfoldings :: SimplEnv -> Maybe OutExpr -> OutId -> OutExpr -> SimplM SimplEnv
 addAltUnfoldings env scrut case_bndr con_app
-  = do { dflags <- getDynFlags
-       ; let con_app_unf = mkSimpleUnfolding dflags con_app
+  = do { let con_app_unf = mk_simple_unf con_app
              env1 = addBinderUnfolding env case_bndr con_app_unf
 
              -- See Note [Add unfolding for scrutinee]
              env2 = case scrut of
                       Just (Var v)           -> addBinderUnfolding env1 v con_app_unf
                       Just (Cast (Var v) co) -> addBinderUnfolding env1 v $
-                                                mkSimpleUnfolding dflags (Cast con_app (mkSymCo co))
+                                                mk_simple_unf (Cast con_app (mkSymCo co))
                       _                      -> env1
 
        ; traceSmpl "addAltUnf" (vcat [ppr case_bndr <+> ppr scrut, ppr con_app])
        ; return env2 }
+  where
+    mk_simple_unf = mkSimpleUnfolding (seDynFlags env)
 
 addBinderUnfolding :: SimplEnv -> Id -> Unfolding -> SimplEnv
 addBinderUnfolding env bndr unf
@@ -2700,7 +2772,7 @@ Note [Add unfolding for scrutinee]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 In general it's unlikely that a variable scrutinee will appear
 in the case alternatives   case x of { ...x unlikely to appear... }
-because the binder-swap in OccAnal has got rid of all such occcurrences
+because the binder-swap in OccAnal has got rid of all such occurrences
 See Note [Binder swap] in OccAnal.
 
 BUT it is still VERY IMPORTANT to add a suitable unfolding for a
@@ -2756,17 +2828,18 @@ knownCon :: SimplEnv
          -> DataCon -> [OutType] -> [OutExpr]   -- The scrutinee (in pieces)
          -> InId -> [InBndr] -> InExpr          -- The alternative
          -> SimplCont
-         -> SimplM (SimplEnv, OutExpr)
+         -> SimplM (SimplFloats, OutExpr)
 
 knownCon env scrut dc dc_ty_args dc_args bndr bs rhs cont
-  = do  { env'  <- bind_args env bs dc_args
-        ; env'' <- bind_case_bndr env'
-        ; simplExprF env'' rhs cont }
+  = do  { (floats1, env1)  <- bind_args env bs dc_args
+        ; (floats2, env2) <- bind_case_bndr env1
+        ; (floats3, expr') <- simplExprF env2 rhs cont
+        ; return (floats1 `addFloats` floats2 `addFloats` floats3, expr') }
   where
     zap_occ = zapBndrOccInfo (isDeadBinder bndr)    -- bndr is an InId
 
                   -- Ugh!
-    bind_args env' [] _  = return env'
+    bind_args env' [] _  = return (emptyFloats env', env')
 
     bind_args env' (b:bs') (Type ty : args)
       = ASSERT( isTyVar b )
@@ -2784,8 +2857,9 @@ knownCon env scrut dc dc_ty_args dc_args bndr bs rhs cont
              -- it via postInlineUnconditionally.
              -- Nevertheless we must keep it if the case-binder is alive,
              -- because it may be used in the con_app.  See Note [knownCon occ info]
-           ; env'' <- simplNonRecX env' b' arg  -- arg satisfies let/app invariant
-           ; bind_args env'' bs' args }
+           ; (floats1, env2) <- simplNonRecX env' b' arg  -- arg satisfies let/app invariant
+           ; (floats2, env3)  <- bind_args env2 bs' args
+           ; return (floats1 `addFloats` floats2, env3) }
 
     bind_args _ _ _ =
       pprPanic "bind_args" $ ppr dc $$ ppr bs $$ ppr dc_args $$
@@ -2799,8 +2873,9 @@ knownCon env scrut dc dc_ty_args dc_args bndr bs rhs cont
        -- about duplicating the arg redexes; in that case, make
        -- a new con-app from the args
     bind_case_bndr env
-      | isDeadBinder bndr   = return env
-      | exprIsTrivial scrut = return (extendIdSubst env bndr (DoneEx scrut Nothing))
+      | isDeadBinder bndr   = return (emptyFloats env, env)
+      | exprIsTrivial scrut = return (emptyFloats env
+                                     , extendIdSubst env bndr (DoneEx scrut Nothing))
       | otherwise           = do { dc_args <- mapM (simplVar env) bs
                                          -- dc_ty_args are aready OutTypes,
                                          -- but bs are InBndrs
@@ -2810,7 +2885,8 @@ knownCon env scrut dc dc_ty_args dc_args bndr bs rhs cont
                                  ; simplNonRecX env bndr con_app }
 
 -------------------
-missingAlt :: SimplEnv -> Id -> [InAlt] -> SimplCont -> SimplM (SimplEnv, OutExpr)
+missingAlt :: SimplEnv -> Id -> [InAlt] -> SimplCont
+           -> SimplM (SimplFloats, OutExpr)
                 -- This isn't strictly an error, although it is unusual.
                 -- It's possible that the simplifier might "see" that
                 -- an inner case has no accessible alternatives before
@@ -2820,7 +2896,8 @@ missingAlt env case_bndr _ cont
   = WARN( True, text "missingAlt" <+> ppr case_bndr )
     -- See Note [Avoiding space leaks in OutType]
     let cont_ty = contResultType cont
-    in seqType cont_ty `seq` return (env, mkImpossibleExpr cont_ty)
+    in seqType cont_ty `seq`
+       return (emptyFloats env, mkImpossibleExpr cont_ty)
 
 {-
 ************************************************************************
@@ -2840,7 +2917,7 @@ and will split it into
    join floats:  $j1 = e1, $j2 = e2
    non_dupable:  let x* = [] in b; stop
 
-Putting this back togeher would give
+Putting this back together would give
    let x* = let { $j1 = e1; $j2 = e2 } in
             case e of { True -> $j1; False -> $j2 }
    in b
@@ -2850,57 +2927,23 @@ inner expression, and not around the whole thing.
 
 In contrast, any let-bindings introduced by mkDupableCont can wrap
 around the entire thing.
--}
-
 
-prepareCaseCont :: SimplEnv -> [InAlt] -> SimplCont
-                -> SimplM (SimplEnv,
-                           SimplCont,  -- For the alternatives
-                           SimplCont)  -- Wraps the entire case
--- We are considering
---     K[ case _ of { p1 -> r1; ...; pn -> rn } ]
--- where K is some enclosing continuation for the case
--- Goal: split K into two pieces Kdup,Knodup so that
---       a) Kdup can be duplicated
---       b) Knodup[Kdup[e]] = K[e]
--- The idea is that we'll transform thus:
---          Knodup[ (case _ of { p1 -> Kdup[r1]; ...; pn -> Kdup[rn] }
---
--- We may also return some extra value bindings in SimplEnv (that scope over
--- the entire continuation) as well as some join points (thus must *not* float
--- past the continuation!).
--- Hence, the full story is this:
---     K[case _ of { p1 -> r1; ...; pn -> rn }] ==>
---     F_v[Knodup[F_j[ (case _ of { p1 -> Kdup[r1]; ...; pn -> Kdup[rn] }) ]]]
--- Here F_v represents some values that got floated out and F_j represents some
--- join points that got floated out.
---
--- When case-of-case is off, just make the entire continuation non-dupable
-
-prepareCaseCont env alts cont
-  | not (altsWouldDup alts)
-  = return (env, cont, mkBoringStop (contResultType cont))
-  | otherwise
-  = do { (env', cont') <- mkDupableCont env cont
-       ; return (env', cont', mkBoringStop (contResultType cont)) }
-
-prepareJoinCont :: SimplEnv -> SimplCont -> SimplM (SimplEnv, SimplCont)
-
--- Similar to prepareCaseCont, only for
---     K[let { j1 = r1; ...; jn -> rn } in _]
--- If the js are join points, this will turn into
---     Knodup[join { j1 = Kdup[r1]; ...; jn = Kdup[rn] } in Kdup[_]].
---
--- When case-of-case is off and it's a join binding, just make the entire
--- continuation non-dupable. This is necessary because otherwise
---     case (join j = ... in case e of { A -> jump j 1; ... }) of { B -> ... }
--- becomes
---     join j = case ... of { B -> ... } in
---     case (case e of { A -> jump j 1; ... }) of { B -> ... },
--- and the reference to j is invalid.
+Note [Bottom alternatives]
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+When we have
+     case (case x of { A -> error .. ; B -> e; C -> error ..)
+       of alts
+then we can just duplicate those alts because the A and C cases
+will disappear immediately.  This is more direct than creating
+join points and inlining them away.  See Trac #4930.
+-}
 
-prepareJoinCont env cont
-  = mkDupableCont env cont
+--------------------
+mkDupableCaseCont :: SimplEnv -> [InAlt] -> SimplCont
+                  -> SimplM (SimplFloats, SimplCont)
+mkDupableCaseCont env alts cont
+  | altsWouldDup alts = mkDupableCont env cont
+  | otherwise         = return (emptyFloats env, cont)
 
 altsWouldDup :: [InAlt] -> Bool -- True iff strictly > 1 non-bottom alternative
 altsWouldDup []  = False        -- See Note [Bottom alternatives]
@@ -2911,115 +2954,109 @@ altsWouldDup (alt:alts)
   where
     is_bot_alt (_,_,rhs) = exprIsBottom rhs
 
-{-
-Note [Bottom alternatives]
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-When we have
-     case (case x of { A -> error .. ; B -> e; C -> error ..)
-       of alts
-then we can just duplicate those alts because the A and C cases
-will disappear immediately.  This is more direct than creating
-join points and inlining them away.  See Trac #4930.
--}
-
 -------------------------
 mkDupableCont :: SimplEnv -> SimplCont
-              -> SimplM ( SimplEnv    -- Incoming SimplEnv augmented with
-                                      --   extra let/join-floats and in-scope variables
-                        , SimplCont)  -- dup_cont: duplicable continuation
-mkDupableCont env cont
-  = mk_dupable_cont env cont
+              -> SimplM ( SimplFloats  -- Incoming SimplEnv augmented with
+                                       --   extra let/join-floats and in-scope variables
+                        , SimplCont)   -- dup_cont: duplicable continuation
 
--------------------------
-mk_dupable_cont :: SimplEnv -> SimplCont
-                -> SimplM (SimplEnv, SimplCont)
-mk_dupable_cont env cont
+mkDupableCont env cont
   | contIsDupable cont
-  = return (env, cont)
+  = return (emptyFloats env, cont)
 
-mk_dupable_cont _ (Stop {}) = panic "mkDupableCont"     -- Handled by previous eqn
+mkDupableCont _ (Stop {}) = panic "mkDupableCont"     -- Handled by previous eqn
 
-mk_dupable_cont env (CastIt ty cont)
-  = do  { (env', cont') <- mk_dupable_cont env cont
-        ; return (env', CastIt ty cont') }
+mkDupableCont env (CastIt ty cont)
+  = do  { (floats, cont') <- mkDupableCont env cont
+        ; return (floats, CastIt ty cont') }
 
 -- Duplicating ticks for now, not sure if this is good or not
-mk_dupable_cont env (TickIt t cont)
-  = do  { (env', cont') <- mk_dupable_cont env cont
-        ; return (env', TickIt t cont') }
+mkDupableCont env (TickIt t cont)
+  = do  { (floats, cont') <- mkDupableCont env cont
+        ; return (floats, TickIt t cont') }
 
-mk_dupable_cont env (StrictBind { sc_bndr = bndr, sc_bndrs = bndrs
-                                , sc_body = body, sc_env = se, sc_cont = cont})
+mkDupableCont env (StrictBind { sc_bndr = bndr, sc_bndrs = bndrs
+                              , sc_body = body, sc_env = se, sc_cont = cont})
   -- See Note [Duplicating StrictBind]
-  = do { let sb_env = se `setInScopeAndZapFloats` env
+  = do { let sb_env = se `setInScopeFromE` env
        ; (sb_env1, bndr') <- simplBinder sb_env bndr
-       ; (sb_env', join_inner) <- simplLam sb_env1 bndrs body cont
-          -- No need to use mk_dupable_cont before simplLam; we
+       ; (floats1, join_inner) <- simplLam sb_env1 bndrs body cont
+          -- No need to use mkDupableCont before simplLam; we
           -- use cont once here, and then share the result if necessary
-       ; let join_body = wrapFloats sb_env' join_inner
+
+       ; let join_body = wrapFloats floats1 join_inner
              res_ty    = contResultType cont
-       ; dflags <- getDynFlags
-       ; (env2, body2)
-            <- if exprIsDupable dflags join_body
-               then return (env, join_body)
+
+       ; (floats2, body2)
+            <- if exprIsDupable (seDynFlags env) join_body
+               then return (emptyFloats env, join_body)
                else do { join_bndr <- newJoinId [bndr'] res_ty
                        ; let join_call = App (Var join_bndr) (Var bndr')
                              join_rhs  = Lam (setOneShotLambda bndr') join_body
-                       ; return (addNonRec env join_bndr join_rhs, join_call) }
-       ; return ( env2
+                             join_bind = NonRec join_bndr join_rhs
+                             floats    = emptyFloats env `extendFloats` join_bind
+                       ; return (floats, join_call) }
+       ; return ( floats2
                 , StrictBind { sc_bndr = bndr', sc_bndrs = []
                              , sc_body = body2
-                             , sc_env  = zapSubstEnv se
+                             , sc_env  = zapSubstEnv se `setInScopeFromF` floats2
+                                         -- See Note [StaticEnv invariant] in SimplUtils
                              , sc_dup  = OkToDup
                              , sc_cont = mkBoringStop res_ty } ) }
 
-mk_dupable_cont env (StrictArg { sc_fun = info, sc_cci = cci, sc_cont = cont })
+mkDupableCont env (StrictArg { sc_fun = info, sc_cci = cci, sc_cont = cont })
         -- See Note [Duplicating StrictArg]
         -- NB: sc_dup /= OkToDup; that is caught earlier by contIsDupable
-  = do { (env', cont')  <- mk_dupable_cont env cont
-       ; (env'', args') <- mapAccumLM makeTrivialArg env' (ai_args info)
-       ; return (env'', StrictArg { sc_fun = info { ai_args = args' }
-                                  , sc_cci = cci
-                                  , sc_cont = cont'
-                                  , sc_dup = OkToDup} ) }
-
-mk_dupable_cont env (ApplyToTy { sc_cont = cont
-                               , sc_arg_ty = arg_ty, sc_hole_ty = hole_ty })
-  = do  { (env', cont') <- mk_dupable_cont env cont
-        ; return (env', ApplyToTy { sc_cont = cont'
-                                  , sc_arg_ty = arg_ty, sc_hole_ty = hole_ty }) }
-
-mk_dupable_cont env (ApplyToVal { sc_arg = arg, sc_dup = dup
-                                , sc_env = se, sc_cont = cont })
+  = do { (floats1, cont') <- mkDupableCont env cont
+       ; (floats_s, args') <- mapAndUnzipM (makeTrivialArg (getMode env))
+                                           (ai_args info)
+       ; return ( foldl' addLetFloats floats1 floats_s
+                , StrictArg { sc_fun = info { ai_args = args' }
+                            , sc_cci = cci
+                            , sc_cont = cont'
+                            , sc_dup = OkToDup} ) }
+
+mkDupableCont env (ApplyToTy { sc_cont = cont
+                             , sc_arg_ty = arg_ty, sc_hole_ty = hole_ty })
+  = do  { (floats, cont') <- mkDupableCont env cont
+        ; return (floats, ApplyToTy { sc_cont = cont'
+                                    , sc_arg_ty = arg_ty, sc_hole_ty = hole_ty }) }
+
+mkDupableCont env (ApplyToVal { sc_arg = arg, sc_dup = dup
+                              , sc_env = se, sc_cont = cont })
   =     -- e.g.         [...hole...] (...arg...)
         --      ==>
         --              let a = ...arg...
         --              in [...hole...] a
         -- NB: sc_dup /= OkToDup; that is caught earlier by contIsDupable
-    do  { (env', cont') <- mk_dupable_cont env cont
+    do  { (floats1, cont') <- mkDupableCont env cont
+        ; let env' = env `setInScopeFromF` floats1
         ; (_, se', arg') <- simplArg env' dup se arg
-        ; (env'', arg'') <- makeTrivial NotTopLevel env' (fsLit "karg") arg'
-        ; return (env'', ApplyToVal { sc_arg = arg'', sc_env = se'
-                                    , sc_dup = OkToDup, sc_cont = cont' }) }
-
-mk_dupable_cont env (Select { sc_bndr = case_bndr, sc_alts = alts
-                            , sc_env = se, sc_cont = cont })
+        ; (let_floats2, arg'') <- makeTrivial (getMode env) NotTopLevel (fsLit "karg") arg'
+        ; let all_floats = floats1 `addLetFloats` let_floats2
+        ; return ( all_floats
+                 , ApplyToVal { sc_arg = arg''
+                              , sc_env = se' `setInScopeFromF` all_floats
+                                         -- Ensure that sc_env includes the free vars of
+                                         -- arg'' in its in-scope set, even if makeTrivial
+                                         -- has turned arg'' into a fresh variable
+                                         -- See Note [StaticEnv invariant] in SimplUtils
+                              , sc_dup = OkToDup, sc_cont = cont' }) }
+
+mkDupableCont env (Select { sc_bndr = case_bndr, sc_alts = alts
+                          , sc_env = se, sc_cont = cont })
   =     -- e.g.         (case [...hole...] of { pi -> ei })
         --      ===>
         --              let ji = \xij -> ei
         --              in case [...hole...] of { pi -> ji xij }
         -- NB: sc_dup /= OkToDup; that is caught earlier by contIsDupable
     do  { tick (CaseOfCase case_bndr)
-        ; (env', alt_cont, wrap_cont) <- prepareCaseCont env alts cont
-                -- NB: We call prepareCaseCont here.  If there is only one
-                -- alternative, then dup_cont may be big, but that's ok
-                -- because we push it into the single alternative, and then
-                -- use mkDupableAlt to turn that simplified alternative into
-                -- a join point if it's too big to duplicate.
+        ; (floats, alt_cont) <- mkDupableCaseCont env alts cont
+                -- NB: We call mkDupableCaseCont here to make cont duplicable
+                --     (if necessary, depending on the number of alts)
                 -- And this is important: see Note [Fusing case continuations]
 
-        ; let alt_env = se `setInScopeAndZapFloats` env'
-
+        ; let alt_env = se `setInScopeFromF` floats
         ; (alt_env', case_bndr') <- simplBinder alt_env case_bndr
         ; alts' <- mapM (simplAlt alt_env' Nothing [] case_bndr' alt_cont) alts
         -- Safe to say that there are no handled-cons for the DEFAULT case
@@ -3034,27 +3071,25 @@ mk_dupable_cont env (Select { sc_bndr = case_bndr, sc_alts = alts
         -- NB: we don't use alt_env further; it has the substEnv for
         --     the alternatives, and we don't want that
 
-        ; (join_binds, alts'') <- mkDupableAlts case_bndr' alts'
-        ; let env'' = foldl (\env (j,r) -> addNonRec env j r) env' join_binds
-
-        ; return (env'',  -- Note [Duplicated env]
-                  Select { sc_dup  = OkToDup
-                         , sc_bndr = case_bndr', sc_alts = alts''
-                         , sc_env  = zapSubstEnv env''
-                         , sc_cont = wrap_cont } ) }
-
-mkDupableAlts :: OutId -> [OutAlt] -> SimplM ([(JoinId, OutExpr)], [OutAlt])
-mkDupableAlts case_bndr' the_alts
-  = do { dflags <- getDynFlags
-       ; (mb_join_floats, dup_alts)
-             <- mapAndUnzipM (mkDupableAlt dflags case_bndr') the_alts
-       ; return (catMaybes mb_join_floats, dup_alts) }
-
-mkDupableAlt :: DynFlags -> OutId -> OutAlt
-             -> SimplM (Maybe (JoinId,OutExpr), OutAlt)
-mkDupableAlt dflags case_bndr (con, bndrs', rhs')
+        ; (join_floats, alts'') <- mapAccumLM (mkDupableAlt (seDynFlags env) case_bndr')
+                                              emptyJoinFloats alts'
+
+        ; let all_floats = floats `addJoinFloats` join_floats
+                           -- Note [Duplicated env]
+        ; return (all_floats
+                 , Select { sc_dup  = OkToDup
+                          , sc_bndr = case_bndr'
+                          , sc_alts = alts''
+                          , sc_env  = zapSubstEnv se `setInScopeFromF` all_floats
+                                      -- See Note [StaticEnv invariant] in SimplUtils
+                          , sc_cont = mkBoringStop (contResultType cont) } ) }
+
+mkDupableAlt :: DynFlags -> OutId
+             -> JoinFloats -> OutAlt
+             -> SimplM (JoinFloats, OutAlt)
+mkDupableAlt dflags case_bndr jfloats (con, bndrs', rhs')
   | exprIsDupable dflags rhs'  -- Note [Small alternative rhs]
-  = return (Nothing, (con, bndrs', rhs'))
+  = return (jfloats, (con, bndrs', rhs'))
 
   | otherwise
   = do  { let rhs_ty'  = exprType rhs'
@@ -3099,7 +3134,8 @@ mkDupableAlt dflags case_bndr (con, bndrs', rhs')
         ; let join_call = mkApps (Var join_bndr) final_args
               alt'      = (con, bndrs', join_call)
 
-        ; return (Just (join_bndr, join_rhs), alt') }
+        ; return ( jfloats `addJoinFlts` unitJoinFloat (NonRec join_bndr join_rhs)
+                 , alt') }
                 -- See Note [Duplicated env]
 
 {-
@@ -3178,7 +3214,7 @@ and c is unused.
 Note [Duplicated env]
 ~~~~~~~~~~~~~~~~~~~~~
 Some of the alternatives are simplified, but have not been turned into a join point
-So they *must* have an zapped subst-env.  So we can't use completeNonRecX to
+So they *must* have a zapped subst-env.  So we can't use completeNonRecX to
 bind the join point, because it might to do PostInlineUnconditionally, and
 we'd lose that when zapping the subst-env.  We could have a per-alt subst-env,
 but zapping it (as we do in mkDupableCont, the Select case) is safe, and
@@ -3347,17 +3383,24 @@ because we don't know its usage in each RHS separately
 -}
 
 simplLetUnfolding :: SimplEnv-> TopLevelFlag
-                  -> Maybe SimplCont
+                  -> MaybeJoinCont
                   -> InId
-                  -> OutExpr
+                  -> OutExpr -> OutType
                   -> Unfolding -> SimplM Unfolding
-simplLetUnfolding env top_lvl cont_mb id new_rhs unf
+simplLetUnfolding env top_lvl cont_mb id new_rhs rhs_ty unf
   | isStableUnfolding unf
-  = simplUnfolding env top_lvl cont_mb id unf
+  = simplStableUnfolding env top_lvl cont_mb id unf rhs_ty
+  | isExitJoinId id
+  = return noUnfolding -- See Note [Do not inline exit join points] in Exitify
   | otherwise
+  = mkLetUnfolding (seDynFlags env) top_lvl InlineRhs id new_rhs
+
+-------------------
+mkLetUnfolding :: DynFlags -> TopLevelFlag -> UnfoldingSource
+               -> InId -> OutExpr -> SimplM Unfolding
+mkLetUnfolding dflags top_lvl src id new_rhs
   = is_bottoming `seq`  -- See Note [Force bottoming field]
-    do { dflags <- getDynFlags
-       ; return (mkUnfolding dflags InlineRhs is_top_lvl is_bottoming new_rhs) }
+    return (mkUnfolding dflags src is_top_lvl is_bottoming new_rhs)
             -- We make an  unfolding *even for loop-breakers*.
             -- Reason: (a) It might be useful to know that they are WHNF
             --         (b) In TidyPgm we currently assume that, if we want to
@@ -3368,53 +3411,62 @@ simplLetUnfolding env top_lvl cont_mb id new_rhs unf
     is_top_lvl   = isTopLevel top_lvl
     is_bottoming = isBottomingId id
 
-simplUnfolding :: SimplEnv -> TopLevelFlag
-               -> Maybe SimplCont  -- Just k => a join point with continuation k
-               -> InId
-               -> Unfolding -> SimplM Unfolding
+-------------------
+simplStableUnfolding :: SimplEnv -> TopLevelFlag
+                     -> MaybeJoinCont  -- Just k => a join point with continuation k
+                     -> InId
+                     -> Unfolding -> OutType -> SimplM Unfolding
 -- Note [Setting the new unfolding]
-simplUnfolding env top_lvl mb_cont id unf
+simplStableUnfolding env top_lvl mb_cont id unf rhs_ty
   = case unf of
-      NoUnfolding -> return unf
+      NoUnfolding   -> return unf
       BootUnfolding -> return unf
-      OtherCon {} -> return unf
+      OtherCon {}   -> return unf
 
       DFunUnfolding { df_bndrs = bndrs, df_con = con, df_args = args }
-        -> do { (env', bndrs') <- simplBinders rule_env bndrs
+        -> do { (env', bndrs') <- simplBinders unf_env bndrs
               ; args' <- mapM (simplExpr env') args
               ; return (mkDFunUnfolding bndrs' con args') }
 
       CoreUnfolding { uf_tmpl = expr, uf_src = src, uf_guidance = guide }
         | isStableSource src
-        -> do { expr' <- case mb_cont of
-                           Just cont -> simplJoinRhs rule_env id expr cont
-                           Nothing   -> simplExpr rule_env expr
+        -> do { expr' <- case mb_cont of -- See Note [Rules and unfolding for join points]
+                           Just cont -> simplJoinRhs unf_env id expr cont
+                           Nothing   -> simplExprC unf_env expr (mkBoringStop rhs_ty)
               ; case guide of
-                  UnfWhen { ug_arity = arity, ug_unsat_ok = sat_ok }  -- Happens for INLINE things
-                     -> let guide' = UnfWhen { ug_arity = arity, ug_unsat_ok = sat_ok
-                                             , ug_boring_ok = inlineBoringOk expr' }
+                  UnfWhen { ug_arity = arity
+                          , ug_unsat_ok = sat_ok
+                          , ug_boring_ok = boring_ok
+                          }
+                          -- Happens for INLINE things
+                     -> let guide' =
+                              UnfWhen { ug_arity = arity
+                                      , ug_unsat_ok = sat_ok
+                                      , ug_boring_ok =
+                                          boring_ok || inlineBoringOk expr'
+                                      }
                         -- Refresh the boring-ok flag, in case expr'
                         -- has got small. This happens, notably in the inlinings
                         -- for dfuns for single-method classes; see
                         -- Note [Single-method classes] in TcInstDcls.
                         -- A test case is Trac #4138
+                        -- But retain a previous boring_ok of True; e.g. see
+                        -- the way it is set in calcUnfoldingGuidanceWithArity
                         in return (mkCoreUnfolding src is_top_lvl expr' guide')
                             -- See Note [Top-level flag on inline rules] in CoreUnfold
 
                   _other              -- Happens for INLINABLE things
-                     -> is_bottoming `seq` -- See Note [Force bottoming field]
-                        do { dflags <- getDynFlags
-                           ; return (mkUnfolding dflags src is_top_lvl is_bottoming expr') } }
+                     -> mkLetUnfolding dflags top_lvl src id expr' }
                 -- If the guidance is UnfIfGoodArgs, this is an INLINABLE
                 -- unfolding, and we need to make sure the guidance is kept up
                 -- to date with respect to any changes in the unfolding.
 
         | otherwise -> return noUnfolding   -- Discard unstable unfoldings
   where
-    is_top_lvl   = isTopLevel top_lvl
-    is_bottoming = isBottomingId id
-    act          = idInlineActivation id
-    rule_env     = updMode (updModeForStableUnfoldings act) env
+    dflags     = seDynFlags env
+    is_top_lvl = isTopLevel top_lvl
+    act        = idInlineActivation id
+    unf_env    = updMode (updModeForStableUnfoldings act) env
          -- See Note [Simplifying inside stable unfoldings] in SimplUtils
 
 {-
@@ -3435,7 +3487,7 @@ Note [Setting the new unfolding]
   important: if exprIsConApp says 'yes' for a recursive thing, then we
   can get into an infinite loop
 
-If there's an stable unfolding on a loop breaker (which happens for
+If there's a stable unfolding on a loop breaker (which happens for
 INLINABLE), we hang on to the inlining.  It's pretty dodgy, but the
 user did say 'INLINE'.  May need to revisit this choice.
 
@@ -3456,20 +3508,24 @@ to apply in that function's own right-hand side.
 See Note [Forming Rec groups] in OccurAnal
 -}
 
-addBndrRules :: SimplEnv -> InBndr -> OutBndr -> SimplM (SimplEnv, OutBndr)
+addBndrRules :: SimplEnv -> InBndr -> OutBndr
+             -> MaybeJoinCont   -- Just k for a join point binder
+                                -- Nothing otherwise
+             -> SimplM (SimplEnv, OutBndr)
 -- Rules are added back into the bin
-addBndrRules env in_id out_id
+addBndrRules env in_id out_id mb_cont
   | null old_rules
   = return (env, out_id)
   | otherwise
-  = do { new_rules <- simplRules env (Just (idName out_id)) old_rules
+  = do { new_rules <- simplRules env (Just out_id) old_rules mb_cont
        ; let final_id  = out_id `setIdSpecialisation` mkRuleInfo new_rules
        ; return (modifyInScope env final_id, final_id) }
   where
     old_rules = ruleInfoRules (idSpecialisation in_id)
 
-simplRules :: SimplEnv -> Maybe Name -> [CoreRule] -> SimplM [CoreRule]
-simplRules env mb_new_nm rules
+simplRules :: SimplEnv -> Maybe OutId -> [CoreRule]
+           -> MaybeJoinCont -> SimplM [CoreRule]
+simplRules env mb_new_id rules mb_cont
   = mapM simpl_rule rules
   where
     simpl_rule rule@(BuiltinRule {})
@@ -3479,11 +3535,29 @@ simplRules env mb_new_nm rules
                           , ru_fn = fn_name, ru_rhs = rhs })
       = do { (env', bndrs') <- simplBinders env bndrs
            ; let rhs_ty = substTy env' (exprType rhs)
-                 rule_cont = mkBoringStop rhs_ty
-                 rule_env  = updMode updModeForRules env'
+                 rhs_cont = case mb_cont of  -- See Note [Rules and unfolding for join points]
+                                Nothing   -> mkBoringStop rhs_ty
+                                Just cont -> ASSERT2( join_ok, bad_join_msg )
+                                             cont
+                 rule_env = updMode updModeForRules env'
+                 fn_name' = case mb_new_id of
+                              Just id -> idName id
+                              Nothing -> fn_name
+
+                 -- join_ok is an assertion check that the join-arity of the
+                 -- binder matches that of the rule, so that pushing the
+                 -- continuation into the RHS makes sense
+                 join_ok = case mb_new_id of
+                             Just id | Just join_arity <- isJoinId_maybe id
+                                     -> length args == join_arity
+                             _ -> False
+                 bad_join_msg = vcat [ ppr mb_new_id, ppr rule
+                                     , ppr (fmap isJoinId_maybe mb_new_id) ]
+
            ; args' <- mapM (simplExpr rule_env) args
-           ; rhs'  <- simplExprC rule_env rhs rule_cont
+           ; rhs'  <- simplExprC rule_env rhs rhs_cont
            ; return (rule { ru_bndrs = bndrs'
-                          , ru_fn    = mb_new_nm `orElse` fn_name
+                          , ru_fn    = fn_name'
                           , ru_args  = args'
                           , ru_rhs   = rhs' }) }
+