Better eta-expansion (again) and don't specilise DFunswip/T18223

This patch fixes #18223, which made GHC generate an exponential amount of code. There are three quite separate changes in here 1. Re-engineer eta-expansion (again). The eta-expander was generating lots of intermediate stuff, which could be optimised away, but which choked the simplifier meanwhile. Relatively easy to kill it off at source. See Note [The EtaInfo mechanism] in GHC.Core.Opt.Arity. The main new thing is the use of pushCoArg in getArg_maybe. 2. Stop Specialise specalising DFuns. This is the cause of a huge (and utterly unnecessary) blowup in program size in #18223. See Note [Do not specialise DFuns] in GHC.Core.Opt.Specialise. I also refactored the Specialise monad a bit... it was silly, because it passed on unchanging values as if they were mutable state. 3. Do an extra Simplifer run, after SpecConstra and before late-Specialise. I found (investigating perf/compiler/T16473) that failing to do this was crippling *both* SpecConstr *and* Specialise. See Note [Simplify after SpecConstr] in GHC.Core.Opt.Pipeline. This change does mean an extra run of the Simplifier, but only with -O2, and I think that's acceptable. T16473 allocates *three* times less with this change. (I changed it to check runtime rather than compile time.) Some smaller consequences * I moved pushCoercion, pushCoArg and friends from SimpleOpt to Arity, because it was needed by the new etaInfoApp. And pushCoValArg now returns a MCoercion rather than Coercion for the argument Coercion. * A minor, incidental improvement to Core pretty-printing This does fix #18223, (which was otherwise uncompilable. Hooray. But there is still a big intermediate because there are some very deeply nested types in that program. Modest reductions in compile-time allocation on a couple of benchmarks T12425 -2.0% T13253 -10.3% Metric increase with -O2, due to extra simplifier run T9233 +5.8% T12227 +1.8% T15630 +5.0% There is a spurious apparent increase on heap residency on T9630, on some architectures at least. I tried it with -G1 and the residency is essentially unchanged. Metric Increase T9233 T12227 T9630 Metric Decrease T12425 T13253
author: Simon Peyton Jones <simonpj@microsoft.com> 2020-08-24 14:36:57 +0100
committer: Simon Peyton Jones <simonpj@microsoft.com> 2020-09-21 16:15:14 +0100
commit: ff5a843e2003abed15f99d10eb1195cf9d572e06 (patch)
tree: 74e015a51530a05adb25daab808f97dd5b050a54
parent: 9df77fed8918bb335874a584a829ee32325cefb5 (diff)
download: haskell-wip/T18223.tar.gz
16 files changed, 870 insertions, 698 deletions
diff --git a/compiler/GHC/Core/Coercion.hs b/compiler/GHC/Core/Coercion.hs
index 6e19cbdd7a..47e6d40173 100644
--- a/compiler/GHC/Core/Coercion.hs
+++ b/compiler/GHC/Core/Coercion.hs
@@ -31,7 +31,7 @@ module GHC.Core.Coercion (
         mkAxInstRHS, mkUnbranchedAxInstRHS,
         mkAxInstLHS, mkUnbranchedAxInstLHS,
         mkPiCo, mkPiCos, mkCoCast,
-        mkSymCo, mkTransCo, mkTransMCo,
+        mkSymCo, mkTransCo,
         mkNthCo, nthCoRole, mkLRCo,
         mkInstCo, mkAppCo, mkAppCos, mkTyConAppCo, mkFunCo,
         mkForAllCo, mkForAllCos, mkHomoForAllCos,
@@ -65,7 +65,8 @@ module GHC.Core.Coercion (
         pickLR,
 
         isGReflCo, isReflCo, isReflCo_maybe, isGReflCo_maybe, isReflexiveCo, isReflexiveCo_maybe,
-        isReflCoVar_maybe, isGReflMCo, coToMCo,
+        isReflCoVar_maybe, isGReflMCo,
+        coToMCo, mkTransMCo, mkTransMCoL,
 
         -- ** Coercion variables
         mkCoVar, isCoVar, coVarName, setCoVarName, setCoVarUnique,
@@ -288,6 +289,44 @@ tidyCoAxBndrsForUser init_env tcvs
                        ('_' : rest) -> all isDigit rest
                        _            -> False
 
+
+{- *********************************************************************
+*                                                                      *
+              MCoercion
+*                                                                      *
+********************************************************************* -}
+
+coToMCo :: Coercion -> MCoercion
+-- Convert a coercion to a MCoercion,
+-- It's not clear whether or not isReflexiveCo would be better here
+coToMCo co | isReflCo co = MRefl
+           | otherwise   = MCo co
+
+-- | Tests if this MCoercion is obviously generalized reflexive
+-- Guaranteed to work very quickly.
+isGReflMCo :: MCoercion -> Bool
+isGReflMCo MRefl = True
+isGReflMCo (MCo co) | isGReflCo co = True
+isGReflMCo _ = False
+
+-- | Make a generalized reflexive coercion
+mkGReflCo :: Role -> Type -> MCoercionN -> Coercion
+mkGReflCo r ty mco
+  | isGReflMCo mco = if r == Nominal then Refl ty
+                     else GRefl r ty MRefl
+  | otherwise    = GRefl r ty mco
+
+-- | Compose two MCoercions via transitivity
+mkTransMCo :: MCoercion -> MCoercion -> MCoercion
+mkTransMCo MRefl     co2       = co2
+mkTransMCo co1       MRefl     = co1
+mkTransMCo (MCo co1) (MCo co2) = MCo (mkTransCo co1 co2)
+
+mkTransMCoL :: MCoercion -> Coercion -> MCoercion
+mkTransMCoL MRefl     co2 = MCo co2
+mkTransMCoL (MCo co1) co2 = MCo (mkTransCo co1 co2)
+
+
 {-
 %************************************************************************
 %*                                                                      *
@@ -556,13 +595,6 @@ isGReflCo (GRefl{}) = True
 isGReflCo (Refl{})  = True -- Refl ty == GRefl N ty MRefl
 isGReflCo _         = False
 
--- | Tests if this MCoercion is obviously generalized reflexive
--- Guaranteed to work very quickly.
-isGReflMCo :: MCoercion -> Bool
-isGReflMCo MRefl = True
-isGReflMCo (MCo co) | isGReflCo co = True
-isGReflMCo _ = False
-
 -- | Tests if this coercion is obviously reflexive. Guaranteed to work
 -- very quickly. Sometimes a coercion can be reflexive, but not obviously
 -- so. c.f. 'isReflexiveCo'
@@ -603,10 +635,6 @@ isReflexiveCo_maybe co
   = Nothing
   where (Pair ty1 ty2, r) = coercionKindRole co
 
-coToMCo :: Coercion -> MCoercion
-coToMCo c = if isReflCo c
-  then MRefl
-  else MCo c
 
 {-
 %************************************************************************
@@ -669,13 +697,6 @@ role is bizarre and a caller should have to ask for this behavior explicitly.
 
 -}
 
--- | Make a generalized reflexive coercion
-mkGReflCo :: Role -> Type -> MCoercionN -> Coercion
-mkGReflCo r ty mco
-  | isGReflMCo mco = if r == Nominal then Refl ty
-                     else GRefl r ty MRefl
-  | otherwise    = GRefl r ty mco
-
 -- | Make a reflexive coercion
 mkReflCo :: Role -> Type -> Coercion
 mkReflCo Nominal ty = Refl ty
@@ -990,12 +1011,6 @@ mkTransCo (GRefl r t1 (MCo co1)) (GRefl _ _ (MCo co2))
   = GRefl r t1 (MCo $ mkTransCo co1 co2)
 mkTransCo co1 co2                 = TransCo co1 co2
 
--- | Compose two MCoercions via transitivity
-mkTransMCo :: MCoercion -> MCoercion -> MCoercion
-mkTransMCo MRefl     co2       = co2
-mkTransMCo co1       MRefl     = co1
-mkTransMCo (MCo co1) (MCo co2) = MCo (mkTransCo co1 co2)
-
 mkNthCo :: HasDebugCallStack
         => Role  -- The role of the coercion you're creating
         -> Int   -- Zero-indexed
diff --git a/compiler/GHC/Core/Opt/Arity.hs b/compiler/GHC/Core/Opt/Arity.hs
index b55d91767e..2471470814 100644
--- a/compiler/GHC/Core/Opt/Arity.hs
+++ b/compiler/GHC/Core/Opt/Arity.hs
@@ -15,10 +15,18 @@ module GHC.Core.Opt.Arity
    ( manifestArity, joinRhsArity, exprArity, typeArity
    , exprEtaExpandArity, findRhsArity
    , etaExpand, etaExpandAT
-   , etaExpandToJoinPoint, etaExpandToJoinPointRule
    , exprBotStrictness_maybe
+
+        -- ** ArityType
    , ArityType(..), expandableArityType, arityTypeArity
    , maxWithArity, isBotArityType, idArityType
+
+        -- ** Join points
+   , etaExpandToJoinPoint, etaExpandToJoinPointRule
+
+        -- ** Coercions and casts
+   , pushCoArg, pushCoArgs, pushCoValArg, pushCoTyArg
+   , pushCoercionIntoLambda, pushCoDataCon, collectBindersPushingCo
    )
 where
 
@@ -31,15 +39,21 @@ import GHC.Driver.Ppr
 import GHC.Core
 import GHC.Core.FVs
 import GHC.Core.Utils
-import GHC.Core.Subst
 import GHC.Types.Demand
 import GHC.Types.Var
 import GHC.Types.Var.Env
 import GHC.Types.Id
-import GHC.Core.Type as Type
-import GHC.Core.TyCon     ( initRecTc, checkRecTc )
+
+-- We have two sorts of substitution:
+--   GHC.Core.Subst.Subst, and GHC.Core.TyCo.TCvSubst
+-- Both have substTy, substCo  Hence need for qualification
+import GHC.Core.Subst    as Core
+import GHC.Core.Type     as Type
+import GHC.Core.Coercion as Type
+
+import GHC.Core.DataCon
+import GHC.Core.TyCon     ( initRecTc, checkRecTc, tyConArity )
 import GHC.Core.Predicate ( isDictTy )
-import GHC.Core.Coercion as Coercion
 import GHC.Core.Multiplicity
 import GHC.Types.Var.Set
 import GHC.Types.Basic
@@ -48,7 +62,8 @@ import GHC.Driver.Session ( DynFlags, GeneralFlag(..), gopt )
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
 import GHC.Data.FastString
-import GHC.Utils.Misc     ( lengthAtLeast )
+import GHC.Data.Pair
+import GHC.Utils.Misc
 
 {-
 ************************************************************************
@@ -1076,12 +1091,11 @@ eta_expand one_shots orig_expr
     go oss (Cast expr co) = Cast (go oss expr) co
 
     go oss expr
-      = -- pprTrace "ee" (vcat [ppr orig_expr, ppr expr, ppr etas]) $
-        retick $ etaInfoAbs etas (etaInfoApp subst' sexpr etas)
+      = -- pprTrace "ee" (vcat [ppr orig_expr, ppr expr, pprEtaInfos etas]) $
+        retick $ etaInfoAbs etas (etaInfoApp in_scope' sexpr etas)
       where
           in_scope = mkInScopeSet (exprFreeVars expr)
           (in_scope', etas) = mkEtaWW oss (ppr orig_expr) in_scope (exprType expr)
-          subst' = mkEmptySubst in_scope'
 
           -- Find ticks behind type apps.
           -- See Note [Eta expansion and source notes]
@@ -1090,76 +1104,197 @@ eta_expand one_shots orig_expr
           sexpr = foldl' App expr'' args
           retick expr = foldr mkTick expr ticks
 
-                                -- Abstraction    Application
+{- *********************************************************************
+*                                                                      *
+              The EtaInfo mechanism
+          mkEtaWW, etaInfoAbs, etaInfoApp
+*                                                                      *
+********************************************************************* -}
+
+{- Note [The EtaInfo mechanism]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Suppose we have (e :: ty) and we want to eta-expand it to arity N.
+This what eta_expand does.  We do it in two steps:
+
+1.  mkEtaWW: from 'ty' and 'N' build a [EtaInfo] which describes
+    the shape of the expansion necessary to expand to arity N.
+
+2.  Build the term
+       \ v1..vn.  e v1 .. vn
+    where those abstractions and applications are described by
+    the same [EtaInfo].  Specifically we build the term
+
+       etaInfoAbs etas (etaInfoApp in_scope e etas)
+
+   where etas :: [EtaInfo]#
+         etaInfoAbs builds the lambdas
+         etaInfoApp builds the applictions
+
+   Note that the /same/ [EtaInfo] drives both etaInfoAbs and etaInfoApp
+
+To a first approximation [EtaInfo] is just [Var].  But
+casts complicate the question.  If we have
+   newtype N a = MkN (S -> a)
+and
+   ty = N (N Int)
+then the eta-expansion must look like
+        (\x (\y. ((e |> co1) x) |> co2) y)
+           |> sym co2)
+        |> sym co1
+where
+  co1 :: N (N Int) ~ S -> N Int
+  co2 :: N Int     ~ S -> Int
+
+Blimey!  Look at all those casts.  Moreover, if the type
+is very deeply nested (as happens in #18223), the repetition
+of types can make the overall term very large.  So there is a big
+payoff in cancelling out casts aggressively wherever possible.
+(See also Note [No crap in eta-expanded code].)
+
+This matters a lot in etaEInfoApp, where we
+* Do beta-reduction on the fly
+* Use getARg_mabye to get a cast out of the way,
+  so that we can do beta reduction
+Together this makes a big difference.  Consider when e is
+   case x of
+      True  -> (\x -> e1) |> c1
+      False -> (\p -> e2) |> c2
+
+When we eta-expand this to arity 1, say, etaInfoAbs will wrap
+a (\eta) around the outside and use etaInfoApp to apply each
+alternative to 'eta'.  We want to beta-reduce all that junk
+away.
+
+#18223 was a dramtic example in which the intermediate term was
+grotesquely huge, even though the next Simplifier iteration squashed
+it.  Better to kill it at birth.
+-}
+
 --------------
-data EtaInfo = EtaVar Var       -- /\a. []        [] a
-                                -- \x.  []        [] x
-             | EtaCo Coercion   -- [] |> sym co   [] |> co
+data EtaInfo            -- Abstraction      Application
+  = EtaVar Var          -- /\a. []         [] a
+                        -- (\x. [])        [] x
+  | EtaCo CoercionR     -- [] |> sym co    [] |> co
 
 instance Outputable EtaInfo where
-   ppr (EtaVar v) = text "EtaVar" <+> ppr v
-   ppr (EtaCo co) = text "EtaCo"  <+> ppr co
+   ppr (EtaVar v) = text "EtaVar" <+> ppr v  <+> dcolon <+> ppr (idType v)
+   ppr (EtaCo co) = text "EtaCo"  <+> hang (ppr co) 2 (dcolon <+> ppr (coercionType co))
+
+-- Used in debug-printing
+-- pprEtaInfos :: [EtaInfo] -> SDoc
+-- pprEtaInfos eis = brackets $ vcat $ punctuate comma $ map ppr eis
 
 pushCoercion :: Coercion -> [EtaInfo] -> [EtaInfo]
+-- Puts a EtaCo on the front of a [EtaInfo], but combining
+-- with an existing EtaCo if possible
+-- A minor improvement
 pushCoercion co1 (EtaCo co2 : eis)
   | isReflCo co = eis
   | otherwise   = EtaCo co : eis
   where
     co = co1 `mkTransCo` co2
 
-pushCoercion co eis = EtaCo co : eis
+pushCoercion co eis
+  = EtaCo co : eis
+
+getArg_maybe :: [EtaInfo] -> Maybe (CoreArg, [EtaInfo])
+-- Get an argument to the front of the [EtaInfo], if possible,
+-- by pushing any EtaCo through the argument
+getArg_maybe eis = go MRefl eis
+  where
+    go :: MCoercion -> [EtaInfo] -> Maybe (CoreArg, [EtaInfo])
+    go _         []                = Nothing
+    go mco       (EtaCo co2 : eis) = go (mkTransMCoL mco co2) eis
+    go MRefl     (EtaVar v : eis)  = Just (varToCoreExpr v, eis)
+    go (MCo co)  (EtaVar v : eis)
+      | Just (arg, mco) <- pushCoArg co (varToCoreExpr v)
+      = case mco of
+           MRefl  -> Just (arg, eis)
+           MCo co -> Just (arg, pushCoercion co eis)
+      | otherwise
+      = Nothing
+
+mkCastMCo :: CoreExpr -> MCoercionR -> CoreExpr
+mkCastMCo e MRefl    = e
+mkCastMCo e (MCo co) = Cast e co
+  -- We are careful to use (MCo co) only when co is not reflexive
+  -- Hence (Cast e co) rather than (mkCast e co)
+
+mkPiMCo :: Var -> MCoercionR -> MCoercionR
+mkPiMCo _  MRefl   = MRefl
+mkPiMCo v (MCo co) = MCo (mkPiCo Representational v co)
 
 --------------
 etaInfoAbs :: [EtaInfo] -> CoreExpr -> CoreExpr
-etaInfoAbs []               expr = expr
-etaInfoAbs (EtaVar v : eis) expr = Lam v (etaInfoAbs eis expr)
-etaInfoAbs (EtaCo co : eis) expr = Cast (etaInfoAbs eis expr) (mkSymCo co)
+-- See Note [The EtaInfo mechanism]
+etaInfoAbs eis expr
+  | null eis  = expr
+  | otherwise = case final_mco of
+                   MRefl  -> expr'
+                   MCo co -> mkCast expr' co
+  where
+     (expr', final_mco) = foldr do_one (split_cast expr) eis
+
+     do_one :: EtaInfo -> (CoreExpr, MCoercion) -> (CoreExpr, MCoercion)
+     -- Implements the "Abstraction" column in the comments for data EtaInfo
+     -- In both argument and result the pair (e,mco) denotes (e |> mco)
+     do_one (EtaVar v) (expr, mco) = (Lam v expr, mkPiMCo v mco)
+     do_one (EtaCo co) (expr, mco) = (expr, mco `mkTransMCoL` mkSymCo co)
+
+     split_cast :: CoreExpr -> (CoreExpr, MCoercion)
+     split_cast (Cast e co) = (e, MCo co)
+     split_cast e           = (e, MRefl)
+     -- We could look in the body of lets, and the branches of a case
+     -- But then we would have to worry about whether the cast mentioned
+     -- any of the bound variables, which is tiresome. Later maybe.
+     -- Result: we may end up with
+     --     (\(x::Int). case x of { DEFAULT -> e1 |> co }) |> sym (<Int>->co)
+     -- and fail to optimise it away
 
 --------------
-etaInfoApp :: Subst -> CoreExpr -> [EtaInfo] -> CoreExpr
+etaInfoApp :: InScopeSet -> CoreExpr -> [EtaInfo] -> CoreExpr
 -- (etaInfoApp s e eis) returns something equivalent to
---             ((substExpr s e) `appliedto` eis)
-
-etaInfoApp subst (Lam v1 e) (EtaVar v2 : eis)
-  = etaInfoApp (GHC.Core.Subst.extendSubstWithVar subst v1 v2) e eis
-
-etaInfoApp subst (Cast e co1) eis
-  = etaInfoApp subst e (pushCoercion co' eis)
-  where
-    co' = GHC.Core.Subst.substCo subst co1
+--             (substExpr s e `appliedto` eis)
+-- See Note [The EtaInfo mechanism]
 
-etaInfoApp subst (Case e b ty alts) eis
-  = Case (subst_expr subst e) b1 ty' alts'
+etaInfoApp in_scope expr eis
+  = go (mkEmptySubst in_scope) expr eis
   where
-    (subst1, b1) = substBndr subst b
-    alts' = map subst_alt alts
-    ty'   = etaInfoAppTy (GHC.Core.Subst.substTy subst ty) eis
-    subst_alt (con, bs, rhs) = (con, bs', etaInfoApp subst2 rhs eis)
-              where
-                 (subst2,bs') = substBndrs subst1 bs
-
-etaInfoApp subst (Let b e) eis
-  | not (isJoinBind b)
-    -- See Note [Eta expansion for join points]
-  = Let b' (etaInfoApp subst' e eis)
-  where
-    (subst', b') = substBindSC subst b
+    go :: Subst -> CoreExpr -> [EtaInfo] -> CoreExpr
+    -- 'go' pushed down the eta-infos into the branch of a case
+    -- and the body of a let; and does beta-reduction if possible
+    go subst (Tick t e) eis
+      = Tick (substTickish subst t) (go subst e eis)
+    go subst (Cast e co) eis
+      = go subst e (pushCoercion (Core.substCo subst co) eis)
+    go subst (Case e b ty alts) eis
+      = Case (Core.substExprSC subst e) b1 ty' alts'
+      where
+        (subst1, b1) = Core.substBndr subst b
+        alts' = map subst_alt alts
+        ty'   = etaInfoAppTy (Core.substTy subst ty) eis
+        subst_alt (con, bs, rhs) = (con, bs', go subst2 rhs eis)
+                 where
+                    (subst2,bs') = Core.substBndrs subst1 bs
+    go subst (Let b e) eis
+      | not (isJoinBind b) -- See Note [Eta expansion for join points]
+      = Let b' (go subst' e eis)
+      where
+        (subst', b') = Core.substBindSC subst b
 
-etaInfoApp subst (Tick t e) eis
-  = Tick (substTickish subst t) (etaInfoApp subst e eis)
+    -- Beta-reduction if possible, using getArg_maybe to push
+    -- any intervening casts past the argument
+    -- See Note [The EtaInfo mechansim]
+    go subst (Lam v e) eis
+      | Just (arg, eis') <- getArg_maybe eis
+      = go (Core.extendSubst subst v arg) e eis'
 
-etaInfoApp subst expr _
-  | (Var fun, _) <- collectArgs expr
-  , Var fun' <- lookupIdSubst subst fun
-  , isJoinId fun'
-  = subst_expr subst expr
+    -- Stop pushing down; just wrap the expression up
+    go subst e eis = wrap (Core.substExprSC subst e) eis
 
-etaInfoApp subst e eis
-  = go (subst_expr subst e) eis
-  where
-    go e []                  = e
-    go e (EtaVar v    : eis) = go (App e (varToCoreExpr v)) eis
-    go e (EtaCo co    : eis) = go (Cast e co) eis
+    wrap e []               = e
+    wrap e (EtaVar v : eis) = wrap (App e (varToCoreExpr v)) eis
+    wrap e (EtaCo co : eis) = wrap (Cast e co) eis
 
 
 --------------
@@ -1235,7 +1370,7 @@ mkEtaWW orig_oss ppr_orig_expr in_scope orig_ty
        -- We want to get
        --      coerce T (\x::[T] -> (coerce ([T]->Int) e) x)
        | Just (co, ty') <- topNormaliseNewType_maybe ty
-       , let co' = Coercion.substCo subst co
+       , let co' = Type.substCo subst co
              -- Remember to apply the substitution to co (#16979)
              -- (or we could have applied to ty, but then
              --  we'd have had to zap it for the recursive call)
@@ -1253,21 +1388,290 @@ mkEtaWW orig_oss ppr_orig_expr in_scope orig_ty
         -- with an explicit lambda having a non-function type
 
 
+{- *********************************************************************
+*                                                                      *
+              The "push rules"
+*                                                                      *
+************************************************************************
 
-------------
-subst_expr :: Subst -> CoreExpr -> CoreExpr
--- Apply a substitution to an expression.  We use substExpr
--- not substExprSC (short-cutting substitution) because
--- we may be changing the types of join points, so applying
--- the in-scope set is necessary.
+Here we implement the "push rules" from FC papers:
+
+* The push-argument rules, where we can move a coercion past an argument.
+  We have
+      (fun |> co) arg
+  and we want to transform it to
+    (fun arg') |> co'
+  for some suitable co' and transformed arg'.
+
+* The PushK rule for data constructors.  We have
+       (K e1 .. en) |> co
+  and we want to transform to
+       (K e1' .. en')
+  by pushing the coercion into the arguments
+-}
+
+pushCoArgs :: CoercionR -> [CoreArg] -> Maybe ([CoreArg], MCoercion)
+pushCoArgs co []         = return ([], MCo co)
+pushCoArgs co (arg:args) = do { (arg',  m_co1) <- pushCoArg  co  arg
+                              ; case m_co1 of
+                                  MCo co1 -> do { (args', m_co2) <- pushCoArgs co1 args
+                                                 ; return (arg':args', m_co2) }
+                                  MRefl  -> return (arg':args, MRefl) }
+
+pushCoArg :: CoercionR -> CoreArg -> Maybe (CoreArg, MCoercion)
+-- We have (fun |> co) arg, and we want to transform it to
+--         (fun arg) |> co
+-- This may fail, e.g. if (fun :: N) where N is a newtype
+-- C.f. simplCast in GHC.Core.Opt.Simplify
+-- 'co' is always Representational
+-- If the returned coercion is Nothing, then it would have been reflexive
+pushCoArg co (Type ty) = do { (ty', m_co') <- pushCoTyArg co ty
+                            ; return (Type ty', m_co') }
+pushCoArg co val_arg   = do { (arg_co, m_co') <- pushCoValArg co
+                            ; return (val_arg `mkCastMCo` arg_co, m_co') }
+
+pushCoTyArg :: CoercionR -> Type -> Maybe (Type, MCoercionR)
+-- We have (fun |> co) @ty
+-- Push the coercion through to return
+--         (fun @ty') |> co'
+-- 'co' is always Representational
+-- If the returned coercion is Nothing, then it would have been reflexive;
+-- it's faster not to compute it, though.
+pushCoTyArg co ty
+  -- The following is inefficient - don't do `eqType` here, the coercion
+  -- optimizer will take care of it. See #14737.
+  -- -- | tyL `eqType` tyR
+  -- -- = Just (ty, Nothing)
+
+  | isReflCo co
+  = Just (ty, MRefl)
+
+  | isForAllTy_ty tyL
+  = ASSERT2( isForAllTy_ty tyR, ppr co $$ ppr ty )
+    Just (ty `mkCastTy` co1, MCo co2)
+
+  | otherwise
+  = Nothing
+  where
+    Pair tyL tyR = coercionKind co
+       -- co :: tyL ~ tyR
+       -- tyL = forall (a1 :: k1). ty1
+       -- tyR = forall (a2 :: k2). ty2
+
+    co1 = mkSymCo (mkNthCo Nominal 0 co)
+       -- co1 :: k2 ~N k1
+       -- Note that NthCo can extract a Nominal equality between the
+       -- kinds of the types related by a coercion between forall-types.
+       -- See the NthCo case in GHC.Core.Lint.
+
+    co2 = mkInstCo co (mkGReflLeftCo Nominal ty co1)
+        -- co2 :: ty1[ (ty|>co1)/a1 ] ~ ty2[ ty/a2 ]
+        -- Arg of mkInstCo is always nominal, hence mkNomReflCo
+
+pushCoValArg :: CoercionR -> Maybe (MCoercionR, MCoercionR)
+-- We have (fun |> co) arg
+-- Push the coercion through to return
+--         (fun (arg |> co_arg)) |> co_res
+-- 'co' is always Representational
+-- If the second returned Coercion is actually Nothing, then no cast is necessary;
+-- the returned coercion would have been reflexive.
+pushCoValArg co
+  -- The following is inefficient - don't do `eqType` here, the coercion
+  -- optimizer will take care of it. See #14737.
+  -- -- | tyL `eqType` tyR
+  -- -- = Just (mkRepReflCo arg, Nothing)
+
+  | isReflCo co
+  = Just (MRefl, MRefl)
+
+  | isFunTy tyL
+  , (co_mult, co1, co2) <- decomposeFunCo Representational co
+  , isReflexiveCo co_mult
+    -- We can't push the coercion in the case where co_mult isn't reflexivity:
+    -- it could be an unsafe axiom, and losing this information could yield
+    -- ill-typed terms. For instance (fun x ::(1) Int -> (fun _ -> () |> co) x)
+    -- with co :: (Int -> ()) ~ (Int #-> ()), would reduce to (fun x ::(1) Int
+    -- -> (fun _ ::(Many) Int -> ()) x) which is ill-typed
+
+              -- If   co  :: (tyL1 -> tyL2) ~ (tyR1 -> tyR2)
+              -- then co1 :: tyL1 ~ tyR1
+              --      co2 :: tyL2 ~ tyR2
+  = ASSERT2( isFunTy tyR, ppr co $$ ppr arg )
+    Just (coToMCo (mkSymCo co1), coToMCo co2)
+    -- Critically, coToMCo to checks for ReflCo; the whole coercion may not
+    -- be reflexive, but either of its components might be
+    -- We could use isReflexiveCo, but it's not clear if the benefit
+    -- is worth the cost, and it makes no difference in #18223
+
+  | otherwise
+  = Nothing
+  where
+    arg = funArgTy tyR
+    Pair tyL tyR = coercionKind co
+
+pushCoercionIntoLambda
+    :: InScopeSet -> Var -> CoreExpr -> CoercionR -> Maybe (Var, CoreExpr)
+-- This implements the Push rule from the paper on coercions
+--    (\x. e) |> co
+-- ===>
+--    (\x'. e |> co')
+pushCoercionIntoLambda in_scope x e co
+    | ASSERT(not (isTyVar x) && not (isCoVar x)) True
+    , Pair s1s2 t1t2 <- coercionKind co
+    , Just (_, _s1,_s2) <- splitFunTy_maybe s1s2
+    , Just (w1, t1,_t2) <- splitFunTy_maybe t1t2
+    , (co_mult, co1, co2) <- decomposeFunCo Representational co
+    , isReflexiveCo co_mult
+      -- We can't push the coercion in the case where co_mult isn't
+      -- reflexivity. See pushCoValArg for more details.
+    = let
+          -- Should we optimize the coercions here?
+          -- Otherwise they might not match too well
+          x' = x `setIdType` t1 `setIdMult` w1
+          in_scope' = in_scope `extendInScopeSet` x'
+          subst = extendIdSubst (mkEmptySubst in_scope')
+                                x
+                                (mkCast (Var x') co1)
+      in Just (x', substExpr subst e `mkCast` co2)
+    | otherwise
+    = pprTrace "exprIsLambda_maybe: Unexpected lambda in case" (ppr (Lam x e))
+      Nothing
+
+pushCoDataCon :: DataCon -> [CoreExpr] -> Coercion
+              -> Maybe (DataCon
+                       , [Type]      -- Universal type args
+                       , [CoreExpr]) -- All other args incl existentials
+-- Implement the KPush reduction rule as described in "Down with kinds"
+-- The transformation applies iff we have
+--      (C e1 ... en) `cast` co
+-- where co :: (T t1 .. tn) ~ to_ty
+-- The left-hand one must be a T, because exprIsConApp returned True
+-- but the right-hand one might not be.  (Though it usually will.)
+pushCoDataCon dc dc_args co
+  | isReflCo co || from_ty `eqType` to_ty  -- try cheap test first
+  , let (univ_ty_args, rest_args) = splitAtList (dataConUnivTyVars dc) dc_args
+  = Just (dc, map exprToType univ_ty_args, rest_args)
+
+  | Just (to_tc, to_tc_arg_tys) <- splitTyConApp_maybe to_ty
+  , to_tc == dataConTyCon dc
+        -- These two tests can fail; we might see
+        --      (C x y) `cast` (g :: T a ~ S [a]),
+        -- where S is a type function.  In fact, exprIsConApp
+        -- will probably not be called in such circumstances,
+        -- but there's nothing wrong with it
+
+  = let
+        tc_arity       = tyConArity to_tc
+        dc_univ_tyvars = dataConUnivTyVars dc
+        dc_ex_tcvars   = dataConExTyCoVars dc
+        arg_tys        = dataConRepArgTys dc
+
+        non_univ_args  = dropList dc_univ_tyvars dc_args
+        (ex_args, val_args) = splitAtList dc_ex_tcvars non_univ_args
+
+        -- Make the "Psi" from the paper
+        omegas = decomposeCo tc_arity co (tyConRolesRepresentational to_tc)
+        (psi_subst, to_ex_arg_tys)
+          = liftCoSubstWithEx Representational
+                              dc_univ_tyvars
+                              omegas
+                              dc_ex_tcvars
+                              (map exprToType ex_args)
+
+          -- Cast the value arguments (which include dictionaries)
+        new_val_args = zipWith cast_arg (map scaledThing arg_tys) val_args
+        cast_arg arg_ty arg = mkCast arg (psi_subst arg_ty)
+
+        to_ex_args = map Type to_ex_arg_tys
+
+        dump_doc = vcat [ppr dc,      ppr dc_univ_tyvars, ppr dc_ex_tcvars,
+                         ppr arg_tys, ppr dc_args,
+                         ppr ex_args, ppr val_args, ppr co, ppr from_ty, ppr to_ty, ppr to_tc
+                         , ppr $ mkTyConApp to_tc (map exprToType $ takeList dc_univ_tyvars dc_args) ]
+    in
+    ASSERT2( eqType from_ty (mkTyConApp to_tc (map exprToType $ takeList dc_univ_tyvars dc_args)), dump_doc )
+    ASSERT2( equalLength val_args arg_tys, dump_doc )
+    Just (dc, to_tc_arg_tys, to_ex_args ++ new_val_args)
+
+  | otherwise
+  = Nothing
+
+  where
+    Pair from_ty to_ty = coercionKind co
+
+collectBindersPushingCo :: CoreExpr -> ([Var], CoreExpr)
+-- Collect lambda binders, pushing coercions inside if possible
+-- E.g.   (\x.e) |> g         g :: <Int> -> blah
+--        = (\x. e |> Nth 1 g)
+--
+-- That is,
 --
--- ToDo: we could instead check if we actually *are*
--- changing any join points' types, and if not use substExprSC.
-subst_expr = substExpr
+-- collectBindersPushingCo ((\x.e) |> g) === ([x], e |> Nth 1 g)
+collectBindersPushingCo e
+  = go [] e
+  where
+    -- Peel off lambdas until we hit a cast.
+    go :: [Var] -> CoreExpr -> ([Var], CoreExpr)
+    -- The accumulator is in reverse order
+    go bs (Lam b e)   = go (b:bs) e
+    go bs (Cast e co) = go_c bs e co
+    go bs e           = (reverse bs, e)
+
+    -- We are in a cast; peel off casts until we hit a lambda.
+    go_c :: [Var] -> CoreExpr -> CoercionR -> ([Var], CoreExpr)
+    -- (go_c bs e c) is same as (go bs e (e |> c))
+    go_c bs (Cast e co1) co2 = go_c bs e (co1 `mkTransCo` co2)
+    go_c bs (Lam b e)    co  = go_lam bs b e co
+    go_c bs e            co  = (reverse bs, mkCast e co)
+
+    -- We are in a lambda under a cast; peel off lambdas and build a
+    -- new coercion for the body.
+    go_lam :: [Var] -> Var -> CoreExpr -> CoercionR -> ([Var], CoreExpr)
+    -- (go_lam bs b e c) is same as (go_c bs (\b.e) c)
+    go_lam bs b e co
+      | isTyVar b
+      , let Pair tyL tyR = coercionKind co
+      , ASSERT( isForAllTy_ty tyL )
+        isForAllTy_ty tyR
+      , isReflCo (mkNthCo Nominal 0 co)  -- See Note [collectBindersPushingCo]
+      = go_c (b:bs) e (mkInstCo co (mkNomReflCo (mkTyVarTy b)))
+
+      | isCoVar b
+      , let Pair tyL tyR = coercionKind co
+      , ASSERT( isForAllTy_co tyL )
+        isForAllTy_co tyR
+      , isReflCo (mkNthCo Nominal 0 co)  -- See Note [collectBindersPushingCo]
+      , let cov = mkCoVarCo b
+      = go_c (b:bs) e (mkInstCo co (mkNomReflCo (mkCoercionTy cov)))
+
+      | isId b
+      , let Pair tyL tyR = coercionKind co
+      , ASSERT( isFunTy tyL) isFunTy tyR
+      , (co_mult, co_arg, co_res) <- decomposeFunCo Representational co
+      , isReflCo co_mult -- See Note [collectBindersPushingCo]
+      , isReflCo co_arg  -- See Note [collectBindersPushingCo]
+      = go_c (b:bs) e co_res
+
+      | otherwise = (reverse bs, mkCast (Lam b e) co)
 
+{-
 
---------------
+Note [collectBindersPushingCo]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We just look for coercions of form
+   <type> # w -> blah
+(and similarly for foralls) to keep this function simple.  We could do
+more elaborate stuff, but it'd involve substitution etc.
+
+-}
+
+{- *********************************************************************
+*                                                                      *
+                Join points
+*                                                                      *
+********************************************************************* -}
 
+-------------------
 -- | Split an expression into the given number of binders and a body,
 -- eta-expanding if necessary. Counts value *and* type binders.
 etaExpandToJoinPoint :: JoinArity -> CoreExpr -> ([CoreBndr], CoreExpr)
@@ -1307,7 +1711,7 @@ etaBodyForJoinPoint need_args body
       = (reverse rev_bs, e)
     go n ty subst rev_bs e
       | Just (tv, res_ty) <- splitForAllTy_maybe ty
-      , let (subst', tv') = Type.substVarBndr subst tv
+      , let (subst', tv') = substVarBndr subst tv
       = go (n-1) res_ty subst' (tv' : rev_bs) (e `App` varToCoreExpr tv')
       | Just (mult, arg_ty, res_ty) <- splitFunTy_maybe ty
       , let (subst', b) = freshEtaId n subst (Scaled mult arg_ty)
@@ -1318,6 +1722,8 @@ etaBodyForJoinPoint need_args body
 
     init_subst e = mkEmptyTCvSubst (mkInScopeSet (exprFreeVars e))
 
+
+
 --------------
 freshEtaId :: Int -> TCvSubst -> Scaled Type -> (TCvSubst, Id)
 -- Make a fresh Id, with specified type (after applying substitution)
@@ -1336,3 +1742,4 @@ freshEtaId n subst ty
                   -- "OrCoVar" since this can be used to eta-expand
                   -- coercion abstractions
         subst'  = extendTCvInScope subst eta_id'
+
diff --git a/compiler/GHC/Core/Opt/Pipeline.hs b/compiler/GHC/Core/Opt/Pipeline.hs
index c3f2fc9f85..6d0712e634 100644
--- a/compiler/GHC/Core/Opt/Pipeline.hs
+++ b/compiler/GHC/Core/Opt/Pipeline.hs
@@ -315,33 +315,38 @@ getCoreToDo dflags
 
         runWhen do_float_in CoreDoFloatInwards,
 
+        simplify "final",  -- Final tidy-up
+
         maybe_rule_check FinalPhase,
 
+        --------  After this we have -O2 passes -----------------
+        -- None of them run with -O
+
                 -- Case-liberation for -O2.  This should be after
                 -- strictness analysis and the simplification which follows it.
-        runWhen liberate_case (CoreDoPasses [
-            CoreLiberateCase,
-            simplify "post-liberate-case"
-            ]),         -- Run the simplifier after LiberateCase to vastly
-                        -- reduce the possibility of shadowing
-                        -- Reason: see Note [Shadowing] in GHC.Core.Opt.SpecConstr
+        runWhen liberate_case $ CoreDoPasses
+           [ CoreLiberateCase, simplify "post-liberate-case" ],
+           -- Run the simplifier after LiberateCase to vastly
+           -- reduce the possibility of shadowing
+           -- Reason: see Note [Shadowing] in GHC.Core.Opt.SpecConstr
 
-        runWhen spec_constr CoreDoSpecConstr,
+        runWhen spec_constr $ CoreDoPasses
+           [ CoreDoSpecConstr, simplify "post-spec-constr"],
+           -- See Note [Simplify after SpecConstr]
 
         maybe_rule_check FinalPhase,
 
-        runWhen late_specialise
-          (CoreDoPasses [ CoreDoSpecialising
-                        , simplify "post-late-spec"]),
+        runWhen late_specialise $ CoreDoPasses
+           [ CoreDoSpecialising, simplify "post-late-spec"],
 
         -- LiberateCase can yield new CSE opportunities because it peels
         -- off one layer of a recursive function (concretely, I saw this
         -- in wheel-sieve1), and I'm guessing that SpecConstr can too
         -- And CSE is a very cheap pass. So it seems worth doing here.
-        runWhen ((liberate_case || spec_constr) && cse) CoreCSE,
+        runWhen ((liberate_case || spec_constr) && cse) $ CoreDoPasses
+           [ CoreCSE, simplify "post-final-cse" ],
 
-        -- Final clean-up simplification:
-        simplify "final",
+        ---------  End of -O2 passes --------------
 
         runWhen late_dmd_anal $ CoreDoPasses (
             dmd_cpr_ww ++ [simplify "post-late-ww"]
@@ -410,6 +415,27 @@ or with -O0.  Two reasons:
 But watch out: list fusion can prevent floating.  So use phase control
 to switch off those rules until after floating.
 
+Note [Simplify after SpecConstr]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We want to run the simplifier after SpecConstr, and before late-Specialise,
+for two reasons, both shown up in test perf/compiler/T16473,
+with -O2 -flate-specialise
+
+1.  I found that running late-Specialise after SpecConstr, with no
+    simplification in between meant that the carefullly constructed
+    SpecConstr rule never got to fire.  (It was something like
+          lvl = f a   -- Arity 1
+          ....g lvl....
+    SpecConstr specialised g for argument lvl; but Specialise then
+    specialised lvl = f a to lvl = $sf, and inlined. Or something like
+    that.)
+
+2.  Specialise relies on unfoldings being available for top-level dictionary
+    bindings; but SpecConstr kills them all!  The Simplifer restores them.
+
+This extra run of the simplifier has a cost, but this is only with -O2.
+
+
 ************************************************************************
 *                                                                      *
                   The CoreToDo interpreter
diff --git a/compiler/GHC/Core/Opt/Simplify.hs b/compiler/GHC/Core/Opt/Simplify.hs
index e219a0dba9..6c207766bd 100644
--- a/compiler/GHC/Core/Opt/Simplify.hs
+++ b/compiler/GHC/Core/Opt/Simplify.hs
@@ -51,9 +51,9 @@ import GHC.Core.Unfold
 import GHC.Core.Unfold.Make
 import GHC.Core.Utils
 import GHC.Core.Opt.Arity ( ArityType(..), arityTypeArity, isBotArityType
+                          , pushCoTyArg, pushCoValArg
                           , idArityType, etaExpandAT )
-import GHC.Core.SimpleOpt ( pushCoTyArg, pushCoValArg
-                          , joinPointBinding_maybe, joinPointBindings_maybe )
+import GHC.Core.SimpleOpt ( joinPointBinding_maybe, joinPointBindings_maybe )
 import GHC.Core.FVs     ( mkRuleInfo )
 import GHC.Core.Rules   ( lookupRule, getRules, initRuleOpts )
 import GHC.Types.Basic
@@ -318,7 +318,7 @@ simplLazyBind env top_lvl is_rec bndr bndr1 rhs rhs_se
         ; let rhs_cont = mkRhsStop (substTy body_env (exprType body))
         ; (body_floats0, body0) <- {-#SCC "simplExprF" #-} simplExprF body_env body rhs_cont
 
-              -- Never float join-floats out of a non-join let-binding
+              -- Never float join-floats out of a non-join let-binding (which this is)
               -- So wrap the body in the join-floats right now
               -- Hence: body_floats1 consists only of let-floats
         ; let (body_floats1, body1) = wrapJoinFloatsX body_floats0 body0
@@ -1414,25 +1414,23 @@ simplCast env body co0 cont0
                                         -- type of the hole changes (#16312)
 
         -- (f |> co) e   ===>   (f (e |> co1)) |> co2
-        -- where   co :: (s1->s2) ~ (t1~t2)
+        -- where   co :: (s1->s2) ~ (t1->t2)
         --         co1 :: t1 ~ s1
         --         co2 :: s2 ~ t2
         addCoerce co cont@(ApplyToVal { sc_arg = arg, sc_env = arg_se
                                       , sc_dup = dup, sc_cont = tail })
-          | Just (co1, m_co2) <- pushCoValArg co
-          , let new_ty = coercionRKind co1
-          , not (isTypeLevPoly new_ty)  -- Without this check, we get a lev-poly arg
-                                        -- See Note [Levity polymorphism invariants] in GHC.Core
-                                        -- test: typecheck/should_run/EtaExpandLevPoly
+          | Just (m_co1, m_co2) <- pushCoValArg co
+          , levity_ok m_co1
           = {-#SCC "addCoerce-pushCoValArg" #-}
             do { tail' <- addCoerceM m_co2 tail
-               ; if isReflCo co1
-                 then return (cont { sc_cont = tail'
-                                   , sc_hole_ty = coercionLKind co })
+               ; case m_co1 of {
+                   MRefl -> return (cont { sc_cont = tail'
+                                         , sc_hole_ty = coercionLKind co }) ;
                       -- Avoid simplifying if possible;
                       -- See Note [Avoiding exponential behaviour]
-                 else do
-               { (dup', arg_se', arg') <- simplArg env dup arg_se arg
+
+                   MCo co1 ->
+            do { (dup', arg_se', arg') <- simplArg env dup arg_se arg
                     -- When we build the ApplyTo we can't mix the OutCoercion
                     -- 'co' with the InExpr 'arg', so we simplify
                     -- to make it all consistent.  It's a bit messy.
@@ -1442,7 +1440,7 @@ simplCast env body co0 cont0
                                     , sc_env  = arg_se'
                                     , sc_dup  = dup'
                                     , sc_cont = tail'
-                                    , sc_hole_ty = coercionLKind co }) } }
+                                    , sc_hole_ty = coercionLKind co }) } } }
 
         addCoerce co cont
           | isReflexiveCo co = return cont  -- Having this at the end makes a huge
@@ -1450,6 +1448,13 @@ simplCast env body co0 cont0
                                             -- See Note [Optimising reflexivity]
           | otherwise        = return (CastIt co cont)
 
+        levity_ok :: MCoercionR -> Bool
+        levity_ok MRefl = True
+        levity_ok (MCo co) = not $ isTypeLevPoly $ coercionRKind co
+          -- Without this check, we get a lev-poly arg
+          -- See Note [Levity polymorphism invariants] in GHC.Core
+          -- test: typecheck/should_run/EtaExpandLevPoly
+
 simplArg :: SimplEnv -> DupFlag -> StaticEnv -> CoreExpr
          -> SimplM (DupFlag, StaticEnv, OutExpr)
 simplArg env dup_flag arg_env arg
@@ -3114,7 +3119,7 @@ knownCon :: SimplEnv
 
 knownCon env scrut dc_floats dc dc_ty_args dc_args bndr bs rhs cont
   = do  { (floats1, env1)  <- bind_args env bs dc_args
-        ; (floats2, env2) <- bind_case_bndr env1
+        ; (floats2, env2)  <- bind_case_bndr env1
         ; (floats3, expr') <- simplExprF env2 rhs cont
         ; case dc_floats of
             [] ->
@@ -3240,6 +3245,7 @@ altsWouldDup [_] = False
 altsWouldDup (alt:alts)
   | is_bot_alt alt = altsWouldDup alts
   | otherwise      = not (all is_bot_alt alts)
+    -- otherwise case: first alt is non-bot, so all the rest must be bot
   where
     is_bot_alt (_,_,rhs) = exprIsDeadEnd rhs
 
diff --git a/compiler/GHC/Core/Opt/Simplify/Env.hs b/compiler/GHC/Core/Opt/Simplify/Env.hs
index 4ceaf637ed..7049e3e578 100644
--- a/compiler/GHC/Core/Opt/Simplify/Env.hs
+++ b/compiler/GHC/Core/Opt/Simplify/Env.hs
@@ -596,7 +596,7 @@ addJoinFlts :: JoinFloats -> JoinFloats -> JoinFloats
 addJoinFlts = appOL
 
 mkRecFloats :: SimplFloats -> SimplFloats
--- Flattens the floats from env2 into a single Rec group,
+-- Flattens the floats into a single Rec group,
 -- They must either all be lifted LetFloats or all JoinFloats
 mkRecFloats floats@(SimplFloats { sfLetFloats  = LetFloats bs ff
                                 , sfJoinFloats = jbs
diff --git a/compiler/GHC/Core/Opt/Specialise.hs b/compiler/GHC/Core/Opt/Specialise.hs
index 8e9e35d208..ba5679778b 100644
--- a/compiler/GHC/Core/Opt/Specialise.hs
+++ b/compiler/GHC/Core/Opt/Specialise.hs
@@ -21,22 +21,21 @@ import GHC.Tc.Utils.TcType hiding( substTy )
 import GHC.Core.Type  hiding( substTy, extendTvSubstList )
 import GHC.Core.Multiplicity
 import GHC.Core.Predicate
-import GHC.Unit.Module( Module, HasModule(..) )
+import GHC.Unit.Module( Module )
 import GHC.Core.Coercion( Coercion )
 import GHC.Core.Opt.Monad
 import qualified GHC.Core.Subst as Core
-import GHC.Core.Unfold
 import GHC.Core.Unfold.Make
 import GHC.Types.Var      ( isLocalVar )
 import GHC.Types.Var.Set
 import GHC.Types.Var.Env
 import GHC.Core
 import GHC.Core.Rules
-import GHC.Core.SimpleOpt ( collectBindersPushingCo )
 import GHC.Core.Utils     ( exprIsTrivial, getIdFromTrivialExpr_maybe
                           , mkCast, exprType )
 import GHC.Core.FVs
-import GHC.Core.Opt.Arity     ( etaExpandToJoinPointRule )
+import GHC.Core.Opt.Arity     ( collectBindersPushingCo
+                              , etaExpandToJoinPointRule )
 import GHC.Types.Unique.Supply
 import GHC.Types.Name
 import GHC.Types.Id.Make  ( voidArgId, voidPrimId )
@@ -53,12 +52,9 @@ import GHC.Utils.Misc
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
 import GHC.Data.FastString
-import GHC.Utils.Monad.State
 import GHC.Types.Unique.DFM
 import GHC.Core.TyCo.Rep (TyCoBinder (..))
 
-import Control.Monad
-
 {-
 ************************************************************************
 *                                                                      *
@@ -592,28 +588,29 @@ specProgram guts@(ModGuts { mg_module = this_mod
                           , mg_binds = binds })
   = do { dflags <- getDynFlags
 
+              -- We need to start with a Subst that knows all the things
+              -- that are in scope, so that the substitution engine doesn't
+              -- accidentally re-use a unique that's already in use
+              -- Easiest thing is to do it all at once, as if all the top-level
+              -- decls were mutually recursive
+       ; let top_env = SE { se_subst = Core.mkEmptySubst $ mkInScopeSet $ mkVarSet $
+                                       bindersOfBinds binds
+                          , se_interesting = emptyVarSet
+                          , se_module = this_mod
+                          , se_dflags = dflags }
+
+             go []           = return ([], emptyUDs)
+             go (bind:binds) = do (binds', uds) <- go binds
+                                  (bind', uds') <- specBind top_env bind uds
+                                  return (bind' ++ binds', uds')
+
              -- Specialise the bindings of this module
-       ; (binds', uds) <- runSpecM dflags this_mod (go binds)
+       ; (binds', uds) <- runSpecM (go binds)
 
-       ; (spec_rules, spec_binds) <- specImports dflags this_mod top_env
-                                                 local_rules uds
+       ; (spec_rules, spec_binds) <- specImports top_env local_rules uds
 
        ; return (guts { mg_binds = spec_binds ++ binds'
                       , mg_rules = spec_rules ++ local_rules }) }
-  where
-        -- We need to start with a Subst that knows all the things
-        -- that are in scope, so that the substitution engine doesn't
-        -- accidentally re-use a unique that's already in use
-        -- Easiest thing is to do it all at once, as if all the top-level
-        -- decls were mutually recursive
-    top_env = SE { se_subst = Core.mkEmptySubst $ mkInScopeSet $ mkVarSet $
-                              bindersOfBinds binds
-                 , se_interesting = emptyVarSet }
-
-    go []           = return ([], emptyUDs)
-    go (bind:binds) = do (binds', uds) <- go binds
-                         (bind', uds') <- specBind top_env bind uds
-                         return (bind' ++ binds', uds')
 
 {-
 Note [Wrap bindings returned by specImports]
@@ -643,13 +640,13 @@ See #10491
 *                                                                      *
 ********************************************************************* -}
 
-specImports :: DynFlags -> Module -> SpecEnv
+specImports :: SpecEnv
             -> [CoreRule]
             -> UsageDetails
             -> CoreM ([CoreRule], [CoreBind])
-specImports dflags this_mod top_env local_rules
+specImports top_env local_rules
             (MkUD { ud_binds = dict_binds, ud_calls = calls })
-  | not $ gopt Opt_CrossModuleSpecialise dflags
+  | not $ gopt Opt_CrossModuleSpecialise (se_dflags top_env)
     -- See Note [Disabling cross-module specialisation]
   = return ([], wrapDictBinds dict_binds [])
 
@@ -657,8 +654,7 @@ specImports dflags this_mod top_env local_rules
   = do { hpt_rules <- getRuleBase
        ; let rule_base = extendRuleBaseList hpt_rules local_rules
 
-       ; (spec_rules, spec_binds) <- spec_imports dflags this_mod top_env
-                                                  [] rule_base
+       ; (spec_rules, spec_binds) <- spec_imports top_env [] rule_base
                                                   dict_binds calls
 
              -- Don't forget to wrap the specialized bindings with
@@ -674,9 +670,7 @@ specImports dflags this_mod top_env local_rules
     }
 
 -- | Specialise a set of calls to imported bindings
-spec_imports :: DynFlags
-             -> Module
-             -> SpecEnv          -- Passed in so that all top-level Ids are in scope
+spec_imports :: SpecEnv          -- Passed in so that all top-level Ids are in scope
              -> [Id]             -- Stack of imported functions being specialised
                                  -- See Note [specImport call stack]
              -> RuleBase         -- Rules from this module and the home package
@@ -686,8 +680,7 @@ spec_imports :: DynFlags
              -> CallDetails      -- Calls for imported things
              -> CoreM ( [CoreRule]   -- New rules
                       , [CoreBind] ) -- Specialised bindings
-spec_imports dflags this_mod top_env
-             callers rule_base dict_binds calls
+spec_imports top_env callers rule_base dict_binds calls
   = do { let import_calls = dVarEnvElts calls
        -- ; debugTraceMsg (text "specImports {" <+>
        --                  vcat [ text "calls:" <+> ppr import_calls
@@ -701,16 +694,13 @@ spec_imports dflags this_mod top_env
     go _ [] = return ([], [])
     go rb (cis : other_calls)
       = do { -- debugTraceMsg (text "specImport {" <+> ppr cis)
-           ; (rules1, spec_binds1) <- spec_import dflags this_mod top_env
-                                                  callers rb dict_binds cis
+           ; (rules1, spec_binds1) <- spec_import top_env callers rb dict_binds cis
            -- ; debugTraceMsg (text "specImport }" <+> ppr cis)
 
            ; (rules2, spec_binds2) <- go (extendRuleBaseList rb rules1) other_calls
            ; return (rules1 ++ rules2, spec_binds1 ++ spec_binds2) }
 
-spec_import :: DynFlags
-            -> Module
-            -> SpecEnv               -- Passed in so that all top-level Ids are in scope
+spec_import :: SpecEnv               -- Passed in so that all top-level Ids are in scope
             -> [Id]                  -- Stack of imported functions being specialised
                                      -- See Note [specImport call stack]
             -> RuleBase              -- Rules from this module
@@ -719,8 +709,7 @@ spec_import :: DynFlags
             -> CallInfoSet           -- Imported function and calls for it
             -> CoreM ( [CoreRule]    -- New rules
                      , [CoreBind] )  -- Specialised bindings
-spec_import dflags this_mod top_env callers
-            rb dict_binds cis@(CIS fn _)
+spec_import top_env callers rb dict_binds cis@(CIS fn _)
   | isIn "specImport" fn callers
   = return ([], [])     -- No warning.  This actually happens all the time
                         -- when specialising a recursive function, because
@@ -731,8 +720,7 @@ spec_import dflags this_mod top_env callers
   = do { -- debugTraceMsg (text "specImport:no valid calls")
        ; return ([], []) }
 
-  | wantSpecImport dflags unfolding
-  , Just rhs <- maybeUnfoldingTemplate unfolding
+  | Just rhs <- canSpecImport dflags fn
   = do {     -- Get rules from the external package state
              -- We keep doing this in case we "page-fault in"
              -- more rules as we go along
@@ -744,8 +732,8 @@ spec_import dflags this_mod top_env callers
 
        ; (rules1, spec_pairs, MkUD { ud_binds = dict_binds1, ud_calls = new_calls })
              <- do { -- debugTraceMsg (text "specImport1" <+> vcat [ppr fn, ppr good_calls, ppr rhs])
-                   ; runSpecM dflags this_mod $
-                     specCalls (Just this_mod) top_env rules_for_fn good_calls fn rhs }
+                   ; runSpecM $
+                     specCalls True top_env rules_for_fn good_calls fn rhs }
        ; let spec_binds1 = [NonRec b r | (b,r) <- spec_pairs]
              -- After the rules kick in we may get recursion, but
              -- we rely on a global GlomBinds to sort that out later
@@ -753,7 +741,7 @@ spec_import dflags this_mod top_env callers
 
               -- Now specialise any cascaded calls
        -- ; debugTraceMsg (text "specImport 2" <+> (ppr fn $$ ppr rules1 $$ ppr spec_binds1))
-       ; (rules2, spec_binds2) <- spec_imports dflags this_mod top_env
+       ; (rules2, spec_binds2) <- spec_imports top_env
                                                (fn:callers)
                                                (extendRuleBaseList rb rules1)
                                                (dict_binds `unionBags` dict_binds1)
@@ -769,11 +757,34 @@ spec_import dflags this_mod top_env callers
        ; return ([], [])}
 
   where
-    unfolding = realIdUnfolding fn   -- We want to see the unfolding even for loop breakers
+    dflags = se_dflags top_env
     good_calls = filterCalls cis dict_binds
        -- SUPER IMPORTANT!  Drop calls that (directly or indirectly) refer to fn
        -- See Note [Avoiding loops in specImports]
 
+canSpecImport :: DynFlags -> Id -> Maybe CoreExpr
+-- See Note [Specialise imported INLINABLE things]
+canSpecImport dflags fn
+  | CoreUnfolding { uf_src = src, uf_tmpl = rhs } <- unf
+  , isStableSource src
+  = Just rhs   -- By default, specialise only imported things that have a stable
+               -- unfolding; that is, have an INLINE or INLINABLE pragma
+               -- Specialise even INLINE things; it hasn't inlined yet,
+               -- so perhaps it never will.  Moreover it may have calls
+               -- inside it that we want to specialise
+
+    -- CoreUnfolding case does /not/ include DFunUnfoldings;
+    -- We only specialise DFunUnfoldings with -fspecialise-aggressively
+    -- See Note [Do not specialise imported DFuns]
+
+  | gopt Opt_SpecialiseAggressively dflags
+  = maybeUnfoldingTemplate unf  -- With -fspecialise-aggressively, specialise anything
+                                -- with an unfolding, stable or not, DFun or not
+
+  | otherwise = Nothing
+  where
+    unf = realIdUnfolding fn   -- We want to see the unfolding even for loop breakers
+
 -- | Returns whether or not to show a missed-spec warning.
 -- If -Wall-missed-specializations is on, show the warning.
 -- Otherwise, if -Wmissed-specializations is on, only show a warning
@@ -798,24 +809,47 @@ tryWarnMissingSpecs dflags callers fn calls_for_fn
           , whenPprDebug (text "calls:" <+> vcat (map (pprCallInfo fn) calls_for_fn))
           , text "Probable fix: add INLINABLE pragma on" <+> quotes (ppr fn) ])
 
-wantSpecImport :: DynFlags -> Unfolding -> Bool
--- See Note [Specialise imported INLINABLE things]
-wantSpecImport dflags unf
- = case unf of
-     NoUnfolding      -> False
-     BootUnfolding    -> False
-     OtherCon {}      -> False
-     DFunUnfolding {} -> True
-     CoreUnfolding { uf_src = src, uf_guidance = _guidance }
-       | gopt Opt_SpecialiseAggressively dflags -> True
-       | isStableSource src -> True
-               -- Specialise even INLINE things; it hasn't inlined yet,
-               -- so perhaps it never will.  Moreover it may have calls
-               -- inside it that we want to specialise
-       | otherwise -> False    -- Stable, not INLINE, hence INLINABLE
 
-{- Note [Avoiding loops in specImports]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+{- Note [Do not specialise imported DFuns]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Ticket #18223 shows that specialising calls of DFuns is can cause a huge
+and entirely unnecessary blowup in program size.  Consider a call to
+    f @[[[[[[[[T]]]]]]]] d1 x
+where df :: C a => C [a]
+      d1 :: C [[[[[[[[T]]]]]]]] = dfC[] @[[[[[[[T]]]]]]] d1
+      d2 :: C [[[[[[[T]]]]]]]   = dfC[] @[[[[[[T]]]]]] d3
+      ...
+Now we'll specialise f's RHS, which may give rise to calls to 'g',
+also overloaded, which we will specialise, and so on.  However, if
+we specialise the calls to dfC[], we'll generate specialised copies of
+all methods of C, at all types; and the same for C's superclasses.
+
+And many of these specialised functions will never be called.  We are
+going to call the specialised 'f', and the specialised 'g', but DFuns
+group functions into a tuple, many of whose elements may never be used.
+
+With deeply-nested types this can lead to a simply overwhelming number
+of specialisations: see #18223 for a simple example (from the wild).
+I measured the number of specialisations for various numbers of calls
+of `flip evalStateT ()`, and got this
+
+                       Size after one simplification
+  #calls    #SPEC rules    Terms     Types
+      5         56          3100     10600
+      9        108         13660     77206
+
+The real tests case has 60+ calls, which blew GHC out of the water.
+
+Solution: don't specialise DFuns.  The downside is that if we end
+up with (h (dfun d)), /and/ we don't specialise 'h', then we won't
+pass to 'h' a tuple of specialised functions.
+
+However, the flag -fspecialise-aggressively (experimental, off by default)
+allows DFuns to specialise as well.
+
+Note [Avoiding loops in specImports]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We must take great care when specialising instance declarations
 (functions like $fOrdList) lest we accidentally build a recursive
 dictionary. See Note [Avoiding loops].
@@ -1003,6 +1037,9 @@ data SpecEnv
              -- Dict Ids that we know something about
              -- and hence may be worth specialising against
              -- See Note [Interesting dictionary arguments]
+
+       , se_module :: Module
+       , se_dflags :: DynFlags
      }
 
 instance Outputable SpecEnv where
@@ -1310,7 +1347,7 @@ specDefn :: SpecEnv
 specDefn env body_uds fn rhs
   = do { let (body_uds_without_me, calls_for_me) = callsForMe fn body_uds
              rules_for_me = idCoreRules fn
-       ; (rules, spec_defns, spec_uds) <- specCalls Nothing env rules_for_me
+       ; (rules, spec_defns, spec_uds) <- specCalls False env rules_for_me
                                                     calls_for_me fn rhs
        ; return ( fn `addIdSpecialisations` rules
                 , spec_defns
@@ -1323,8 +1360,8 @@ specDefn env body_uds fn rhs
                 -- body_uds_without_me
 
 ---------------------------
-specCalls :: Maybe Module      -- Just this_mod  =>  specialising imported fn
-                               -- Nothing        =>  specialising local fn
+specCalls :: Bool              -- True  =>  specialising imported fn
+                               -- False =>  specialising local fn
           -> SpecEnv
           -> [CoreRule]        -- Existing RULES for the fn
           -> [CallInfo]
@@ -1339,7 +1376,7 @@ type SpecInfo = ( [CoreRule]       -- Specialisation rules
                 , [(Id,CoreExpr)]  -- Specialised definition
                 , UsageDetails )   -- Usage details from specialised RHSs
 
-specCalls mb_mod env existing_rules calls_for_me fn rhs
+specCalls spec_imp env existing_rules calls_for_me fn rhs
         -- The first case is the interesting one
   |  notNull calls_for_me               -- And there are some calls to specialise
   && not (isNeverActive (idInlineActivation fn))
@@ -1370,7 +1407,9 @@ specCalls mb_mod env existing_rules calls_for_me fn rhs
     inl_act   = inlinePragmaActivation inl_prag
     is_local  = isLocalId fn
     is_dfun   = isDFunId fn
-
+    dflags    = se_dflags env
+    ropts     = initRuleOpts dflags
+    this_mod  = se_module env
         -- Figure out whether the function has an INLINE pragma
         -- See Note [Inline specialisations]
 
@@ -1412,8 +1451,6 @@ specCalls mb_mod env existing_rules calls_for_me fn rhs
 --                                        , ppr dx_binds ]) $
 --             return ()
 
-           ; dflags <- getDynFlags
-           ; let ropts = initRuleOpts dflags
            ; if not useful  -- No useful specialisation
                 || already_covered ropts rules_acc rule_lhs_args
              then return spec_acc
@@ -1441,17 +1478,15 @@ specCalls mb_mod env existing_rules calls_for_me fn rhs
                                  = Nothing
 
            ; spec_fn <- newSpecIdSM fn spec_fn_ty spec_join_arity
-           ; this_mod <- getModule
            ; let
                 -- The rule to put in the function's specialisation is:
                 --      forall x @b d1' d2'.
                 --          f x @T1 @b @T2 d1' d2' = f1 x @b
                 -- See Note [Specialising Calls]
-                herald = case mb_mod of
-                           Nothing        -- Specialising local fn
-                               -> text "SPEC"
-                           Just this_mod  -- Specialising imported fn
-                               -> text "SPEC/" <> ppr this_mod
+                herald | spec_imp  = -- Specialising imported fn
+                                     text "SPEC/" <> ppr this_mod
+                       | otherwise = -- Specialising local fn
+                                     text "SPEC"
 
                 rule_name = mkFastString $ showSDoc dflags $
                             herald <+> ftext (occNameFS (getOccName fn))
@@ -2480,15 +2515,15 @@ mkCallUDs env f args
     res = mkCallUDs' env f args
 
 mkCallUDs' env f args
-  |  not (want_calls_for f)  -- Imported from elsewhere
-  || null ci_key             -- No useful specialisation
-   -- See also Note [Specialisations already covered]
+  | wantCallsFor env f    -- We want it, and...
+  , not (null ci_key)     -- this call site has a useful specialisation
+  = -- pprTrace "mkCallUDs: keeping" _trace_doc
+    singleCall f ci_key
+
+  | otherwise  -- See also Note [Specialisations already covered]
   = -- pprTrace "mkCallUDs: discarding" _trace_doc
     emptyUDs
 
-  | otherwise
-  = -- pprTrace "mkCallUDs: keeping" _trace_doc
-    singleCall f ci_key
   where
     _trace_doc = vcat [ppr f, ppr args, ppr ci_key]
     pis                = fst $ splitPiTys $ idType f
@@ -2525,12 +2560,23 @@ mkCallUDs' env f args
     mk_spec_arg _ (Anon VisArg _)
       = UnspecArg
 
-    want_calls_for f = isLocalId f || isJust (maybeUnfoldingTemplate (realIdUnfolding f))
-         -- For imported things, we gather call instances if
-         -- there is an unfolding that we could in principle specialise
-         -- We might still decide not to use it (consulting dflags)
-         -- in specImports
-         -- Use 'realIdUnfolding' to ignore the loop-breaker flag!
+wantCallsFor :: SpecEnv -> Id -> Bool
+wantCallsFor _env _f = True
+ -- We could reduce the size of the UsageDetails by being less eager
+ -- about collecting calls for LocalIds: there is no point for
+ -- ones that are lambda-bound.  We can't decide this by looking at
+ -- the (absence of an) unfolding, because unfoldings for local
+ -- functions are discarded by cloneBindSM, so no local binder will
+ -- have an unfolding at this stage.  We'd have to keep a candidate
+ -- set of let-binders.
+ --
+ -- Not many lambda-bound variables have dictionary arguments, so
+ -- this would make little difference anyway.
+ --
+ -- For imported Ids we could check for an unfolding, but we have to
+ -- do so anyway in canSpecImport, and it seems better to have it
+ -- all in one place.  So we simply collect usage info for imported
+ -- overloaded functions.
 
 {- Note [Type determines value]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2780,55 +2826,12 @@ deleteCallsFor bs calls = delDVarEnvList calls bs
 ************************************************************************
 -}
 
-newtype SpecM a = SpecM (State SpecState a) deriving (Functor)
-
-data SpecState = SpecState {
-                     spec_uniq_supply :: UniqSupply,
-                     spec_module :: Module,
-                     spec_dflags :: DynFlags
-                 }
-
-instance Applicative SpecM where
-    pure x = SpecM $ return x
-    (<*>) = ap
-
-instance Monad SpecM where
-    SpecM x >>= f = SpecM $ do y <- x
-                               case f y of
-                                   SpecM z ->
-                                       z
-
-instance MonadFail SpecM where
-   fail str = SpecM $ error str
-
-instance MonadUnique SpecM where
-    getUniqueSupplyM
-        = SpecM $ do st <- get
-                     let (us1, us2) = splitUniqSupply $ spec_uniq_supply st
-                     put $ st { spec_uniq_supply = us2 }
-                     return us1
-
-    getUniqueM
-        = SpecM $ do st <- get
-                     let (u,us') = takeUniqFromSupply $ spec_uniq_supply st
-                     put $ st { spec_uniq_supply = us' }
-                     return u
-
-instance HasDynFlags SpecM where
-    getDynFlags = SpecM $ liftM spec_dflags get
-
-instance HasModule SpecM where
-    getModule = SpecM $ liftM spec_module get
-
-runSpecM :: DynFlags -> Module -> SpecM a -> CoreM a
-runSpecM dflags this_mod (SpecM spec)
-    = do us <- getUniqueSupplyM
-         let initialState = SpecState {
-                                spec_uniq_supply = us,
-                                spec_module = this_mod,
-                                spec_dflags = dflags
-                            }
-         return $ evalState spec initialState
+type SpecM a = UniqSM a
+
+runSpecM :: SpecM a -> CoreM a
+runSpecM thing_inside
+  = do { us <- getUniqueSupplyM
+       ; return (initUs_ us thing_inside) }
 
 mapAndCombineSM :: (a -> SpecM (b, UsageDetails)) -> [a] -> SpecM ([b], UsageDetails)
 mapAndCombineSM _ []     = return ([], emptyUDs)
diff --git a/compiler/GHC/Core/Ppr.hs b/compiler/GHC/Core/Ppr.hs
index c0b2749359..79c5acae23 100644
--- a/compiler/GHC/Core/Ppr.hs
+++ b/compiler/GHC/Core/Ppr.hs
@@ -161,15 +161,20 @@ pprOptCo co = sdocOption sdocSuppressCoercions $ \case
               True  -> angleBrackets (text "Co:" <> int (coercionSize co))
               False -> parens $ sep [ppr co, dcolon <+> ppr (coercionType co)]
 
+ppr_id_occ :: (SDoc -> SDoc) -> Id -> SDoc
+ppr_id_occ add_par id
+  | isJoinId id = add_par ((text "jump") <+> pp_id)
+  | otherwise   = pp_id
+  where
+    pp_id = ppr id  -- We could use pprPrefixOcc to print (+) etc, but this is
+                    -- Core where we don't print things infix anyway, so doing
+                    -- so just adds extra redundant parens
+
 ppr_expr :: OutputableBndr b => (SDoc -> SDoc) -> Expr b -> SDoc
         -- The function adds parens in context that need
         -- an atomic value (e.g. function args)
 
-ppr_expr add_par (Var name)
- | isJoinId name               = add_par ((text "jump") <+> pp_name)
- | otherwise                   = pp_name
- where
-   pp_name = pprPrefixOcc name
+ppr_expr add_par (Var id)      = ppr_id_occ add_par id
 ppr_expr add_par (Type ty)     = add_par (text "TYPE:" <+> ppr ty)       -- Weird
 ppr_expr add_par (Coercion co) = add_par (text "CO:" <+> ppr co)
 ppr_expr add_par (Lit lit)     = pprLiteral add_par lit
@@ -212,8 +217,7 @@ ppr_expr add_par expr@(App {})
 
                    _ -> parens (hang fun_doc 2 pp_args)
                    where
-                     fun_doc | isJoinId f = text "jump" <+> ppr f
-                             | otherwise  = ppr f
+                     fun_doc = ppr_id_occ noParens f
 
         _ -> parens (hang (pprParendExpr fun) 2 pp_args)
     }
diff --git a/compiler/GHC/Core/SimpleOpt.hs b/compiler/GHC/Core/SimpleOpt.hs
index 8529b13025..67f08cdd23 100644
--- a/compiler/GHC/Core/SimpleOpt.hs
+++ b/compiler/GHC/Core/SimpleOpt.hs
@@ -18,17 +18,14 @@ module GHC.Core.SimpleOpt (
         -- ** Predicates on expressions
         exprIsConApp_maybe, exprIsLiteral_maybe, exprIsLambda_maybe,
 
-        -- ** Coercions and casts
-        pushCoArg, pushCoValArg, pushCoTyArg, collectBindersPushingCo
     ) where
 
 #include "HsVersions.h"
 
 import GHC.Prelude
 
-import GHC.Core.Opt.Arity( etaExpandToJoinPoint )
-
 import GHC.Core
+import GHC.Core.Opt.Arity
 import GHC.Core.Subst
 import GHC.Core.Utils
 import GHC.Core.FVs
@@ -48,16 +45,12 @@ import GHC.Core.Coercion.Opt ( optCoercion, OptCoercionOpts (..) )
 import GHC.Core.Type hiding ( substTy, extendTvSubst, extendCvSubst, extendTvSubstList
                             , isInScope, substTyVarBndr, cloneTyVarBndr )
 import GHC.Core.Coercion hiding ( substCo, substCoVarBndr )
-import GHC.Core.TyCon ( tyConArity )
-import GHC.Core.Multiplicity
 import GHC.Builtin.Types
 import GHC.Builtin.Names
 import GHC.Types.Basic
 import GHC.Unit.Module ( Module )
-import GHC.Driver.Ppr
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
-import GHC.Data.Pair
 import GHC.Utils.Misc
 import GHC.Data.Maybe       ( orElse )
 import GHC.Data.FastString
@@ -782,6 +775,28 @@ a good cause. And it won't hurt other RULES and such that it comes across.
 ************************************************************************
 -}
 
+{- Note [Strictness and join points]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Suppose we have
+
+   let f = \x.  if x>200 then e1 else e1
+
+and we know that f is strict in x.  Then if we subsequently
+discover that f is an arity-2 join point, we'll eta-expand it to
+
+   let f = \x y.  if x>200 then e1 else e1
+
+and now it's only strict if applied to two arguments.  So we should
+adjust the strictness info.
+
+A more common case is when
+
+   f = \x. error ".."
+
+and again its arity increases (#15517)
+-}
+
+
 -- | Returns Just (bndr,rhs) if the binding is a join point:
 -- If it's a JoinId, just return it
 -- If it's not yet a JoinId but is always tail-called,
@@ -815,27 +830,6 @@ joinPointBindings_maybe bndrs
   = mapM (uncurry joinPointBinding_maybe) bndrs
 
 
-{- Note [Strictness and join points]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Suppose we have
-
-   let f = \x.  if x>200 then e1 else e1
-
-and we know that f is strict in x.  Then if we subsequently
-discover that f is an arity-2 join point, we'll eta-expand it to
-
-   let f = \x y.  if x>200 then e1 else e1
-
-and now it's only strict if applied to two arguments.  So we should
-adjust the strictness info.
-
-A more common case is when
-
-   f = \x. error ".."
-
-and again its arity increases (#15517)
--}
-
 {- *********************************************************************
 *                                                                      *
          exprIsConApp_maybe
@@ -1350,275 +1344,3 @@ exprIsLambda_maybe _ _e
       Nothing
 
 
-{- *********************************************************************
-*                                                                      *
-              The "push rules"
-*                                                                      *
-************************************************************************
-
-Here we implement the "push rules" from FC papers:
-
-* The push-argument rules, where we can move a coercion past an argument.
-  We have
-      (fun |> co) arg
-  and we want to transform it to
-    (fun arg') |> co'
-  for some suitable co' and transformed arg'.
-
-* The PushK rule for data constructors.  We have
-       (K e1 .. en) |> co
-  and we want to transform to
-       (K e1' .. en')
-  by pushing the coercion into the arguments
--}
-
-pushCoArgs :: CoercionR -> [CoreArg] -> Maybe ([CoreArg], MCoercion)
-pushCoArgs co []         = return ([], MCo co)
-pushCoArgs co (arg:args) = do { (arg',  m_co1) <- pushCoArg  co  arg
-                              ; case m_co1 of
-                                  MCo co1 -> do { (args', m_co2) <- pushCoArgs co1 args
-                                                 ; return (arg':args', m_co2) }
-                                  MRefl  -> return (arg':args, MRefl) }
-
-pushCoArg :: CoercionR -> CoreArg -> Maybe (CoreArg, MCoercion)
--- We have (fun |> co) arg, and we want to transform it to
---         (fun arg) |> co
--- This may fail, e.g. if (fun :: N) where N is a newtype
--- C.f. simplCast in GHC.Core.Opt.Simplify
--- 'co' is always Representational
--- If the returned coercion is Nothing, then it would have been reflexive
-pushCoArg co (Type ty) = do { (ty', m_co') <- pushCoTyArg co ty
-                            ; return (Type ty', m_co') }
-pushCoArg co val_arg   = do { (arg_co, m_co') <- pushCoValArg co
-                            ; return (val_arg `mkCast` arg_co, m_co') }
-
-pushCoTyArg :: CoercionR -> Type -> Maybe (Type, MCoercionR)
--- We have (fun |> co) @ty
--- Push the coercion through to return
---         (fun @ty') |> co'
--- 'co' is always Representational
--- If the returned coercion is Nothing, then it would have been reflexive;
--- it's faster not to compute it, though.
-pushCoTyArg co ty
-  -- The following is inefficient - don't do `eqType` here, the coercion
-  -- optimizer will take care of it. See #14737.
-  -- -- | tyL `eqType` tyR
-  -- -- = Just (ty, Nothing)
-
-  | isReflCo co
-  = Just (ty, MRefl)
-
-  | isForAllTy_ty tyL
-  = ASSERT2( isForAllTy_ty tyR, ppr co $$ ppr ty )
-    Just (ty `mkCastTy` co1, MCo co2)
-
-  | otherwise
-  = Nothing
-  where
-    Pair tyL tyR = coercionKind co
-       -- co :: tyL ~ tyR
-       -- tyL = forall (a1 :: k1). ty1
-       -- tyR = forall (a2 :: k2). ty2
-
-    co1 = mkSymCo (mkNthCo Nominal 0 co)
-       -- co1 :: k2 ~N k1
-       -- Note that NthCo can extract a Nominal equality between the
-       -- kinds of the types related by a coercion between forall-types.
-       -- See the NthCo case in GHC.Core.Lint.
-
-    co2 = mkInstCo co (mkGReflLeftCo Nominal ty co1)
-        -- co2 :: ty1[ (ty|>co1)/a1 ] ~ ty2[ ty/a2 ]
-        -- Arg of mkInstCo is always nominal, hence mkNomReflCo
-
-pushCoValArg :: CoercionR -> Maybe (Coercion, MCoercion)
--- We have (fun |> co) arg
--- Push the coercion through to return
---         (fun (arg |> co_arg)) |> co_res
--- 'co' is always Representational
--- If the second returned Coercion is actually Nothing, then no cast is necessary;
--- the returned coercion would have been reflexive.
-pushCoValArg co
-  -- The following is inefficient - don't do `eqType` here, the coercion
-  -- optimizer will take care of it. See #14737.
-  -- -- | tyL `eqType` tyR
-  -- -- = Just (mkRepReflCo arg, Nothing)
-
-  | isReflCo co
-  = Just (mkRepReflCo arg, MRefl)
-
-  | isFunTy tyL
-  , (co_mult, co1, co2) <- decomposeFunCo Representational co
-  , isReflexiveCo co_mult
-    -- We can't push the coercion in the case where co_mult isn't reflexivity:
-    -- it could be an unsafe axiom, and losing this information could yield
-    -- ill-typed terms. For instance (fun x ::(1) Int -> (fun _ -> () |> co) x)
-    -- with co :: (Int -> ()) ~ (Int #-> ()), would reduce to (fun x ::(1) Int
-    -- -> (fun _ ::(Many) Int -> ()) x) which is ill-typed
-
-              -- If   co  :: (tyL1 -> tyL2) ~ (tyR1 -> tyR2)
-              -- then co1 :: tyL1 ~ tyR1
-              --      co2 :: tyL2 ~ tyR2
-  = ASSERT2( isFunTy tyR, ppr co $$ ppr arg )
-    Just (mkSymCo co1, MCo co2)
-
-  | otherwise
-  = Nothing
-  where
-    arg = funArgTy tyR
-    Pair tyL tyR = coercionKind co
-
-pushCoercionIntoLambda
-    :: InScopeSet -> Var -> CoreExpr -> CoercionR -> Maybe (Var, CoreExpr)
--- This implements the Push rule from the paper on coercions
---    (\x. e) |> co
--- ===>
---    (\x'. e |> co')
-pushCoercionIntoLambda in_scope x e co
-    | ASSERT(not (isTyVar x) && not (isCoVar x)) True
-    , Pair s1s2 t1t2 <- coercionKind co
-    , Just (_, _s1,_s2) <- splitFunTy_maybe s1s2
-    , Just (w1, t1,_t2) <- splitFunTy_maybe t1t2
-    , (co_mult, co1, co2) <- decomposeFunCo Representational co
-    , isReflexiveCo co_mult
-      -- We can't push the coercion in the case where co_mult isn't
-      -- reflexivity. See pushCoValArg for more details.
-    = let
-          -- Should we optimize the coercions here?
-          -- Otherwise they might not match too well
-          x' = x `setIdType` t1 `setIdMult` w1
-          in_scope' = in_scope `extendInScopeSet` x'
-          subst = extendIdSubst (mkEmptySubst in_scope')
-                                x
-                                (mkCast (Var x') co1)
-      in Just (x', substExpr subst e `mkCast` co2)
-    | otherwise
-    = pprTrace "exprIsLambda_maybe: Unexpected lambda in case" (ppr (Lam x e))
-      Nothing
-
-pushCoDataCon :: DataCon -> [CoreExpr] -> Coercion
-              -> Maybe (DataCon
-                       , [Type]      -- Universal type args
-                       , [CoreExpr]) -- All other args incl existentials
--- Implement the KPush reduction rule as described in "Down with kinds"
--- The transformation applies iff we have
---      (C e1 ... en) `cast` co
--- where co :: (T t1 .. tn) ~ to_ty
--- The left-hand one must be a T, because exprIsConApp returned True
--- but the right-hand one might not be.  (Though it usually will.)
-pushCoDataCon dc dc_args co
-  | isReflCo co || from_ty `eqType` to_ty  -- try cheap test first
-  , let (univ_ty_args, rest_args) = splitAtList (dataConUnivTyVars dc) dc_args
-  = Just (dc, map exprToType univ_ty_args, rest_args)
-
-  | Just (to_tc, to_tc_arg_tys) <- splitTyConApp_maybe to_ty
-  , to_tc == dataConTyCon dc
-        -- These two tests can fail; we might see
-        --      (C x y) `cast` (g :: T a ~ S [a]),
-        -- where S is a type function.  In fact, exprIsConApp
-        -- will probably not be called in such circumstances,
-        -- but there's nothing wrong with it
-
-  = let
-        tc_arity       = tyConArity to_tc
-        dc_univ_tyvars = dataConUnivTyVars dc
-        dc_ex_tcvars   = dataConExTyCoVars dc
-        arg_tys        = dataConRepArgTys dc
-
-        non_univ_args  = dropList dc_univ_tyvars dc_args
-        (ex_args, val_args) = splitAtList dc_ex_tcvars non_univ_args
-
-        -- Make the "Psi" from the paper
-        omegas = decomposeCo tc_arity co (tyConRolesRepresentational to_tc)
-        (psi_subst, to_ex_arg_tys)
-          = liftCoSubstWithEx Representational
-                              dc_univ_tyvars
-                              omegas
-                              dc_ex_tcvars
-                              (map exprToType ex_args)
-
-          -- Cast the value arguments (which include dictionaries)
-        new_val_args = zipWith cast_arg (map scaledThing arg_tys) val_args
-        cast_arg arg_ty arg = mkCast arg (psi_subst arg_ty)
-
-        to_ex_args = map Type to_ex_arg_tys
-
-        dump_doc = vcat [ppr dc,      ppr dc_univ_tyvars, ppr dc_ex_tcvars,
-                         ppr arg_tys, ppr dc_args,
-                         ppr ex_args, ppr val_args, ppr co, ppr from_ty, ppr to_ty, ppr to_tc
-                         , ppr $ mkTyConApp to_tc (map exprToType $ takeList dc_univ_tyvars dc_args) ]
-    in
-    ASSERT2( eqType from_ty (mkTyConApp to_tc (map exprToType $ takeList dc_univ_tyvars dc_args)), dump_doc )
-    ASSERT2( equalLength val_args arg_tys, dump_doc )
-    Just (dc, to_tc_arg_tys, to_ex_args ++ new_val_args)
-
-  | otherwise
-  = Nothing
-
-  where
-    Pair from_ty to_ty = coercionKind co
-
-collectBindersPushingCo :: CoreExpr -> ([Var], CoreExpr)
--- Collect lambda binders, pushing coercions inside if possible
--- E.g.   (\x.e) |> g         g :: <Int> -> blah
---        = (\x. e |> Nth 1 g)
---
--- That is,
---
--- collectBindersPushingCo ((\x.e) |> g) === ([x], e |> Nth 1 g)
-collectBindersPushingCo e
-  = go [] e
-  where
-    -- Peel off lambdas until we hit a cast.
-    go :: [Var] -> CoreExpr -> ([Var], CoreExpr)
-    -- The accumulator is in reverse order
-    go bs (Lam b e)   = go (b:bs) e
-    go bs (Cast e co) = go_c bs e co
-    go bs e           = (reverse bs, e)
-
-    -- We are in a cast; peel off casts until we hit a lambda.
-    go_c :: [Var] -> CoreExpr -> CoercionR -> ([Var], CoreExpr)
-    -- (go_c bs e c) is same as (go bs e (e |> c))
-    go_c bs (Cast e co1) co2 = go_c bs e (co1 `mkTransCo` co2)
-    go_c bs (Lam b e)    co  = go_lam bs b e co
-    go_c bs e            co  = (reverse bs, mkCast e co)
-
-    -- We are in a lambda under a cast; peel off lambdas and build a
-    -- new coercion for the body.
-    go_lam :: [Var] -> Var -> CoreExpr -> CoercionR -> ([Var], CoreExpr)
-    -- (go_lam bs b e c) is same as (go_c bs (\b.e) c)
-    go_lam bs b e co
-      | isTyVar b
-      , let Pair tyL tyR = coercionKind co
-      , ASSERT( isForAllTy_ty tyL )
-        isForAllTy_ty tyR
-      , isReflCo (mkNthCo Nominal 0 co)  -- See Note [collectBindersPushingCo]
-      = go_c (b:bs) e (mkInstCo co (mkNomReflCo (mkTyVarTy b)))
-
-      | isCoVar b
-      , let Pair tyL tyR = coercionKind co
-      , ASSERT( isForAllTy_co tyL )
-        isForAllTy_co tyR
-      , isReflCo (mkNthCo Nominal 0 co)  -- See Note [collectBindersPushingCo]
-      , let cov = mkCoVarCo b
-      = go_c (b:bs) e (mkInstCo co (mkNomReflCo (mkCoercionTy cov)))
-
-      | isId b
-      , let Pair tyL tyR = coercionKind co
-      , ASSERT( isFunTy tyL) isFunTy tyR
-      , (co_mult, co_arg, co_res) <- decomposeFunCo Representational co
-      , isReflCo co_mult -- See Note [collectBindersPushingCo]
-      , isReflCo co_arg  -- See Note [collectBindersPushingCo]
-      = go_c (b:bs) e co_res
-
-      | otherwise = (reverse bs, mkCast (Lam b e) co)
-
-{-
-
-Note [collectBindersPushingCo]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-We just look for coercions of form
-   <type> # w -> blah
-(and similarly for foralls) to keep this function simple.  We could do
-more elaborate stuff, but it'd involve substitution etc.
-
--}
diff --git a/compiler/GHC/Core/Subst.hs b/compiler/GHC/Core/Subst.hs
index f1d104df66..d57a0e2bf0 100644
--- a/compiler/GHC/Core/Subst.hs
+++ b/compiler/GHC/Core/Subst.hs
@@ -343,6 +343,8 @@ instance Outputable Subst where
 
 substExprSC :: HasDebugCallStack => Subst -> CoreExpr -> CoreExpr
 -- Just like substExpr, but a no-op if the substitution is empty
+-- Note that this does /not/ replace occurrences of free vars with
+-- their canonical representatives in the in-scope set
 substExprSC subst orig_expr
   | isEmptySubst subst = orig_expr
   | otherwise          = -- pprTrace "enter subst-expr" (doc $$ ppr orig_expr) $
@@ -628,6 +630,9 @@ substIdInfo subst new_id info
 
 ------------------
 -- | Substitutes for the 'Id's within an unfolding
+-- NB: substUnfolding /discards/ any unfolding without
+--     without a Stable source.  This is usually what we want,
+--     but it may be a bit unexpected
 substUnfolding, substUnfoldingSC :: Subst -> Unfolding -> Unfolding
         -- Seq'ing on the returned Unfolding is enough to cause
         -- all the substitutions to happen completely
diff --git a/testsuite/tests/perf/compiler/Makefile b/testsuite/tests/perf/compiler/Makefile
index b27c842e91..ded99684b3 100644
--- a/testsuite/tests/perf/compiler/Makefile
+++ b/testsuite/tests/perf/compiler/Makefile
@@ -7,7 +7,3 @@ T4007:
 	$(RM) -f T4007.hi T4007.o
 	'$(TEST_HC)' $(TEST_HC_OPTS) -c -O -ddump-rule-firings T4007.hs
 
-T16473:
-	$(RM) -f T16473.hi T16473.o
-	'$(TEST_HC)' $(TEST_HC_OPTS) -c -O -ddump-rule-firings T16473.hs
-
diff --git a/testsuite/tests/perf/compiler/T16473.stdout b/testsuite/tests/perf/compiler/T16473.stdout
index e6063e01e8..c3ac783e70 100644
--- a/testsuite/tests/perf/compiler/T16473.stdout
+++ b/testsuite/tests/perf/compiler/T16473.stdout
@@ -1,97 +1 @@
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op liftA2 (BUILTIN)
-Rule fired: Class op <*> (BUILTIN)
-Rule fired: Class op $p1Applicative (BUILTIN)
-Rule fired: Class op <$ (BUILTIN)
-Rule fired: Class op <*> (BUILTIN)
-Rule fired: Class op $p1Applicative (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op get (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op >> (BUILTIN)
-Rule fired: Class op put (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op get (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op >> (BUILTIN)
-Rule fired: Class op put (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op show (BUILTIN)
-Rule fired: Class op mempty (BUILTIN)
-Rule fired: Class op fromInteger (BUILTIN)
-Rule fired: Integer -> Int# (BUILTIN)
-Rule fired: Class op <> (BUILTIN)
-Rule fired: Class op + (BUILTIN)
-Rule fired: Class op enumFromTo (BUILTIN)
-Rule fired: Class op *> (BUILTIN)
-Rule fired: Class op *> (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op *> (BUILTIN)
-Rule fired: Class op *> (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op *> (BUILTIN)
-Rule fired: Class op *> (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: fold/build (GHC.Base)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op $p1Applicative (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op $p1Applicative (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: ># (BUILTIN)
-Rule fired: ==# (BUILTIN)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op <*> (BUILTIN)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op $p1Applicative (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op pure (BUILTIN)
-Rule fired: Class op $p1Monad (BUILTIN)
-Rule fired: Class op $p1Applicative (BUILTIN)
-Rule fired: SPEC/Main $fMonadStateT_$c>>= @Identity _ (Main)
-Rule fired: SPEC/Main $fApplicativeStateT_$c<*> @Identity _ (Main)
-Rule fired: SPEC/Main $fApplicativeStateT_$cpure @Identity _ (Main)
-Rule fired: SPEC/Main $fFunctorStateT @Identity _ (Main)
-Rule fired: SPEC/Main $fFunctorStateT_$cfmap @Identity _ (Main)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: SPEC/Main $fFunctorStateT_$cfmap @Identity _ (Main)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op fmap (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op return (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op >>= (BUILTIN)
-Rule fired: Class op return (BUILTIN)
+5050
diff --git a/testsuite/tests/perf/compiler/T18223.hs b/testsuite/tests/perf/compiler/T18223.hs
new file mode 100644
index 0000000000..3e160cc957
--- /dev/null
+++ b/testsuite/tests/perf/compiler/T18223.hs
@@ -0,0 +1,78 @@
+{-# LANGUAGE PartialTypeSignatures #-}
+{-# LANGUAGE Strict #-}
+
+import Control.Monad.State
+
+tester :: MonadState a m => m ()
+tester = modify' id
+
+-- manyState :: StateT () (StateT () IO) () -> IO ()
+-- manyState :: _ -> IO ()
+manyState x =
+  (flip evalStateT ()     -- 1
+  . flip evalStateT ()    -- 2
+  . flip evalStateT ()    -- 3
+  . flip evalStateT ()    -- 4
+  . flip evalStateT ()    -- 5
+  . flip evalStateT ()    -- 6
+  . flip evalStateT ()    -- 7
+  . flip evalStateT ()    -- 8
+  . flip evalStateT ()    -- 9
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  . flip evalStateT ()
+  ) x :: IO ()
+
+main :: IO ()
+main = manyState tester >>= print
diff --git a/testsuite/tests/perf/compiler/all.T b/testsuite/tests/perf/compiler/all.T
index a8eea78729..f6e56c56b3 100644
--- a/testsuite/tests/perf/compiler/all.T
+++ b/testsuite/tests/perf/compiler/all.T
@@ -367,7 +367,13 @@ test('T16190',
       multimod_compile,
       ['T16190.hs', '-v0'])
 
-test('T16473', normal, makefile_test, ['T16473'])
+# Run this program. If specialisation fails, it'll start to allocate much more
+test ('T16473',
+      [ collect_stats('bytes allocated',5)
+      , only_ways(['normal'])
+      ],
+      compile_and_run,
+      ['-O2 -flate-specialise'])
 
 test('T17516',
       [ collect_compiler_stats('bytes allocated', 5),
@@ -415,3 +421,8 @@ test ('T13253-spj',
       ],
       compile,
       ['-v0 -O'])
+test ('T18223',
+      [ collect_compiler_stats('bytes allocated',2)
+      ],
+      compile,
+      ['-v0 -O'])
diff --git a/testsuite/tests/printer/T18052a.stderr b/testsuite/tests/printer/T18052a.stderr
index a45c9c958c..d8d9f94ba3 100644
--- a/testsuite/tests/printer/T18052a.stderr
+++ b/testsuite/tests/printer/T18052a.stderr
@@ -20,7 +20,7 @@ T18052a.$b:||: = \ (@a) (@b) (x :: a) (y :: b) -> (x, y)
 -- RHS size: {terms: 1, types: 0, coercions: 0, joins: 0/0}
 (+++) :: forall {a}. [a] -> [a] -> [a]
 [GblId]
-(+++) = (++)
+(+++) = ++
 
 -- RHS size: {terms: 13, types: 20, coercions: 0, joins: 0/0}
 T18052a.$m:||:
diff --git a/testsuite/tests/simplCore/should_compile/T17966.stdout b/testsuite/tests/simplCore/should_compile/T17966.stdout
index dc34ec8fae..9abaee2af2 100644
--- a/testsuite/tests/simplCore/should_compile/T17966.stdout
+++ b/testsuite/tests/simplCore/should_compile/T17966.stdout
@@ -1,5 +1,2 @@
  RULES: "SPEC $cm @()" [0]
  RULES: "SPEC f @Bool @() @(Maybe Integer)" [0]
-"SPEC/T17966 $fShowMaybe_$cshow @Integer"
-"SPEC/T17966 $fShowMaybe_$cshowList @Integer"
-"SPEC/T17966 $fShowMaybe @Integer"
diff --git a/testsuite/tests/stranal/should_compile/T18122.stderr b/testsuite/tests/stranal/should_compile/T18122.stderr
index ff18350279..f94751fb55 100644
--- a/testsuite/tests/stranal/should_compile/T18122.stderr
+++ b/testsuite/tests/stranal/should_compile/T18122.stderr
@@ -13,9 +13,8 @@ Lib.$trModule4 = "main"#
 -- RHS size: {terms: 2, types: 0, coercions: 0, joins: 0/0}
 Lib.$trModule3 :: GHC.Types.TrName
 [GblId,
- Cpr=m1,
  Unf=Unf{Src=<vanilla>, TopLvl=True, Value=True, ConLike=True,
-         WorkFree=True, Expandable=True, Guidance=IF_ARGS [] 10 20}]
+         WorkFree=True, Expandable=True, Guidance=IF_ARGS [] 10 10}]
 Lib.$trModule3 = GHC.Types.TrNameS Lib.$trModule4
 
 -- RHS size: {terms: 1, types: 0, coercions: 0, joins: 0/0}
@@ -28,27 +27,25 @@ Lib.$trModule2 = "Lib"#
 -- RHS size: {terms: 2, types: 0, coercions: 0, joins: 0/0}
 Lib.$trModule1 :: GHC.Types.TrName
 [GblId,
- Cpr=m1,
  Unf=Unf{Src=<vanilla>, TopLvl=True, Value=True, ConLike=True,
-         WorkFree=True, Expandable=True, Guidance=IF_ARGS [] 10 20}]
+         WorkFree=True, Expandable=True, Guidance=IF_ARGS [] 10 10}]
 Lib.$trModule1 = GHC.Types.TrNameS Lib.$trModule2
 
 -- RHS size: {terms: 3, types: 0, coercions: 0, joins: 0/0}
 Lib.$trModule :: GHC.Types.Module
 [GblId,
- Cpr=m1,
  Unf=Unf{Src=<vanilla>, TopLvl=True, Value=True, ConLike=True,
-         WorkFree=True, Expandable=True, Guidance=IF_ARGS [] 10 30}]
+         WorkFree=True, Expandable=True, Guidance=IF_ARGS [] 10 10}]
 Lib.$trModule = GHC.Types.Module Lib.$trModule3 Lib.$trModule1
 
 -- RHS size: {terms: 1, types: 0, coercions: 0, joins: 0/0}
 Lib.$wfoo [InlPrag=NOINLINE]
   :: GHC.Prim.Int# -> GHC.Prim.Int# -> GHC.Prim.Int#
 [GblId, Arity=2, Str=<L,U><L,U>, Unf=OtherCon []]
-Lib.$wfoo = (GHC.Prim.+#)
+Lib.$wfoo = GHC.Prim.+#
 
 -- RHS size: {terms: 18, types: 14, coercions: 0, joins: 0/0}
-foo [InlPrag=NOUSERINLINE[0]] :: (Int, Int) -> Int -> Int
+foo [InlPrag=NOUSERINLINE[final]] :: (Int, Int) -> Int -> Int
 [GblId,
  Arity=2,
  Str=<S(SL),1*U(1*U(U),A)><S,1*U(U)>,
@@ -56,24 +53,25 @@ foo [InlPrag=NOUSERINLINE[0]] :: (Int, Int) -> Int -> Int
  Unf=Unf{Src=InlineStable, TopLvl=True, Value=True, ConLike=True,
          WorkFree=True, Expandable=True,
          Guidance=ALWAYS_IF(arity=2,unsat_ok=True,boring_ok=False)
-         Tmpl= \ (w_sHs [Occ=Once!] :: (Int, Int))
-                 (w1_sHt [Occ=Once!] :: Int) ->
-                 case w_sHs of { (ww1_sHw [Occ=Once!], _ [Occ=Dead]) ->
-                 case ww1_sHw of { GHC.Types.I# ww4_sHz [Occ=Once] ->
-                 case w1_sHt of { GHC.Types.I# ww6_sHF [Occ=Once] ->
-                 case Lib.$wfoo ww4_sHz ww6_sHF of ww7_sHJ [Occ=Once] { __DEFAULT ->
-                 GHC.Types.I# ww7_sHJ
+         Tmpl= \ (w_sEf [Occ=Once1!] :: (Int, Int))
+                 (w1_sEg [Occ=Once1!] :: Int) ->
+                 case w_sEf of { (ww1_sEj [Occ=Once1!], _ [Occ=Dead]) ->
+                 case ww1_sEj of { GHC.Types.I# ww4_sEm [Occ=Once1] ->
+                 case w1_sEg of { GHC.Types.I# ww6_sEs [Occ=Once1] ->
+                 case Lib.$wfoo ww4_sEm ww6_sEs of ww7_sEw [Occ=Once1]
+                 { __DEFAULT ->
+                 GHC.Types.I# ww7_sEw
                  }
                  }
                  }
                  }}]
 foo
-  = \ (w_sHs :: (Int, Int)) (w1_sHt :: Int) ->
-      case w_sHs of { (ww1_sHw, ww2_sHB) ->
-      case ww1_sHw of { GHC.Types.I# ww4_sHz ->
-      case w1_sHt of { GHC.Types.I# ww6_sHF ->
-      case Lib.$wfoo ww4_sHz ww6_sHF of ww7_sHJ { __DEFAULT ->
-      GHC.Types.I# ww7_sHJ
+  = \ (w_sEf :: (Int, Int)) (w1_sEg :: Int) ->
+      case w_sEf of { (ww1_sEj, ww2_sEo) ->
+      case ww1_sEj of { GHC.Types.I# ww4_sEm ->
+      case w1_sEg of { GHC.Types.I# ww6_sEs ->
+      case Lib.$wfoo ww4_sEm ww6_sEs of ww7_sEw { __DEFAULT ->
+      GHC.Types.I# ww7_sEw
       }
       }
       }
author	Simon Peyton Jones <simonpj@microsoft.com>	2020-08-24 14:36:57 +0100
committer	Simon Peyton Jones <simonpj@microsoft.com>	2020-09-21 16:15:14 +0100
commit	ff5a843e2003abed15f99d10eb1195cf9d572e06 (patch)
tree	74e015a51530a05adb25daab808f97dd5b050a54
parent	9df77fed8918bb335874a584a829ee32325cefb5 (diff)
download	haskell-wip/T18223.tar.gz