5 files changed, 679 insertions, 237 deletions
diff --git a/compiler/GHC/Core/Opt/Arity.hs b/compiler/GHC/Core/Opt/Arity.hs
index b55d91767e..2471470814 100644
--- a/compiler/GHC/Core/Opt/Arity.hs
+++ b/compiler/GHC/Core/Opt/Arity.hs
@@ -15,10 +15,18 @@ module GHC.Core.Opt.Arity
    ( manifestArity, joinRhsArity, exprArity, typeArity
    , exprEtaExpandArity, findRhsArity
    , etaExpand, etaExpandAT
-   , etaExpandToJoinPoint, etaExpandToJoinPointRule
    , exprBotStrictness_maybe
+
+        -- ** ArityType
    , ArityType(..), expandableArityType, arityTypeArity
    , maxWithArity, isBotArityType, idArityType
+
+        -- ** Join points
+   , etaExpandToJoinPoint, etaExpandToJoinPointRule
+
+        -- ** Coercions and casts
+   , pushCoArg, pushCoArgs, pushCoValArg, pushCoTyArg
+   , pushCoercionIntoLambda, pushCoDataCon, collectBindersPushingCo
    )
 where
 
@@ -31,15 +39,21 @@ import GHC.Driver.Ppr
 import GHC.Core
 import GHC.Core.FVs
 import GHC.Core.Utils
-import GHC.Core.Subst
 import GHC.Types.Demand
 import GHC.Types.Var
 import GHC.Types.Var.Env
 import GHC.Types.Id
-import GHC.Core.Type as Type
-import GHC.Core.TyCon     ( initRecTc, checkRecTc )
+
+-- We have two sorts of substitution:
+--   GHC.Core.Subst.Subst, and GHC.Core.TyCo.TCvSubst
+-- Both have substTy, substCo  Hence need for qualification
+import GHC.Core.Subst    as Core
+import GHC.Core.Type     as Type
+import GHC.Core.Coercion as Type
+
+import GHC.Core.DataCon
+import GHC.Core.TyCon     ( initRecTc, checkRecTc, tyConArity )
 import GHC.Core.Predicate ( isDictTy )
-import GHC.Core.Coercion as Coercion
 import GHC.Core.Multiplicity
 import GHC.Types.Var.Set
 import GHC.Types.Basic
@@ -48,7 +62,8 @@ import GHC.Driver.Session ( DynFlags, GeneralFlag(..), gopt )
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
 import GHC.Data.FastString
-import GHC.Utils.Misc     ( lengthAtLeast )
+import GHC.Data.Pair
+import GHC.Utils.Misc
 
 {-
 ************************************************************************
@@ -1076,12 +1091,11 @@ eta_expand one_shots orig_expr
     go oss (Cast expr co) = Cast (go oss expr) co
 
     go oss expr
-      = -- pprTrace "ee" (vcat [ppr orig_expr, ppr expr, ppr etas]) $
-        retick $ etaInfoAbs etas (etaInfoApp subst' sexpr etas)
+      = -- pprTrace "ee" (vcat [ppr orig_expr, ppr expr, pprEtaInfos etas]) $
+        retick $ etaInfoAbs etas (etaInfoApp in_scope' sexpr etas)
       where
           in_scope = mkInScopeSet (exprFreeVars expr)
           (in_scope', etas) = mkEtaWW oss (ppr orig_expr) in_scope (exprType expr)
-          subst' = mkEmptySubst in_scope'
 
           -- Find ticks behind type apps.
           -- See Note [Eta expansion and source notes]
@@ -1090,76 +1104,197 @@ eta_expand one_shots orig_expr
           sexpr = foldl' App expr'' args
           retick expr = foldr mkTick expr ticks
 
-                                -- Abstraction    Application
+{- *********************************************************************
+*                                                                      *
+              The EtaInfo mechanism
+          mkEtaWW, etaInfoAbs, etaInfoApp
+*                                                                      *
+********************************************************************* -}
+
+{- Note [The EtaInfo mechanism]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Suppose we have (e :: ty) and we want to eta-expand it to arity N.
+This what eta_expand does.  We do it in two steps:
+
+1.  mkEtaWW: from 'ty' and 'N' build a [EtaInfo] which describes
+    the shape of the expansion necessary to expand to arity N.
+
+2.  Build the term
+       \ v1..vn.  e v1 .. vn
+    where those abstractions and applications are described by
+    the same [EtaInfo].  Specifically we build the term
+
+       etaInfoAbs etas (etaInfoApp in_scope e etas)
+
+   where etas :: [EtaInfo]#
+         etaInfoAbs builds the lambdas
+         etaInfoApp builds the applictions
+
+   Note that the /same/ [EtaInfo] drives both etaInfoAbs and etaInfoApp
+
+To a first approximation [EtaInfo] is just [Var].  But
+casts complicate the question.  If we have
+   newtype N a = MkN (S -> a)
+and
+   ty = N (N Int)
+then the eta-expansion must look like
+        (\x (\y. ((e |> co1) x) |> co2) y)
+           |> sym co2)
+        |> sym co1
+where
+  co1 :: N (N Int) ~ S -> N Int
+  co2 :: N Int     ~ S -> Int
+
+Blimey!  Look at all those casts.  Moreover, if the type
+is very deeply nested (as happens in #18223), the repetition
+of types can make the overall term very large.  So there is a big
+payoff in cancelling out casts aggressively wherever possible.
+(See also Note [No crap in eta-expanded code].)
+
+This matters a lot in etaEInfoApp, where we
+* Do beta-reduction on the fly
+* Use getARg_mabye to get a cast out of the way,
+  so that we can do beta reduction
+Together this makes a big difference.  Consider when e is
+   case x of
+      True  -> (\x -> e1) |> c1
+      False -> (\p -> e2) |> c2
+
+When we eta-expand this to arity 1, say, etaInfoAbs will wrap
+a (\eta) around the outside and use etaInfoApp to apply each
+alternative to 'eta'.  We want to beta-reduce all that junk
+away.
+
+#18223 was a dramtic example in which the intermediate term was
+grotesquely huge, even though the next Simplifier iteration squashed
+it.  Better to kill it at birth.
+-}
+
 --------------
-data EtaInfo = EtaVar Var       -- /\a. []        [] a
-                                -- \x.  []        [] x
-             | EtaCo Coercion   -- [] |> sym co   [] |> co
+data EtaInfo            -- Abstraction      Application
+  = EtaVar Var          -- /\a. []         [] a
+                        -- (\x. [])        [] x
+  | EtaCo CoercionR     -- [] |> sym co    [] |> co
 
 instance Outputable EtaInfo where
-   ppr (EtaVar v) = text "EtaVar" <+> ppr v
-   ppr (EtaCo co) = text "EtaCo"  <+> ppr co
+   ppr (EtaVar v) = text "EtaVar" <+> ppr v  <+> dcolon <+> ppr (idType v)
+   ppr (EtaCo co) = text "EtaCo"  <+> hang (ppr co) 2 (dcolon <+> ppr (coercionType co))
+
+-- Used in debug-printing
+-- pprEtaInfos :: [EtaInfo] -> SDoc
+-- pprEtaInfos eis = brackets $ vcat $ punctuate comma $ map ppr eis
 
 pushCoercion :: Coercion -> [EtaInfo] -> [EtaInfo]
+-- Puts a EtaCo on the front of a [EtaInfo], but combining
+-- with an existing EtaCo if possible
+-- A minor improvement
 pushCoercion co1 (EtaCo co2 : eis)
   | isReflCo co = eis
   | otherwise   = EtaCo co : eis
   where
     co = co1 `mkTransCo` co2
 
-pushCoercion co eis = EtaCo co : eis
+pushCoercion co eis
+  = EtaCo co : eis
+
+getArg_maybe :: [EtaInfo] -> Maybe (CoreArg, [EtaInfo])
+-- Get an argument to the front of the [EtaInfo], if possible,
+-- by pushing any EtaCo through the argument
+getArg_maybe eis = go MRefl eis
+  where
+    go :: MCoercion -> [EtaInfo] -> Maybe (CoreArg, [EtaInfo])
+    go _         []                = Nothing
+    go mco       (EtaCo co2 : eis) = go (mkTransMCoL mco co2) eis
+    go MRefl     (EtaVar v : eis)  = Just (varToCoreExpr v, eis)
+    go (MCo co)  (EtaVar v : eis)
+      | Just (arg, mco) <- pushCoArg co (varToCoreExpr v)
+      = case mco of
+           MRefl  -> Just (arg, eis)
+           MCo co -> Just (arg, pushCoercion co eis)
+      | otherwise
+      = Nothing
+
+mkCastMCo :: CoreExpr -> MCoercionR -> CoreExpr
+mkCastMCo e MRefl    = e
+mkCastMCo e (MCo co) = Cast e co
+  -- We are careful to use (MCo co) only when co is not reflexive
+  -- Hence (Cast e co) rather than (mkCast e co)
+
+mkPiMCo :: Var -> MCoercionR -> MCoercionR
+mkPiMCo _  MRefl   = MRefl
+mkPiMCo v (MCo co) = MCo (mkPiCo Representational v co)
 
 --------------
 etaInfoAbs :: [EtaInfo] -> CoreExpr -> CoreExpr
-etaInfoAbs []               expr = expr
-etaInfoAbs (EtaVar v : eis) expr = Lam v (etaInfoAbs eis expr)
-etaInfoAbs (EtaCo co : eis) expr = Cast (etaInfoAbs eis expr) (mkSymCo co)
+-- See Note [The EtaInfo mechanism]
+etaInfoAbs eis expr
+  | null eis  = expr
+  | otherwise = case final_mco of
+                   MRefl  -> expr'
+                   MCo co -> mkCast expr' co
+  where
+     (expr', final_mco) = foldr do_one (split_cast expr) eis
+
+     do_one :: EtaInfo -> (CoreExpr, MCoercion) -> (CoreExpr, MCoercion)
+     -- Implements the "Abstraction" column in the comments for data EtaInfo
+     -- In both argument and result the pair (e,mco) denotes (e |> mco)
+     do_one (EtaVar v) (expr, mco) = (Lam v expr, mkPiMCo v mco)
+     do_one (EtaCo co) (expr, mco) = (expr, mco `mkTransMCoL` mkSymCo co)
+
+     split_cast :: CoreExpr -> (CoreExpr, MCoercion)
+     split_cast (Cast e co) = (e, MCo co)
+     split_cast e           = (e, MRefl)
+     -- We could look in the body of lets, and the branches of a case
+     -- But then we would have to worry about whether the cast mentioned
+     -- any of the bound variables, which is tiresome. Later maybe.
+     -- Result: we may end up with
+     --     (\(x::Int). case x of { DEFAULT -> e1 |> co }) |> sym (<Int>->co)
+     -- and fail to optimise it away
 
 --------------
-etaInfoApp :: Subst -> CoreExpr -> [EtaInfo] -> CoreExpr
+etaInfoApp :: InScopeSet -> CoreExpr -> [EtaInfo] -> CoreExpr
 -- (etaInfoApp s e eis) returns something equivalent to
---             ((substExpr s e) `appliedto` eis)
-
-etaInfoApp subst (Lam v1 e) (EtaVar v2 : eis)
-  = etaInfoApp (GHC.Core.Subst.extendSubstWithVar subst v1 v2) e eis
-
-etaInfoApp subst (Cast e co1) eis
-  = etaInfoApp subst e (pushCoercion co' eis)
-  where
-    co' = GHC.Core.Subst.substCo subst co1
+--             (substExpr s e `appliedto` eis)
+-- See Note [The EtaInfo mechanism]
 
-etaInfoApp subst (Case e b ty alts) eis
-  = Case (subst_expr subst e) b1 ty' alts'
+etaInfoApp in_scope expr eis
+  = go (mkEmptySubst in_scope) expr eis
   where
-    (subst1, b1) = substBndr subst b
-    alts' = map subst_alt alts
-    ty'   = etaInfoAppTy (GHC.Core.Subst.substTy subst ty) eis
-    subst_alt (con, bs, rhs) = (con, bs', etaInfoApp subst2 rhs eis)
-              where
-                 (subst2,bs') = substBndrs subst1 bs
-
-etaInfoApp subst (Let b e) eis
-  | not (isJoinBind b)
-    -- See Note [Eta expansion for join points]
-  = Let b' (etaInfoApp subst' e eis)
-  where
-    (subst', b') = substBindSC subst b
+    go :: Subst -> CoreExpr -> [EtaInfo] -> CoreExpr
+    -- 'go' pushed down the eta-infos into the branch of a case
+    -- and the body of a let; and does beta-reduction if possible
+    go subst (Tick t e) eis
+      = Tick (substTickish subst t) (go subst e eis)
+    go subst (Cast e co) eis
+      = go subst e (pushCoercion (Core.substCo subst co) eis)
+    go subst (Case e b ty alts) eis
+      = Case (Core.substExprSC subst e) b1 ty' alts'
+      where
+        (subst1, b1) = Core.substBndr subst b
+        alts' = map subst_alt alts
+        ty'   = etaInfoAppTy (Core.substTy subst ty) eis
+        subst_alt (con, bs, rhs) = (con, bs', go subst2 rhs eis)
+                 where
+                    (subst2,bs') = Core.substBndrs subst1 bs
+    go subst (Let b e) eis
+      | not (isJoinBind b) -- See Note [Eta expansion for join points]
+      = Let b' (go subst' e eis)
+      where
+        (subst', b') = Core.substBindSC subst b
 
-etaInfoApp subst (Tick t e) eis
-  = Tick (substTickish subst t) (etaInfoApp subst e eis)
+    -- Beta-reduction if possible, using getArg_maybe to push
+    -- any intervening casts past the argument
+    -- See Note [The EtaInfo mechansim]
+    go subst (Lam v e) eis
+      | Just (arg, eis') <- getArg_maybe eis
+      = go (Core.extendSubst subst v arg) e eis'
 
-etaInfoApp subst expr _
-  | (Var fun, _) <- collectArgs expr
-  , Var fun' <- lookupIdSubst subst fun
-  , isJoinId fun'
-  = subst_expr subst expr
+    -- Stop pushing down; just wrap the expression up
+    go subst e eis = wrap (Core.substExprSC subst e) eis
 
-etaInfoApp subst e eis
-  = go (subst_expr subst e) eis
-  where
-    go e []                  = e
-    go e (EtaVar v    : eis) = go (App e (varToCoreExpr v)) eis
-    go e (EtaCo co    : eis) = go (Cast e co) eis
+    wrap e []               = e
+    wrap e (EtaVar v : eis) = wrap (App e (varToCoreExpr v)) eis
+    wrap e (EtaCo co : eis) = wrap (Cast e co) eis
 
 
 --------------
@@ -1235,7 +1370,7 @@ mkEtaWW orig_oss ppr_orig_expr in_scope orig_ty
        -- We want to get
        --      coerce T (\x::[T] -> (coerce ([T]->Int) e) x)
        | Just (co, ty') <- topNormaliseNewType_maybe ty
-       , let co' = Coercion.substCo subst co
+       , let co' = Type.substCo subst co
              -- Remember to apply the substitution to co (#16979)
              -- (or we could have applied to ty, but then
              --  we'd have had to zap it for the recursive call)
@@ -1253,21 +1388,290 @@ mkEtaWW orig_oss ppr_orig_expr in_scope orig_ty
         -- with an explicit lambda having a non-function type
 
 
+{- *********************************************************************
+*                                                                      *
+              The "push rules"
+*                                                                      *
+************************************************************************
 
-------------
-subst_expr :: Subst -> CoreExpr -> CoreExpr
--- Apply a substitution to an expression.  We use substExpr
--- not substExprSC (short-cutting substitution) because
--- we may be changing the types of join points, so applying
--- the in-scope set is necessary.
+Here we implement the "push rules" from FC papers:
+
+* The push-argument rules, where we can move a coercion past an argument.
+  We have
+      (fun |> co) arg
+  and we want to transform it to
+    (fun arg') |> co'
+  for some suitable co' and transformed arg'.
+
+* The PushK rule for data constructors.  We have
+       (K e1 .. en) |> co
+  and we want to transform to
+       (K e1' .. en')
+  by pushing the coercion into the arguments
+-}
+
+pushCoArgs :: CoercionR -> [CoreArg] -> Maybe ([CoreArg], MCoercion)
+pushCoArgs co []         = return ([], MCo co)
+pushCoArgs co (arg:args) = do { (arg',  m_co1) <- pushCoArg  co  arg
+                              ; case m_co1 of
+                                  MCo co1 -> do { (args', m_co2) <- pushCoArgs co1 args
+                                                 ; return (arg':args', m_co2) }
+                                  MRefl  -> return (arg':args, MRefl) }
+
+pushCoArg :: CoercionR -> CoreArg -> Maybe (CoreArg, MCoercion)
+-- We have (fun |> co) arg, and we want to transform it to
+--         (fun arg) |> co
+-- This may fail, e.g. if (fun :: N) where N is a newtype
+-- C.f. simplCast in GHC.Core.Opt.Simplify
+-- 'co' is always Representational
+-- If the returned coercion is Nothing, then it would have been reflexive
+pushCoArg co (Type ty) = do { (ty', m_co') <- pushCoTyArg co ty
+                            ; return (Type ty', m_co') }
+pushCoArg co val_arg   = do { (arg_co, m_co') <- pushCoValArg co
+                            ; return (val_arg `mkCastMCo` arg_co, m_co') }
+
+pushCoTyArg :: CoercionR -> Type -> Maybe (Type, MCoercionR)
+-- We have (fun |> co) @ty
+-- Push the coercion through to return
+--         (fun @ty') |> co'
+-- 'co' is always Representational
+-- If the returned coercion is Nothing, then it would have been reflexive;
+-- it's faster not to compute it, though.
+pushCoTyArg co ty
+  -- The following is inefficient - don't do `eqType` here, the coercion
+  -- optimizer will take care of it. See #14737.
+  -- -- | tyL `eqType` tyR
+  -- -- = Just (ty, Nothing)
+
+  | isReflCo co
+  = Just (ty, MRefl)
+
+  | isForAllTy_ty tyL
+  = ASSERT2( isForAllTy_ty tyR, ppr co $$ ppr ty )
+    Just (ty `mkCastTy` co1, MCo co2)
+
+  | otherwise
+  = Nothing
+  where
+    Pair tyL tyR = coercionKind co
+       -- co :: tyL ~ tyR
+       -- tyL = forall (a1 :: k1). ty1
+       -- tyR = forall (a2 :: k2). ty2
+
+    co1 = mkSymCo (mkNthCo Nominal 0 co)
+       -- co1 :: k2 ~N k1
+       -- Note that NthCo can extract a Nominal equality between the
+       -- kinds of the types related by a coercion between forall-types.
+       -- See the NthCo case in GHC.Core.Lint.
+
+    co2 = mkInstCo co (mkGReflLeftCo Nominal ty co1)
+        -- co2 :: ty1[ (ty|>co1)/a1 ] ~ ty2[ ty/a2 ]
+        -- Arg of mkInstCo is always nominal, hence mkNomReflCo
+
+pushCoValArg :: CoercionR -> Maybe (MCoercionR, MCoercionR)
+-- We have (fun |> co) arg
+-- Push the coercion through to return
+--         (fun (arg |> co_arg)) |> co_res
+-- 'co' is always Representational
+-- If the second returned Coercion is actually Nothing, then no cast is necessary;
+-- the returned coercion would have been reflexive.
+pushCoValArg co
+  -- The following is inefficient - don't do `eqType` here, the coercion
+  -- optimizer will take care of it. See #14737.
+  -- -- | tyL `eqType` tyR
+  -- -- = Just (mkRepReflCo arg, Nothing)
+
+  | isReflCo co
+  = Just (MRefl, MRefl)
+
+  | isFunTy tyL
+  , (co_mult, co1, co2) <- decomposeFunCo Representational co
+  , isReflexiveCo co_mult
+    -- We can't push the coercion in the case where co_mult isn't reflexivity:
+    -- it could be an unsafe axiom, and losing this information could yield
+    -- ill-typed terms. For instance (fun x ::(1) Int -> (fun _ -> () |> co) x)
+    -- with co :: (Int -> ()) ~ (Int #-> ()), would reduce to (fun x ::(1) Int
+    -- -> (fun _ ::(Many) Int -> ()) x) which is ill-typed
+
+              -- If   co  :: (tyL1 -> tyL2) ~ (tyR1 -> tyR2)
+              -- then co1 :: tyL1 ~ tyR1
+              --      co2 :: tyL2 ~ tyR2
+  = ASSERT2( isFunTy tyR, ppr co $$ ppr arg )
+    Just (coToMCo (mkSymCo co1), coToMCo co2)
+    -- Critically, coToMCo to checks for ReflCo; the whole coercion may not
+    -- be reflexive, but either of its components might be
+    -- We could use isReflexiveCo, but it's not clear if the benefit
+    -- is worth the cost, and it makes no difference in #18223
+
+  | otherwise
+  = Nothing
+  where
+    arg = funArgTy tyR
+    Pair tyL tyR = coercionKind co
+
+pushCoercionIntoLambda
+    :: InScopeSet -> Var -> CoreExpr -> CoercionR -> Maybe (Var, CoreExpr)
+-- This implements the Push rule from the paper on coercions
+--    (\x. e) |> co
+-- ===>
+--    (\x'. e |> co')
+pushCoercionIntoLambda in_scope x e co
+    | ASSERT(not (isTyVar x) && not (isCoVar x)) True
+    , Pair s1s2 t1t2 <- coercionKind co
+    , Just (_, _s1,_s2) <- splitFunTy_maybe s1s2
+    , Just (w1, t1,_t2) <- splitFunTy_maybe t1t2
+    , (co_mult, co1, co2) <- decomposeFunCo Representational co
+    , isReflexiveCo co_mult
+      -- We can't push the coercion in the case where co_mult isn't
+      -- reflexivity. See pushCoValArg for more details.
+    = let
+          -- Should we optimize the coercions here?
+          -- Otherwise they might not match too well
+          x' = x `setIdType` t1 `setIdMult` w1
+          in_scope' = in_scope `extendInScopeSet` x'
+          subst = extendIdSubst (mkEmptySubst in_scope')
+                                x
+                                (mkCast (Var x') co1)
+      in Just (x', substExpr subst e `mkCast` co2)
+    | otherwise
+    = pprTrace "exprIsLambda_maybe: Unexpected lambda in case" (ppr (Lam x e))
+      Nothing
+
+pushCoDataCon :: DataCon -> [CoreExpr] -> Coercion
+              -> Maybe (DataCon
+                       , [Type]      -- Universal type args
+                       , [CoreExpr]) -- All other args incl existentials
+-- Implement the KPush reduction rule as described in "Down with kinds"
+-- The transformation applies iff we have
+--      (C e1 ... en) `cast` co
+-- where co :: (T t1 .. tn) ~ to_ty
+-- The left-hand one must be a T, because exprIsConApp returned True
+-- but the right-hand one might not be.  (Though it usually will.)
+pushCoDataCon dc dc_args co
+  | isReflCo co || from_ty `eqType` to_ty  -- try cheap test first
+  , let (univ_ty_args, rest_args) = splitAtList (dataConUnivTyVars dc) dc_args
+  = Just (dc, map exprToType univ_ty_args, rest_args)
+
+  | Just (to_tc, to_tc_arg_tys) <- splitTyConApp_maybe to_ty
+  , to_tc == dataConTyCon dc
+        -- These two tests can fail; we might see
+        --      (C x y) `cast` (g :: T a ~ S [a]),
+        -- where S is a type function.  In fact, exprIsConApp
+        -- will probably not be called in such circumstances,
+        -- but there's nothing wrong with it
+
+  = let
+        tc_arity       = tyConArity to_tc
+        dc_univ_tyvars = dataConUnivTyVars dc
+        dc_ex_tcvars   = dataConExTyCoVars dc
+        arg_tys        = dataConRepArgTys dc
+
+        non_univ_args  = dropList dc_univ_tyvars dc_args
+        (ex_args, val_args) = splitAtList dc_ex_tcvars non_univ_args
+
+        -- Make the "Psi" from the paper
+        omegas = decomposeCo tc_arity co (tyConRolesRepresentational to_tc)
+        (psi_subst, to_ex_arg_tys)
+          = liftCoSubstWithEx Representational
+                              dc_univ_tyvars
+                              omegas
+                              dc_ex_tcvars
+                              (map exprToType ex_args)
+
+          -- Cast the value arguments (which include dictionaries)
+        new_val_args = zipWith cast_arg (map scaledThing arg_tys) val_args
+        cast_arg arg_ty arg = mkCast arg (psi_subst arg_ty)
+
+        to_ex_args = map Type to_ex_arg_tys
+
+        dump_doc = vcat [ppr dc,      ppr dc_univ_tyvars, ppr dc_ex_tcvars,
+                         ppr arg_tys, ppr dc_args,
+                         ppr ex_args, ppr val_args, ppr co, ppr from_ty, ppr to_ty, ppr to_tc
+                         , ppr $ mkTyConApp to_tc (map exprToType $ takeList dc_univ_tyvars dc_args) ]
+    in
+    ASSERT2( eqType from_ty (mkTyConApp to_tc (map exprToType $ takeList dc_univ_tyvars dc_args)), dump_doc )
+    ASSERT2( equalLength val_args arg_tys, dump_doc )
+    Just (dc, to_tc_arg_tys, to_ex_args ++ new_val_args)
+
+  | otherwise
+  = Nothing
+
+  where
+    Pair from_ty to_ty = coercionKind co
+
+collectBindersPushingCo :: CoreExpr -> ([Var], CoreExpr)
+-- Collect lambda binders, pushing coercions inside if possible
+-- E.g.   (\x.e) |> g         g :: <Int> -> blah
+--        = (\x. e |> Nth 1 g)
+--
+-- That is,
 --
--- ToDo: we could instead check if we actually *are*
--- changing any join points' types, and if not use substExprSC.
-subst_expr = substExpr
+-- collectBindersPushingCo ((\x.e) |> g) === ([x], e |> Nth 1 g)
+collectBindersPushingCo e
+  = go [] e
+  where
+    -- Peel off lambdas until we hit a cast.
+    go :: [Var] -> CoreExpr -> ([Var], CoreExpr)
+    -- The accumulator is in reverse order
+    go bs (Lam b e)   = go (b:bs) e
+    go bs (Cast e co) = go_c bs e co
+    go bs e           = (reverse bs, e)
+
+    -- We are in a cast; peel off casts until we hit a lambda.
+    go_c :: [Var] -> CoreExpr -> CoercionR -> ([Var], CoreExpr)
+    -- (go_c bs e c) is same as (go bs e (e |> c))
+    go_c bs (Cast e co1) co2 = go_c bs e (co1 `mkTransCo` co2)
+    go_c bs (Lam b e)    co  = go_lam bs b e co
+    go_c bs e            co  = (reverse bs, mkCast e co)
+
+    -- We are in a lambda under a cast; peel off lambdas and build a
+    -- new coercion for the body.
+    go_lam :: [Var] -> Var -> CoreExpr -> CoercionR -> ([Var], CoreExpr)
+    -- (go_lam bs b e c) is same as (go_c bs (\b.e) c)
+    go_lam bs b e co
+      | isTyVar b
+      , let Pair tyL tyR = coercionKind co
+      , ASSERT( isForAllTy_ty tyL )
+        isForAllTy_ty tyR
+      , isReflCo (mkNthCo Nominal 0 co)  -- See Note [collectBindersPushingCo]
+      = go_c (b:bs) e (mkInstCo co (mkNomReflCo (mkTyVarTy b)))
+
+      | isCoVar b
+      , let Pair tyL tyR = coercionKind co
+      , ASSERT( isForAllTy_co tyL )
+        isForAllTy_co tyR
+      , isReflCo (mkNthCo Nominal 0 co)  -- See Note [collectBindersPushingCo]
+      , let cov = mkCoVarCo b
+      = go_c (b:bs) e (mkInstCo co (mkNomReflCo (mkCoercionTy cov)))
+
+      | isId b
+      , let Pair tyL tyR = coercionKind co
+      , ASSERT( isFunTy tyL) isFunTy tyR
+      , (co_mult, co_arg, co_res) <- decomposeFunCo Representational co
+      , isReflCo co_mult -- See Note [collectBindersPushingCo]
+      , isReflCo co_arg  -- See Note [collectBindersPushingCo]
+      = go_c (b:bs) e co_res
+
+      | otherwise = (reverse bs, mkCast (Lam b e) co)
 
+{-
 
---------------
+Note [collectBindersPushingCo]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We just look for coercions of form
+   <type> # w -> blah
+(and similarly for foralls) to keep this function simple.  We could do
+more elaborate stuff, but it'd involve substitution etc.
+
+-}
+
+{- *********************************************************************
+*                                                                      *
+                Join points
+*                                                                      *
+********************************************************************* -}
 
+-------------------
 -- | Split an expression into the given number of binders and a body,
 -- eta-expanding if necessary. Counts value *and* type binders.
 etaExpandToJoinPoint :: JoinArity -> CoreExpr -> ([CoreBndr], CoreExpr)
@@ -1307,7 +1711,7 @@ etaBodyForJoinPoint need_args body
       = (reverse rev_bs, e)
     go n ty subst rev_bs e
       | Just (tv, res_ty) <- splitForAllTy_maybe ty
-      , let (subst', tv') = Type.substVarBndr subst tv
+      , let (subst', tv') = substVarBndr subst tv
       = go (n-1) res_ty subst' (tv' : rev_bs) (e `App` varToCoreExpr tv')
       | Just (mult, arg_ty, res_ty) <- splitFunTy_maybe ty
       , let (subst', b) = freshEtaId n subst (Scaled mult arg_ty)
@@ -1318,6 +1722,8 @@ etaBodyForJoinPoint need_args body
 
     init_subst e = mkEmptyTCvSubst (mkInScopeSet (exprFreeVars e))
 
+
+
 --------------
 freshEtaId :: Int -> TCvSubst -> Scaled Type -> (TCvSubst, Id)
 -- Make a fresh Id, with specified type (after applying substitution)
@@ -1336,3 +1742,4 @@ freshEtaId n subst ty
                   -- "OrCoVar" since this can be used to eta-expand
                   -- coercion abstractions
         subst'  = extendTCvInScope subst eta_id'
+
diff --git a/compiler/GHC/Core/Opt/Pipeline.hs b/compiler/GHC/Core/Opt/Pipeline.hs
index c3f2fc9f85..6d0712e634 100644
--- a/compiler/GHC/Core/Opt/Pipeline.hs
+++ b/compiler/GHC/Core/Opt/Pipeline.hs
@@ -315,33 +315,38 @@ getCoreToDo dflags
 
         runWhen do_float_in CoreDoFloatInwards,
 
+        simplify "final",  -- Final tidy-up
+
         maybe_rule_check FinalPhase,
 
+        --------  After this we have -O2 passes -----------------
+        -- None of them run with -O
+
                 -- Case-liberation for -O2.  This should be after
                 -- strictness analysis and the simplification which follows it.
-        runWhen liberate_case (CoreDoPasses [
-            CoreLiberateCase,
-            simplify "post-liberate-case"
-            ]),         -- Run the simplifier after LiberateCase to vastly
-                        -- reduce the possibility of shadowing
-                        -- Reason: see Note [Shadowing] in GHC.Core.Opt.SpecConstr
+        runWhen liberate_case $ CoreDoPasses
+           [ CoreLiberateCase, simplify "post-liberate-case" ],
+           -- Run the simplifier after LiberateCase to vastly
+           -- reduce the possibility of shadowing
+           -- Reason: see Note [Shadowing] in GHC.Core.Opt.SpecConstr
 
-        runWhen spec_constr CoreDoSpecConstr,
+        runWhen spec_constr $ CoreDoPasses
+           [ CoreDoSpecConstr, simplify "post-spec-constr"],
+           -- See Note [Simplify after SpecConstr]
 
         maybe_rule_check FinalPhase,
 
-        runWhen late_specialise
-          (CoreDoPasses [ CoreDoSpecialising
-                        , simplify "post-late-spec"]),
+        runWhen late_specialise $ CoreDoPasses
+           [ CoreDoSpecialising, simplify "post-late-spec"],
 
         -- LiberateCase can yield new CSE opportunities because it peels
         -- off one layer of a recursive function (concretely, I saw this
         -- in wheel-sieve1), and I'm guessing that SpecConstr can too
         -- And CSE is a very cheap pass. So it seems worth doing here.
-        runWhen ((liberate_case || spec_constr) && cse) CoreCSE,
+        runWhen ((liberate_case || spec_constr) && cse) $ CoreDoPasses
+           [ CoreCSE, simplify "post-final-cse" ],
 
-        -- Final clean-up simplification:
-        simplify "final",
+        ---------  End of -O2 passes --------------
 
         runWhen late_dmd_anal $ CoreDoPasses (
             dmd_cpr_ww ++ [simplify "post-late-ww"]
@@ -410,6 +415,27 @@ or with -O0.  Two reasons:
 But watch out: list fusion can prevent floating.  So use phase control
 to switch off those rules until after floating.
 
+Note [Simplify after SpecConstr]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We want to run the simplifier after SpecConstr, and before late-Specialise,
+for two reasons, both shown up in test perf/compiler/T16473,
+with -O2 -flate-specialise
+
+1.  I found that running late-Specialise after SpecConstr, with no
+    simplification in between meant that the carefullly constructed
+    SpecConstr rule never got to fire.  (It was something like
+          lvl = f a   -- Arity 1
+          ....g lvl....
+    SpecConstr specialised g for argument lvl; but Specialise then
+    specialised lvl = f a to lvl = $sf, and inlined. Or something like
+    that.)
+
+2.  Specialise relies on unfoldings being available for top-level dictionary
+    bindings; but SpecConstr kills them all!  The Simplifer restores them.
+
+This extra run of the simplifier has a cost, but this is only with -O2.
+
+
 ************************************************************************
 *                                                                      *
                   The CoreToDo interpreter
diff --git a/compiler/GHC/Core/Opt/Simplify.hs b/compiler/GHC/Core/Opt/Simplify.hs
index e219a0dba9..6c207766bd 100644
--- a/compiler/GHC/Core/Opt/Simplify.hs
+++ b/compiler/GHC/Core/Opt/Simplify.hs
@@ -51,9 +51,9 @@ import GHC.Core.Unfold
 import GHC.Core.Unfold.Make
 import GHC.Core.Utils
 import GHC.Core.Opt.Arity ( ArityType(..), arityTypeArity, isBotArityType
+                          , pushCoTyArg, pushCoValArg
                           , idArityType, etaExpandAT )
-import GHC.Core.SimpleOpt ( pushCoTyArg, pushCoValArg
-                          , joinPointBinding_maybe, joinPointBindings_maybe )
+import GHC.Core.SimpleOpt ( joinPointBinding_maybe, joinPointBindings_maybe )
 import GHC.Core.FVs     ( mkRuleInfo )
 import GHC.Core.Rules   ( lookupRule, getRules, initRuleOpts )
 import GHC.Types.Basic
@@ -318,7 +318,7 @@ simplLazyBind env top_lvl is_rec bndr bndr1 rhs rhs_se
         ; let rhs_cont = mkRhsStop (substTy body_env (exprType body))
         ; (body_floats0, body0) <- {-#SCC "simplExprF" #-} simplExprF body_env body rhs_cont
 
-              -- Never float join-floats out of a non-join let-binding
+              -- Never float join-floats out of a non-join let-binding (which this is)
               -- So wrap the body in the join-floats right now
               -- Hence: body_floats1 consists only of let-floats
         ; let (body_floats1, body1) = wrapJoinFloatsX body_floats0 body0
@@ -1414,25 +1414,23 @@ simplCast env body co0 cont0
                                         -- type of the hole changes (#16312)
 
         -- (f |> co) e   ===>   (f (e |> co1)) |> co2
-        -- where   co :: (s1->s2) ~ (t1~t2)
+        -- where   co :: (s1->s2) ~ (t1->t2)
         --         co1 :: t1 ~ s1
         --         co2 :: s2 ~ t2
         addCoerce co cont@(ApplyToVal { sc_arg = arg, sc_env = arg_se
                                       , sc_dup = dup, sc_cont = tail })
-          | Just (co1, m_co2) <- pushCoValArg co
-          , let new_ty = coercionRKind co1
-          , not (isTypeLevPoly new_ty)  -- Without this check, we get a lev-poly arg
-                                        -- See Note [Levity polymorphism invariants] in GHC.Core
-                                        -- test: typecheck/should_run/EtaExpandLevPoly
+          | Just (m_co1, m_co2) <- pushCoValArg co
+          , levity_ok m_co1
           = {-#SCC "addCoerce-pushCoValArg" #-}
             do { tail' <- addCoerceM m_co2 tail
-               ; if isReflCo co1
-                 then return (cont { sc_cont = tail'
-                                   , sc_hole_ty = coercionLKind co })
+               ; case m_co1 of {
+                   MRefl -> return (cont { sc_cont = tail'
+                                         , sc_hole_ty = coercionLKind co }) ;
                       -- Avoid simplifying if possible;
                       -- See Note [Avoiding exponential behaviour]
-                 else do
-               { (dup', arg_se', arg') <- simplArg env dup arg_se arg
+
+                   MCo co1 ->
+            do { (dup', arg_se', arg') <- simplArg env dup arg_se arg
                     -- When we build the ApplyTo we can't mix the OutCoercion
                     -- 'co' with the InExpr 'arg', so we simplify
                     -- to make it all consistent.  It's a bit messy.
@@ -1442,7 +1440,7 @@ simplCast env body co0 cont0
                                     , sc_env  = arg_se'
                                     , sc_dup  = dup'
                                     , sc_cont = tail'
-                                    , sc_hole_ty = coercionLKind co }) } }
+                                    , sc_hole_ty = coercionLKind co }) } } }
 
         addCoerce co cont
           | isReflexiveCo co = return cont  -- Having this at the end makes a huge
@@ -1450,6 +1448,13 @@ simplCast env body co0 cont0
                                             -- See Note [Optimising reflexivity]
           | otherwise        = return (CastIt co cont)
 
+        levity_ok :: MCoercionR -> Bool
+        levity_ok MRefl = True
+        levity_ok (MCo co) = not $ isTypeLevPoly $ coercionRKind co
+          -- Without this check, we get a lev-poly arg
+          -- See Note [Levity polymorphism invariants] in GHC.Core
+          -- test: typecheck/should_run/EtaExpandLevPoly
+
 simplArg :: SimplEnv -> DupFlag -> StaticEnv -> CoreExpr
          -> SimplM (DupFlag, StaticEnv, OutExpr)
 simplArg env dup_flag arg_env arg
@@ -3114,7 +3119,7 @@ knownCon :: SimplEnv
 
 knownCon env scrut dc_floats dc dc_ty_args dc_args bndr bs rhs cont
   = do  { (floats1, env1)  <- bind_args env bs dc_args
-        ; (floats2, env2) <- bind_case_bndr env1
+        ; (floats2, env2)  <- bind_case_bndr env1
         ; (floats3, expr') <- simplExprF env2 rhs cont
         ; case dc_floats of
             [] ->
@@ -3240,6 +3245,7 @@ altsWouldDup [_] = False
 altsWouldDup (alt:alts)
   | is_bot_alt alt = altsWouldDup alts
   | otherwise      = not (all is_bot_alt alts)
+    -- otherwise case: first alt is non-bot, so all the rest must be bot
   where
     is_bot_alt (_,_,rhs) = exprIsDeadEnd rhs
 
diff --git a/compiler/GHC/Core/Opt/Simplify/Env.hs b/compiler/GHC/Core/Opt/Simplify/Env.hs
index 4ceaf637ed..7049e3e578 100644
--- a/compiler/GHC/Core/Opt/Simplify/Env.hs
+++ b/compiler/GHC/Core/Opt/Simplify/Env.hs
@@ -596,7 +596,7 @@ addJoinFlts :: JoinFloats -> JoinFloats -> JoinFloats
 addJoinFlts = appOL
 
 mkRecFloats :: SimplFloats -> SimplFloats
--- Flattens the floats from env2 into a single Rec group,
+-- Flattens the floats into a single Rec group,
 -- They must either all be lifted LetFloats or all JoinFloats
 mkRecFloats floats@(SimplFloats { sfLetFloats  = LetFloats bs ff
                                 , sfJoinFloats = jbs
diff --git a/compiler/GHC/Core/Opt/Specialise.hs b/compiler/GHC/Core/Opt/Specialise.hs
index 8e9e35d208..ba5679778b 100644
--- a/compiler/GHC/Core/Opt/Specialise.hs
+++ b/compiler/GHC/Core/Opt/Specialise.hs
@@ -21,22 +21,21 @@ import GHC.Tc.Utils.TcType hiding( substTy )
 import GHC.Core.Type  hiding( substTy, extendTvSubstList )
 import GHC.Core.Multiplicity
 import GHC.Core.Predicate
-import GHC.Unit.Module( Module, HasModule(..) )
+import GHC.Unit.Module( Module )
 import GHC.Core.Coercion( Coercion )
 import GHC.Core.Opt.Monad
 import qualified GHC.Core.Subst as Core
-import GHC.Core.Unfold
 import GHC.Core.Unfold.Make
 import GHC.Types.Var      ( isLocalVar )
 import GHC.Types.Var.Set
 import GHC.Types.Var.Env
 import GHC.Core
 import GHC.Core.Rules
-import GHC.Core.SimpleOpt ( collectBindersPushingCo )
 import GHC.Core.Utils     ( exprIsTrivial, getIdFromTrivialExpr_maybe
                           , mkCast, exprType )
 import GHC.Core.FVs
-import GHC.Core.Opt.Arity     ( etaExpandToJoinPointRule )
+import GHC.Core.Opt.Arity     ( collectBindersPushingCo
+                              , etaExpandToJoinPointRule )
 import GHC.Types.Unique.Supply
 import GHC.Types.Name
 import GHC.Types.Id.Make  ( voidArgId, voidPrimId )
@@ -53,12 +52,9 @@ import GHC.Utils.Misc
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
 import GHC.Data.FastString
-import GHC.Utils.Monad.State
 import GHC.Types.Unique.DFM
 import GHC.Core.TyCo.Rep (TyCoBinder (..))
 
-import Control.Monad
-
 {-
 ************************************************************************
 *                                                                      *
@@ -592,28 +588,29 @@ specProgram guts@(ModGuts { mg_module = this_mod
                           , mg_binds = binds })
   = do { dflags <- getDynFlags
 
+              -- We need to start with a Subst that knows all the things
+              -- that are in scope, so that the substitution engine doesn't
+              -- accidentally re-use a unique that's already in use
+              -- Easiest thing is to do it all at once, as if all the top-level
+              -- decls were mutually recursive
+       ; let top_env = SE { se_subst = Core.mkEmptySubst $ mkInScopeSet $ mkVarSet $
+                                       bindersOfBinds binds
+                          , se_interesting = emptyVarSet
+                          , se_module = this_mod
+                          , se_dflags = dflags }
+
+             go []           = return ([], emptyUDs)
+             go (bind:binds) = do (binds', uds) <- go binds
+                                  (bind', uds') <- specBind top_env bind uds
+                                  return (bind' ++ binds', uds')
+
              -- Specialise the bindings of this module
-       ; (binds', uds) <- runSpecM dflags this_mod (go binds)
+       ; (binds', uds) <- runSpecM (go binds)
 
-       ; (spec_rules, spec_binds) <- specImports dflags this_mod top_env
-                                                 local_rules uds
+       ; (spec_rules, spec_binds) <- specImports top_env local_rules uds
 
        ; return (guts { mg_binds = spec_binds ++ binds'
                       , mg_rules = spec_rules ++ local_rules }) }
-  where
-        -- We need to start with a Subst that knows all the things
-        -- that are in scope, so that the substitution engine doesn't
-        -- accidentally re-use a unique that's already in use
-        -- Easiest thing is to do it all at once, as if all the top-level
-        -- decls were mutually recursive
-    top_env = SE { se_subst = Core.mkEmptySubst $ mkInScopeSet $ mkVarSet $
-                              bindersOfBinds binds
-                 , se_interesting = emptyVarSet }
-
-    go []           = return ([], emptyUDs)
-    go (bind:binds) = do (binds', uds) <- go binds
-                         (bind', uds') <- specBind top_env bind uds
-                         return (bind' ++ binds', uds')
 
 {-
 Note [Wrap bindings returned by specImports]
@@ -643,13 +640,13 @@ See #10491
 *                                                                      *
 ********************************************************************* -}
 
-specImports :: DynFlags -> Module -> SpecEnv
+specImports :: SpecEnv
             -> [CoreRule]
             -> UsageDetails
             -> CoreM ([CoreRule], [CoreBind])
-specImports dflags this_mod top_env local_rules
+specImports top_env local_rules
             (MkUD { ud_binds = dict_binds, ud_calls = calls })
-  | not $ gopt Opt_CrossModuleSpecialise dflags
+  | not $ gopt Opt_CrossModuleSpecialise (se_dflags top_env)
     -- See Note [Disabling cross-module specialisation]
   = return ([], wrapDictBinds dict_binds [])
 
@@ -657,8 +654,7 @@ specImports dflags this_mod top_env local_rules
   = do { hpt_rules <- getRuleBase
        ; let rule_base = extendRuleBaseList hpt_rules local_rules
 
-       ; (spec_rules, spec_binds) <- spec_imports dflags this_mod top_env
-                                                  [] rule_base
+       ; (spec_rules, spec_binds) <- spec_imports top_env [] rule_base
                                                   dict_binds calls
 
              -- Don't forget to wrap the specialized bindings with
@@ -674,9 +670,7 @@ specImports dflags this_mod top_env local_rules
     }
 
 -- | Specialise a set of calls to imported bindings
-spec_imports :: DynFlags
-             -> Module
-             -> SpecEnv          -- Passed in so that all top-level Ids are in scope
+spec_imports :: SpecEnv          -- Passed in so that all top-level Ids are in scope
              -> [Id]             -- Stack of imported functions being specialised
                                  -- See Note [specImport call stack]
              -> RuleBase         -- Rules from this module and the home package
@@ -686,8 +680,7 @@ spec_imports :: DynFlags
              -> CallDetails      -- Calls for imported things
              -> CoreM ( [CoreRule]   -- New rules
                       , [CoreBind] ) -- Specialised bindings
-spec_imports dflags this_mod top_env
-             callers rule_base dict_binds calls
+spec_imports top_env callers rule_base dict_binds calls
   = do { let import_calls = dVarEnvElts calls
        -- ; debugTraceMsg (text "specImports {" <+>
        --                  vcat [ text "calls:" <+> ppr import_calls
@@ -701,16 +694,13 @@ spec_imports dflags this_mod top_env
     go _ [] = return ([], [])
     go rb (cis : other_calls)
       = do { -- debugTraceMsg (text "specImport {" <+> ppr cis)
-           ; (rules1, spec_binds1) <- spec_import dflags this_mod top_env
-                                                  callers rb dict_binds cis
+           ; (rules1, spec_binds1) <- spec_import top_env callers rb dict_binds cis
            -- ; debugTraceMsg (text "specImport }" <+> ppr cis)
 
            ; (rules2, spec_binds2) <- go (extendRuleBaseList rb rules1) other_calls
            ; return (rules1 ++ rules2, spec_binds1 ++ spec_binds2) }
 
-spec_import :: DynFlags
-            -> Module
-            -> SpecEnv               -- Passed in so that all top-level Ids are in scope
+spec_import :: SpecEnv               -- Passed in so that all top-level Ids are in scope
             -> [Id]                  -- Stack of imported functions being specialised
                                      -- See Note [specImport call stack]
             -> RuleBase              -- Rules from this module
@@ -719,8 +709,7 @@ spec_import :: DynFlags
             -> CallInfoSet           -- Imported function and calls for it
             -> CoreM ( [CoreRule]    -- New rules
                      , [CoreBind] )  -- Specialised bindings
-spec_import dflags this_mod top_env callers
-            rb dict_binds cis@(CIS fn _)
+spec_import top_env callers rb dict_binds cis@(CIS fn _)
   | isIn "specImport" fn callers
   = return ([], [])     -- No warning.  This actually happens all the time
                         -- when specialising a recursive function, because
@@ -731,8 +720,7 @@ spec_import dflags this_mod top_env callers
   = do { -- debugTraceMsg (text "specImport:no valid calls")
        ; return ([], []) }
 
-  | wantSpecImport dflags unfolding
-  , Just rhs <- maybeUnfoldingTemplate unfolding
+  | Just rhs <- canSpecImport dflags fn
   = do {     -- Get rules from the external package state
              -- We keep doing this in case we "page-fault in"
              -- more rules as we go along
@@ -744,8 +732,8 @@ spec_import dflags this_mod top_env callers
 
        ; (rules1, spec_pairs, MkUD { ud_binds = dict_binds1, ud_calls = new_calls })
              <- do { -- debugTraceMsg (text "specImport1" <+> vcat [ppr fn, ppr good_calls, ppr rhs])
-                   ; runSpecM dflags this_mod $
-                     specCalls (Just this_mod) top_env rules_for_fn good_calls fn rhs }
+                   ; runSpecM $
+                     specCalls True top_env rules_for_fn good_calls fn rhs }
        ; let spec_binds1 = [NonRec b r | (b,r) <- spec_pairs]
              -- After the rules kick in we may get recursion, but
              -- we rely on a global GlomBinds to sort that out later
@@ -753,7 +741,7 @@ spec_import dflags this_mod top_env callers
 
               -- Now specialise any cascaded calls
        -- ; debugTraceMsg (text "specImport 2" <+> (ppr fn $$ ppr rules1 $$ ppr spec_binds1))
-       ; (rules2, spec_binds2) <- spec_imports dflags this_mod top_env
+       ; (rules2, spec_binds2) <- spec_imports top_env
                                                (fn:callers)
                                                (extendRuleBaseList rb rules1)
                                                (dict_binds `unionBags` dict_binds1)
@@ -769,11 +757,34 @@ spec_import dflags this_mod top_env callers
        ; return ([], [])}
 
   where
-    unfolding = realIdUnfolding fn   -- We want to see the unfolding even for loop breakers
+    dflags = se_dflags top_env
     good_calls = filterCalls cis dict_binds
        -- SUPER IMPORTANT!  Drop calls that (directly or indirectly) refer to fn
        -- See Note [Avoiding loops in specImports]
 
+canSpecImport :: DynFlags -> Id -> Maybe CoreExpr
+-- See Note [Specialise imported INLINABLE things]
+canSpecImport dflags fn
+  | CoreUnfolding { uf_src = src, uf_tmpl = rhs } <- unf
+  , isStableSource src
+  = Just rhs   -- By default, specialise only imported things that have a stable
+               -- unfolding; that is, have an INLINE or INLINABLE pragma
+               -- Specialise even INLINE things; it hasn't inlined yet,
+               -- so perhaps it never will.  Moreover it may have calls
+               -- inside it that we want to specialise
+
+    -- CoreUnfolding case does /not/ include DFunUnfoldings;
+    -- We only specialise DFunUnfoldings with -fspecialise-aggressively
+    -- See Note [Do not specialise imported DFuns]
+
+  | gopt Opt_SpecialiseAggressively dflags
+  = maybeUnfoldingTemplate unf  -- With -fspecialise-aggressively, specialise anything
+                                -- with an unfolding, stable or not, DFun or not
+
+  | otherwise = Nothing
+  where
+    unf = realIdUnfolding fn   -- We want to see the unfolding even for loop breakers
+
 -- | Returns whether or not to show a missed-spec warning.
 -- If -Wall-missed-specializations is on, show the warning.
 -- Otherwise, if -Wmissed-specializations is on, only show a warning
@@ -798,24 +809,47 @@ tryWarnMissingSpecs dflags callers fn calls_for_fn
           , whenPprDebug (text "calls:" <+> vcat (map (pprCallInfo fn) calls_for_fn))
           , text "Probable fix: add INLINABLE pragma on" <+> quotes (ppr fn) ])
 
-wantSpecImport :: DynFlags -> Unfolding -> Bool
--- See Note [Specialise imported INLINABLE things]
-wantSpecImport dflags unf
- = case unf of
-     NoUnfolding      -> False
-     BootUnfolding    -> False
-     OtherCon {}      -> False
-     DFunUnfolding {} -> True
-     CoreUnfolding { uf_src = src, uf_guidance = _guidance }
-       | gopt Opt_SpecialiseAggressively dflags -> True
-       | isStableSource src -> True
-               -- Specialise even INLINE things; it hasn't inlined yet,
-               -- so perhaps it never will.  Moreover it may have calls
-               -- inside it that we want to specialise
-       | otherwise -> False    -- Stable, not INLINE, hence INLINABLE
 
-{- Note [Avoiding loops in specImports]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+{- Note [Do not specialise imported DFuns]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Ticket #18223 shows that specialising calls of DFuns is can cause a huge
+and entirely unnecessary blowup in program size.  Consider a call to
+    f @[[[[[[[[T]]]]]]]] d1 x
+where df :: C a => C [a]
+      d1 :: C [[[[[[[[T]]]]]]]] = dfC[] @[[[[[[[T]]]]]]] d1
+      d2 :: C [[[[[[[T]]]]]]]   = dfC[] @[[[[[[T]]]]]] d3
+      ...
+Now we'll specialise f's RHS, which may give rise to calls to 'g',
+also overloaded, which we will specialise, and so on.  However, if
+we specialise the calls to dfC[], we'll generate specialised copies of
+all methods of C, at all types; and the same for C's superclasses.
+
+And many of these specialised functions will never be called.  We are
+going to call the specialised 'f', and the specialised 'g', but DFuns
+group functions into a tuple, many of whose elements may never be used.
+
+With deeply-nested types this can lead to a simply overwhelming number
+of specialisations: see #18223 for a simple example (from the wild).
+I measured the number of specialisations for various numbers of calls
+of `flip evalStateT ()`, and got this
+
+                       Size after one simplification
+  #calls    #SPEC rules    Terms     Types
+      5         56          3100     10600
+      9        108         13660     77206
+
+The real tests case has 60+ calls, which blew GHC out of the water.
+
+Solution: don't specialise DFuns.  The downside is that if we end
+up with (h (dfun d)), /and/ we don't specialise 'h', then we won't
+pass to 'h' a tuple of specialised functions.
+
+However, the flag -fspecialise-aggressively (experimental, off by default)
+allows DFuns to specialise as well.
+
+Note [Avoiding loops in specImports]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We must take great care when specialising instance declarations
 (functions like $fOrdList) lest we accidentally build a recursive
 dictionary. See Note [Avoiding loops].
@@ -1003,6 +1037,9 @@ data SpecEnv
              -- Dict Ids that we know something about
              -- and hence may be worth specialising against
              -- See Note [Interesting dictionary arguments]
+
+       , se_module :: Module
+       , se_dflags :: DynFlags
      }
 
 instance Outputable SpecEnv where
@@ -1310,7 +1347,7 @@ specDefn :: SpecEnv
 specDefn env body_uds fn rhs
   = do { let (body_uds_without_me, calls_for_me) = callsForMe fn body_uds
              rules_for_me = idCoreRules fn
-       ; (rules, spec_defns, spec_uds) <- specCalls Nothing env rules_for_me
+       ; (rules, spec_defns, spec_uds) <- specCalls False env rules_for_me
                                                     calls_for_me fn rhs
        ; return ( fn `addIdSpecialisations` rules
                 , spec_defns
@@ -1323,8 +1360,8 @@ specDefn env body_uds fn rhs
                 -- body_uds_without_me
 
 ---------------------------
-specCalls :: Maybe Module      -- Just this_mod  =>  specialising imported fn
-                               -- Nothing        =>  specialising local fn
+specCalls :: Bool              -- True  =>  specialising imported fn
+                               -- False =>  specialising local fn
           -> SpecEnv
           -> [CoreRule]        -- Existing RULES for the fn
           -> [CallInfo]
@@ -1339,7 +1376,7 @@ type SpecInfo = ( [CoreRule]       -- Specialisation rules
                 , [(Id,CoreExpr)]  -- Specialised definition
                 , UsageDetails )   -- Usage details from specialised RHSs
 
-specCalls mb_mod env existing_rules calls_for_me fn rhs
+specCalls spec_imp env existing_rules calls_for_me fn rhs
         -- The first case is the interesting one
   |  notNull calls_for_me               -- And there are some calls to specialise
   && not (isNeverActive (idInlineActivation fn))
@@ -1370,7 +1407,9 @@ specCalls mb_mod env existing_rules calls_for_me fn rhs
     inl_act   = inlinePragmaActivation inl_prag
     is_local  = isLocalId fn
     is_dfun   = isDFunId fn
-
+    dflags    = se_dflags env
+    ropts     = initRuleOpts dflags
+    this_mod  = se_module env
         -- Figure out whether the function has an INLINE pragma
         -- See Note [Inline specialisations]
 
@@ -1412,8 +1451,6 @@ specCalls mb_mod env existing_rules calls_for_me fn rhs
 --                                        , ppr dx_binds ]) $
 --             return ()
 
-           ; dflags <- getDynFlags
-           ; let ropts = initRuleOpts dflags
            ; if not useful  -- No useful specialisation
                 || already_covered ropts rules_acc rule_lhs_args
              then return spec_acc
@@ -1441,17 +1478,15 @@ specCalls mb_mod env existing_rules calls_for_me fn rhs
                                  = Nothing
 
            ; spec_fn <- newSpecIdSM fn spec_fn_ty spec_join_arity
-           ; this_mod <- getModule
            ; let
                 -- The rule to put in the function's specialisation is:
                 --      forall x @b d1' d2'.
                 --          f x @T1 @b @T2 d1' d2' = f1 x @b
                 -- See Note [Specialising Calls]
-                herald = case mb_mod of
-                           Nothing        -- Specialising local fn
-                               -> text "SPEC"
-                           Just this_mod  -- Specialising imported fn
-                               -> text "SPEC/" <> ppr this_mod
+                herald | spec_imp  = -- Specialising imported fn
+                                     text "SPEC/" <> ppr this_mod
+                       | otherwise = -- Specialising local fn
+                                     text "SPEC"
 
                 rule_name = mkFastString $ showSDoc dflags $
                             herald <+> ftext (occNameFS (getOccName fn))
@@ -2480,15 +2515,15 @@ mkCallUDs env f args
     res = mkCallUDs' env f args
 
 mkCallUDs' env f args
-  |  not (want_calls_for f)  -- Imported from elsewhere
-  || null ci_key             -- No useful specialisation
-   -- See also Note [Specialisations already covered]
+  | wantCallsFor env f    -- We want it, and...
+  , not (null ci_key)     -- this call site has a useful specialisation
+  = -- pprTrace "mkCallUDs: keeping" _trace_doc
+    singleCall f ci_key
+
+  | otherwise  -- See also Note [Specialisations already covered]
   = -- pprTrace "mkCallUDs: discarding" _trace_doc
     emptyUDs
 
-  | otherwise
-  = -- pprTrace "mkCallUDs: keeping" _trace_doc
-    singleCall f ci_key
   where
     _trace_doc = vcat [ppr f, ppr args, ppr ci_key]
     pis                = fst $ splitPiTys $ idType f
@@ -2525,12 +2560,23 @@ mkCallUDs' env f args
     mk_spec_arg _ (Anon VisArg _)
       = UnspecArg
 
-    want_calls_for f = isLocalId f || isJust (maybeUnfoldingTemplate (realIdUnfolding f))
-         -- For imported things, we gather call instances if
-         -- there is an unfolding that we could in principle specialise
-         -- We might still decide not to use it (consulting dflags)
-         -- in specImports
-         -- Use 'realIdUnfolding' to ignore the loop-breaker flag!
+wantCallsFor :: SpecEnv -> Id -> Bool
+wantCallsFor _env _f = True
+ -- We could reduce the size of the UsageDetails by being less eager
+ -- about collecting calls for LocalIds: there is no point for
+ -- ones that are lambda-bound.  We can't decide this by looking at
+ -- the (absence of an) unfolding, because unfoldings for local
+ -- functions are discarded by cloneBindSM, so no local binder will
+ -- have an unfolding at this stage.  We'd have to keep a candidate
+ -- set of let-binders.
+ --
+ -- Not many lambda-bound variables have dictionary arguments, so
+ -- this would make little difference anyway.
+ --
+ -- For imported Ids we could check for an unfolding, but we have to
+ -- do so anyway in canSpecImport, and it seems better to have it
+ -- all in one place.  So we simply collect usage info for imported
+ -- overloaded functions.
 
 {- Note [Type determines value]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2780,55 +2826,12 @@ deleteCallsFor bs calls = delDVarEnvList calls bs
 ************************************************************************
 -}
 
-newtype SpecM a = SpecM (State SpecState a) deriving (Functor)
-
-data SpecState = SpecState {
-                     spec_uniq_supply :: UniqSupply,
-                     spec_module :: Module,
-                     spec_dflags :: DynFlags
-                 }
-
-instance Applicative SpecM where
-    pure x = SpecM $ return x
-    (<*>) = ap
-
-instance Monad SpecM where
-    SpecM x >>= f = SpecM $ do y <- x
-                               case f y of
-                                   SpecM z ->
-                                       z
-
-instance MonadFail SpecM where
-   fail str = SpecM $ error str
-
-instance MonadUnique SpecM where
-    getUniqueSupplyM
-        = SpecM $ do st <- get
-                     let (us1, us2) = splitUniqSupply $ spec_uniq_supply st
-                     put $ st { spec_uniq_supply = us2 }
-                     return us1
-
-    getUniqueM
-        = SpecM $ do st <- get
-                     let (u,us') = takeUniqFromSupply $ spec_uniq_supply st
-                     put $ st { spec_uniq_supply = us' }
-                     return u
-
-instance HasDynFlags SpecM where
-    getDynFlags = SpecM $ liftM spec_dflags get
-
-instance HasModule SpecM where
-    getModule = SpecM $ liftM spec_module get
-
-runSpecM :: DynFlags -> Module -> SpecM a -> CoreM a
-runSpecM dflags this_mod (SpecM spec)
-    = do us <- getUniqueSupplyM
-         let initialState = SpecState {
-                                spec_uniq_supply = us,
-                                spec_module = this_mod,
-                                spec_dflags = dflags
-                            }
-         return $ evalState spec initialState
+type SpecM a = UniqSM a
+
+runSpecM :: SpecM a -> CoreM a
+runSpecM thing_inside
+  = do { us <- getUniqueSupplyM
+       ; return (initUs_ us thing_inside) }
 
 mapAndCombineSM :: (a -> SpecM (b, UsageDetails)) -> [a] -> SpecM ([b], UsageDetails)
 mapAndCombineSM _ []     = return ([], emptyUDs)