From c9e7c782bc1414f960d1116267c683da28f961df Mon Sep 17 00:00:00 2001 From: Andreas Klebinger Date: Mon, 13 Apr 2020 15:53:46 +0200 Subject: Update comments/notes Also use eqExpr for match_append_lit. --- compiler/GHC/Core/Op/ConstantFold.hs | 101 +++++++++++++++++++++++------------ libraries/ghc-prim/GHC/CString.hs | 58 ++++++++++++++++++++ 2 files changed, 124 insertions(+), 35 deletions(-) diff --git a/compiler/GHC/Core/Op/ConstantFold.hs b/compiler/GHC/Core/Op/ConstantFold.hs index 251f25244c..c3ea80dce9 100644 --- a/compiler/GHC/Core/Op/ConstantFold.hs +++ b/compiler/GHC/Core/Op/ConstantFold.hs @@ -36,6 +36,7 @@ import GHC.Types.Id import GHC.Types.Literal import GHC.Core.SimpleOpt ( exprIsLiteral_maybe, exprIsLambda_maybe , exprIsStrippableLiteral_maybe ) +import GHC.Core.FVs import PrimOp ( PrimOp(..), tagToEnumKey ) import TysWiredIn import TysPrim @@ -44,7 +45,7 @@ import GHC.Core.TyCon , isNewTyCon, unwrapNewTyCon_maybe, tyConDataCons , tyConFamilySize ) import GHC.Core.DataCon ( dataConTagZ, dataConTyCon, dataConWrapId, dataConWorkId ) -import GHC.Core.Utils ( cheapEqExpr, cheapEqExpr', exprIsHNF, exprType +import GHC.Core.Utils ( eqExpr, cheapEqExpr, cheapEqExpr', exprIsHNF, exprType , stripTicksTop, stripTicksTopT, mkTicks ) import GHC.Core.Unfold ( exprIsConApp_maybe ) import GHC.Core.Type @@ -69,7 +70,8 @@ import Data.Int import Data.Ratio import Data.Word - +import GHC.Types.Var.Set +import GHC.Types.Var.Env {- Note [Constant folding] ~~~~~~~~~~~~~~~~~~~~~~~ @@ -1467,11 +1469,20 @@ match_append_lit foldVariant _ id_unf _ `App` c2 `App` n) <- stripTicksTop tickishFloatable e2 , unpk `hasKey` foldVariant - , cheapEqExpr' tickishFloatable c1 c2 - , (c1Ticks, c1') <- stripTicksTop tickishFloatable c1 - , c2Ticks <- stripTicksTopT tickishFloatable c2 + , pprTrace "match_append_lit" (ppr [ Type ty1 + , lit1 + , c1 + , e2 + ]) True , Just (LitString s1) <- exprIsLiteral_maybe id_unf lit1 , Just (LitString s2) <- exprIsLiteral_maybe id_unf lit2 + , pprTraceIt "eqExpr" $ eqExpr + (mkInScopeSet (exprFreeVars c1 `unionVarSet` exprFreeVars c2)) + c1 c2 + , (c1Ticks, c1') <- stripTicksTop tickishFloatable c1 + , c2Ticks <- stripTicksTopT tickishFloatable c2 + , pprTrace "areLits?" (ppr c2Ticks) True + , pprTrace "Matched!" (ppr . show $ (s1,s2)) True = ASSERT( ty1 `eqType` ty2 ) Just $ mkTicks strTicks $ Var unpk `App` Type ty1 @@ -1492,6 +1503,9 @@ match_eq_string _ id_unf _ | unpk_key1 <- getUnique unpk1 , unpk_key2 <- getUnique unpk2 , unpk_key1 == unpk_key2 + -- For now we insist the literals have to agree in their encoding + -- to keep the rule simple. But we could check if the decoded strings + -- compare equal in here as well. , unpk_key1 `elem` [unpackCStringUtf8IdKey, unpackCStringIdKey] , Just (LitString s1) <- exprIsLiteral_maybe id_unf lit1 , Just (LitString s2) <- exprIsLiteral_maybe id_unf lit2 @@ -1506,30 +1520,39 @@ We start of with this userwritten code in haskell elem x "" -which we would like to transform into: +After optimization we would like the resul + +which we would like to transform this into an +expression semantically equivalent to: case x of + '<' -> True 'k' -> True - 'k' -> True - '_' -> True + 'n' -> True + 'o' -> True + ... + '>' -> True -For lists this would happen automatically via rules -associated with elem. However these do not fire for -string literals as we do not translate string literals -into actual lists for performance reasons. (At least -past a certain string size) +This is then either compiled to a binary search or a lookup table, so at +worst log(n) in complexity. -For string literals what happens in abensence of any -built in rules what would happen is: +For proper lists this happens via rules +associated with elem (elem/build). However these do not achieve the same +outcome when operating over string literals because we do not desugar them +to actual lists. See Note [String literals in GHC] for how strings are +desugared in general. + +For elem on string literals what would happen would be: 1) Start with: elem x "" 2) Desugar: elem x (unpackCString# ""#) -3) Apply "unpack" RULE in GHC.Base: [Phase 2 and earlier] +3) Apply "unpack" RULE in GHC.Base: [InitialPhase, Phase 2 and later]: elem x (build (unpackFoldrCString# ""#)) 3) Apply "build/elem" rule [Phase 2 and later] - + inline foldCString# [Phase 0]: + + inline foldCString# [Phase 0]. + Which gives this code: joinrec { unpack_a1ui (nh_ayh :: Int#) @@ -1547,24 +1570,32 @@ built in rules what would happen is: } Which is not *bad* we do not allocate, but we perform linear search -on the string. - -So instead we use match_elem to look for applications of elem to known -strings during all phases and transform them into case expressions -explicitly. Depending on the content of the string we then end up -with either a decision trees or lookup tables. - -Userwritten patterns of `elem x ""` will all -be caught during the initial phase since we match on both the -desugared form and the form after applying the "unpack" rule. -So it does not matter which rule ends up firing first. - -Should we get this pattern in a later pass it's a toin coss -if we get a case statement or linear search. But linear search -isn't bad enough to loose sleep over this scenario. - -The only exception is that we disable this rule for overly large -strings (>= 500 bytes). This avoid code size blow up if we check +on the string which being O(n) is in most cases worse. + +So what we do is: +a) Match on elem applications to known strings. +b) Transform them into a case via a builtin rule. + +However we want to match on applications before we hit phase 2 as +otherwise the "build/elem" rule might fire first and we end up +with linear search. + +For this reason we match on both, the desugared from and the form +after the "unpack" rule has been applied. This way it does not +matter if the unpack rule or the builtin rule fires first and we +can finish the whole transformation in the InitialPhase during which +"build/elem" is not yet active. + +Should the patterns we match on only emerge in later phases because +of inlining or other optimizations it's a toin coss which rule fires +first ("build/elem" or the one defined by match_elem) so we either +get a case statement or linear search. +But this is relatively rare and linear search isn't bad enough to +loose sleep over this scenario. Especially since there is no easy +way to change this. + +We do disable this optimiatzion for overly long strings strings +(>= 500 bytes). This avoids code size blow up if we check for many sparse values. -} diff --git a/libraries/ghc-prim/GHC/CString.hs b/libraries/ghc-prim/GHC/CString.hs index 9a53bc1448..95a8704dbe 100644 --- a/libraries/ghc-prim/GHC/CString.hs +++ b/libraries/ghc-prim/GHC/CString.hs @@ -30,6 +30,64 @@ module GHC.CString ( import GHC.Types import GHC.Prim +{- +Note [String literals in GHC] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +String literals get quite a bit of special handling in GHC. This Note +summarises the moving parts. + +* Desugaring: see GHC.HsToCore.Match.Literal.dsLit, which in + turn calls GHC.Core.Make.mkStringExprFS. + + The desugarer desugars the Haskell literal "foo" into Core + GHC.CString.unpackCString# "foo"# + where "foo"# is primitive string literal (of type Addr#). + + When the string cannot be encoded as a C string, we use UTF8: + GHC.CString.unpackCStringUtf8# "foo"# + +* The library module ghc-prim:GHC.CString has a bunch of functions that + work over primitive strings, including GHC.CString.unpackCString# + +* GHC.Core.Op.ConstantFold has some RULES that optimise treatment of + literal strings. These include things like: + + + Special handling of elem over string literals. + + Constant folding the desugared form of ("foo" ++ "bar") + into ("foobar") + + and more + +* GHC.Base has a number of regular rules for String literals. + + + a rule "eqString": (==) @String = eqString + where GHC.Base.eqString :: String -> String -> Bool + + ConstantFold has a RULE for eqString on literals: + eqString (Lit "foo"#) (Lit "bar"#) --> False + + This allows compile time evaluation of things like "foo" == "bar" + + + A bunch of rules to promote fusion: + + "unpack" [~1] forall a . unpackCString# a = build (unpackFoldrCString# a) + "unpack-list" [1] forall a . unpackFoldrCString# a (:) [] = unpackCString# a + "unpack-append" forall a n . unpackFoldrCString# a (:) n = unpackAppendCString# a n + + And UTF8 variants of these rules. + +* We allow primitive (unlifted) literal strings to be top-level + bindings, breaking out usual rule. See GHC.Core + Note [Core top-level string literals] + +* TODO: There is work on a special code-gen path for top-level boxed strings + str :: [Char] + str = unpackCString# "foo"# + so that they can all share a common code pointer + + There is a WIP MR on gitlab for this: !3012 + +-} + ----------------------------------------------------------------------------- -- Unpacking C strings ----------------------------------------------------------------------------- -- cgit v1.2.1