summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Klebinger <klebinger.andreas@gmx.at>2020-04-13 15:53:46 +0200
committerAndreas Klebinger <klebinger.andreas@gmx.at>2020-04-14 02:46:45 +0200
commitc9e7c782bc1414f960d1116267c683da28f961df (patch)
tree6e0aa5d477e4ec188e131f8bb8c28b1ccd5ade5b
parentec44e160ec3a11d398e07e4c04844c56574ddf50 (diff)
downloadhaskell-wip/andreask/elem_rule_rework.tar.gz
Update comments/noteswip/andreask/elem_rule_rework
Also use eqExpr for match_append_lit.
-rw-r--r--compiler/GHC/Core/Op/ConstantFold.hs101
-rw-r--r--libraries/ghc-prim/GHC/CString.hs58
2 files changed, 124 insertions, 35 deletions
diff --git a/compiler/GHC/Core/Op/ConstantFold.hs b/compiler/GHC/Core/Op/ConstantFold.hs
index 251f25244c..c3ea80dce9 100644
--- a/compiler/GHC/Core/Op/ConstantFold.hs
+++ b/compiler/GHC/Core/Op/ConstantFold.hs
@@ -36,6 +36,7 @@ import GHC.Types.Id
import GHC.Types.Literal
import GHC.Core.SimpleOpt ( exprIsLiteral_maybe, exprIsLambda_maybe
, exprIsStrippableLiteral_maybe )
+import GHC.Core.FVs
import PrimOp ( PrimOp(..), tagToEnumKey )
import TysWiredIn
import TysPrim
@@ -44,7 +45,7 @@ import GHC.Core.TyCon
, isNewTyCon, unwrapNewTyCon_maybe, tyConDataCons
, tyConFamilySize )
import GHC.Core.DataCon ( dataConTagZ, dataConTyCon, dataConWrapId, dataConWorkId )
-import GHC.Core.Utils ( cheapEqExpr, cheapEqExpr', exprIsHNF, exprType
+import GHC.Core.Utils ( eqExpr, cheapEqExpr, cheapEqExpr', exprIsHNF, exprType
, stripTicksTop, stripTicksTopT, mkTicks )
import GHC.Core.Unfold ( exprIsConApp_maybe )
import GHC.Core.Type
@@ -69,7 +70,8 @@ import Data.Int
import Data.Ratio
import Data.Word
-
+import GHC.Types.Var.Set
+import GHC.Types.Var.Env
{-
Note [Constant folding]
~~~~~~~~~~~~~~~~~~~~~~~
@@ -1467,11 +1469,20 @@ match_append_lit foldVariant _ id_unf _
`App` c2
`App` n) <- stripTicksTop tickishFloatable e2
, unpk `hasKey` foldVariant
- , cheapEqExpr' tickishFloatable c1 c2
- , (c1Ticks, c1') <- stripTicksTop tickishFloatable c1
- , c2Ticks <- stripTicksTopT tickishFloatable c2
+ , pprTrace "match_append_lit" (ppr [ Type ty1
+ , lit1
+ , c1
+ , e2
+ ]) True
, Just (LitString s1) <- exprIsLiteral_maybe id_unf lit1
, Just (LitString s2) <- exprIsLiteral_maybe id_unf lit2
+ , pprTraceIt "eqExpr" $ eqExpr
+ (mkInScopeSet (exprFreeVars c1 `unionVarSet` exprFreeVars c2))
+ c1 c2
+ , (c1Ticks, c1') <- stripTicksTop tickishFloatable c1
+ , c2Ticks <- stripTicksTopT tickishFloatable c2
+ , pprTrace "areLits?" (ppr c2Ticks) True
+ , pprTrace "Matched!" (ppr . show $ (s1,s2)) True
= ASSERT( ty1 `eqType` ty2 )
Just $ mkTicks strTicks
$ Var unpk `App` Type ty1
@@ -1492,6 +1503,9 @@ match_eq_string _ id_unf _
| unpk_key1 <- getUnique unpk1
, unpk_key2 <- getUnique unpk2
, unpk_key1 == unpk_key2
+ -- For now we insist the literals have to agree in their encoding
+ -- to keep the rule simple. But we could check if the decoded strings
+ -- compare equal in here as well.
, unpk_key1 `elem` [unpackCStringUtf8IdKey, unpackCStringIdKey]
, Just (LitString s1) <- exprIsLiteral_maybe id_unf lit1
, Just (LitString s2) <- exprIsLiteral_maybe id_unf lit2
@@ -1506,30 +1520,39 @@ We start of with this userwritten code in haskell
elem x "<knownString>"
-which we would like to transform into:
+After optimization we would like the resul
+
+which we would like to transform this into an
+expression semantically equivalent to:
case x of
+ '<' -> True
'k' -> True
- 'k' -> True
- '_' -> True
+ 'n' -> True
+ 'o' -> True
+ ...
+ '>' -> True
-For lists this would happen automatically via rules
-associated with elem. However these do not fire for
-string literals as we do not translate string literals
-into actual lists for performance reasons. (At least
-past a certain string size)
+This is then either compiled to a binary search or a lookup table, so at
+worst log(n) in complexity.
-For string literals what happens in abensence of any
-built in rules what would happen is:
+For proper lists this happens via rules
+associated with elem (elem/build). However these do not achieve the same
+outcome when operating over string literals because we do not desugar them
+to actual lists. See Note [String literals in GHC] for how strings are
+desugared in general.
+
+For elem on string literals what would happen would be:
1) Start with:
elem x "<knownString>"
2) Desugar:
elem x (unpackCString# "<knownString>"#)
-3) Apply "unpack" RULE in GHC.Base: [Phase 2 and earlier]
+3) Apply "unpack" RULE in GHC.Base: [InitialPhase, Phase 2 and later]:
elem x (build (unpackFoldrCString# "<knownString>"#))
3) Apply "build/elem" rule [Phase 2 and later]
- + inline foldCString# [Phase 0]:
+ + inline foldCString# [Phase 0].
+ Which gives this code:
joinrec {
unpack_a1ui (nh_ayh :: Int#)
@@ -1547,24 +1570,32 @@ built in rules what would happen is:
}
Which is not *bad* we do not allocate, but we perform linear search
-on the string.
-
-So instead we use match_elem to look for applications of elem to known
-strings during all phases and transform them into case expressions
-explicitly. Depending on the content of the string we then end up
-with either a decision trees or lookup tables.
-
-Userwritten patterns of `elem x "<knownString>"` will all
-be caught during the initial phase since we match on both the
-desugared form and the form after applying the "unpack" rule.
-So it does not matter which rule ends up firing first.
-
-Should we get this pattern in a later pass it's a toin coss
-if we get a case statement or linear search. But linear search
-isn't bad enough to loose sleep over this scenario.
-
-The only exception is that we disable this rule for overly large
-strings (>= 500 bytes). This avoid code size blow up if we check
+on the string which being O(n) is in most cases worse.
+
+So what we do is:
+a) Match on elem applications to known strings.
+b) Transform them into a case via a builtin rule.
+
+However we want to match on applications before we hit phase 2 as
+otherwise the "build/elem" rule might fire first and we end up
+with linear search.
+
+For this reason we match on both, the desugared from and the form
+after the "unpack" rule has been applied. This way it does not
+matter if the unpack rule or the builtin rule fires first and we
+can finish the whole transformation in the InitialPhase during which
+"build/elem" is not yet active.
+
+Should the patterns we match on only emerge in later phases because
+of inlining or other optimizations it's a toin coss which rule fires
+first ("build/elem" or the one defined by match_elem) so we either
+get a case statement or linear search.
+But this is relatively rare and linear search isn't bad enough to
+loose sleep over this scenario. Especially since there is no easy
+way to change this.
+
+We do disable this optimiatzion for overly long strings strings
+(>= 500 bytes). This avoids code size blow up if we check
for many sparse values.
-}
diff --git a/libraries/ghc-prim/GHC/CString.hs b/libraries/ghc-prim/GHC/CString.hs
index 9a53bc1448..95a8704dbe 100644
--- a/libraries/ghc-prim/GHC/CString.hs
+++ b/libraries/ghc-prim/GHC/CString.hs
@@ -30,6 +30,64 @@ module GHC.CString (
import GHC.Types
import GHC.Prim
+{-
+Note [String literals in GHC]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+String literals get quite a bit of special handling in GHC. This Note
+summarises the moving parts.
+
+* Desugaring: see GHC.HsToCore.Match.Literal.dsLit, which in
+ turn calls GHC.Core.Make.mkStringExprFS.
+
+ The desugarer desugars the Haskell literal "foo" into Core
+ GHC.CString.unpackCString# "foo"#
+ where "foo"# is primitive string literal (of type Addr#).
+
+ When the string cannot be encoded as a C string, we use UTF8:
+ GHC.CString.unpackCStringUtf8# "foo"#
+
+* The library module ghc-prim:GHC.CString has a bunch of functions that
+ work over primitive strings, including GHC.CString.unpackCString#
+
+* GHC.Core.Op.ConstantFold has some RULES that optimise treatment of
+ literal strings. These include things like:
+
+ + Special handling of elem over string literals.
+ + Constant folding the desugared form of ("foo" ++ "bar")
+ into ("foobar")
+ + and more
+
+* GHC.Base has a number of regular rules for String literals.
+
+ + a rule "eqString": (==) @String = eqString
+ where GHC.Base.eqString :: String -> String -> Bool
+
+ ConstantFold has a RULE for eqString on literals:
+ eqString (Lit "foo"#) (Lit "bar"#) --> False
+
+ This allows compile time evaluation of things like "foo" == "bar"
+
+ + A bunch of rules to promote fusion:
+
+ "unpack" [~1] forall a . unpackCString# a = build (unpackFoldrCString# a)
+ "unpack-list" [1] forall a . unpackFoldrCString# a (:) [] = unpackCString# a
+ "unpack-append" forall a n . unpackFoldrCString# a (:) n = unpackAppendCString# a n
+
+ And UTF8 variants of these rules.
+
+* We allow primitive (unlifted) literal strings to be top-level
+ bindings, breaking out usual rule. See GHC.Core
+ Note [Core top-level string literals]
+
+* TODO: There is work on a special code-gen path for top-level boxed strings
+ str :: [Char]
+ str = unpackCString# "foo"#
+ so that they can all share a common code pointer
+
+ There is a WIP MR on gitlab for this: !3012
+
+-}
+
-----------------------------------------------------------------------------
-- Unpacking C strings
-----------------------------------------------------------------------------