diff options
-rw-r--r-- | compiler/prelude/PrelNames.hs | 27 | ||||
-rw-r--r-- | compiler/prelude/PrelRules.hs | 45 | ||||
-rw-r--r-- | libraries/base/GHC/Base.hs | 8 | ||||
-rw-r--r-- | libraries/base/GHC/List.hs | 4 | ||||
-rw-r--r-- | libraries/base/changelog.md | 13 | ||||
-rw-r--r-- | libraries/base/tests/perf/Makefile | 15 | ||||
-rw-r--r-- | libraries/base/tests/perf/T17752.hs | 18 | ||||
-rw-r--r-- | libraries/base/tests/perf/T17752.stdout | 2 | ||||
-rw-r--r-- | libraries/base/tests/perf/all.T | 5 | ||||
-rw-r--r-- | libraries/ghc-prim/GHC/CString.hs | 273 | ||||
-rw-r--r-- | libraries/ghc-prim/changelog.md | 21 |
11 files changed, 360 insertions, 71 deletions
diff --git a/compiler/prelude/PrelNames.hs b/compiler/prelude/PrelNames.hs index 978c6f5b26..4f5ecde4f2 100644 --- a/compiler/prelude/PrelNames.hs +++ b/compiler/prelude/PrelNames.hs @@ -343,8 +343,9 @@ basicKnownKeyNames groupWithName, -- Strings and lists - unpackCStringName, - unpackCStringFoldrName, unpackCStringUtf8Name, + unpackCStringName, unpackCStringUtf8Name, + unpackCStringFoldrName, unpackCStringFoldrUtf8Name, + cstringLengthName, -- Overloaded lists isListClassName, @@ -704,11 +705,12 @@ ioDataCon_RDR :: RdrName ioDataCon_RDR = nameRdrName ioDataConName eqString_RDR, unpackCString_RDR, unpackCStringFoldr_RDR, - unpackCStringUtf8_RDR :: RdrName + unpackCStringFoldrUtf8_RDR, unpackCStringUtf8_RDR :: RdrName eqString_RDR = nameRdrName eqStringName unpackCString_RDR = nameRdrName unpackCStringName unpackCStringFoldr_RDR = nameRdrName unpackCStringFoldrName unpackCStringUtf8_RDR = nameRdrName unpackCStringUtf8Name +unpackCStringFoldrUtf8_RDR = nameRdrName unpackCStringFoldrUtf8Name newStablePtr_RDR :: RdrName newStablePtr_RDR = nameRdrName newStablePtrName @@ -1001,11 +1003,13 @@ modIntName = varQual gHC_CLASSES (fsLit "modInt#") modIntIdKey -- Base strings Strings unpackCStringName, unpackCStringFoldrName, - unpackCStringUtf8Name, eqStringName :: Name + unpackCStringUtf8Name, unpackCStringFoldrUtf8Name, + eqStringName, cstringLengthName :: Name unpackCStringName = varQual gHC_CSTRING (fsLit "unpackCString#") unpackCStringIdKey unpackCStringFoldrName = varQual gHC_CSTRING (fsLit "unpackFoldrCString#") unpackCStringFoldrIdKey unpackCStringUtf8Name = varQual gHC_CSTRING (fsLit "unpackCStringUtf8#") unpackCStringUtf8IdKey eqStringName = varQual gHC_BASE (fsLit "eqString") eqStringIdKey +unpackCStringFoldrUtf8Name = varQual gHC_CSTRING (fsLit "unpackFoldrCStringUtf8#") unpackCStringFoldrUtf8IdKey -- The 'inline' function inlineIdName :: Name @@ -2080,7 +2084,8 @@ wildCardKey, absentErrorIdKey, augmentIdKey, appendIdKey, runtimeErrorIdKey, patErrorIdKey, voidPrimIdKey, realWorldPrimIdKey, recConErrorIdKey, unpackCStringUtf8IdKey, unpackCStringAppendIdKey, - unpackCStringFoldrIdKey, unpackCStringIdKey, + unpackCStringFoldrIdKey, unpackCStringFoldrUtf8IdKey, + unpackCStringIdKey, typeErrorIdKey, divIntIdKey, modIntIdKey, absentSumFieldErrorIdKey :: Unique @@ -2103,12 +2108,14 @@ recConErrorIdKey = mkPreludeMiscIdUnique 16 unpackCStringUtf8IdKey = mkPreludeMiscIdUnique 17 unpackCStringAppendIdKey = mkPreludeMiscIdUnique 18 unpackCStringFoldrIdKey = mkPreludeMiscIdUnique 19 + unpackCStringIdKey = mkPreludeMiscIdUnique 20 -voidPrimIdKey = mkPreludeMiscIdUnique 21 -typeErrorIdKey = mkPreludeMiscIdUnique 22 -divIntIdKey = mkPreludeMiscIdUnique 23 -modIntIdKey = mkPreludeMiscIdUnique 24 -absentSumFieldErrorIdKey = mkPreludeMiscIdUnique 9 +unpackCStringFoldrUtf8IdKey = mkPreludeMiscIdUnique 21 +voidPrimIdKey = mkPreludeMiscIdUnique 22 +typeErrorIdKey = mkPreludeMiscIdUnique 23 +divIntIdKey = mkPreludeMiscIdUnique 24 +modIntIdKey = mkPreludeMiscIdUnique 25 +cstringLengthIdKey = mkPreludeMiscIdUnique 26 unsafeCoerceIdKey, concatIdKey, filterIdKey, zipIdKey, bindIOIdKey, returnIOIdKey, newStablePtrIdKey, diff --git a/compiler/prelude/PrelRules.hs b/compiler/prelude/PrelRules.hs index ffee79da36..370a026768 100644 --- a/compiler/prelude/PrelRules.hs +++ b/compiler/prelude/PrelRules.hs @@ -1206,7 +1206,10 @@ builtinRules :: [CoreRule] builtinRules = [BuiltinRule { ru_name = fsLit "AppendLitString", ru_fn = unpackCStringFoldrName, - ru_nargs = 4, ru_try = match_append_lit }, + ru_nargs = 4, ru_try = match_append_lit_C }, + BuiltinRule { ru_name = fsLit "AppendLitStringUtf8", + ru_fn = unpackCStringFoldrUtf8Name, + ru_nargs = 4, ru_try = match_append_lit_utf8 }, BuiltinRule { ru_name = fsLit "EqString", ru_fn = eqStringName, ru_nargs = 2, ru_try = match_eq_string }, BuiltinRule { ru_name = fsLit "Inline", ru_fn = inlineIdName, @@ -1378,11 +1381,22 @@ builtinNaturalRules = --------------------------------------------------- -- The rule is this: --- unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n) --- = unpackFoldrCString# "foobaz" c n +-- unpackFoldrCString*# "foo"# c (unpackFoldrCString*# "baz"# c n) +-- = unpackFoldrCString*# "foobaz"# c n +-- +-- See also Note [String literals in GHC] in CString.hs + +-- CString version +match_append_lit_C :: RuleFun +match_append_lit_C = match_append_lit unpackCStringFoldrIdKey -match_append_lit :: RuleFun -match_append_lit _ id_unf _ +-- CStringUTF8 version +match_append_lit_utf8 :: RuleFun +match_append_lit_utf8 = match_append_lit unpackCStringFoldrUtf8IdKey + +{-# INLINE match_append_lit #-} +match_append_lit :: Unique -> RuleFun +match_append_lit foldVariant _ id_unf _ [ Type ty1 , lit1 , c1 @@ -1395,12 +1409,13 @@ match_append_lit _ id_unf _ `App` lit2 `App` c2 `App` n) <- stripTicksTop tickishFloatable e2 - , unpk `hasKey` unpackCStringFoldrIdKey - , cheapEqExpr' tickishFloatable c1 c2 - , (c1Ticks, c1') <- stripTicksTop tickishFloatable c1 - , c2Ticks <- stripTicksTopT tickishFloatable c2 + , unpk `hasKey` foldVariant , Just (LitString s1) <- exprIsLiteral_maybe id_unf lit1 , Just (LitString s2) <- exprIsLiteral_maybe id_unf lit2 + , let freeVars = (mkInScopeSet (exprFreeVars c1 `unionVarSet` exprFreeVars c2)) + in eqExpr freeVars c1 c2 + , (c1Ticks, c1') <- stripTicksTop tickishFloatable c1 + , c2Ticks <- stripTicksTopT tickishFloatable c2 = ASSERT( ty1 `eqType` ty2 ) Just $ mkTicks strTicks $ Var unpk `App` Type ty1 @@ -1408,17 +1423,23 @@ match_append_lit _ id_unf _ `App` mkTicks (c1Ticks ++ c2Ticks) c1' `App` n -match_append_lit _ _ _ _ = Nothing +match_append_lit _ _ _ _ _ = Nothing --------------------------------------------------- -- The rule is this: -- eqString (unpackCString# (Lit s1)) (unpackCString# (Lit s2)) = s1==s2 +-- Also matches unpackCStringUtf8# match_eq_string :: RuleFun match_eq_string _ id_unf _ [Var unpk1 `App` lit1, Var unpk2 `App` lit2] - | unpk1 `hasKey` unpackCStringIdKey - , unpk2 `hasKey` unpackCStringIdKey + | unpk_key1 <- getUnique unpk1 + , unpk_key2 <- getUnique unpk2 + , unpk_key1 == unpk_key2 + -- For now we insist the literals have to agree in their encoding + -- to keep the rule simple. But we could check if the decoded strings + -- compare equal in here as well. + , unpk_key1 `elem` [unpackCStringUtf8IdKey, unpackCStringIdKey] , Just (LitString s1) <- exprIsLiteral_maybe id_unf lit1 , Just (LitString s2) <- exprIsLiteral_maybe id_unf lit2 = Just (if s1 == s2 then trueValBool else falseValBool) diff --git a/libraries/base/GHC/Base.hs b/libraries/base/GHC/Base.hs index cd0c7e2c15..dae86a9484 100644 --- a/libraries/base/GHC/Base.hs +++ b/libraries/base/GHC/Base.hs @@ -1637,7 +1637,13 @@ a `iShiftRL#` b | isTrue# (b >=# WORD_SIZE_IN_BITS#) = 0# "unpack-list" [1] forall a . unpackFoldrCString# a (:) [] = unpackCString# a "unpack-append" forall a n . unpackFoldrCString# a (:) n = unpackAppendCString# a n --- There's a built-in rule (in PrelRules.hs) for +"unpack-utf8" [~1] forall a . unpackCStringUtf8# a = build (unpackFoldrCStringUtf8# a) +"unpack-list-utf8" [1] forall a . unpackFoldrCStringUtf8# a (:) [] = unpackCStringUtf8# a +"unpack-append-utf8" forall a n . unpackFoldrCStringUtf8# a (:) n = unpackAppendCStringUtf8# a n + +-- There's a built-in rule (in GHC.Core.Op.ConstantFold) for -- unpackFoldr "foo" c (unpackFoldr "baz" c n) = unpackFoldr "foobaz" c n +-- See also the Note [String literals in GHC] in CString.hs + #-} diff --git a/libraries/base/GHC/List.hs b/libraries/base/GHC/List.hs index 6f6d9d670a..f75b9426fb 100644 --- a/libraries/base/GHC/List.hs +++ b/libraries/base/GHC/List.hs @@ -860,7 +860,7 @@ elem _ [] = False elem x (y:ys) = x==y || elem x ys {-# NOINLINE [1] elem #-} {-# RULES -"elem/build" forall x (g :: forall b . Eq a => (a -> b -> b) -> b -> b) +"elem/build" forall x (g :: forall b . (a -> b -> b) -> b -> b) . elem x (build g) = g (\ y r -> (x == y) || r) False #-} #endif @@ -874,7 +874,7 @@ notElem _ [] = True notElem x (y:ys)= x /= y && notElem x ys {-# NOINLINE [1] notElem #-} {-# RULES -"notElem/build" forall x (g :: forall b . Eq a => (a -> b -> b) -> b -> b) +"notElem/build" forall x (g :: forall b . (a -> b -> b) -> b -> b) . notElem x (build g) = g (\ y r -> (x /= y) && r) True #-} #endif diff --git a/libraries/base/changelog.md b/libraries/base/changelog.md index 953f80740d..a760f97970 100644 --- a/libraries/base/changelog.md +++ b/libraries/base/changelog.md @@ -1,5 +1,18 @@ # Changelog for [`base` package](http://hackage.haskell.org/package/base) +## 4.14.2.0 *Mar 2021* + + * Add `Generic` instances to `Fingerprint`, `GiveGCStats`, `GCFlags`, + `ConcFlags`, `DebugFlags`, `CCFlags`, `DoHeapProfile`, `ProfFlags`, + `DoTrace`, `TraceFlags`, `TickyFlags`, `ParFlags`, `RTSFlags`, `RTSStats`, + `GCStats`, `ByteOrder`, `GeneralCategory`, `SrcLoc` + + * Add rules `unpackUtf8`, `unpack-listUtf8` and `unpack-appendUtf8` to `GHC.Base`. + They correspond to their ascii versions and hopefully make it easier + for libraries to handle utf8 encoded strings efficiently. + + * An issue with list fusion and `elem` was fixed. `elem` applied to known + small lists will now compile to a simple case statement more often. ## 4.14.1.0 *Jul 2020* * Bundled with GHC 8.10.2 diff --git a/libraries/base/tests/perf/Makefile b/libraries/base/tests/perf/Makefile new file mode 100644 index 0000000000..28840dc0b9 --- /dev/null +++ b/libraries/base/tests/perf/Makefile @@ -0,0 +1,15 @@ +# This Makefile runs the tests using GHC's testsuite framework. It +# assumes the package is part of a GHC build tree with the testsuite +# installed in ../../../testsuite. + +TOP=../../../../testsuite +include $(TOP)/mk/boilerplate.mk +include $(TOP)/mk/test.mk + + +T17752: + '$(TEST_HC)' $(TEST_HC_OPTS) -O --make T17752 -rtsopts -ddump-simpl -ddump-to-file -dsuppress-uniques -dsuppress-all + # All occurences of elem should be optimized away. + # For strings these should result in loops after inlining foldCString. + # For lists it should result in a case expression. + echo $$(cat T17752.dump-simpl | grep "elem" -A4 ) diff --git a/libraries/base/tests/perf/T17752.hs b/libraries/base/tests/perf/T17752.hs new file mode 100644 index 0000000000..f7f136e1f4 --- /dev/null +++ b/libraries/base/tests/perf/T17752.hs @@ -0,0 +1,18 @@ +module T17752 where + +-- All occurences of elem should be optimized away. +-- For strings these should result in loops after inlining foldCString. +-- For lists it should result in a case expression. + +-- Should compile to a pattern match if the rules fire +isElemList x = x `elem` ['a','b','c'] +isNotElemList x = x `elem` ['x','y','z'] + +isOneOfThese x = x `elem` [1,2,3,4,5::Int] +isNotOneOfThese x = x `notElem` [1,2,3,4,5::Int] + +isElemString x = elem x "foo" +isNotElemString x = notElem x "bar" + +isElemStringUtf x = elem x "foö" +isNotElemStringUtf x = notElem x "bär" diff --git a/libraries/base/tests/perf/T17752.stdout b/libraries/base/tests/perf/T17752.stdout new file mode 100644 index 0000000000..4d70e27e02 --- /dev/null +++ b/libraries/base/tests/perf/T17752.stdout @@ -0,0 +1,2 @@ +[1 of 1] Compiling T17752 ( T17752.hs, T17752.o ) + diff --git a/libraries/base/tests/perf/all.T b/libraries/base/tests/perf/all.T new file mode 100644 index 0000000000..61f42f5420 --- /dev/null +++ b/libraries/base/tests/perf/all.T @@ -0,0 +1,5 @@ +#-------------------------------------- +# Check specialization of elem via rules +#-------------------------------------- + +test('T17752', [only_ways(['normal'])] , makefile_test, ['T17752']) diff --git a/libraries/ghc-prim/GHC/CString.hs b/libraries/ghc-prim/GHC/CString.hs index 0e6199f30f..fd0e3f6625 100644 --- a/libraries/ghc-prim/GHC/CString.hs +++ b/libraries/ghc-prim/GHC/CString.hs @@ -16,13 +16,78 @@ ----------------------------------------------------------------------------- module GHC.CString ( + -- * Ascii variants unpackCString#, unpackAppendCString#, unpackFoldrCString#, - unpackCStringUtf8#, unpackNBytes# + cstringLength#, + + -- * Utf variants + unpackCStringUtf8#, unpackAppendCStringUtf8#, unpackFoldrCStringUtf8#, + + -- * Other + unpackNBytes#, ) where import GHC.Types import GHC.Prim +{- +Note [String literals in GHC] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +String literals get quite a bit of special handling in GHC. This Note +summarises the moving parts. + +* Desugaring: see GHC.HsToCore.Match.Literal.dsLit, which in + turn calls GHC.Core.Make.mkStringExprFS. + + The desugarer desugars the Haskell literal "foo" into Core + GHC.CString.unpackCString# "foo"# + where "foo"# is primitive string literal (of type Addr#). + + When the string cannot be encoded as a C string, we use UTF8: + GHC.CString.unpackCStringUtf8# "foo"# + +* The library module ghc-prim:GHC.CString has a bunch of functions that + work over primitive strings, including GHC.CString.unpackCString# + +* GHC.Core.Op.ConstantFold has some RULES that optimise certain string + operations on literal strings. For example: + + + Constant folding the desugared form of ("foo" ++ "bar") + into ("foobar") + + Comparing strings + + and more + +* GHC.Base has a number of regular rules for String literals. + + + a rule "eqString": (==) @String = eqString + where GHC.Base.eqString :: String -> String -> Bool + + ConstantFold has a RULE for eqString on literals: + eqString (Lit "foo"#) (Lit "bar"#) --> False + + This allows compile time evaluation of things like "foo" == "bar" + + + A bunch of rules to promote fusion: + + "unpack" [~1] forall a . unpackCString# a = build (unpackFoldrCString# a) + "unpack-list" [1] forall a . unpackFoldrCString# a (:) [] = unpackCString# a + "unpack-append" forall a n . unpackFoldrCString# a (:) n = unpackAppendCString# a n + + And UTF8 variants of these rules. + +* We allow primitive (unlifted) literal strings to be top-level + bindings, breaking out usual rule. See GHC.Core + Note [Core top-level string literals] + +* TODO: There is work on a special code-gen path for top-level boxed strings + str :: [Char] + str = unpackCString# "foo"# + so that they can all share a common code pointer + + There is a WIP MR on gitlab for this: !3012 + +-} + ----------------------------------------------------------------------------- -- Unpacking C strings ----------------------------------------------------------------------------- @@ -70,6 +135,49 @@ Moreover, we want to make it CONLIKE, so that: All of this goes for unpackCStringUtf8# too. -} +{- Note [Inlining of unpackFoldrCString] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Usually the unpack-list rule turns unpackFoldrCString# into unpackCString# +It also has a BuiltInRule in PrelRules.hs: + unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n) + = unpackFoldrCString# "foobaz" c n + +We use NOINLINE [0] on the grounds that, unlike +unpackCString#, there *is* some point in inlining +unpackFoldrCString#, because we get better code for the +higher-order function call. + +This can cause a code size increase but it was minimal +when looking at nofib. + +This is especially important for elem which then results in an +allocation free loop. + + Note [unpackCString# iterating over addr] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When unpacking unpackCString# and friends repeatedly return a cons cell +containing: +* The current character we just unpacked. +* A thunk to unpack the rest of the string. + +In order to minimize the size of the thunk we do not index of +the start of the string, offsetting into it, but instead increment +the addr and always use offset 0#. + +This works since these two expressions will read from the same address. +* `indexCharOffAddr# a i` +* `indexCharOffAddr (a `plusAddr#` i) 0#` + +This way we avoid the need for the thunks to close over both the start of +the string and the current offset, saving a word for each character unpacked. + +This has the additional advantage the we can guarantee that only the +increment will happen in the loop. + +-} + unpackCString# :: Addr# -> [Char] {-# NOINLINE CONLIKE unpackCString# #-} unpackCString# addr @@ -93,60 +201,63 @@ unpackAppendCString# addr rest where !ch = indexCharOffAddr# addr nh -unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a - --- Usually the unpack-list rule turns unpackFoldrCString# into unpackCString# - --- It also has a BuiltInRule in PrelRules.hs: --- unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n) --- = unpackFoldrCString# "foobaz" c n - -{-# NOINLINE unpackFoldrCString# #-} --- At one stage I had NOINLINE [0] on the grounds that, unlike --- unpackCString#, there *is* some point in inlining --- unpackFoldrCString#, because we get better code for the --- higher-order function call. BUT there may be a lot of --- literal strings, and making a separate 'unpack' loop for --- each is highly gratuitous. See nofib/real/anna/PrettyPrint. - -unpackFoldrCString# addr f z - = unpack 0# +-- Usually the unpack-list rule turns unpackFoldrCString# into unpackCString#. +-- See Note [String literals in GHC] for more details. +-- See [Inlining of unpackFoldrCString] +{-# NOINLINE[0] unpackFoldrCString# #-} +unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a +unpackFoldrCString# str f z_init = go str z_init where - unpack nh + go addr z | isTrue# (ch `eqChar#` '\0'#) = z - | True = C# ch `f` unpack (nh +# 1#) + | True = C# ch `f` go (addr `plusAddr#` 1#) z where - !ch = indexCharOffAddr# addr nh + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# -- There's really no point in inlining this for the same reasons as -- unpackCString. See Note [Inlining unpackCString#] above for details. unpackCStringUtf8# :: Addr# -> [Char] {-# NOINLINE CONLIKE unpackCStringUtf8# #-} unpackCStringUtf8# addr - = unpack 0# + | isTrue# (ch `eqChar#` '\0'# ) = [] + | True = + let !byte_count = getByteCount ch + !utf_ch = unpackUtf8Char# byte_count ch addr + !addr' = addr `plusBytes` byte_count + in C# utf_ch : unpackCStringUtf8# addr' + where + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# + + +unpackAppendCStringUtf8# :: Addr# -> [Char] -> [Char] +{-# NOINLINE unpackAppendCStringUtf8# #-} + -- See the NOINLINE note on unpackCString# +unpackAppendCStringUtf8# addr rest + | isTrue# (ch `eqChar#` '\0'#) = rest + | True = + let !byte_count = getByteCount ch + !utf_ch = unpackUtf8Char# byte_count ch addr + !addr' = (addr `plusBytes` byte_count) + in C# utf_ch : unpackAppendCStringUtf8# addr' rest + where + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# + +-- See Note [Inlining of unpackFoldrCString] +{-# NOINLINE[0] unpackFoldrCStringUtf8# #-} +unpackFoldrCStringUtf8# :: Addr# -> (Char -> a -> a) -> a -> a +unpackFoldrCStringUtf8# addr_init f z_init + = go addr_init z_init where - -- We take care to strictly evaluate the character decoding as - -- indexCharOffAddr# is marked with the can_fail flag and - -- consequently GHC won't evaluate the expression unless it is absolutely - -- needed. - unpack nh - | isTrue# (ch `eqChar#` '\0'# ) = [] - | isTrue# (ch `leChar#` '\x7F'#) = C# ch : unpack (nh +# 1#) - | isTrue# (ch `leChar#` '\xDF'#) = - let !c = C# (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#))) - in c : unpack (nh +# 2#) - | isTrue# (ch `leChar#` '\xEF'#) = - let !c = C# (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +# - ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#))) - in c : unpack (nh +# 3#) - | True = - let !c = C# (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +# - ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 12#) +# - ((ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 3#)) -# 0x80#))) - in c : unpack (nh +# 4#) + go addr z + | isTrue# (ch `eqChar#` '\0'#) = z + | True = + let !byte_count = getByteCount ch + !utf_ch = unpackUtf8Char# byte_count ch addr + !addr' = (addr `plusBytes` byte_count) + in C# utf_ch `f` go addr' z where !ch = indexCharOffAddr# addr nh @@ -163,3 +274,75 @@ unpackNBytes# addr len# = unpack [] (len# -# 1#) case indexCharOffAddr# addr i# of ch -> unpack (C# ch : acc) (i# -# 1#) +-- The return type is not correct here. We really want CSize, +-- but that type is defined in base. However, CSize should always +-- match the size of a machine word (I hope), so this is probably +-- alright on all platforms that GHC supports. +foreign import ccall unsafe "strlen" c_strlen :: Addr# -> Int# + +-- | Compute the length of a NUL-terminated string. This address +-- must refer to immutable memory. GHC includes a built-in rule for +-- constant folding when the argument is a statically-known literal. +-- That is, a core-to-core pass reduces the expression +-- @cstringLength# "hello"#@ to the constant @5#@. +cstringLength# :: Addr# -> Int# +{-# INLINE[0] cstringLength# #-} +cstringLength# = c_strlen + + +------------------------------ +--- UTF8 decoding utilities +------------------------------ +-- +-- These functions make explicit the logic that was originally +-- part of unpackCStringUtf8. Since we want the same support for ascii +-- and non-ascii a variety of functions needs the same logic. Instead +-- of C&P'in the decoding logic all over we have it here once, and then +-- force GHC to inline it. +-- +-- All the overhead of the Bytes argument and calls goes away once all is +-- said and done. And what remains is readable code in Haskell land and +-- performant code in the resulting binary. + +data Bytes = One | Two | Three | Four + +{-# INLINE getByteCount #-} +getByteCount :: Char# -> Bytes +getByteCount ch + | isTrue# (ch `leChar#` '\x7F'#) = One + | isTrue# (ch `leChar#` '\xDF'#) = Two + | isTrue# (ch `leChar#` '\xEF'#) = Three + | True = Four + +{-# INLINE plusBytes #-} +plusBytes :: Addr# -> Bytes -> Addr# +plusBytes addr bytes = + case bytes of + One -> addr `plusAddr#` 1# + Two -> addr `plusAddr#` 2# + Three -> addr `plusAddr#` 3# + Four -> addr `plusAddr#` 4# + +-- | Take the current address, read unicode char of the given size. +-- We obviously want the number of bytes, but we have to read one +-- byte to determine the number of bytes for the current codepoint +-- so we might as well reuse it and avoid a read. +-- +-- Side Note: We don't dare to decode all 4 possibilities at once. +-- Reading past the end of the addr might trigger an exception. +-- For this reason we really have to check the width first and only +-- decode after. +{-# INLINE unpackUtf8Char# #-} +unpackUtf8Char# :: Bytes -> Char# -> Addr# -> Char# +unpackUtf8Char# bytes ch addr = + case bytes of + One -> ch + Two -> (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#))) + Three -> (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#))) + Four -> (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 12#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 3#) 0#) -# 0x80#))) diff --git a/libraries/ghc-prim/changelog.md b/libraries/ghc-prim/changelog.md index 96c648e620..1b52f06260 100644 --- a/libraries/ghc-prim/changelog.md +++ b/libraries/ghc-prim/changelog.md @@ -1,4 +1,23 @@ -## 0.6.1 +## 0.6.2 (edit as necessary) + +- Shipped with GHC 8.12.1 + +- Add known-key `cstringLength#` to `GHC.CString`. This is just the + C function `strlen`, but a built-in rewrite rule allows GHC to + compute the result at compile time when the argument is known. + +- In order to support unicode better the following functions in `GHC.CString` + gained UTF8 counterparts: + + unpackAppendCStringUtf8# :: Addr# -> [Char] -> [Char] + unpackFoldrCStringUtf8# :: Addr# -> (Char -> a -> a) -> a -> a + +- unpackFoldrCString* variants can now inline in phase [0]. + + If the folding function is known this allows for unboxing of the + Char argument resulting in much faster code. + +## 0.6.1 (edit as necessary) - Shipped with GHC 8.10.1 |