diff options
Diffstat (limited to 'libraries/ghc-prim/GHC/CString.hs')
-rw-r--r-- | libraries/ghc-prim/GHC/CString.hs | 273 |
1 files changed, 228 insertions, 45 deletions
diff --git a/libraries/ghc-prim/GHC/CString.hs b/libraries/ghc-prim/GHC/CString.hs index 0e6199f30f..fd0e3f6625 100644 --- a/libraries/ghc-prim/GHC/CString.hs +++ b/libraries/ghc-prim/GHC/CString.hs @@ -16,13 +16,78 @@ ----------------------------------------------------------------------------- module GHC.CString ( + -- * Ascii variants unpackCString#, unpackAppendCString#, unpackFoldrCString#, - unpackCStringUtf8#, unpackNBytes# + cstringLength#, + + -- * Utf variants + unpackCStringUtf8#, unpackAppendCStringUtf8#, unpackFoldrCStringUtf8#, + + -- * Other + unpackNBytes#, ) where import GHC.Types import GHC.Prim +{- +Note [String literals in GHC] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +String literals get quite a bit of special handling in GHC. This Note +summarises the moving parts. + +* Desugaring: see GHC.HsToCore.Match.Literal.dsLit, which in + turn calls GHC.Core.Make.mkStringExprFS. + + The desugarer desugars the Haskell literal "foo" into Core + GHC.CString.unpackCString# "foo"# + where "foo"# is primitive string literal (of type Addr#). + + When the string cannot be encoded as a C string, we use UTF8: + GHC.CString.unpackCStringUtf8# "foo"# + +* The library module ghc-prim:GHC.CString has a bunch of functions that + work over primitive strings, including GHC.CString.unpackCString# + +* GHC.Core.Op.ConstantFold has some RULES that optimise certain string + operations on literal strings. For example: + + + Constant folding the desugared form of ("foo" ++ "bar") + into ("foobar") + + Comparing strings + + and more + +* GHC.Base has a number of regular rules for String literals. + + + a rule "eqString": (==) @String = eqString + where GHC.Base.eqString :: String -> String -> Bool + + ConstantFold has a RULE for eqString on literals: + eqString (Lit "foo"#) (Lit "bar"#) --> False + + This allows compile time evaluation of things like "foo" == "bar" + + + A bunch of rules to promote fusion: + + "unpack" [~1] forall a . unpackCString# a = build (unpackFoldrCString# a) + "unpack-list" [1] forall a . unpackFoldrCString# a (:) [] = unpackCString# a + "unpack-append" forall a n . unpackFoldrCString# a (:) n = unpackAppendCString# a n + + And UTF8 variants of these rules. + +* We allow primitive (unlifted) literal strings to be top-level + bindings, breaking out usual rule. See GHC.Core + Note [Core top-level string literals] + +* TODO: There is work on a special code-gen path for top-level boxed strings + str :: [Char] + str = unpackCString# "foo"# + so that they can all share a common code pointer + + There is a WIP MR on gitlab for this: !3012 + +-} + ----------------------------------------------------------------------------- -- Unpacking C strings ----------------------------------------------------------------------------- @@ -70,6 +135,49 @@ Moreover, we want to make it CONLIKE, so that: All of this goes for unpackCStringUtf8# too. -} +{- Note [Inlining of unpackFoldrCString] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Usually the unpack-list rule turns unpackFoldrCString# into unpackCString# +It also has a BuiltInRule in PrelRules.hs: + unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n) + = unpackFoldrCString# "foobaz" c n + +We use NOINLINE [0] on the grounds that, unlike +unpackCString#, there *is* some point in inlining +unpackFoldrCString#, because we get better code for the +higher-order function call. + +This can cause a code size increase but it was minimal +when looking at nofib. + +This is especially important for elem which then results in an +allocation free loop. + + Note [unpackCString# iterating over addr] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When unpacking unpackCString# and friends repeatedly return a cons cell +containing: +* The current character we just unpacked. +* A thunk to unpack the rest of the string. + +In order to minimize the size of the thunk we do not index of +the start of the string, offsetting into it, but instead increment +the addr and always use offset 0#. + +This works since these two expressions will read from the same address. +* `indexCharOffAddr# a i` +* `indexCharOffAddr (a `plusAddr#` i) 0#` + +This way we avoid the need for the thunks to close over both the start of +the string and the current offset, saving a word for each character unpacked. + +This has the additional advantage the we can guarantee that only the +increment will happen in the loop. + +-} + unpackCString# :: Addr# -> [Char] {-# NOINLINE CONLIKE unpackCString# #-} unpackCString# addr @@ -93,60 +201,63 @@ unpackAppendCString# addr rest where !ch = indexCharOffAddr# addr nh -unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a - --- Usually the unpack-list rule turns unpackFoldrCString# into unpackCString# - --- It also has a BuiltInRule in PrelRules.hs: --- unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n) --- = unpackFoldrCString# "foobaz" c n - -{-# NOINLINE unpackFoldrCString# #-} --- At one stage I had NOINLINE [0] on the grounds that, unlike --- unpackCString#, there *is* some point in inlining --- unpackFoldrCString#, because we get better code for the --- higher-order function call. BUT there may be a lot of --- literal strings, and making a separate 'unpack' loop for --- each is highly gratuitous. See nofib/real/anna/PrettyPrint. - -unpackFoldrCString# addr f z - = unpack 0# +-- Usually the unpack-list rule turns unpackFoldrCString# into unpackCString#. +-- See Note [String literals in GHC] for more details. +-- See [Inlining of unpackFoldrCString] +{-# NOINLINE[0] unpackFoldrCString# #-} +unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a +unpackFoldrCString# str f z_init = go str z_init where - unpack nh + go addr z | isTrue# (ch `eqChar#` '\0'#) = z - | True = C# ch `f` unpack (nh +# 1#) + | True = C# ch `f` go (addr `plusAddr#` 1#) z where - !ch = indexCharOffAddr# addr nh + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# -- There's really no point in inlining this for the same reasons as -- unpackCString. See Note [Inlining unpackCString#] above for details. unpackCStringUtf8# :: Addr# -> [Char] {-# NOINLINE CONLIKE unpackCStringUtf8# #-} unpackCStringUtf8# addr - = unpack 0# + | isTrue# (ch `eqChar#` '\0'# ) = [] + | True = + let !byte_count = getByteCount ch + !utf_ch = unpackUtf8Char# byte_count ch addr + !addr' = addr `plusBytes` byte_count + in C# utf_ch : unpackCStringUtf8# addr' + where + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# + + +unpackAppendCStringUtf8# :: Addr# -> [Char] -> [Char] +{-# NOINLINE unpackAppendCStringUtf8# #-} + -- See the NOINLINE note on unpackCString# +unpackAppendCStringUtf8# addr rest + | isTrue# (ch `eqChar#` '\0'#) = rest + | True = + let !byte_count = getByteCount ch + !utf_ch = unpackUtf8Char# byte_count ch addr + !addr' = (addr `plusBytes` byte_count) + in C# utf_ch : unpackAppendCStringUtf8# addr' rest + where + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# + +-- See Note [Inlining of unpackFoldrCString] +{-# NOINLINE[0] unpackFoldrCStringUtf8# #-} +unpackFoldrCStringUtf8# :: Addr# -> (Char -> a -> a) -> a -> a +unpackFoldrCStringUtf8# addr_init f z_init + = go addr_init z_init where - -- We take care to strictly evaluate the character decoding as - -- indexCharOffAddr# is marked with the can_fail flag and - -- consequently GHC won't evaluate the expression unless it is absolutely - -- needed. - unpack nh - | isTrue# (ch `eqChar#` '\0'# ) = [] - | isTrue# (ch `leChar#` '\x7F'#) = C# ch : unpack (nh +# 1#) - | isTrue# (ch `leChar#` '\xDF'#) = - let !c = C# (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#))) - in c : unpack (nh +# 2#) - | isTrue# (ch `leChar#` '\xEF'#) = - let !c = C# (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +# - ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#))) - in c : unpack (nh +# 3#) - | True = - let !c = C# (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +# - ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 12#) +# - ((ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 3#)) -# 0x80#))) - in c : unpack (nh +# 4#) + go addr z + | isTrue# (ch `eqChar#` '\0'#) = z + | True = + let !byte_count = getByteCount ch + !utf_ch = unpackUtf8Char# byte_count ch addr + !addr' = (addr `plusBytes` byte_count) + in C# utf_ch `f` go addr' z where !ch = indexCharOffAddr# addr nh @@ -163,3 +274,75 @@ unpackNBytes# addr len# = unpack [] (len# -# 1#) case indexCharOffAddr# addr i# of ch -> unpack (C# ch : acc) (i# -# 1#) +-- The return type is not correct here. We really want CSize, +-- but that type is defined in base. However, CSize should always +-- match the size of a machine word (I hope), so this is probably +-- alright on all platforms that GHC supports. +foreign import ccall unsafe "strlen" c_strlen :: Addr# -> Int# + +-- | Compute the length of a NUL-terminated string. This address +-- must refer to immutable memory. GHC includes a built-in rule for +-- constant folding when the argument is a statically-known literal. +-- That is, a core-to-core pass reduces the expression +-- @cstringLength# "hello"#@ to the constant @5#@. +cstringLength# :: Addr# -> Int# +{-# INLINE[0] cstringLength# #-} +cstringLength# = c_strlen + + +------------------------------ +--- UTF8 decoding utilities +------------------------------ +-- +-- These functions make explicit the logic that was originally +-- part of unpackCStringUtf8. Since we want the same support for ascii +-- and non-ascii a variety of functions needs the same logic. Instead +-- of C&P'in the decoding logic all over we have it here once, and then +-- force GHC to inline it. +-- +-- All the overhead of the Bytes argument and calls goes away once all is +-- said and done. And what remains is readable code in Haskell land and +-- performant code in the resulting binary. + +data Bytes = One | Two | Three | Four + +{-# INLINE getByteCount #-} +getByteCount :: Char# -> Bytes +getByteCount ch + | isTrue# (ch `leChar#` '\x7F'#) = One + | isTrue# (ch `leChar#` '\xDF'#) = Two + | isTrue# (ch `leChar#` '\xEF'#) = Three + | True = Four + +{-# INLINE plusBytes #-} +plusBytes :: Addr# -> Bytes -> Addr# +plusBytes addr bytes = + case bytes of + One -> addr `plusAddr#` 1# + Two -> addr `plusAddr#` 2# + Three -> addr `plusAddr#` 3# + Four -> addr `plusAddr#` 4# + +-- | Take the current address, read unicode char of the given size. +-- We obviously want the number of bytes, but we have to read one +-- byte to determine the number of bytes for the current codepoint +-- so we might as well reuse it and avoid a read. +-- +-- Side Note: We don't dare to decode all 4 possibilities at once. +-- Reading past the end of the addr might trigger an exception. +-- For this reason we really have to check the width first and only +-- decode after. +{-# INLINE unpackUtf8Char# #-} +unpackUtf8Char# :: Bytes -> Char# -> Addr# -> Char# +unpackUtf8Char# bytes ch addr = + case bytes of + One -> ch + Two -> (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#))) + Three -> (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#))) + Four -> (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 12#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 3#) 0#) -# 0x80#))) |