diff options
author | Andreas Klebinger <klebinger.andreas@gmx.at> | 2020-01-12 18:30:01 +0100 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2020-02-08 10:19:10 -0500 |
commit | 88bf81aaaf67e9c9bb5b12088ab33accd4a55fb3 (patch) | |
tree | 7e56f3ea22993d2dab41ba4b5f9b8745e230d33d | |
parent | 7d04b9f298c9cc0ff291e0717826743f488670bb (diff) | |
download | haskell-88bf81aaaf67e9c9bb5b12088ab33accd4a55fb3.tar.gz |
Optimize unpackCString# to allocate less.
unpackCString# is a recursive function which for each iteration
returns a Cons cell containing the current Char, and a thunk for
unpacking the rest of the string.
In this patch we change from storing addr + offset inside this thunk
to storing only the addr, simply incrementing the address on each
iteration.
This saves one word of allocation per unpacked character.
For a program like "main = print "<largishString>" this amounts
to 2-3% fewer % in bytes allocated.
I also removed the now redundant local unpack definitions.
This removes one call per unpack operation.
-rw-r--r-- | libraries/ghc-prim/GHC/CString.hs | 97 |
1 files changed, 54 insertions, 43 deletions
diff --git a/libraries/ghc-prim/GHC/CString.hs b/libraries/ghc-prim/GHC/CString.hs index 0e6199f30f..5c9dcf95db 100644 --- a/libraries/ghc-prim/GHC/CString.hs +++ b/libraries/ghc-prim/GHC/CString.hs @@ -1,4 +1,5 @@ {-# LANGUAGE MagicHash, NoImplicitPrelude, BangPatterns #-} + ----------------------------------------------------------------------------- -- | -- Module : GHC.CString @@ -70,28 +71,45 @@ Moreover, we want to make it CONLIKE, so that: All of this goes for unpackCStringUtf8# too. -} +{- Note [unpackCString# iterating over addr] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When unpacking unpackCString# and friends repeatedly return a cons cell +containing: +* The current character we just unpacked. +* A thunk to unpack the rest of the string. + +In order to minimize the size of the thunk we do not index of +the start of the string, offsetting into it, but instead increment +the addr and always use offset 0#. + +This works since these two expressions will read from the same address. +* `indexCharOffAddr# a i` +* `indexCharOffAddr (a `plusAddr#` i) 0#` + +This way we avoid the need for the thunks to close over both the start of +the string and the current offset, saving a word for each character unpacked. +-} + unpackCString# :: Addr# -> [Char] {-# NOINLINE CONLIKE unpackCString# #-} unpackCString# addr - = unpack 0# - where - unpack nh - | isTrue# (ch `eqChar#` '\0'#) = [] - | True = C# ch : unpack (nh +# 1#) + | isTrue# (ch `eqChar#` '\0'#) = [] + | True = C# ch : unpackCString# (addr `plusAddr#` 1#) where - !ch = indexCharOffAddr# addr nh + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# + unpackAppendCString# :: Addr# -> [Char] -> [Char] {-# NOINLINE unpackAppendCString# #-} -- See the NOINLINE note on unpackCString# unpackAppendCString# addr rest - = unpack 0# - where - unpack nh - | isTrue# (ch `eqChar#` '\0'#) = rest - | True = C# ch : unpack (nh +# 1#) + | isTrue# (ch `eqChar#` '\0'#) = rest + | True = C# ch : unpackAppendCString# (addr `plusAddr#` 1#) rest where - !ch = indexCharOffAddr# addr nh + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a @@ -110,45 +128,37 @@ unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a -- each is highly gratuitous. See nofib/real/anna/PrettyPrint. unpackFoldrCString# addr f z - = unpack 0# + | isTrue# (ch `eqChar#` '\0'#) = z + | True = C# ch `f` unpackFoldrCString# (addr `plusAddr#` 1#) f z where - unpack nh - | isTrue# (ch `eqChar#` '\0'#) = z - | True = C# ch `f` unpack (nh +# 1#) - where - !ch = indexCharOffAddr# addr nh + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# -- There's really no point in inlining this for the same reasons as -- unpackCString. See Note [Inlining unpackCString#] above for details. unpackCStringUtf8# :: Addr# -> [Char] {-# NOINLINE CONLIKE unpackCStringUtf8# #-} unpackCStringUtf8# addr - = unpack 0# - where - -- We take care to strictly evaluate the character decoding as - -- indexCharOffAddr# is marked with the can_fail flag and - -- consequently GHC won't evaluate the expression unless it is absolutely - -- needed. - unpack nh - | isTrue# (ch `eqChar#` '\0'# ) = [] - | isTrue# (ch `leChar#` '\x7F'#) = C# ch : unpack (nh +# 1#) - | isTrue# (ch `leChar#` '\xDF'#) = - let !c = C# (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#))) - in c : unpack (nh +# 2#) - | isTrue# (ch `leChar#` '\xEF'#) = - let !c = C# (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +# - ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#))) - in c : unpack (nh +# 3#) - | True = - let !c = C# (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +# - ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 12#) +# - ((ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ord# (indexCharOffAddr# addr (nh +# 3#)) -# 0x80#))) - in c : unpack (nh +# 4#) + | isTrue# (ch `eqChar#` '\0'# ) = [] + | isTrue# (ch `leChar#` '\x7F'#) = C# ch : unpackCStringUtf8# (addr `plusAddr#` 1#) + | isTrue# (ch `leChar#` '\xDF'#) = + let !c = C# (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#))) + in c : unpackCStringUtf8# (addr `plusAddr#` 2#) + | isTrue# (ch `leChar#` '\xEF'#) = + let !c = C# (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#))) + in c : unpackCStringUtf8# (addr `plusAddr#` 3#) + | True = + let !c = C# (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 12#) +# + ((ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ord# (indexCharOffAddr# (addr `plusAddr#` 3#) 0#) -# 0x80#))) + in c : unpackCStringUtf8# (addr `plusAddr#` 4#) where - !ch = indexCharOffAddr# addr nh + -- See Note [unpackCString# iterating over addr] + !ch = indexCharOffAddr# addr 0# -- There's really no point in inlining this for the same reasons as -- unpackCString. See Note [Inlining unpackCString#] above for details. @@ -157,6 +167,7 @@ unpackNBytes# :: Addr# -> Int# -> [Char] unpackNBytes# _addr 0# = [] unpackNBytes# addr len# = unpack [] (len# -# 1#) where + unpack :: [Char] -> Int# -> [Char] unpack acc i# | isTrue# (i# <# 0#) = acc | True = |