diff options
author | Thomas Miedema <thomasmiedema@gmail.com> | 2014-09-16 07:56:35 -0500 |
---|---|---|
committer | Austin Seipp <austin@well-typed.com> | 2014-09-16 07:56:35 -0500 |
commit | caf449e39f5e7545eeabd567349661450aa8c6e5 (patch) | |
tree | 2a3124e3f72efc76519fe33c681a41301d14d2d9 | |
parent | 52eab67a99dd928204b730355245233fa96fa24d (diff) | |
download | haskell-caf449e39f5e7545eeabd567349661450aa8c6e5.tar.gz |
Return nBytes instead of nextAddr from utf8DecodeChar
Summary:
While researching D176, I came across the following simplification
opportunity:
Not all functions that call utf8DecodeChar actually need the address
of the next char. And some need the 'number of bytes' read. So returning
nBytes instead of nextAddr should save a few addition and subtraction
operations, and makes the code a bit simpler.
Test Plan: it validates
Reviewers: simonmar, ezyang, austin
Reviewed By: austin
Subscribers: simonmar, ezyang, carter
Differential Revision: https://phabricator.haskell.org/D179
-rw-r--r-- | compiler/utils/Encoding.hs | 50 | ||||
-rw-r--r-- | compiler/utils/FastString.lhs | 3 | ||||
-rw-r--r-- | compiler/utils/StringBuffer.lhs | 4 |
3 files changed, 29 insertions, 28 deletions
diff --git a/compiler/utils/Encoding.hs b/compiler/utils/Encoding.hs index 115703fc69..5c8619baa6 100644 --- a/compiler/utils/Encoding.hs +++ b/compiler/utils/Encoding.hs @@ -47,18 +47,18 @@ import ExtsCompat46 -- before decoding them (see StringBuffer.hs). {-# INLINE utf8DecodeChar# #-} -utf8DecodeChar# :: Addr# -> (# Char#, Addr# #) +utf8DecodeChar# :: Addr# -> (# Char#, Int# #) utf8DecodeChar# a# = let !ch0 = word2Int# (indexWord8OffAddr# a# 0#) in case () of - _ | ch0 <=# 0x7F# -> (# chr# ch0, a# `plusAddr#` 1# #) + _ | ch0 <=# 0x7F# -> (# chr# ch0, 1# #) | ch0 >=# 0xC0# && ch0 <=# 0xDF# -> let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +# (ch1 -# 0x80#)), - a# `plusAddr#` 2# #) + 2# #) | ch0 >=# 0xE0# && ch0 <=# 0xEF# -> let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in @@ -68,7 +68,7 @@ utf8DecodeChar# a# = (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +# ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +# (ch2 -# 0x80#)), - a# `plusAddr#` 3# #) + 3# #) | ch0 >=# 0xF0# && ch0 <=# 0xF8# -> let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in @@ -81,20 +81,21 @@ utf8DecodeChar# a# = ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +# ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +# (ch3 -# 0x80#)), - a# `plusAddr#` 4# #) + 4# #) | otherwise -> fail 1# where -- all invalid sequences end up here: - fail n = (# '\0'#, a# `plusAddr#` n #) + fail :: Int# -> (# Char#, Int# #) + fail nBytes# = (# '\0'#, nBytes# #) -- '\xFFFD' would be the usual replacement character, but -- that's a valid symbol in Haskell, so will result in a -- confusing parse error later on. Instead we use '\0' which -- will signal a lexer error immediately. -utf8DecodeChar :: Ptr Word8 -> (Char, Ptr Word8) +utf8DecodeChar :: Ptr Word8 -> (Char, Int) utf8DecodeChar (Ptr a#) = - case utf8DecodeChar# a# of (# c#, b# #) -> ( C# c#, Ptr b# ) + case utf8DecodeChar# a# of (# c#, nBytes# #) -> ( C# c#, I# nBytes# ) -- UTF-8 is cleverly designed so that we can always figure out where -- the start of the current character is, given any position in a @@ -111,35 +112,36 @@ utf8CharStart p = go p else return p utf8DecodeString :: Ptr Word8 -> Int -> IO [Char] -STRICT2(utf8DecodeString) -utf8DecodeString (Ptr a#) (I# len#) - = unpack a# +utf8DecodeString ptr len + = unpack ptr where - !end# = addr2Int# (a# `plusAddr#` len#) + !end = ptr `plusPtr` len - unpack p# - | addr2Int# p# >=# end# = return [] + unpack p + | p >= end = return [] | otherwise = - case utf8DecodeChar# p# of - (# c#, q# #) -> do - chs <- unpack q# + case utf8DecodeChar# (unPtr p) of + (# c#, nBytes# #) -> do + chs <- unpack (p `plusPtr#` nBytes#) return (C# c# : chs) countUTF8Chars :: Ptr Word8 -> Int -> IO Int -countUTF8Chars ptr bytes = go ptr 0 +countUTF8Chars ptr len = go ptr 0 where - end = ptr `plusPtr` bytes + !end = ptr `plusPtr` len - STRICT2(go) - go ptr n - | ptr >= end = return n + go p !n + | p >= end = return n | otherwise = do - case utf8DecodeChar# (unPtr ptr) of - (# _, a #) -> go (Ptr a) (n+1) + case utf8DecodeChar# (unPtr p) of + (# _, nBytes# #) -> go (p `plusPtr#` nBytes#) (n+1) unPtr :: Ptr a -> Addr# unPtr (Ptr a) = a +plusPtr# :: Ptr a -> Int# -> Ptr a +plusPtr# ptr nBytes# = ptr `plusPtr` (I# nBytes#) + utf8EncodeChar :: Char -> Ptr Word8 -> IO (Ptr Word8) utf8EncodeChar c ptr = let x = ord c in diff --git a/compiler/utils/FastString.lhs b/compiler/utils/FastString.lhs index a38d87e1b5..91236cce11 100644 --- a/compiler/utils/FastString.lhs +++ b/compiler/utils/FastString.lhs @@ -529,8 +529,7 @@ tailFS :: FastString -> FastString tailFS (FastString _ 0 _ _) = panic "tailFS: Empty FastString" tailFS (FastString _ _ bs _) = inlinePerformIO $ BS.unsafeUseAsCString bs $ \ptr -> - do let (_, ptr') = utf8DecodeChar (castPtr ptr) - n = ptr' `minusPtr` ptr + do let (_, n) = utf8DecodeChar (castPtr ptr) return $! mkFastStringByteString (BS.drop n bs) consFS :: Char -> FastString -> FastString diff --git a/compiler/utils/StringBuffer.lhs b/compiler/utils/StringBuffer.lhs index 50d8443b05..d298457bdd 100644 --- a/compiler/utils/StringBuffer.lhs +++ b/compiler/utils/StringBuffer.lhs @@ -179,8 +179,8 @@ nextChar (StringBuffer buf len (I# cur#)) = inlinePerformIO $ do withForeignPtr buf $ \(Ptr a#) -> do case utf8DecodeChar# (a# `plusAddr#` cur#) of - (# c#, b# #) -> - let cur' = I# (b# `minusAddr#` a#) in + (# c#, nBytes# #) -> + let cur' = I# (cur# +# nBytes#) in return (C# c#, StringBuffer buf len cur') currentChar :: StringBuffer -> Char |