diff options
author | Ben Gamari <bgamari.foss@gmail.com> | 2017-04-17 12:26:56 -0400 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2017-04-17 20:34:40 -0400 |
commit | 1cc82d38759c7a5f527ccc6cb514b8ba576cc3d1 (patch) | |
tree | dae54e5e992c487cc9d2d1a2ea435ef25952c826 /compiler | |
parent | 3d3975f2f4caf3af76a7ea27d2882ddaee7db3c9 (diff) | |
download | haskell-1cc82d38759c7a5f527ccc6cb514b8ba576cc3d1.tar.gz |
utils: Lazily decode UTF8 strings
Reviewers: austin, hvr
Subscribers: rwbarton, thomie
GHC Trac Issues: #13527
Differential Revision: https://phabricator.haskell.org/D3442
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/utils/Encoding.hs | 34 | ||||
-rw-r--r-- | compiler/utils/FastString.hs | 4 | ||||
-rw-r--r-- | compiler/utils/StringBuffer.hs | 4 |
3 files changed, 25 insertions, 17 deletions
diff --git a/compiler/utils/Encoding.hs b/compiler/utils/Encoding.hs index 712de6ca82..f2b0979995 100644 --- a/compiler/utils/Encoding.hs +++ b/compiler/utils/Encoding.hs @@ -17,7 +17,8 @@ module Encoding ( utf8PrevChar, utf8CharStart, utf8DecodeChar, - utf8DecodeString, + utf8DecodeByteString, + utf8DecodeStringLazy, utf8EncodeChar, utf8EncodeString, utf8EncodedLength, @@ -33,9 +34,15 @@ module Encoding ( ) where import Foreign +import Foreign.ForeignPtr.Unsafe import Data.Char import qualified Data.Char as Char import Numeric +import GHC.IO + +import Data.ByteString (ByteString) +import qualified Data.ByteString.Internal as BS + import GHC.Exts -- ----------------------------------------------------------------------------- @@ -115,19 +122,24 @@ utf8CharStart p = go p then go (p `plusPtr` (-1)) else return p -utf8DecodeString :: Ptr Word8 -> Int -> IO [Char] -utf8DecodeString ptr len - = unpack ptr +utf8DecodeByteString :: ByteString -> [Char] +utf8DecodeByteString (BS.PS ptr offset len) + = utf8DecodeStringLazy ptr offset len + +utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char] +utf8DecodeStringLazy fptr offset len + = unsafeDupablePerformIO $ unpack start where - !end = ptr `plusPtr` len + !start = unsafeForeignPtrToPtr fptr `plusPtr` offset + !end = start `plusPtr` len unpack p - | p >= end = return [] - | otherwise = - case utf8DecodeChar# (unPtr p) of - (# c#, nBytes# #) -> do - chs <- unpack (p `plusPtr#` nBytes#) - return (C# c# : chs) + | p >= end = touchForeignPtr fptr >> return [] + | otherwise = + case utf8DecodeChar# (unPtr p) of + (# c#, nBytes# #) -> do + rest <- unsafeDupableInterleaveIO $ unpack (p `plusPtr#` nBytes#) + return (C# c# : rest) countUTF8Chars :: Ptr Word8 -> Int -> IO Int countUTF8Chars ptr len = go ptr 0 diff --git a/compiler/utils/FastString.hs b/compiler/utils/FastString.hs index 8d1bbb5c67..8653485e0c 100644 --- a/compiler/utils/FastString.hs +++ b/compiler/utils/FastString.hs @@ -485,9 +485,7 @@ nullFS f = BS.null (fs_bs f) -- | Unpacks and decodes the FastString unpackFS :: FastString -> String -unpackFS (FastString _ _ bs _) = - inlinePerformIO $ BS.unsafeUseAsCStringLen bs $ \(ptr, len) -> - utf8DecodeString (castPtr ptr) len +unpackFS (FastString _ _ bs _) = utf8DecodeByteString bs -- | Gives the UTF-8 encoded bytes corresponding to a 'FastString' bytesFS :: FastString -> [Word8] diff --git a/compiler/utils/StringBuffer.hs b/compiler/utils/StringBuffer.hs index ec5184a1c2..fcc344554b 100644 --- a/compiler/utils/StringBuffer.hs +++ b/compiler/utils/StringBuffer.hs @@ -251,9 +251,7 @@ lexemeToString :: StringBuffer -> String lexemeToString _ 0 = "" lexemeToString (StringBuffer buf _ cur) bytes = - inlinePerformIO $ - withForeignPtr buf $ \ptr -> - utf8DecodeString (ptr `plusPtr` cur) bytes + utf8DecodeStringLazy buf cur bytes lexemeToFastString :: StringBuffer -> Int -- ^ @n@, the number of bytes |