diff options
author | Ben Gamari <ben@smart-cactus.org> | 2022-07-16 13:54:57 -0400 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2022-07-22 08:18:41 -0400 |
commit | ae1666353696b5d85938d8a2f5fb11fb66f21678 (patch) | |
tree | 7de167e7c2f27ec2acc300ba815677878c1bd738 | |
parent | 2c5991ccaf45cb7e68e54d59a27ee144a4499edb (diff) | |
download | haskell-ae1666353696b5d85938d8a2f5fb11fb66f21678.tar.gz |
ghc-boot: Clean up UTF-8 codecs
In preparation for moving the UTF-8 codecs into `base`:
* Move them to GHC.Utils.Encoding.UTF8
* Make names more consistent
* Add some Haddocks
-rw-r--r-- | compiler/GHC/CmmToAsm/Dwarf/Types.hs | 2 | ||||
-rw-r--r-- | compiler/GHC/Data/FastString.hs | 6 | ||||
-rw-r--r-- | compiler/GHC/Data/StringBuffer.hs | 10 | ||||
-rw-r--r-- | compiler/GHC/Hs/DocString.hs | 2 | ||||
-rw-r--r-- | compiler/GHC/StgToCmm/Prof.hs | 4 | ||||
-rw-r--r-- | compiler/GHC/Types/Literal.hs | 2 | ||||
-rw-r--r-- | libraries/ghc-boot/GHC/Data/ShortText.hs | 5 | ||||
-rw-r--r-- | libraries/ghc-boot/GHC/Utils/Encoding.hs | 302 | ||||
-rw-r--r-- | libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs | 344 | ||||
-rw-r--r-- | libraries/ghc-boot/ghc-boot.cabal.in | 1 |
10 files changed, 363 insertions, 315 deletions
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Types.hs b/compiler/GHC/CmmToAsm/Dwarf/Types.hs index e29f03e1d6..caa829db21 100644 --- a/compiler/GHC/CmmToAsm/Dwarf/Types.hs +++ b/compiler/GHC/CmmToAsm/Dwarf/Types.hs @@ -599,7 +599,7 @@ pprString str = pprString' $ hcat $ map escapeChar $ if str `lengthIs` utf8EncodedLength str then str - else map (chr . fromIntegral) $ BS.unpack $ utf8EncodeString str + else map (chr . fromIntegral) $ BS.unpack $ utf8EncodeByteString str -- | Escape a single non-unicode character escapeChar :: Char -> SDoc diff --git a/compiler/GHC/Data/FastString.hs b/compiler/GHC/Data/FastString.hs index 131f174c41..98ca34c249 100644 --- a/compiler/GHC/Data/FastString.hs +++ b/compiler/GHC/Data/FastString.hs @@ -531,13 +531,13 @@ mkFastString :: String -> FastString {-# NOINLINE[1] mkFastString #-} mkFastString str = inlinePerformIO $ do - sbs <- utf8EncodeShortByteString str + let !sbs = utf8EncodeShortByteString str mkFastStringWith (mkNewFastStringShortByteString sbs) sbs -- The following rule is used to avoid polluting the non-reclaimable FastString -- table with transient strings when we only want their encoding. {-# RULES -"bytesFS/mkFastString" forall x. bytesFS (mkFastString x) = utf8EncodeString x #-} +"bytesFS/mkFastString" forall x. bytesFS (mkFastString x) = utf8EncodeByteString x #-} -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@ mkFastStringByteList :: [Word8] -> FastString @@ -554,7 +554,7 @@ mkNewFastStringShortByteString :: ShortByteString -> Int -> FastMutInt -> IO FastString mkNewFastStringShortByteString sbs uid n_zencs = do let zstr = mkZFastString n_zencs sbs - chars <- countUTF8Chars sbs + chars = utf8CountCharsShortByteString sbs return (FastString uid chars sbs zstr) hashStr :: ShortByteString -> Int diff --git a/compiler/GHC/Data/StringBuffer.hs b/compiler/GHC/Data/StringBuffer.hs index e6dcb14b6b..1426cf26e3 100644 --- a/compiler/GHC/Data/StringBuffer.hs +++ b/compiler/GHC/Data/StringBuffer.hs @@ -199,7 +199,7 @@ stringToStringBuffer str = let size = utf8EncodedLength str buf <- mallocForeignPtrArray (size+3) unsafeWithForeignPtr buf $ \ptr -> do - utf8EncodeStringPtr ptr str + utf8EncodePtr ptr str pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0] -- sentinels for UTF-8 decoding return (StringBuffer buf size 0) @@ -297,7 +297,7 @@ prevChar (StringBuffer buf _ cur) _ = inlinePerformIO $ unsafeWithForeignPtr buf $ \p -> do p' <- utf8PrevChar (p `plusPtr` cur) - return (fst (utf8DecodeChar p')) + return (fst (utf8DecodeCharPtr p')) -- ----------------------------------------------------------------------------- -- Moving @@ -383,7 +383,7 @@ lexemeToString :: StringBuffer -> String lexemeToString _ 0 = "" lexemeToString (StringBuffer buf _ cur) bytes = - utf8DecodeStringLazy buf cur bytes + utf8DecodeForeignPtr buf cur bytes lexemeToFastString :: StringBuffer -> Int -- ^ @n@, the number of bytes @@ -405,7 +405,7 @@ decodePrevNChars n (StringBuffer buf _ cur) = go buf0 n acc p | n == 0 || buf0 >= p = return acc go buf0 n acc p = do p' <- utf8PrevChar p - let (c,_) = utf8DecodeChar p' + let (c,_) = utf8DecodeCharPtr p' go buf0 (n - 1) (c:acc) p' -- ----------------------------------------------------------------------------- @@ -414,7 +414,7 @@ parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let go i x | i == len = x - | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of + | otherwise = case fst (utf8DecodeCharPtr (ptr `plusPtr` (cur + i))) of '_' -> go (i + 1) x -- skip "_" (#14473) char -> go (i + 1) (x * radix + toInteger (char_to_int char)) in go 0 0 diff --git a/compiler/GHC/Hs/DocString.hs b/compiler/GHC/Hs/DocString.hs index 3a557ee0e8..7e35004237 100644 --- a/compiler/GHC/Hs/DocString.hs +++ b/compiler/GHC/Hs/DocString.hs @@ -137,7 +137,7 @@ instance Outputable HsDocStringChunk where mkHsDocStringChunk :: String -> HsDocStringChunk -mkHsDocStringChunk s = HsDocStringChunk (utf8EncodeString s) +mkHsDocStringChunk s = HsDocStringChunk (utf8EncodeByteString s) -- | Create a 'HsDocString' from a UTF8-encoded 'ByteString'. mkHsDocStringChunkUtf8ByteString :: ByteString -> HsDocStringChunk diff --git a/compiler/GHC/StgToCmm/Prof.hs b/compiler/GHC/StgToCmm/Prof.hs index 478925122c..8c8f89dbe9 100644 --- a/compiler/GHC/StgToCmm/Prof.hs +++ b/compiler/GHC/StgToCmm/Prof.hs @@ -230,7 +230,7 @@ emitCostCentreDecl cc = do ; modl <- newByteStringCLit (bytesFS $ moduleNameFS $ moduleName $ cc_mod cc) - ; loc <- newByteStringCLit $ utf8EncodeString $ + ; loc <- newByteStringCLit $ utf8EncodeByteString $ renderWithContext ctx (ppr $! costCentreSrcSpan cc) ; let lits = [ zero platform, -- StgInt ccID, @@ -297,7 +297,7 @@ emitInfoTableProv ip = do ctx = stgToCmmContext cfg platform = stgToCmmPlatform cfg ; let (src, label) = maybe ("", "") (first (renderWithContext ctx . ppr)) (infoTableProv ip) - mk_string = newByteStringCLit . utf8EncodeString + mk_string = newByteStringCLit . utf8EncodeByteString ; label <- mk_string label ; modl <- newByteStringCLit (bytesFS $ moduleNameFS $ moduleName mod) diff --git a/compiler/GHC/Types/Literal.hs b/compiler/GHC/Types/Literal.hs index 5b14ecc78d..b525fc94df 100644 --- a/compiler/GHC/Types/Literal.hs +++ b/compiler/GHC/Types/Literal.hs @@ -578,7 +578,7 @@ mkLitChar = LitChar mkLitString :: String -> Literal -- stored UTF-8 encoded mkLitString [] = LitString mempty -mkLitString s = LitString (utf8EncodeString s) +mkLitString s = LitString (utf8EncodeByteString s) mkLitBigNat :: Integer -> Literal mkLitBigNat x = assertPpr (x >= 0) (integer x) diff --git a/libraries/ghc-boot/GHC/Data/ShortText.hs b/libraries/ghc-boot/GHC/Data/ShortText.hs index 2b3038ccfc..929b65b481 100644 --- a/libraries/ghc-boot/GHC/Data/ShortText.hs +++ b/libraries/ghc-boot/GHC/Data/ShortText.hs @@ -67,14 +67,15 @@ instance IsString ShortText where -- | /O(n)/ Returns the length of the 'ShortText' in characters. codepointLength :: ShortText -> Int -codepointLength st = unsafeDupablePerformIO $ countUTF8Chars (contents st) +codepointLength st = utf8CountCharsShortByteString (contents st) + -- | /O(1)/ Returns the length of the 'ShortText' in bytes. byteLength :: ShortText -> Int byteLength st = SBS.length $ contents st -- | /O(n)/ Convert a 'String' into a 'ShortText'. pack :: String -> ShortText -pack s = unsafeDupablePerformIO $ ShortText <$> utf8EncodeShortByteString s +pack s = ShortText $ utf8EncodeShortByteString s -- | /O(n)/ Convert a 'ShortText' into a 'String'. unpack :: ShortText -> String diff --git a/libraries/ghc-boot/GHC/Utils/Encoding.hs b/libraries/ghc-boot/GHC/Utils/Encoding.hs index 16386c69ca..183d29946f 100644 --- a/libraries/ghc-boot/GHC/Utils/Encoding.hs +++ b/libraries/ghc-boot/GHC/Utils/Encoding.hs @@ -17,21 +17,7 @@ module GHC.Utils.Encoding ( -- * UTF-8 - utf8DecodeCharAddr#, - utf8PrevChar, - utf8CharStart, - utf8DecodeChar, - utf8DecodeByteString, - utf8UnconsByteString, - utf8DecodeShortByteString, - utf8CompareShortByteString, - utf8DecodeStringLazy, - utf8EncodeChar, - utf8EncodeString, - utf8EncodeStringPtr, - utf8EncodeShortByteString, - utf8EncodedLength, - countUTF8Chars, + module GHC.Utils.Encoding.UTF8, -- * Z-encoding UserString, @@ -47,295 +33,11 @@ module GHC.Utils.Encoding ( import Prelude import Foreign -import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) import Data.Char import qualified Data.Char as Char import Numeric -import GHC.IO -import GHC.ST -import Data.ByteString (ByteString) -import qualified Data.ByteString.Internal as BS -import Data.ByteString.Short.Internal (ShortByteString(..)) - -import GHC.Exts - --- ----------------------------------------------------------------------------- --- UTF-8 - --- We can't write the decoder as efficiently as we'd like without --- resorting to unboxed extensions, unfortunately. I tried to write --- an IO version of this function, but GHC can't eliminate boxed --- results from an IO-returning function. --- --- We assume we can ignore overflow when parsing a multibyte character here. --- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences --- before decoding them (see "GHC.Data.StringBuffer"). - -{-# INLINE utf8DecodeChar# #-} -utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #) -utf8DecodeChar# indexWord8# = - let !ch0 = word2Int# (indexWord8# 0#) in - case () of - _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #) - - | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) -> - let !ch1 = word2Int# (indexWord8# 1#) in - if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else - (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +# - (ch1 -# 0x80#)), - 2# #) - - | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) -> - let !ch1 = word2Int# (indexWord8# 1#) in - if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else - let !ch2 = word2Int# (indexWord8# 2#) in - if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else - (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +# - ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ch2 -# 0x80#)), - 3# #) - - | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) -> - let !ch1 = word2Int# (indexWord8# 1#) in - if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else - let !ch2 = word2Int# (indexWord8# 2#) in - if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else - let !ch3 = word2Int# (indexWord8# 3#) in - if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else - (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +# - ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +# - ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ch3 -# 0x80#)), - 4# #) - - | otherwise -> fail 1# - where - -- all invalid sequences end up here: - fail :: Int# -> (# Char#, Int# #) - fail nBytes# = (# '\0'#, nBytes# #) - -- '\xFFFD' would be the usual replacement character, but - -- that's a valid symbol in Haskell, so will result in a - -- confusing parse error later on. Instead we use '\0' which - -- will signal a lexer error immediately. - -utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #) -utf8DecodeCharAddr# a# off# = -#if !MIN_VERSION_base(4,16,0) - utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#)) -#else - utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#))) -#endif - -utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #) -utf8DecodeCharByteArray# ba# off# = -#if !MIN_VERSION_base(4,16,0) - utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#)) -#else - utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#))) -#endif - - -utf8DecodeChar :: Ptr Word8 -> (Char, Int) -utf8DecodeChar !(Ptr a#) = - case utf8DecodeCharAddr# a# 0# of - (# c#, nBytes# #) -> ( C# c#, I# nBytes# ) - --- UTF-8 is cleverly designed so that we can always figure out where --- the start of the current character is, given any position in a --- stream. This function finds the start of the previous character, --- assuming there *is* a previous character. -utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8) -utf8PrevChar p = utf8CharStart (p `plusPtr` (-1)) - -utf8CharStart :: Ptr Word8 -> IO (Ptr Word8) -utf8CharStart p = go p - where go p = do w <- peek p - if w >= 0x80 && w < 0xC0 - then go (p `plusPtr` (-1)) - else return p - -{-# INLINE utf8DecodeLazy# #-} -utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char] -utf8DecodeLazy# retain decodeChar# len# - = unpack 0# - where - unpack i# - | isTrue# (i# >=# len#) = retain >> return [] - | otherwise = - case decodeChar# i# of - (# c#, nBytes# #) -> do - rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#) - return (C# c# : rest) - -utf8DecodeByteString :: ByteString -> [Char] -utf8DecodeByteString (BS.PS fptr offset len) - = utf8DecodeStringLazy fptr offset len - -utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString) -utf8UnconsByteString (BS.PS _ _ 0) = Nothing -utf8UnconsByteString (BS.PS fptr offset len) - = unsafeDupablePerformIO $ - withForeignPtr fptr $ \ptr -> do - let (c,n) = utf8DecodeChar (ptr `plusPtr` offset) - return $ Just (c, BS.PS fptr (offset + n) (len - n)) - -utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char] -utf8DecodeStringLazy fp offset (I# len#) - = unsafeDupablePerformIO $ do - let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset - utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len# --- Note that since utf8DecodeLazy# returns a thunk the lifetime of the --- ForeignPtr actually needs to be longer than the lexical lifetime --- withForeignPtr would provide here. That's why we use touchForeignPtr to --- keep the fp alive until the last character has actually been decoded. - -utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering -utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0# - -- UTF-8 has the property that sorting by bytes values also sorts by - -- code-points. - -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property - -- doesn't hold and we must explicitly check this case here. - -- Note that decoding every code point would also work but it would be much - -- more costly. - where - !sz1 = sizeofByteArray# a1 - !sz2 = sizeofByteArray# a2 - go off1 off2 - | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ - | isTrue# (off1 >=# sz1) = LT - | isTrue# (off2 >=# sz2) = GT - | otherwise = -#if !MIN_VERSION_base(4,16,0) - let !b1_1 = indexWord8Array# a1 off1 - !b2_1 = indexWord8Array# a2 off2 -#else - let !b1_1 = word8ToWord# (indexWord8Array# a1 off1) - !b2_1 = word8ToWord# (indexWord8Array# a2 off2) -#endif - in case b1_1 of - 0xC0## -> case b2_1 of - 0xC0## -> go (off1 +# 1#) (off2 +# 1#) -#if !MIN_VERSION_base(4,16,0) - _ -> case indexWord8Array# a1 (off1 +# 1#) of -#else - _ -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of -#endif - 0x80## -> LT - _ -> go (off1 +# 1#) (off2 +# 1#) - _ -> case b2_1 of -#if !MIN_VERSION_base(4,16,0) - 0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of -#else - 0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of -#endif - 0x80## -> GT - _ -> go (off1 +# 1#) (off2 +# 1#) - _ | isTrue# (b1_1 `gtWord#` b2_1) -> GT - | isTrue# (b1_1 `ltWord#` b2_1) -> LT - | otherwise -> go (off1 +# 1#) (off2 +# 1#) - -utf8DecodeShortByteString :: ShortByteString -> [Char] -utf8DecodeShortByteString (SBS ba#) - = unsafeDupablePerformIO $ - let len# = sizeofByteArray# ba# in - utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len# - -countUTF8Chars :: ShortByteString -> IO Int -countUTF8Chars (SBS ba) = go 0# 0# - where - len# = sizeofByteArray# ba - go i# n# - | isTrue# (i# >=# len#) = - return (I# n#) - | otherwise = do - case utf8DecodeCharByteArray# ba i# of - (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#) - -{-# INLINE utf8EncodeChar #-} -utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s) - -> Char -> ST s Int -utf8EncodeChar write# c = - let x = fromIntegral (ord c) in - case () of - _ | x > 0 && x <= 0x007f -> do - write 0 x - return 1 - -- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we - -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8). - | x <= 0x07ff -> do - write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F)) - write 1 (0x80 .|. (x .&. 0x3F)) - return 2 - | x <= 0xffff -> do - write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F) - write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F) - write 2 (0x80 .|. (x .&. 0x3F)) - return 3 - | otherwise -> do - write 0 (0xF0 .|. (x `shiftR` 18)) - write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F)) - write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F)) - write 3 (0x80 .|. (x .&. 0x3F)) - return 4 - where - {-# INLINE write #-} - write (I# off#) (W# c#) = ST $ \s -> -#if !MIN_VERSION_base(4,16,0) - case write# off# (narrowWord8# c#) s of -#else - case write# off# (wordToWord8# c#) s of -#endif - s -> (# s, () #) - -utf8EncodeString :: String -> ByteString -utf8EncodeString s = - unsafePerformIO $ do - let len = utf8EncodedLength s - buf <- mallocForeignPtrBytes len - withForeignPtr buf $ \ptr -> do - utf8EncodeStringPtr ptr s - pure (BS.fromForeignPtr buf 0 len) - -utf8EncodeStringPtr :: Ptr Word8 -> String -> IO () -utf8EncodeStringPtr (Ptr a#) str = go a# str - where go !_ [] = return () - go a# (c:cs) = do -#if !MIN_VERSION_base(4,16,0) - -- writeWord8OffAddr# was taking a Word# - I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c -#else - I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c -#endif - go (a# `plusAddr#` off#) cs - -utf8EncodeShortByteString :: String -> IO ShortByteString -utf8EncodeShortByteString str = IO $ \s -> - case utf8EncodedLength str of { I# len# -> - case newByteArray# len# s of { (# s, mba# #) -> - case go mba# 0# str of { ST f_go -> - case f_go s of { (# s, () #) -> - case unsafeFreezeByteArray# mba# s of { (# s, ba# #) -> - (# s, SBS ba# #) }}}}} - where - go _ _ [] = return () - go mba# i# (c:cs) = do -#if !MIN_VERSION_base(4,16,0) - -- writeWord8Array# was taking a Word# - I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c -#else - I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c -#endif - go mba# (i# +# off#) cs - -utf8EncodedLength :: String -> Int -utf8EncodedLength str = go 0 str - where go !n [] = n - go n (c:cs) - | ord c > 0 && ord c <= 0x007f = go (n+1) cs - | ord c <= 0x07ff = go (n+2) cs - | ord c <= 0xffff = go (n+3) cs - | otherwise = go (n+4) cs +import GHC.Utils.Encoding.UTF8 -- ----------------------------------------------------------------------------- -- Note [Z-Encoding] diff --git a/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs new file mode 100644 index 0000000000..d1d470ab25 --- /dev/null +++ b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs @@ -0,0 +1,344 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, MultiWayIf #-} +{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-} +-- We always optimise this, otherwise performance of a non-optimised +-- compiler is severely affected. This module used to live in the `ghc` +-- package but has been moved to `ghc-boot` because the definition +-- of the package database (needed in both ghc and in ghc-pkg) lives in +-- `ghc-boot` and uses ShortText, which in turn depends on this module. + +-- | Simple, non-streaming UTF-8 codecs. +module GHC.Utils.Encoding.UTF8 + ( -- * Decoding single characters + utf8DecodeCharAddr# + , utf8DecodeCharPtr + , utf8DecodeCharByteArray# + , utf8PrevChar + , utf8CharStart + , utf8UnconsByteString + -- * Decoding strings + , utf8DecodeByteString + , utf8DecodeShortByteString + , utf8DecodeForeignPtr + , utf8DecodeByteArray# + -- * Counting characters + , utf8CountCharsShortByteString + , utf8CountCharsByteArray# + -- * Comparison + , utf8CompareByteArray# + , utf8CompareShortByteString + -- * Encoding strings + , utf8EncodeByteArray# + , utf8EncodePtr + , utf8EncodeByteString + , utf8EncodeShortByteString + , utf8EncodedLength + ) where + + +import Prelude + +import Foreign +import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) +import Data.Char +import GHC.IO +import GHC.ST + +import Data.ByteString (ByteString) +import qualified Data.ByteString.Internal as BS +import Data.ByteString.Short.Internal (ShortByteString(..)) + +import GHC.Exts + +-- | Find the start of the codepoint preceding the codepoint at the given +-- 'Ptr'. This is undefined if there is no previous valid codepoint. +utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8) +utf8PrevChar p = utf8CharStart (p `plusPtr` (-1)) + +-- | Find the start of the codepoint at the given 'Ptr'. This is undefined if +-- there is no previous valid codepoint. +utf8CharStart :: Ptr Word8 -> IO (Ptr Word8) +utf8CharStart p = go p + where go p = do w <- peek p + if w >= 0x80 && w < 0xC0 + then go (p `plusPtr` (-1)) + else return p + +utf8CountCharsShortByteString :: ShortByteString -> Int +utf8CountCharsShortByteString (SBS ba) = utf8CountCharsByteArray# ba + +utf8DecodeShortByteString :: ShortByteString -> [Char] +utf8DecodeShortByteString (SBS ba#) = utf8DecodeByteArray# ba# + +-- | Decode a 'ByteString' containing a UTF-8 string. +utf8DecodeByteString :: ByteString -> [Char] +utf8DecodeByteString (BS.PS fptr offset len) + = utf8DecodeForeignPtr fptr offset len + +utf8EncodeShortByteString :: String -> ShortByteString +utf8EncodeShortByteString str = SBS (utf8EncodeByteArray# str) + +-- | Encode a 'String' into a 'ByteString'. +utf8EncodeByteString :: String -> ByteString +utf8EncodeByteString s = + unsafePerformIO $ do + let len = utf8EncodedLength s + buf <- mallocForeignPtrBytes len + withForeignPtr buf $ \ptr -> do + utf8EncodePtr ptr s + pure (BS.fromForeignPtr buf 0 len) + +utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString) +utf8UnconsByteString (BS.PS _ _ 0) = Nothing +utf8UnconsByteString (BS.PS fptr offset len) + = unsafeDupablePerformIO $ + withForeignPtr fptr $ \ptr -> do + let (c,n) = utf8DecodeCharPtr (ptr `plusPtr` offset) + return $ Just (c, BS.PS fptr (offset + n) (len - n)) + +utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering +utf8CompareShortByteString (SBS a1) (SBS a2) = utf8CompareByteArray# a1 a2 + +-- We can't write the decoder as efficiently as we'd like without +-- resorting to unboxed extensions, unfortunately. I tried to write +-- an IO version of this function, but GHC can't eliminate boxed +-- results from an IO-returning function. +-- +-- We assume we can ignore overflow when parsing a multibyte character here. +-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences +-- before decoding them (see "GHC.Data.StringBuffer"). + +{-# INLINE utf8DecodeChar# #-} +-- | Decode a single codepoint from a byte buffer indexed by the given indexing +-- function. +utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #) +utf8DecodeChar# indexWord8# = + let !ch0 = word2Int# (indexWord8# 0#) in + case () of + _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #) + + | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) -> + let !ch1 = word2Int# (indexWord8# 1#) in + if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else + (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +# + (ch1 -# 0x80#)), + 2# #) + + | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) -> + let !ch1 = word2Int# (indexWord8# 1#) in + if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else + let !ch2 = word2Int# (indexWord8# 2#) in + if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else + (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +# + ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ch2 -# 0x80#)), + 3# #) + + | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) -> + let !ch1 = word2Int# (indexWord8# 1#) in + if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else + let !ch2 = word2Int# (indexWord8# 2#) in + if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else + let !ch3 = word2Int# (indexWord8# 3#) in + if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else + (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +# + ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +# + ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ch3 -# 0x80#)), + 4# #) + + | otherwise -> fail 1# + where + -- all invalid sequences end up here: + fail :: Int# -> (# Char#, Int# #) + fail nBytes# = (# '\0'#, nBytes# #) + -- '\xFFFD' would be the usual replacement character, but + -- that's a valid symbol in Haskell, so will result in a + -- confusing parse error later on. Instead we use '\0' which + -- will signal a lexer error immediately. + +-- | Decode a single character at the given 'Addr#'. +utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #) +utf8DecodeCharAddr# a# off# = +#if !MIN_VERSION_base(4,16,0) + utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#)) +#else + utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#))) +#endif + +-- | Decode a single codepoint starting at the given 'Ptr'. +utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int) +utf8DecodeCharPtr !(Ptr a#) = + case utf8DecodeCharAddr# a# 0# of + (# c#, nBytes# #) -> ( C# c#, I# nBytes# ) + +-- | Decode a single codepoint starting at the given byte offset into a +-- 'ByteArray#'. +utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #) +utf8DecodeCharByteArray# ba# off# = +#if !MIN_VERSION_base(4,16,0) + utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#)) +#else + utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#))) +#endif + +{-# INLINE utf8Decode# #-} +utf8Decode# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char] +utf8Decode# retain decodeChar# len# + = unpack 0# + where + unpack i# + | isTrue# (i# >=# len#) = retain >> return [] + | otherwise = + case decodeChar# i# of + (# c#, nBytes# #) -> do + rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#) + return (C# c# : rest) + +utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char] +utf8DecodeForeignPtr fp offset (I# len#) + = unsafeDupablePerformIO $ do + let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset + utf8Decode# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len# +-- Note that since utf8Decode# returns a thunk the lifetime of the +-- ForeignPtr actually needs to be longer than the lexical lifetime +-- withForeignPtr would provide here. That's why we use touchForeignPtr to +-- keep the fp alive until the last character has actually been decoded. + +utf8DecodeByteArray# :: ByteArray# -> [Char] +utf8DecodeByteArray# ba# + = unsafeDupablePerformIO $ + let len# = sizeofByteArray# ba# in + utf8Decode# (return ()) (utf8DecodeCharByteArray# ba#) len# + +utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering +utf8CompareByteArray# a1 a2 = go 0# 0# + -- UTF-8 has the property that sorting by bytes values also sorts by + -- code-points. + -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property + -- doesn't hold and we must explicitly check this case here. + -- Note that decoding every code point would also work but it would be much + -- more costly. + where + !sz1 = sizeofByteArray# a1 + !sz2 = sizeofByteArray# a2 + go off1 off2 + | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ + | isTrue# (off1 >=# sz1) = LT + | isTrue# (off2 >=# sz2) = GT + | otherwise = +#if !MIN_VERSION_base(4,16,0) + let !b1_1 = indexWord8Array# a1 off1 + !b2_1 = indexWord8Array# a2 off2 +#else + let !b1_1 = word8ToWord# (indexWord8Array# a1 off1) + !b2_1 = word8ToWord# (indexWord8Array# a2 off2) +#endif + in case b1_1 of + 0xC0## -> case b2_1 of + 0xC0## -> go (off1 +# 1#) (off2 +# 1#) +#if !MIN_VERSION_base(4,16,0) + _ -> case indexWord8Array# a1 (off1 +# 1#) of +#else + _ -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of +#endif + 0x80## -> LT + _ -> go (off1 +# 1#) (off2 +# 1#) + _ -> case b2_1 of +#if !MIN_VERSION_base(4,16,0) + 0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of +#else + 0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of +#endif + 0x80## -> GT + _ -> go (off1 +# 1#) (off2 +# 1#) + _ | isTrue# (b1_1 `gtWord#` b2_1) -> GT + | isTrue# (b1_1 `ltWord#` b2_1) -> LT + | otherwise -> go (off1 +# 1#) (off2 +# 1#) + +utf8CountCharsByteArray# :: ByteArray# -> Int +utf8CountCharsByteArray# ba = go 0# 0# + where + len# = sizeofByteArray# ba + go i# n# + | isTrue# (i# >=# len#) = I# n# + | otherwise = + case utf8DecodeCharByteArray# ba i# of + (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#) + +{-# INLINE utf8EncodeChar #-} +utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s) + -> Char -> ST s Int +utf8EncodeChar write# c = + let x = fromIntegral (ord c) in + case () of + _ | x > 0 && x <= 0x007f -> do + write 0 x + return 1 + -- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we + -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8). + | x <= 0x07ff -> do + write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F)) + write 1 (0x80 .|. (x .&. 0x3F)) + return 2 + | x <= 0xffff -> do + write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F) + write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F) + write 2 (0x80 .|. (x .&. 0x3F)) + return 3 + | otherwise -> do + write 0 (0xF0 .|. (x `shiftR` 18)) + write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F)) + write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F)) + write 3 (0x80 .|. (x .&. 0x3F)) + return 4 + where + {-# INLINE write #-} + write (I# off#) (W# c#) = ST $ \s -> +#if !MIN_VERSION_base(4,16,0) + case write# off# (narrowWord8# c#) s of +#else + case write# off# (wordToWord8# c#) s of +#endif + s -> (# s, () #) + +utf8EncodePtr :: Ptr Word8 -> String -> IO () +utf8EncodePtr (Ptr a#) str = go a# str + where go !_ [] = return () + go a# (c:cs) = do +#if !MIN_VERSION_base(4,16,0) + -- writeWord8OffAddr# was taking a Word# + I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c +#else + I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c +#endif + go (a# `plusAddr#` off#) cs + +utf8EncodeByteArray# :: String -> ByteArray# +utf8EncodeByteArray# str = runRW# $ \s -> + case utf8EncodedLength str of { I# len# -> + case newByteArray# len# s of { (# s, mba# #) -> + case go mba# 0# str of { ST f_go -> + case f_go s of { (# s, () #) -> + case unsafeFreezeByteArray# mba# s of { (# _, ba# #) -> + ba# }}}}} + where + go _ _ [] = return () + go mba# i# (c:cs) = do +#if !MIN_VERSION_base(4,16,0) + -- writeWord8Array# was taking a Word# + I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c +#else + I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c +#endif + go mba# (i# +# off#) cs + +utf8EncodedLength :: String -> Int +utf8EncodedLength str = go 0 str + where go !n [] = n + go n (c:cs) + | ord c > 0 && ord c <= 0x007f = go (n+1) cs + | ord c <= 0x07ff = go (n+2) cs + | ord c <= 0xffff = go (n+3) cs + | otherwise = go (n+4) cs + diff --git a/libraries/ghc-boot/ghc-boot.cabal.in b/libraries/ghc-boot/ghc-boot.cabal.in index 3ca83a5e1c..531d445b57 100644 --- a/libraries/ghc-boot/ghc-boot.cabal.in +++ b/libraries/ghc-boot/ghc-boot.cabal.in @@ -45,6 +45,7 @@ Library GHC.Data.ShortText GHC.Data.SizedSeq GHC.Utils.Encoding + GHC.Utils.Encoding.UTF8 GHC.LanguageExtensions GHC.Unit.Database GHC.Serialized |