ghc-boot: Clean up UTF-8 codecs

In preparation for moving the UTF-8 codecs into `base`: * Move them to GHC.Utils.Encoding.UTF8 * Make names more consistent * Add some Haddocks
author: Ben Gamari <ben@smart-cactus.org> 2022-07-16 13:54:57 -0400
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2022-07-22 08:18:41 -0400
commit: ae1666353696b5d85938d8a2f5fb11fb66f21678 (patch)
tree: 7de167e7c2f27ec2acc300ba815677878c1bd738
parent: 2c5991ccaf45cb7e68e54d59a27ee144a4499edb (diff)
download: haskell-ae1666353696b5d85938d8a2f5fb11fb66f21678.tar.gz
10 files changed, 363 insertions, 315 deletions
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Types.hs b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
index e29f03e1d6..caa829db21 100644
--- a/compiler/GHC/CmmToAsm/Dwarf/Types.hs
+++ b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
@@ -599,7 +599,7 @@ pprString str
   = pprString' $ hcat $ map escapeChar $
     if str `lengthIs` utf8EncodedLength str
     then str
-    else map (chr . fromIntegral) $ BS.unpack $ utf8EncodeString str
+    else map (chr . fromIntegral) $ BS.unpack $ utf8EncodeByteString str
 
 -- | Escape a single non-unicode character
 escapeChar :: Char -> SDoc
diff --git a/compiler/GHC/Data/FastString.hs b/compiler/GHC/Data/FastString.hs
index 131f174c41..98ca34c249 100644
--- a/compiler/GHC/Data/FastString.hs
+++ b/compiler/GHC/Data/FastString.hs
@@ -531,13 +531,13 @@ mkFastString :: String -> FastString
 {-# NOINLINE[1] mkFastString #-}
 mkFastString str =
   inlinePerformIO $ do
-    sbs <- utf8EncodeShortByteString str
+    let !sbs = utf8EncodeShortByteString str
     mkFastStringWith (mkNewFastStringShortByteString sbs) sbs
 
 -- The following rule is used to avoid polluting the non-reclaimable FastString
 -- table with transient strings when we only want their encoding.
 {-# RULES
-"bytesFS/mkFastString" forall x. bytesFS (mkFastString x) = utf8EncodeString x #-}
+"bytesFS/mkFastString" forall x. bytesFS (mkFastString x) = utf8EncodeByteString x #-}
 
 -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
 mkFastStringByteList :: [Word8] -> FastString
@@ -554,7 +554,7 @@ mkNewFastStringShortByteString :: ShortByteString -> Int
                                -> FastMutInt -> IO FastString
 mkNewFastStringShortByteString sbs uid n_zencs = do
   let zstr = mkZFastString n_zencs sbs
-  chars <- countUTF8Chars sbs
+      chars = utf8CountCharsShortByteString sbs
   return (FastString uid chars sbs zstr)
 
 hashStr  :: ShortByteString -> Int
diff --git a/compiler/GHC/Data/StringBuffer.hs b/compiler/GHC/Data/StringBuffer.hs
index e6dcb14b6b..1426cf26e3 100644
--- a/compiler/GHC/Data/StringBuffer.hs
+++ b/compiler/GHC/Data/StringBuffer.hs
@@ -199,7 +199,7 @@ stringToStringBuffer str =
   let size = utf8EncodedLength str
   buf <- mallocForeignPtrArray (size+3)
   unsafeWithForeignPtr buf $ \ptr -> do
-    utf8EncodeStringPtr ptr str
+    utf8EncodePtr ptr str
     pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
     -- sentinels for UTF-8 decoding
   return (StringBuffer buf size 0)
@@ -297,7 +297,7 @@ prevChar (StringBuffer buf _   cur) _     =
   inlinePerformIO $
     unsafeWithForeignPtr buf $ \p -> do
       p' <- utf8PrevChar (p `plusPtr` cur)
-      return (fst (utf8DecodeChar p'))
+      return (fst (utf8DecodeCharPtr p'))
 
 -- -----------------------------------------------------------------------------
 -- Moving
@@ -383,7 +383,7 @@ lexemeToString :: StringBuffer
                -> String
 lexemeToString _ 0 = ""
 lexemeToString (StringBuffer buf _ cur) bytes =
-  utf8DecodeStringLazy buf cur bytes
+  utf8DecodeForeignPtr buf cur bytes
 
 lexemeToFastString :: StringBuffer
                    -> Int               -- ^ @n@, the number of bytes
@@ -405,7 +405,7 @@ decodePrevNChars n (StringBuffer buf _ cur) =
     go buf0 n acc p | n == 0 || buf0 >= p = return acc
     go buf0 n acc p = do
         p' <- utf8PrevChar p
-        let (c,_) = utf8DecodeChar p'
+        let (c,_) = utf8DecodeCharPtr p'
         go buf0 (n - 1) (c:acc) p'
 
 -- -----------------------------------------------------------------------------
@@ -414,7 +414,7 @@ parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
   = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
     go i x | i == len  = x
-           | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
+           | otherwise = case fst (utf8DecodeCharPtr (ptr `plusPtr` (cur + i))) of
                '_'  -> go (i + 1) x    -- skip "_" (#14473)
                char -> go (i + 1) (x * radix + toInteger (char_to_int char))
   in go 0 0
diff --git a/compiler/GHC/Hs/DocString.hs b/compiler/GHC/Hs/DocString.hs
index 3a557ee0e8..7e35004237 100644
--- a/compiler/GHC/Hs/DocString.hs
+++ b/compiler/GHC/Hs/DocString.hs
@@ -137,7 +137,7 @@ instance Outputable HsDocStringChunk where
 
 
 mkHsDocStringChunk :: String -> HsDocStringChunk
-mkHsDocStringChunk s = HsDocStringChunk (utf8EncodeString s)
+mkHsDocStringChunk s = HsDocStringChunk (utf8EncodeByteString s)
 
 -- | Create a 'HsDocString' from a UTF8-encoded 'ByteString'.
 mkHsDocStringChunkUtf8ByteString :: ByteString -> HsDocStringChunk
diff --git a/compiler/GHC/StgToCmm/Prof.hs b/compiler/GHC/StgToCmm/Prof.hs
index 478925122c..8c8f89dbe9 100644
--- a/compiler/GHC/StgToCmm/Prof.hs
+++ b/compiler/GHC/StgToCmm/Prof.hs
@@ -230,7 +230,7 @@ emitCostCentreDecl cc = do
   ; modl  <- newByteStringCLit (bytesFS $ moduleNameFS
                                         $ moduleName
                                         $ cc_mod cc)
-  ; loc <- newByteStringCLit $ utf8EncodeString $
+  ; loc <- newByteStringCLit $ utf8EncodeByteString $
                    renderWithContext ctx (ppr $! costCentreSrcSpan cc)
   ; let
      lits = [ zero platform,  -- StgInt ccID,
@@ -297,7 +297,7 @@ emitInfoTableProv ip = do
         ctx      = stgToCmmContext  cfg
         platform = stgToCmmPlatform cfg
   ; let (src, label) = maybe ("", "") (first (renderWithContext ctx . ppr)) (infoTableProv ip)
-        mk_string    = newByteStringCLit . utf8EncodeString
+        mk_string    = newByteStringCLit . utf8EncodeByteString
   ; label <- mk_string label
   ; modl  <- newByteStringCLit (bytesFS $ moduleNameFS
                                         $ moduleName mod)
diff --git a/compiler/GHC/Types/Literal.hs b/compiler/GHC/Types/Literal.hs
index 5b14ecc78d..b525fc94df 100644
--- a/compiler/GHC/Types/Literal.hs
+++ b/compiler/GHC/Types/Literal.hs
@@ -578,7 +578,7 @@ mkLitChar = LitChar
 mkLitString :: String -> Literal
 -- stored UTF-8 encoded
 mkLitString [] = LitString mempty
-mkLitString s  = LitString (utf8EncodeString s)
+mkLitString s  = LitString (utf8EncodeByteString s)
 
 mkLitBigNat :: Integer -> Literal
 mkLitBigNat x = assertPpr (x >= 0) (integer x)
diff --git a/libraries/ghc-boot/GHC/Data/ShortText.hs b/libraries/ghc-boot/GHC/Data/ShortText.hs
index 2b3038ccfc..929b65b481 100644
--- a/libraries/ghc-boot/GHC/Data/ShortText.hs
+++ b/libraries/ghc-boot/GHC/Data/ShortText.hs
@@ -67,14 +67,15 @@ instance IsString ShortText where
 
 -- | /O(n)/ Returns the length of the 'ShortText' in characters.
 codepointLength :: ShortText -> Int
-codepointLength st = unsafeDupablePerformIO $ countUTF8Chars (contents st)
+codepointLength st = utf8CountCharsShortByteString (contents st)
+
 -- | /O(1)/ Returns the length of the 'ShortText' in bytes.
 byteLength :: ShortText -> Int
 byteLength st = SBS.length $ contents st
 
 -- | /O(n)/ Convert a 'String' into a 'ShortText'.
 pack :: String -> ShortText
-pack s = unsafeDupablePerformIO $ ShortText <$> utf8EncodeShortByteString s
+pack s = ShortText $ utf8EncodeShortByteString s
 
 -- | /O(n)/ Convert a 'ShortText' into a 'String'.
 unpack :: ShortText -> String
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding.hs b/libraries/ghc-boot/GHC/Utils/Encoding.hs
index 16386c69ca..183d29946f 100644
--- a/libraries/ghc-boot/GHC/Utils/Encoding.hs
+++ b/libraries/ghc-boot/GHC/Utils/Encoding.hs
@@ -17,21 +17,7 @@
 
 module GHC.Utils.Encoding (
         -- * UTF-8
-        utf8DecodeCharAddr#,
-        utf8PrevChar,
-        utf8CharStart,
-        utf8DecodeChar,
-        utf8DecodeByteString,
-        utf8UnconsByteString,
-        utf8DecodeShortByteString,
-        utf8CompareShortByteString,
-        utf8DecodeStringLazy,
-        utf8EncodeChar,
-        utf8EncodeString,
-        utf8EncodeStringPtr,
-        utf8EncodeShortByteString,
-        utf8EncodedLength,
-        countUTF8Chars,
+        module GHC.Utils.Encoding.UTF8,
 
         -- * Z-encoding
         UserString,
@@ -47,295 +33,11 @@ module GHC.Utils.Encoding (
 import Prelude
 
 import Foreign
-import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
 import Data.Char
 import qualified Data.Char as Char
 import Numeric
-import GHC.IO
-import GHC.ST
 
-import Data.ByteString (ByteString)
-import qualified Data.ByteString.Internal as BS
-import Data.ByteString.Short.Internal (ShortByteString(..))
-
-import GHC.Exts
-
--- -----------------------------------------------------------------------------
--- UTF-8
-
--- We can't write the decoder as efficiently as we'd like without
--- resorting to unboxed extensions, unfortunately.  I tried to write
--- an IO version of this function, but GHC can't eliminate boxed
--- results from an IO-returning function.
---
--- We assume we can ignore overflow when parsing a multibyte character here.
--- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
--- before decoding them (see "GHC.Data.StringBuffer").
-
-{-# INLINE utf8DecodeChar# #-}
-utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
-utf8DecodeChar# indexWord8# =
-  let !ch0 = word2Int# (indexWord8# 0#) in
-  case () of
-    _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
-
-      | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
-        let !ch1 = word2Int# (indexWord8# 1#) in
-        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
-        (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
-                  (ch1 -# 0x80#)),
-           2# #)
-
-      | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
-        let !ch1 = word2Int# (indexWord8# 1#) in
-        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
-        let !ch2 = word2Int# (indexWord8# 2#) in
-        if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
-        (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
-                 ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
-                  (ch2 -# 0x80#)),
-           3# #)
-
-     | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
-        let !ch1 = word2Int# (indexWord8# 1#) in
-        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
-        let !ch2 = word2Int# (indexWord8# 2#) in
-        if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
-        let !ch3 = word2Int# (indexWord8# 3#) in
-        if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
-        (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
-                 ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
-                 ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
-                  (ch3 -# 0x80#)),
-           4# #)
-
-      | otherwise -> fail 1#
-  where
-        -- all invalid sequences end up here:
-        fail :: Int# -> (# Char#, Int# #)
-        fail nBytes# = (# '\0'#, nBytes# #)
-        -- '\xFFFD' would be the usual replacement character, but
-        -- that's a valid symbol in Haskell, so will result in a
-        -- confusing parse error later on.  Instead we use '\0' which
-        -- will signal a lexer error immediately.
-
-utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
-utf8DecodeCharAddr# a# off# =
-#if !MIN_VERSION_base(4,16,0)
-    utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
-#else
-    utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#)))
-#endif
-
-utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
-utf8DecodeCharByteArray# ba# off# =
-#if !MIN_VERSION_base(4,16,0)
-    utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
-#else
-    utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#)))
-#endif
-
-
-utf8DecodeChar :: Ptr Word8 -> (Char, Int)
-utf8DecodeChar !(Ptr a#) =
-  case utf8DecodeCharAddr# a# 0# of
-    (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
-
--- UTF-8 is cleverly designed so that we can always figure out where
--- the start of the current character is, given any position in a
--- stream.  This function finds the start of the previous character,
--- assuming there *is* a previous character.
-utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
-utf8PrevChar p = utf8CharStart (p `plusPtr` (-1))
-
-utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
-utf8CharStart p = go p
- where go p = do w <- peek p
-                 if w >= 0x80 && w < 0xC0
-                        then go (p `plusPtr` (-1))
-                        else return p
-
-{-# INLINE utf8DecodeLazy# #-}
-utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
-utf8DecodeLazy# retain decodeChar# len#
-  = unpack 0#
-  where
-    unpack i#
-        | isTrue# (i# >=# len#) = retain >> return []
-        | otherwise =
-            case decodeChar# i# of
-              (# c#, nBytes# #) -> do
-                rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
-                return (C# c# : rest)
-
-utf8DecodeByteString :: ByteString -> [Char]
-utf8DecodeByteString (BS.PS fptr offset len)
-  = utf8DecodeStringLazy fptr offset len
-
-utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
-utf8UnconsByteString (BS.PS _ _ 0) = Nothing
-utf8UnconsByteString (BS.PS fptr offset len)
-  = unsafeDupablePerformIO $
-      withForeignPtr fptr $ \ptr -> do
-        let (c,n) = utf8DecodeChar (ptr `plusPtr` offset)
-        return $ Just (c, BS.PS fptr (offset + n) (len - n))
-
-utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
-utf8DecodeStringLazy fp offset (I# len#)
-  = unsafeDupablePerformIO $ do
-      let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
-      utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
--- Note that since utf8DecodeLazy# returns a thunk the lifetime of the
--- ForeignPtr actually needs to be longer than the lexical lifetime
--- withForeignPtr would provide here. That's why we use touchForeignPtr to
--- keep the fp alive until the last character has actually been decoded.
-
-utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
-utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0#
-   -- UTF-8 has the property that sorting by bytes values also sorts by
-   -- code-points.
-   -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
-   -- doesn't hold and we must explicitly check this case here.
-   -- Note that decoding every code point would also work but it would be much
-   -- more costly.
-   where
-       !sz1 = sizeofByteArray# a1
-       !sz2 = sizeofByteArray# a2
-       go off1 off2
-         | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
-         | isTrue# (off1 >=# sz1)                          = LT
-         | isTrue# (off2 >=# sz2)                          = GT
-         | otherwise =
-#if !MIN_VERSION_base(4,16,0)
-               let !b1_1 = indexWord8Array# a1 off1
-                   !b2_1 = indexWord8Array# a2 off2
-#else
-               let !b1_1 = word8ToWord# (indexWord8Array# a1 off1)
-                   !b2_1 = word8ToWord# (indexWord8Array# a2 off2)
-#endif
-               in case b1_1 of
-                  0xC0## -> case b2_1 of
-                     0xC0## -> go (off1 +# 1#) (off2 +# 1#)
-#if !MIN_VERSION_base(4,16,0)
-                     _      -> case indexWord8Array# a1 (off1 +# 1#) of
-#else
-                     _      -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of
-#endif
-                        0x80## -> LT
-                        _      -> go (off1 +# 1#) (off2 +# 1#)
-                  _      -> case b2_1 of
-#if !MIN_VERSION_base(4,16,0)
-                     0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
-#else
-                     0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of
-#endif
-                        0x80## -> GT
-                        _      -> go (off1 +# 1#) (off2 +# 1#)
-                     _   | isTrue# (b1_1 `gtWord#` b2_1) -> GT
-                         | isTrue# (b1_1 `ltWord#` b2_1) -> LT
-                         | otherwise                     -> go (off1 +# 1#) (off2 +# 1#)
-
-utf8DecodeShortByteString :: ShortByteString -> [Char]
-utf8DecodeShortByteString (SBS ba#)
-  = unsafeDupablePerformIO $
-      let len# = sizeofByteArray# ba# in
-      utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len#
-
-countUTF8Chars :: ShortByteString -> IO Int
-countUTF8Chars (SBS ba) = go 0# 0#
-  where
-    len# = sizeofByteArray# ba
-    go i# n#
-      | isTrue# (i# >=# len#) =
-          return (I# n#)
-      | otherwise = do
-          case utf8DecodeCharByteArray# ba i# of
-            (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
-
-{-# INLINE utf8EncodeChar #-}
-utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
-               -> Char -> ST s Int
-utf8EncodeChar write# c =
-  let x = fromIntegral (ord c) in
-  case () of
-    _ | x > 0 && x <= 0x007f -> do
-          write 0 x
-          return 1
-        -- NB. '\0' is encoded as '\xC0\x80', not '\0'.  This is so that we
-        -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
-      | x <= 0x07ff -> do
-          write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
-          write 1 (0x80 .|. (x .&. 0x3F))
-          return 2
-      | x <= 0xffff -> do
-          write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
-          write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
-          write 2 (0x80 .|. (x .&. 0x3F))
-          return 3
-      | otherwise -> do
-          write 0 (0xF0 .|. (x `shiftR` 18))
-          write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
-          write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
-          write 3 (0x80 .|. (x .&. 0x3F))
-          return 4
-  where
-    {-# INLINE write #-}
-    write (I# off#) (W# c#) = ST $ \s ->
-#if !MIN_VERSION_base(4,16,0)
-      case write# off# (narrowWord8# c#) s of
-#else
-      case write# off# (wordToWord8# c#) s of
-#endif
-        s -> (# s, () #)
-
-utf8EncodeString :: String -> ByteString
-utf8EncodeString s =
-  unsafePerformIO $ do
-    let len = utf8EncodedLength s
-    buf <- mallocForeignPtrBytes len
-    withForeignPtr buf $ \ptr -> do
-      utf8EncodeStringPtr ptr s
-      pure (BS.fromForeignPtr buf 0 len)
-
-utf8EncodeStringPtr :: Ptr Word8 -> String -> IO ()
-utf8EncodeStringPtr (Ptr a#) str = go a# str
-  where go !_   []   = return ()
-        go a# (c:cs) = do
-#if !MIN_VERSION_base(4,16,0)
-          -- writeWord8OffAddr# was taking a Word#
-          I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
-#else
-          I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
-#endif
-          go (a# `plusAddr#` off#) cs
-
-utf8EncodeShortByteString :: String -> IO ShortByteString
-utf8EncodeShortByteString str = IO $ \s ->
-  case utf8EncodedLength str         of { I# len# ->
-  case newByteArray# len# s          of { (# s, mba# #) ->
-  case go mba# 0# str                of { ST f_go ->
-  case f_go s                        of { (# s, () #) ->
-  case unsafeFreezeByteArray# mba# s of { (# s, ba# #) ->
-  (# s, SBS ba# #) }}}}}
-  where
-    go _ _ [] = return ()
-    go mba# i# (c:cs) = do
-#if !MIN_VERSION_base(4,16,0)
-      -- writeWord8Array# was taking a Word#
-      I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
-#else
-      I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
-#endif
-      go mba# (i# +# off#) cs
-
-utf8EncodedLength :: String -> Int
-utf8EncodedLength str = go 0 str
-  where go !n [] = n
-        go n (c:cs)
-          | ord c > 0 && ord c <= 0x007f = go (n+1) cs
-          | ord c <= 0x07ff = go (n+2) cs
-          | ord c <= 0xffff = go (n+3) cs
-          | otherwise       = go (n+4) cs
+import GHC.Utils.Encoding.UTF8
 
 -- -----------------------------------------------------------------------------
 -- Note [Z-Encoding]
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs
new file mode 100644
index 0000000000..d1d470ab25
--- /dev/null
+++ b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs
@@ -0,0 +1,344 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, MultiWayIf #-}
+{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}
+-- We always optimise this, otherwise performance of a non-optimised
+-- compiler is severely affected. This module used to live in the `ghc`
+-- package but has been moved to `ghc-boot` because the definition
+-- of the package database (needed in both ghc and in ghc-pkg) lives in
+-- `ghc-boot` and uses ShortText, which in turn depends on this module.
+
+-- | Simple, non-streaming UTF-8 codecs.
+module GHC.Utils.Encoding.UTF8
+    ( -- * Decoding single characters
+      utf8DecodeCharAddr#
+    , utf8DecodeCharPtr
+    , utf8DecodeCharByteArray#
+    , utf8PrevChar
+    , utf8CharStart
+    , utf8UnconsByteString
+      -- * Decoding strings
+    , utf8DecodeByteString
+    , utf8DecodeShortByteString
+    , utf8DecodeForeignPtr
+    , utf8DecodeByteArray#
+      -- * Counting characters
+    , utf8CountCharsShortByteString
+    , utf8CountCharsByteArray#
+      -- * Comparison
+    , utf8CompareByteArray#
+    , utf8CompareShortByteString
+      -- * Encoding strings
+    , utf8EncodeByteArray#
+    , utf8EncodePtr
+    , utf8EncodeByteString
+    , utf8EncodeShortByteString
+    , utf8EncodedLength
+    ) where
+
+
+import Prelude
+
+import Foreign
+import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
+import Data.Char
+import GHC.IO
+import GHC.ST
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Internal as BS
+import Data.ByteString.Short.Internal (ShortByteString(..))
+
+import GHC.Exts
+
+-- | Find the start of the codepoint preceding the codepoint at the given
+-- 'Ptr'. This is undefined if there is no previous valid codepoint.
+utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
+utf8PrevChar p = utf8CharStart (p `plusPtr` (-1))
+
+-- | Find the start of the codepoint at the given 'Ptr'. This is undefined if
+-- there is no previous valid codepoint.
+utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
+utf8CharStart p = go p
+ where go p = do w <- peek p
+                 if w >= 0x80 && w < 0xC0
+                        then go (p `plusPtr` (-1))
+                        else return p
+
+utf8CountCharsShortByteString :: ShortByteString -> Int
+utf8CountCharsShortByteString (SBS ba) = utf8CountCharsByteArray# ba
+
+utf8DecodeShortByteString :: ShortByteString -> [Char]
+utf8DecodeShortByteString (SBS ba#) = utf8DecodeByteArray# ba#
+
+-- | Decode a 'ByteString' containing a UTF-8 string.
+utf8DecodeByteString :: ByteString -> [Char]
+utf8DecodeByteString (BS.PS fptr offset len)
+  = utf8DecodeForeignPtr fptr offset len
+
+utf8EncodeShortByteString :: String -> ShortByteString
+utf8EncodeShortByteString str = SBS (utf8EncodeByteArray# str)
+
+-- | Encode a 'String' into a 'ByteString'.
+utf8EncodeByteString :: String -> ByteString
+utf8EncodeByteString s =
+  unsafePerformIO $ do
+    let len = utf8EncodedLength s
+    buf <- mallocForeignPtrBytes len
+    withForeignPtr buf $ \ptr -> do
+      utf8EncodePtr ptr s
+      pure (BS.fromForeignPtr buf 0 len)
+
+utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
+utf8UnconsByteString (BS.PS _ _ 0) = Nothing
+utf8UnconsByteString (BS.PS fptr offset len)
+  = unsafeDupablePerformIO $
+      withForeignPtr fptr $ \ptr -> do
+        let (c,n) = utf8DecodeCharPtr (ptr `plusPtr` offset)
+        return $ Just (c, BS.PS fptr (offset + n) (len - n))
+
+utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
+utf8CompareShortByteString (SBS a1) (SBS a2) = utf8CompareByteArray# a1 a2
+
+-- We can't write the decoder as efficiently as we'd like without
+-- resorting to unboxed extensions, unfortunately.  I tried to write
+-- an IO version of this function, but GHC can't eliminate boxed
+-- results from an IO-returning function.
+--
+-- We assume we can ignore overflow when parsing a multibyte character here.
+-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
+-- before decoding them (see "GHC.Data.StringBuffer").
+
+{-# INLINE utf8DecodeChar# #-}
+-- | Decode a single codepoint from a byte buffer indexed by the given indexing
+-- function.
+utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
+utf8DecodeChar# indexWord8# =
+  let !ch0 = word2Int# (indexWord8# 0#) in
+  case () of
+    _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
+
+      | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
+                  (ch1 -# 0x80#)),
+           2# #)
+
+      | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        let !ch2 = word2Int# (indexWord8# 2#) in
+        if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+        (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+                 ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
+                  (ch2 -# 0x80#)),
+           3# #)
+
+     | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        let !ch2 = word2Int# (indexWord8# 2#) in
+        if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+        let !ch3 = word2Int# (indexWord8# 3#) in
+        if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
+        (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+                 ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
+                 ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
+                  (ch3 -# 0x80#)),
+           4# #)
+
+      | otherwise -> fail 1#
+  where
+        -- all invalid sequences end up here:
+        fail :: Int# -> (# Char#, Int# #)
+        fail nBytes# = (# '\0'#, nBytes# #)
+        -- '\xFFFD' would be the usual replacement character, but
+        -- that's a valid symbol in Haskell, so will result in a
+        -- confusing parse error later on.  Instead we use '\0' which
+        -- will signal a lexer error immediately.
+
+-- | Decode a single character at the given 'Addr#'.
+utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharAddr# a# off# =
+#if !MIN_VERSION_base(4,16,0)
+    utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
+#else
+    utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#)))
+#endif
+
+-- | Decode a single codepoint starting at the given 'Ptr'.
+utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int)
+utf8DecodeCharPtr !(Ptr a#) =
+  case utf8DecodeCharAddr# a# 0# of
+    (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
+
+-- | Decode a single codepoint starting at the given byte offset into a
+-- 'ByteArray#'.
+utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharByteArray# ba# off# =
+#if !MIN_VERSION_base(4,16,0)
+    utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
+#else
+    utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#)))
+#endif
+
+{-# INLINE utf8Decode# #-}
+utf8Decode# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
+utf8Decode# retain decodeChar# len#
+  = unpack 0#
+  where
+    unpack i#
+        | isTrue# (i# >=# len#) = retain >> return []
+        | otherwise =
+            case decodeChar# i# of
+              (# c#, nBytes# #) -> do
+                rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
+                return (C# c# : rest)
+
+utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char]
+utf8DecodeForeignPtr fp offset (I# len#)
+  = unsafeDupablePerformIO $ do
+      let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
+      utf8Decode# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
+-- Note that since utf8Decode# returns a thunk the lifetime of the
+-- ForeignPtr actually needs to be longer than the lexical lifetime
+-- withForeignPtr would provide here. That's why we use touchForeignPtr to
+-- keep the fp alive until the last character has actually been decoded.
+
+utf8DecodeByteArray# :: ByteArray# -> [Char]
+utf8DecodeByteArray# ba#
+  = unsafeDupablePerformIO $
+      let len# = sizeofByteArray# ba# in
+      utf8Decode# (return ()) (utf8DecodeCharByteArray# ba#) len#
+
+utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering
+utf8CompareByteArray# a1 a2 = go 0# 0#
+   -- UTF-8 has the property that sorting by bytes values also sorts by
+   -- code-points.
+   -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
+   -- doesn't hold and we must explicitly check this case here.
+   -- Note that decoding every code point would also work but it would be much
+   -- more costly.
+   where
+       !sz1 = sizeofByteArray# a1
+       !sz2 = sizeofByteArray# a2
+       go off1 off2
+         | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
+         | isTrue# (off1 >=# sz1)                          = LT
+         | isTrue# (off2 >=# sz2)                          = GT
+         | otherwise =
+#if !MIN_VERSION_base(4,16,0)
+               let !b1_1 = indexWord8Array# a1 off1
+                   !b2_1 = indexWord8Array# a2 off2
+#else
+               let !b1_1 = word8ToWord# (indexWord8Array# a1 off1)
+                   !b2_1 = word8ToWord# (indexWord8Array# a2 off2)
+#endif
+               in case b1_1 of
+                  0xC0## -> case b2_1 of
+                     0xC0## -> go (off1 +# 1#) (off2 +# 1#)
+#if !MIN_VERSION_base(4,16,0)
+                     _      -> case indexWord8Array# a1 (off1 +# 1#) of
+#else
+                     _      -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of
+#endif
+                        0x80## -> LT
+                        _      -> go (off1 +# 1#) (off2 +# 1#)
+                  _      -> case b2_1 of
+#if !MIN_VERSION_base(4,16,0)
+                     0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
+#else
+                     0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of
+#endif
+                        0x80## -> GT
+                        _      -> go (off1 +# 1#) (off2 +# 1#)
+                     _   | isTrue# (b1_1 `gtWord#` b2_1) -> GT
+                         | isTrue# (b1_1 `ltWord#` b2_1) -> LT
+                         | otherwise                     -> go (off1 +# 1#) (off2 +# 1#)
+
+utf8CountCharsByteArray# :: ByteArray# -> Int
+utf8CountCharsByteArray# ba = go 0# 0#
+  where
+    len# = sizeofByteArray# ba
+    go i# n#
+      | isTrue# (i# >=# len#) = I# n#
+      | otherwise =
+          case utf8DecodeCharByteArray# ba i# of
+            (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
+
+{-# INLINE utf8EncodeChar #-}
+utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
+               -> Char -> ST s Int
+utf8EncodeChar write# c =
+  let x = fromIntegral (ord c) in
+  case () of
+    _ | x > 0 && x <= 0x007f -> do
+          write 0 x
+          return 1
+        -- NB. '\0' is encoded as '\xC0\x80', not '\0'.  This is so that we
+        -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
+      | x <= 0x07ff -> do
+          write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
+          write 1 (0x80 .|. (x .&. 0x3F))
+          return 2
+      | x <= 0xffff -> do
+          write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
+          write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
+          write 2 (0x80 .|. (x .&. 0x3F))
+          return 3
+      | otherwise -> do
+          write 0 (0xF0 .|. (x `shiftR` 18))
+          write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
+          write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
+          write 3 (0x80 .|. (x .&. 0x3F))
+          return 4
+  where
+    {-# INLINE write #-}
+    write (I# off#) (W# c#) = ST $ \s ->
+#if !MIN_VERSION_base(4,16,0)
+      case write# off# (narrowWord8# c#) s of
+#else
+      case write# off# (wordToWord8# c#) s of
+#endif
+        s -> (# s, () #)
+
+utf8EncodePtr :: Ptr Word8 -> String -> IO ()
+utf8EncodePtr (Ptr a#) str = go a# str
+  where go !_   []   = return ()
+        go a# (c:cs) = do
+#if !MIN_VERSION_base(4,16,0)
+          -- writeWord8OffAddr# was taking a Word#
+          I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
+#else
+          I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
+#endif
+          go (a# `plusAddr#` off#) cs
+
+utf8EncodeByteArray# :: String -> ByteArray#
+utf8EncodeByteArray# str = runRW# $ \s ->
+  case utf8EncodedLength str         of { I# len# ->
+  case newByteArray# len# s          of { (# s, mba# #) ->
+  case go mba# 0# str                of { ST f_go ->
+  case f_go s                        of { (# s, () #) ->
+  case unsafeFreezeByteArray# mba# s of { (# _, ba# #) ->
+  ba# }}}}}
+  where
+    go _ _ [] = return ()
+    go mba# i# (c:cs) = do
+#if !MIN_VERSION_base(4,16,0)
+      -- writeWord8Array# was taking a Word#
+      I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
+#else
+      I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
+#endif
+      go mba# (i# +# off#) cs
+
+utf8EncodedLength :: String -> Int
+utf8EncodedLength str = go 0 str
+  where go !n [] = n
+        go n (c:cs)
+          | ord c > 0 && ord c <= 0x007f = go (n+1) cs
+          | ord c <= 0x07ff = go (n+2) cs
+          | ord c <= 0xffff = go (n+3) cs
+          | otherwise       = go (n+4) cs
+
diff --git a/libraries/ghc-boot/ghc-boot.cabal.in b/libraries/ghc-boot/ghc-boot.cabal.in
index 3ca83a5e1c..531d445b57 100644
--- a/libraries/ghc-boot/ghc-boot.cabal.in
+++ b/libraries/ghc-boot/ghc-boot.cabal.in
@@ -45,6 +45,7 @@ Library
             GHC.Data.ShortText
             GHC.Data.SizedSeq
             GHC.Utils.Encoding
+            GHC.Utils.Encoding.UTF8
             GHC.LanguageExtensions
             GHC.Unit.Database
             GHC.Serialized
author	Ben Gamari <ben@smart-cactus.org>	2022-07-16 13:54:57 -0400
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2022-07-22 08:18:41 -0400
commit	ae1666353696b5d85938d8a2f5fb11fb66f21678 (patch)
tree	7de167e7c2f27ec2acc300ba815677878c1bd738
parent	2c5991ccaf45cb7e68e54d59a27ee144a4499edb (diff)
download	haskell-ae1666353696b5d85938d8a2f5fb11fb66f21678.tar.gz