base: Introduce GHC.Encoding.UTF8

Here we copy a subset of the UTF-8 implementation living in `ghc-boot` into `base`, with the intent of dropping the former in the future. For this reason, the `ghc-boot` copy is now CPP-guarded on `MIN_VERSION_base(4,18,0)`. Naturally, we can't copy *all* of the functions defined by `ghc-boot` as some depend upon `bytestring`; we rather just copy those which only depend upon `base` and `ghc-prim`. Further consolidation? ---------------------- Currently GHC ships with at least five UTF-8 implementations: * the implementation used by GHC in `ghc-boot:GHC.Utils.Encoding`; this can be used at a number of types including `Addr#`, `ByteArray#`, `ForeignPtr`, `Ptr`, `ShortByteString`, and `ByteString`. Most of this can be removed in GHC 9.6+2, when the copies in `base` will become available to `ghc-boot`. * the copy of the `ghc-boot` definition now exported by `base:GHC.Encoding.UTF8`. This can be used at `Addr#`, `Ptr`, `ByteArray#`, and `ForeignPtr` * the decoder used by `unpackCStringUtf8#` in `ghc-prim:GHC.CString`; this is specialised at `Addr#`. * the codec used by the IO subsystem in `base:GHC.IO.Encoding.UTF8`; this is specialised at `Addr#` but, unlike the above, supports recovery in the presence of partial codepoints (since in IO contexts codepoints may be broken across buffers) * the implementation provided by the `text` library This does seem a tad silly. On the other hand, these implementations *do* materially differ from one another (e.g. in the types they support, the detail in errors they can report, and the ability to recover from partial codepoints). Consequently, it's quite unclear that further consolidate would be worthwhile.
author: Ben Gamari <ben@smart-cactus.org> 2022-07-16 15:57:15 -0400
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2022-07-22 08:18:41 -0400
commit: e8ac91dbaeeb1296c4ae22506b01faf3b6579e9a (patch)
tree: 613e766873eb151289bafe7e128f9c88e2430bf9
parent: ae1666353696b5d85938d8a2f5fb11fb66f21678 (diff)
download: haskell-e8ac91dbaeeb1296c4ae22506b01faf3b6579e9a.tar.gz
3 files changed, 288 insertions, 0 deletions
diff --git a/libraries/base/GHC/Encoding/UTF8.hs b/libraries/base/GHC/Encoding/UTF8.hs
new file mode 100644
index 0000000000..4563ce2e61
--- /dev/null
+++ b/libraries/base/GHC/Encoding/UTF8.hs
@@ -0,0 +1,278 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, NoImplicitPrelude #-}
+{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}
+
+-- | Simple UTF-8 codecs supporting non-streaming encoding/decoding.
+-- For encoding where codepoints may be broken across buffers,
+-- see "GHC.IO.Encoding.UTF8".
+module GHC.Encoding.UTF8
+    ( -- * Decoding single characters
+      utf8DecodeCharAddr#
+    , utf8DecodeCharPtr
+    , utf8DecodeCharByteArray#
+      -- * Decoding strings
+    , utf8DecodeByteArray#
+    , utf8DecodeForeignPtr
+      -- * Counting characters
+    , utf8CountCharsByteArray#
+      -- * Comparison
+    , utf8CompareByteArray#
+      -- * Encoding strings
+    , utf8EncodePtr
+    , utf8EncodeByteArray#
+    , utf8EncodedLength
+    ) where
+
+import GHC.Types
+import GHC.Base
+import GHC.IO
+import GHC.ST
+import GHC.Word
+import GHC.ForeignPtr
+import GHC.Num
+import GHC.Bits
+import GHC.Real
+import GHC.Ptr
+
+-- We can't write the decoder as efficiently as we'd like without
+-- resorting to unboxed extensions, unfortunately.  I tried to write
+-- an IO version of this function, but GHC can't eliminate boxed
+-- results from an IO-returning function.
+--
+-- We assume we can ignore overflow when parsing a multibyte character here.
+-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
+-- before decoding them (see "GHC.Data.StringBuffer").
+
+{-# INLINE utf8DecodeChar# #-}
+-- | Decode a single codepoint from a byte buffer indexed by the given indexing
+-- function.
+utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
+utf8DecodeChar# indexWord8# =
+  let !ch0 = word2Int# (indexWord8# 0#) in
+  case () of
+    _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
+
+      | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
+                  (ch1 -# 0x80#)),
+           2# #)
+
+      | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        let !ch2 = word2Int# (indexWord8# 2#) in
+        if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+        (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+                 ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
+                  (ch2 -# 0x80#)),
+           3# #)
+
+     | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        let !ch2 = word2Int# (indexWord8# 2#) in
+        if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+        let !ch3 = word2Int# (indexWord8# 3#) in
+        if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
+        (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+                 ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
+                 ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
+                  (ch3 -# 0x80#)),
+           4# #)
+
+      | otherwise -> fail 1#
+  where
+        -- all invalid sequences end up here:
+        fail :: Int# -> (# Char#, Int# #)
+        fail nBytes# = (# '\0'#, nBytes# #)
+        -- '\xFFFD' would be the usual replacement character, but
+        -- that's a valid symbol in Haskell, so will result in a
+        -- confusing parse error later on.  Instead we use '\0' which
+        -- will signal a lexer error immediately.
+
+-- | Decode a single character at the given 'Addr#'.
+utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharAddr# a# off# =
+#if !MIN_VERSION_base(4,16,0)
+    utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
+#else
+    utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#)))
+#endif
+
+-- | Decode a single codepoint starting at the given 'Ptr'.
+utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int)
+utf8DecodeCharPtr !(Ptr a#) =
+  case utf8DecodeCharAddr# a# 0# of
+    (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
+
+-- | Decode a single codepoint starting at the given byte offset into a
+-- 'ByteArray#'.
+utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharByteArray# ba# off# =
+#if !MIN_VERSION_base(4,16,0)
+    utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
+#else
+    utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#)))
+#endif
+
+{-# INLINE utf8Decode# #-}
+utf8Decode# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
+utf8Decode# retain decodeChar# len#
+  = unpack 0#
+  where
+    unpack i#
+        | isTrue# (i# >=# len#) = retain >> return []
+        | otherwise =
+            case decodeChar# i# of
+              (# c#, nBytes# #) -> do
+                rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
+                return (C# c# : rest)
+
+utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char]
+utf8DecodeForeignPtr fp offset (I# len#)
+  = unsafeDupablePerformIO $ do
+      let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
+      utf8Decode# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
+-- Note that since utf8Decode# returns a thunk the lifetime of the
+-- ForeignPtr actually needs to be longer than the lexical lifetime
+-- withForeignPtr would provide here. That's why we use touchForeignPtr to
+-- keep the fp alive until the last character has actually been decoded.
+
+utf8DecodeByteArray# :: ByteArray# -> [Char]
+utf8DecodeByteArray# ba#
+  = unsafeDupablePerformIO $
+      let len# = sizeofByteArray# ba# in
+      utf8Decode# (return ()) (utf8DecodeCharByteArray# ba#) len#
+
+utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering
+utf8CompareByteArray# a1 a2 = go 0# 0#
+   -- UTF-8 has the property that sorting by bytes values also sorts by
+   -- code-points.
+   -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
+   -- doesn't hold and we must explicitly check this case here.
+   -- Note that decoding every code point would also work but it would be much
+   -- more costly.
+   where
+       !sz1 = sizeofByteArray# a1
+       !sz2 = sizeofByteArray# a2
+       go off1 off2
+         | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
+         | isTrue# (off1 >=# sz1)                          = LT
+         | isTrue# (off2 >=# sz2)                          = GT
+         | otherwise =
+#if !MIN_VERSION_base(4,16,0)
+               let !b1_1 = indexWord8Array# a1 off1
+                   !b2_1 = indexWord8Array# a2 off2
+#else
+               let !b1_1 = word8ToWord# (indexWord8Array# a1 off1)
+                   !b2_1 = word8ToWord# (indexWord8Array# a2 off2)
+#endif
+               in case b1_1 of
+                  0xC0## -> case b2_1 of
+                     0xC0## -> go (off1 +# 1#) (off2 +# 1#)
+#if !MIN_VERSION_base(4,16,0)
+                     _      -> case indexWord8Array# a1 (off1 +# 1#) of
+#else
+                     _      -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of
+#endif
+                        0x80## -> LT
+                        _      -> go (off1 +# 1#) (off2 +# 1#)
+                  _      -> case b2_1 of
+#if !MIN_VERSION_base(4,16,0)
+                     0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
+#else
+                     0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of
+#endif
+                        0x80## -> GT
+                        _      -> go (off1 +# 1#) (off2 +# 1#)
+                     _   | isTrue# (b1_1 `gtWord#` b2_1) -> GT
+                         | isTrue# (b1_1 `ltWord#` b2_1) -> LT
+                         | otherwise                     -> go (off1 +# 1#) (off2 +# 1#)
+
+utf8CountCharsByteArray# :: ByteArray# -> Int
+utf8CountCharsByteArray# ba = go 0# 0#
+  where
+    len# = sizeofByteArray# ba
+    go i# n#
+      | isTrue# (i# >=# len#) = I# n#
+      | otherwise =
+          case utf8DecodeCharByteArray# ba i# of
+            (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
+
+{-# INLINE utf8EncodeChar #-}
+utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
+               -> Char -> ST s Int
+utf8EncodeChar write# c =
+  let x = fromIntegral (ord c) in
+  case () of
+    _ | x > 0 && x <= 0x007f -> do
+          write 0 x
+          return 1
+        -- NB. '\0' is encoded as '\xC0\x80', not '\0'.  This is so that we
+        -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
+      | x <= 0x07ff -> do
+          write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
+          write 1 (0x80 .|. (x .&. 0x3F))
+          return 2
+      | x <= 0xffff -> do
+          write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
+          write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
+          write 2 (0x80 .|. (x .&. 0x3F))
+          return 3
+      | otherwise -> do
+          write 0 (0xF0 .|. (x `shiftR` 18))
+          write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
+          write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
+          write 3 (0x80 .|. (x .&. 0x3F))
+          return 4
+  where
+    {-# INLINE write #-}
+    write (I# off#) (W# c#) = ST $ \s ->
+#if !MIN_VERSION_base(4,16,0)
+      case write# off# (narrowWord8# c#) s of
+#else
+      case write# off# (wordToWord8# c#) s of
+#endif
+        s -> (# s, () #)
+
+utf8EncodePtr :: Ptr Word8 -> String -> IO ()
+utf8EncodePtr (Ptr a#) str = go a# str
+  where go !_   []   = return ()
+        go a# (c:cs) = do
+#if !MIN_VERSION_base(4,16,0)
+          -- writeWord8OffAddr# was taking a Word#
+          I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
+#else
+          I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
+#endif
+          go (a# `plusAddr#` off#) cs
+
+utf8EncodeByteArray# :: String -> ByteArray#
+utf8EncodeByteArray# str = runRW# $ \s ->
+  case utf8EncodedLength str         of { I# len# ->
+  case newByteArray# len# s          of { (# s, mba# #) ->
+  case go mba# 0# str                of { ST f_go ->
+  case f_go s                        of { (# s, () #) ->
+  case unsafeFreezeByteArray# mba# s of { (# _, ba# #) ->
+  ba# }}}}}
+  where
+    go _ _ [] = return ()
+    go mba# i# (c:cs) = do
+#if !MIN_VERSION_base(4,16,0)
+      -- writeWord8Array# was taking a Word#
+      I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
+#else
+      I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
+#endif
+      go mba# (i# +# off#) cs
+
+utf8EncodedLength :: String -> Int
+utf8EncodedLength str = go 0 str
+  where go !n [] = n
+        go n (c:cs)
+          | ord c > 0 && ord c <= 0x007f = go (n+1) cs
+          | ord c <= 0x07ff = go (n+2) cs
+          | ord c <= 0xffff = go (n+3) cs
+          | otherwise       = go (n+4) cs
diff --git a/libraries/base/base.cabal b/libraries/base/base.cabal
index 58e11e30f7..6965827c0a 100644
--- a/libraries/base/base.cabal
+++ b/libraries/base/base.cabal
@@ -202,6 +202,7 @@ Library
         GHC.ConsoleHandler
         GHC.Constants
         GHC.Desugar
+        GHC.Encoding.UTF8
         GHC.Enum
         GHC.Environment
         GHC.Err
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs
index d1d470ab25..fd63f84b0b 100644
--- a/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs
+++ b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs
@@ -99,6 +99,14 @@ utf8UnconsByteString (BS.PS fptr offset len)
 utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
 utf8CompareShortByteString (SBS a1) (SBS a2) = utf8CompareByteArray# a1 a2
 
+---------------------------------------------------------
+-- Everything below was moved into base in GHC 9.6
+--
+-- These can be dropped in GHC 9.6 + 2 major releases.
+---------------------------------------------------------
+
+#if !MIN_VERSION_base(4,18,0)
+
 -- We can't write the decoder as efficiently as we'd like without
 -- resorting to unboxed extensions, unfortunately.  I tried to write
 -- an IO version of this function, but GHC can't eliminate boxed
@@ -342,3 +350,4 @@ utf8EncodedLength str = go 0 str
           | ord c <= 0xffff = go (n+3) cs
           | otherwise       = go (n+4) cs
 
+#endif /* MIN_VERSION_base(4,18,0) */
author	Ben Gamari <ben@smart-cactus.org>	2022-07-16 15:57:15 -0400
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2022-07-22 08:18:41 -0400
commit	e8ac91dbaeeb1296c4ae22506b01faf3b6579e9a (patch)
tree	613e766873eb151289bafe7e128f9c88e2430bf9
parent	ae1666353696b5d85938d8a2f5fb11fb66f21678 (diff)
download	haskell-e8ac91dbaeeb1296c4ae22506b01faf3b6579e9a.tar.gz