diff options
author | Ben Gamari <ben@smart-cactus.org> | 2022-07-17 10:48:25 -0400 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2022-07-19 01:45:54 -0400 |
commit | 2c614d1868c3ea9c8d8bee6dd1f5b03f437903de (patch) | |
tree | dd90b486a9be9f1f9b08700c6e731e99fc1cfbdb | |
parent | 24c7168d87ba5aed1db9ef06bf00f99249b91d3e (diff) | |
download | haskell-wip/base-utf8-codecs.tar.gz |
Add a Note summarising GHC's UTF-8 implementationswip/base-utf8-codecs
GHC has a somewhat dizzying array of UTF-8 implementations. This note
describes why this is the case.
-rw-r--r-- | libraries/base/GHC/Encoding/UTF8.hs | 40 | ||||
-rw-r--r-- | libraries/base/GHC/IO/Encoding/UTF8.hs | 4 | ||||
-rw-r--r-- | libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs | 5 | ||||
-rw-r--r-- | libraries/ghc-prim/GHC/CString.hs | 7 |
4 files changed, 54 insertions, 2 deletions
diff --git a/libraries/base/GHC/Encoding/UTF8.hs b/libraries/base/GHC/Encoding/UTF8.hs index 4563ce2e61..72ac176263 100644 --- a/libraries/base/GHC/Encoding/UTF8.hs +++ b/libraries/base/GHC/Encoding/UTF8.hs @@ -5,6 +5,11 @@ -- | Simple UTF-8 codecs supporting non-streaming encoding/decoding. -- For encoding where codepoints may be broken across buffers, -- see "GHC.IO.Encoding.UTF8". +-- +-- This is one of several UTF-8 implementations provided by GHC; see Note +-- [GHC's many UTF-8 implementations] in "GHC.Encoding.UTF8" for an +-- overview. +-- module GHC.Encoding.UTF8 ( -- * Decoding single characters utf8DecodeCharAddr# @@ -34,6 +39,41 @@ import GHC.Bits import GHC.Real import GHC.Ptr +{- +Note [GHC's many UTF-8 implementations] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Currently GHC ships with at least five UTF-8 implementations: + +a. the implementation used by GHC in `ghc-boot:GHC.Utils.Encoding`; this can be + used at a number of types including `Addr#`, `ByteArray#`, `ForeignPtr`, + `Ptr`, `ShortByteString`, and `ByteString`. Most of this can be removed in + GHC 9.6+2, when the copies in `base` will become available to `ghc-boot`. + +b. the copy of the `ghc-boot` definition now exported by `base:GHC.Encoding.UTF8`. + This can be used at `Addr#`, `Ptr`, `ByteArray#`, and `ForeignPtr`. + +c. the decoder used by `unpackCStringUtf8#` in `ghc-prim:GHC.CString`; this is + specialised at `Addr#`. + +d. the codec used by the IO subsystem in `base:GHC.IO.Encoding.UTF8`; this is + specialised at `Addr#` but, unlike the above, supports recovery in the presence + of partial codepoints (since in IO contexts codepoints may be broken across + buffers) + +e. the implementation provided by the `text` library + +On its face, this seems a tad silly. On the other hand, these implementations do +materially differ from one another (e.g. in the types they support, the +detail in errors they can report, and the ability to recover from partial +codepoints). Consequently, it's quite unclear that further consolidation +would be worthwhile. + +The most obvious opportunity is to move (b) into `ghc-prim` and use it to +implement (c) (namely `unpackCStringUtf8#` and friends). However, it's not +clear that this would be worthwhile as several of the types supported by (b) +are defined in `base`. +-} + -- We can't write the decoder as efficiently as we'd like without -- resorting to unboxed extensions, unfortunately. I tried to write -- an IO version of this function, but GHC can't eliminate boxed diff --git a/libraries/base/GHC/IO/Encoding/UTF8.hs b/libraries/base/GHC/IO/Encoding/UTF8.hs index 4513ab68b6..a8d30d9749 100644 --- a/libraries/base/GHC/IO/Encoding/UTF8.hs +++ b/libraries/base/GHC/IO/Encoding/UTF8.hs @@ -18,6 +18,10 @@ -- -- UTF-8 Codec for the IO library -- +-- This is one of several UTF-8 implementations provided by GHC; see Note +-- [GHC's many UTF-8 implementations] in "GHC.Encoding.UTF8" for an +-- overview. +-- -- Portions Copyright : (c) Tom Harper 2008-2009, -- (c) Bryan O'Sullivan 2009, -- (c) Duncan Coutts 2009 diff --git a/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs index fd63f84b0b..91e7b381c9 100644 --- a/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs +++ b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs @@ -8,6 +8,11 @@ -- `ghc-boot` and uses ShortText, which in turn depends on this module. -- | Simple, non-streaming UTF-8 codecs. +-- +-- This is one of several UTF-8 implementations provided by GHC; see Note +-- [GHC's many UTF-8 implementations] in "GHC.Encoding.UTF8" for an +-- overview. +-- module GHC.Utils.Encoding.UTF8 ( -- * Decoding single characters utf8DecodeCharAddr# diff --git a/libraries/ghc-prim/GHC/CString.hs b/libraries/ghc-prim/GHC/CString.hs index 680d3c8a39..003bd9af65 100644 --- a/libraries/ghc-prim/GHC/CString.hs +++ b/libraries/ghc-prim/GHC/CString.hs @@ -225,7 +225,6 @@ unpackCStringUtf8# addr -- See Note [unpackCString# iterating over addr] !ch = indexCharOffAddr# addr 0# - unpackAppendCStringUtf8# :: Addr# -> [Char] -> [Char] {-# NOINLINE unpackAppendCStringUtf8# #-} -- See the NOINLINE note on unpackCString# @@ -288,9 +287,13 @@ cstringLength# = c_strlen ------------------------------ ---- UTF8 decoding utilities +--- UTF-8 decoding utilities ------------------------------ -- +-- This is one of several UTF-8 implementations provided by GHC; see Note +-- [GHC's many UTF-8 implementations] in "GHC.Encoding.UTF8" for an +-- overview. +-- -- These functions make explicit the logic that was originally -- part of unpackCStringUtf8. Since we want the same support for ascii -- and non-ascii a variety of functions needs the same logic. Instead |