summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libraries/base/GHC/IO/Encoding.hs28
1 files changed, 24 insertions, 4 deletions
diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs
index bd541828e5..052955c2ea 100644
--- a/libraries/base/GHC/IO/Encoding.hs
+++ b/libraries/base/GHC/IO/Encoding.hs
@@ -175,8 +175,8 @@ char8 = Latin1.latin1
--
-- * @UTF-32@, @UTF-32BE@, @UTF-32LE@
--
--- On systems using GNU iconv (e.g. Linux), there is additional
--- notation for specifying how illegal characters are handled:
+-- There is additional notation (borrowed from GNU iconv) for specifying
+-- how illegal characters are handled:
--
-- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause
-- all illegal sequences on input to be ignored, and on output
@@ -186,6 +186,28 @@ char8 = Latin1.latin1
-- * a suffix of @\/\/TRANSLIT@ will choose a replacement character
-- for illegal sequences or code points.
--
+-- * a suffix of @\/\/ROUNDTRIP@ will use a PEP383-style escape mechanism
+-- to represent any invalid bytes in the input as Unicode codepoints (specifically,
+-- as lone surrogates, which are normally invalid in UTF-32).
+-- Upon output, these special codepoints are detected and turned back into the
+-- corresponding original byte.
+--
+-- In theory, this mechanism allows arbitrary data to be roundtripped via
+-- a 'String' with no loss of data. In practice, there are two limitations
+-- to be aware of:
+--
+-- 1. This only stands a chance of working for an encoding which is an ASCII
+-- superset, as for security reasons we refuse to escape any bytes smaller
+-- than 128. Many encodings of interest are ASCII supersets (in particular,
+-- you can assume that the locale encoding is an ASCII superset) but many
+-- (such as UTF-16) are not.
+--
+-- 2. If the underlying encoding is not itself roundtrippable, this mechanism
+-- can fail. Roundtrippable encodings are those which have an injective mapping
+-- into Unicode. Almost all encodings meet this criteria, but some do not. Notably,
+-- Shift-JIS (CP932) and Big5 contain several different encodings of the same
+-- Unicode codepoint.
+--
-- On Windows, you can access supported code pages with the prefix
-- @CP@; for example, @\"CP1250\"@.
--
@@ -194,8 +216,6 @@ mkTextEncoding e = case mb_coding_failure_mode of
Nothing -> unknownEncodingErr e
Just cfm -> mkTextEncoding' cfm enc
where
- -- The only problem with actually documenting //IGNORE and //TRANSLIT as
- -- supported suffixes is that they are not necessarily supported with non-GNU iconv
(enc, suffix) = span (/= '/') e
mb_coding_failure_mode = case suffix of
"" -> Just ErrorOnCodingFailure