libraries/base/GHC/IO/Encoding/Failure.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE NoImplicitPrelude #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  GHC.IO.Encoding.Failure
-- Copyright   :  (c) The University of Glasgow, 2008-2011
-- License     :  see libraries/base/LICENSE
--
-- Maintainer  :  libraries@haskell.org
-- Stability   :  internal
-- Portability :  non-portable
--
-- Types for specifying how text encoding/decoding fails
--
-----------------------------------------------------------------------------

module GHC.IO.Encoding.Failure (
    CodingFailureMode(..), codingFailureModeSuffix,
    isSurrogate,
    recoverDecode, recoverEncode
  ) where

import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Exception

import GHC.Base
import GHC.Char
import GHC.Word
import GHC.Show
import GHC.Num
import GHC.Real ( fromIntegral )

--import System.Posix.Internals

-- | The 'CodingFailureMode' is used to construct 'System.IO.TextEncoding's,
-- and specifies how they handle illegal sequences.
data CodingFailureMode
  = ErrorOnCodingFailure
       -- ^ Throw an error when an illegal sequence is encountered
  | IgnoreCodingFailure
       -- ^ Attempt to ignore and recover if an illegal sequence is
       -- encountered
  | TransliterateCodingFailure
       -- ^ Replace with the closest visual match upon an illegal
       -- sequence
  | RoundtripFailure
       -- ^ Use the private-use escape mechanism to attempt to allow
       -- illegal sequences to be roundtripped.
  deriving ( Show -- ^ @since 4.4.0.0
           )
       -- This will only work properly for those encodings which are
       -- strict supersets of ASCII in the sense that valid ASCII data
       -- is also valid in that encoding. This is not true for
       -- e.g. UTF-16, because ASCII characters must be padded to two
       -- bytes to retain their meaning.

-- Note [Roundtripping]
-- ~~~~~~~~~~~~~~~~~~~~
-- Roundtripping is based on the ideas of PEP383.
--
-- We used to use the range of private-use characters from 0xEF80 to
-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registry
-- to encode these characters.
--
-- However, people didn't like this because it means we don't get
-- guaranteed roundtripping for byte sequences that look like a UTF-8
-- encoded codepoint 0xEFxx.
--
-- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape
-- undecodable bytes, even though that may confuse Unicode processing
-- software written in Haskell. This guarantees roundtripping because
-- unicode input that includes lone surrogate codepoints is invalid by
-- definition.
--
--
-- When we used private-use characters there was a technical problem when it
-- came to encoding back to bytes using iconv. The iconv code will not fail when
-- it tries to encode a private-use character (as it would if trying to encode
-- a surrogate), which means that we wouldn't get a chance to replace it
-- with the byte we originally escaped.
--
-- To work around this, when filling the buffer to be encoded (in
-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the
-- private-use characters with lone surrogates again! Likewise, when
-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we had
-- to do the inverse process.
--
-- The user of String would never see these lone surrogates, but it
-- ensured that iconv will throw an error when encountering them.  We
-- used lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.

codingFailureModeSuffix :: CodingFailureMode -> String
codingFailureModeSuffix ErrorOnCodingFailure       = ""
codingFailureModeSuffix IgnoreCodingFailure        = "//IGNORE"
codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
codingFailureModeSuffix RoundtripFailure           = "//ROUNDTRIP"

-- | In transliterate mode, we use this character when decoding
-- unknown bytes.
--
-- This is the defined Unicode replacement character:
-- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
unrepresentableChar :: Char
unrepresentableChar = '\xFFFD'

-- It is extraordinarily important that this series of
-- predicates/transformers gets inlined, because they tend to be used
-- in inner loops related to text encoding. In particular,
-- surrogatifyRoundtripCharacter must be inlined (see #5536)

-- | Some characters are actually "surrogate" codepoints defined for
-- use in UTF-16. We need to signal an invalid character if we detect
-- them when encoding a sequence of 'Char's into 'Word8's because they
-- won't give valid Unicode.
--
-- We may also need to signal an invalid character if we detect them
-- when encoding a sequence of 'Char's into 'Word8's because the
-- 'RoundtripFailure' mode creates these to round-trip bytes through
-- our internal UTF-16 encoding.
{-# INLINE isSurrogate #-}
isSurrogate :: Char -> Bool
isSurrogate c = (0xD800 <= x && x <= 0xDBFF)
             || (0xDC00 <= x && x <= 0xDFFF)
  where x = ord c

-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
{-# INLINE escapeToRoundtripCharacterSurrogate #-}
escapeToRoundtripCharacterSurrogate :: Word8 -> Char
escapeToRoundtripCharacterSurrogate b
  | b < 128   = chr (fromIntegral b)
      -- Disallow 'smuggling' of ASCII bytes. For roundtripping to
      -- work, this assumes encoding is ASCII-superset.
  | otherwise = chr (0xDC00 + fromIntegral b)

-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
{-# INLINE unescapeRoundtripCharacterSurrogate #-}
unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
unescapeRoundtripCharacterSurrogate c
    | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
    | otherwise                 = Nothing
  where x = ord c

recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char
              -> IO (Buffer Word8, Buffer Char)
recoverDecode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
                  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } =
 --puts $ "recoverDecode " ++ show ir
 case cfm of
  ErrorOnCodingFailure       -> do
      b <- readWord8Buf iraw ir
      ioe_decodingError b
  IgnoreCodingFailure        -> return (input { bufL=ir+1 }, output)
  TransliterateCodingFailure -> do
      ow' <- writeCharBuf oraw ow unrepresentableChar
      return (input { bufL=ir+1 }, output { bufR=ow' })
  RoundtripFailure           -> do
      b <- readWord8Buf iraw ir
      ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
      return (input { bufL=ir+1 }, output { bufR=ow' })

recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8
              -> IO (Buffer Char, Buffer Word8)
recoverEncode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
                  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
  (c,ir') <- readCharBuf iraw ir
  --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
  case cfm of
    IgnoreCodingFailure        -> return (input { bufL=ir' }, output)
    TransliterateCodingFailure -> do
        if c == '?'
         then return (input { bufL=ir' }, output)
         else do
          -- XXX: evil hack! To implement transliteration, we just
          -- poke an ASCII ? into the input buffer and tell the caller
          -- to try and decode again. This is *probably* safe given
          -- current uses of TextEncoding.
          --
          -- The "if" test above ensures we skip if the encoding fails
          -- to deal with the ?, though this should never happen in
          -- practice as all encodings are in fact capable of
          -- reperesenting all ASCII characters.
          _ir' <- writeCharBuf iraw ir '?'
          return (input, output)

        -- This implementation does not work because e.g. UTF-16
        -- requires 2 bytes to encode a simple ASCII value
        --writeWord8Buf oraw ow unrepresentableByte
        --return (input { bufL=ir' }, output { bufR=ow+1 })
    RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
        writeWord8Buf oraw ow x
        return (input { bufL=ir' }, output { bufR=ow+1 })
    _                          -> ioe_encodingError c

ioe_decodingError :: Word8 -> IO a
ioe_decodingError b = ioException
    (IOError Nothing InvalidArgument "recoverDecode"
        ("cannot decode byte sequence starting from " ++ show b) Nothing Nothing)

ioe_encodingError :: Char -> IO a
ioe_encodingError ch = ioException
    (IOError Nothing InvalidArgument "recoverEncode"
        -- This assumes that @show ch@ escapes non-ASCII symbols
        -- and thus does not cause recursive encoding failures.
        ("cannot encode character " ++ show ch) Nothing Nothing)