summaryrefslogtreecommitdiff
path: root/libraries/base/GHC/IO/Encoding/Failure.hs
blob: 8cee4b3ff79fb7374e0828acd71e11141145ac6c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE NoImplicitPrelude, PatternGuards #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  GHC.IO.Encoding.Failure
-- Copyright   :  (c) The University of Glasgow, 2008-2011
-- License     :  see libraries/base/LICENSE
-- 
-- Maintainer  :  libraries@haskell.org
-- Stability   :  internal
-- Portability :  non-portable
--
-- Types for specifying how text encoding/decoding fails
--
-----------------------------------------------------------------------------

module GHC.IO.Encoding.Failure (
    CodingFailureMode(..), codingFailureModeSuffix,
    isSurrogate,
    surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
    recoverDecode, recoverEncode
  ) where

import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Exception

import GHC.Base
import GHC.Word
import GHC.Show
import GHC.Num
import GHC.Real ( fromIntegral )

--import System.Posix.Internals

import Data.Maybe


-- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and
-- specifies how they handle illegal sequences.
data CodingFailureMode
  = ErrorOnCodingFailure
       -- ^ Throw an error when an illegal sequence is encountered
  | IgnoreCodingFailure
       -- ^ Attempt to ignore and recover if an illegal sequence is
       -- encountered
  | TransliterateCodingFailure
       -- ^ Replace with the closest visual match upon an illegal
       -- sequence
  | RoundtripFailure
       -- ^ Use the private-use escape mechanism to attempt to allow
       -- illegal sequences to be roundtripped.
  deriving (Show)
       -- This will only work properly for those encodings which are
       -- strict supersets of ASCII in the sense that valid ASCII data
       -- is also valid in that encoding. This is not true for
       -- e.g. UTF-16, because ASCII characters must be padded to two
       -- bytes to retain their meaning.

-- Note [Roundtripping]
-- ~~~~~~~~~~~~~~~~~~~~
--
-- Roundtripping is based on the ideas of PEP383. However, unlike
-- PEP383 we do not wish to use lone surrogate codepoints to escape
-- undecodable bytes, because that may confuse Unicode processing
-- software written in Haskell. Instead, we use the range of
-- private-use characters from 0xEF80 to 0xEFFF designated for
-- "encoding hacks" by the ConScript Unicode Registery.
--
-- This introduces a technical problem when it comes to encoding back
-- to bytes using iconv. The iconv code will not fail when it tries to
-- encode a private-use character (as it would if trying to encode a
-- surrogate), which means that we won't get a chance to replace it
-- with the byte we originally escaped.
--
-- To work around this, when filling the buffer to be encoded (in
-- writeBlocks/withEncodedCString/newEncodedCString), we replace the
-- private-use characters with lone surrogates again! Likewise, when
-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we have
-- to do the inverse process.
--
-- The user of String should never see these lone surrogates, but it
-- ensures that iconv will throw an error when encountering them.  We
-- use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.

codingFailureModeSuffix :: CodingFailureMode -> String
codingFailureModeSuffix ErrorOnCodingFailure       = ""
codingFailureModeSuffix IgnoreCodingFailure        = "//IGNORE"
codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
codingFailureModeSuffix RoundtripFailure           = "//ROUNDTRIP"

-- | In transliterate mode, we use this character when decoding
-- unknown bytes.
--
-- This is the defined Unicode replacement character:
-- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
unrepresentableChar :: Char
unrepresentableChar = '\xFFFD'

-- It is extraordinarily important that this series of
-- predicates/transformers gets inlined, because they tend to be used
-- in inner loops related to text encoding. In particular,
-- surrogatifyRoundtripCharacter must be inlined (see #5536)

-- | Some characters are actually "surrogate" codepoints defined for
-- use in UTF-16. We need to signal an invalid character if we detect
-- them when encoding a sequence of 'Char's into 'Word8's because they
-- won't give valid Unicode.
--
-- We may also need to signal an invalid character if we detect them
-- when encoding a sequence of 'Char's into 'Word8's because the
-- 'RoundtripFailure' mode creates these to round-trip bytes through
-- our internal UTF-16 encoding.
{-# INLINE isSurrogate #-}
isSurrogate :: Char -> Bool
isSurrogate c = (0xD800 <= x && x <= 0xDBFF)
             || (0xDC00 <= x && x <= 0xDFFF)
  where x = ord c

-- | Private use characters (in Strings) --> lone surrogates (in
-- Buffer CharBufElem) (We use some private-use characters for
-- roundtripping unknown bytes through a String)
{-# INLINE surrogatifyRoundtripCharacter #-}
surrogatifyRoundtripCharacter :: Char -> Char
surrogatifyRoundtripCharacter c
  | 0xEF00 <= x && x < 0xF000 = chr (x - (0xEF00 - 0xDC00))
  | otherwise                 = c
  where x = ord c

-- | Lone surrogates (in Buffer CharBufElem) --> private use
-- characters (in Strings) (We use some surrogate characters for
-- roundtripping unknown bytes through a String)
{-# INLINE desurrogatifyRoundtripCharacter #-}
desurrogatifyRoundtripCharacter :: Char -> Char
desurrogatifyRoundtripCharacter c
  | 0xDC00 <= x && x < 0xDD00 = chr (x - (0xDC00 - 0xEF00))
  | otherwise                 = c
  where x = ord c

-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
{-# INLINE escapeToRoundtripCharacterSurrogate #-}
escapeToRoundtripCharacterSurrogate :: Word8 -> Char
escapeToRoundtripCharacterSurrogate b
  | b < 128   = chr (fromIntegral b)
      -- Disallow 'smuggling' of ASCII bytes. For roundtripping to
      -- work, this assumes encoding is ASCII-superset.
  | otherwise = chr (0xDC00 + fromIntegral b)

-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
{-# INLINE unescapeRoundtripCharacterSurrogate #-}
unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
unescapeRoundtripCharacterSurrogate c
    | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
    | otherwise                 = Nothing
  where x = ord c

recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char
              -> IO (Buffer Word8, Buffer Char)
recoverDecode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
                  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
 --puts $ "recoverDecode " ++ show ir
 case cfm of
  ErrorOnCodingFailure       -> ioe_decodingError
  IgnoreCodingFailure        -> return (input { bufL=ir+1 }, output)
  TransliterateCodingFailure -> do
      ow' <- writeCharBuf oraw ow unrepresentableChar
      return (input { bufL=ir+1 }, output { bufR=ow' })
  RoundtripFailure           -> do
      b <- readWord8Buf iraw ir
      ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
      return (input { bufL=ir+1 }, output { bufR=ow' })

recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8
              -> IO (Buffer Char, Buffer Word8)
recoverEncode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
                  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
  (c,ir') <- readCharBuf iraw ir
  --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
  case cfm of
    IgnoreCodingFailure        -> return (input { bufL=ir' }, output)
    TransliterateCodingFailure -> do
        if c == '?'
         then return (input { bufL=ir' }, output)
         else do
          -- XXX: evil hack! To implement transliteration, we just
          -- poke an ASCII ? into the input buffer and tell the caller
          -- to try and decode again. This is *probably* safe given
          -- current uses of TextEncoding.
          --
          -- The "if" test above ensures we skip if the encoding fails
          -- to deal with the ?, though this should never happen in
          -- practice as all encodings are in fact capable of
          -- reperesenting all ASCII characters.
          _ir' <- writeCharBuf iraw ir '?'
          return (input, output)
        
        -- This implementation does not work because e.g. UTF-16
        -- requires 2 bytes to encode a simple ASCII value
        --writeWord8Buf oraw ow unrepresentableByte
        --return (input { bufL=ir' }, output { bufR=ow+1 })
    RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
        writeWord8Buf oraw ow x
        return (input { bufL=ir' }, output { bufR=ow+1 })
    _                          -> ioe_encodingError

ioe_decodingError :: IO a
ioe_decodingError = ioException
    (IOError Nothing InvalidArgument "recoverDecode"
        "invalid byte sequence" Nothing Nothing)

ioe_encodingError :: IO a
ioe_encodingError = ioException
    (IOError Nothing InvalidArgument "recoverEncode"
        "invalid character" Nothing Nothing)