diff options
author | Simon Marlow <marlowsd@gmail.com> | 2011-10-12 10:26:56 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2011-10-13 11:11:32 +0100 |
commit | 77edace13eab20959cccc64ba3f1480806166458 (patch) | |
tree | ff4ab72cf2111bdedabc7bf5e232a39dba030cad /libraries | |
parent | 4d4740420a9406062f8526b6088f00c7207cd9bd (diff) | |
download | haskell-77edace13eab20959cccc64ba3f1480806166458.tar.gz |
80-columnify
Diffstat (limited to 'libraries')
-rw-r--r-- | libraries/base/GHC/IO/Encoding/Failure.hs | 137 |
1 files changed, 87 insertions, 50 deletions
diff --git a/libraries/base/GHC/IO/Encoding/Failure.hs b/libraries/base/GHC/IO/Encoding/Failure.hs index f1e03dd688..8cee4b3ff7 100644 --- a/libraries/base/GHC/IO/Encoding/Failure.hs +++ b/libraries/base/GHC/IO/Encoding/Failure.hs @@ -35,33 +35,53 @@ import GHC.Real ( fromIntegral ) import Data.Maybe --- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and specifies --- how they handle illegal sequences. -data CodingFailureMode = ErrorOnCodingFailure -- ^ Throw an error when an illegal sequence is encountered - | IgnoreCodingFailure -- ^ Attempt to ignore and recover if an illegal sequence is encountered - | TransliterateCodingFailure -- ^ Replace with the closest visual match upon an illegal sequence - | RoundtripFailure -- ^ Use the private-use escape mechanism to attempt to allow illegal sequences to be roundtripped. - deriving (Show) -- This will only work properly for those encodings which are strict supersets of ASCII in the sense - -- that valid ASCII data is also valid in that encoding. This is not true for e.g. UTF-16, because - -- ASCII characters must be padded to two bytes to retain their meaning. + +-- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and +-- specifies how they handle illegal sequences. +data CodingFailureMode + = ErrorOnCodingFailure + -- ^ Throw an error when an illegal sequence is encountered + | IgnoreCodingFailure + -- ^ Attempt to ignore and recover if an illegal sequence is + -- encountered + | TransliterateCodingFailure + -- ^ Replace with the closest visual match upon an illegal + -- sequence + | RoundtripFailure + -- ^ Use the private-use escape mechanism to attempt to allow + -- illegal sequences to be roundtripped. + deriving (Show) + -- This will only work properly for those encodings which are + -- strict supersets of ASCII in the sense that valid ASCII data + -- is also valid in that encoding. This is not true for + -- e.g. UTF-16, because ASCII characters must be padded to two + -- bytes to retain their meaning. -- Note [Roundtripping] -- ~~~~~~~~~~~~~~~~~~~~ -- --- Roundtripping is based on the ideas of PEP383. However, unlike PEP383 we do not wish to use lone surrogate codepoints --- to escape undecodable bytes, because that may confuse Unicode processing software written in Haskell. Instead, we use --- the range of private-use characters from 0xEF80 to 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery. +-- Roundtripping is based on the ideas of PEP383. However, unlike +-- PEP383 we do not wish to use lone surrogate codepoints to escape +-- undecodable bytes, because that may confuse Unicode processing +-- software written in Haskell. Instead, we use the range of +-- private-use characters from 0xEF80 to 0xEFFF designated for +-- "encoding hacks" by the ConScript Unicode Registery. -- --- This introduces a technical problem when it comes to encoding back to bytes using iconv. The iconv code will not fail when --- it tries to encode a private-use character (as it would if trying to encode a surrogate), which means that we won't get a --- chance to replace it with the byte we originally escaped. +-- This introduces a technical problem when it comes to encoding back +-- to bytes using iconv. The iconv code will not fail when it tries to +-- encode a private-use character (as it would if trying to encode a +-- surrogate), which means that we won't get a chance to replace it +-- with the byte we originally escaped. -- --- To work around this, when filling the buffer to be encoded (in writeBlocks/withEncodedCString/newEncodedCString), we replace --- the private-use characters with lone surrogates again! Likewise, when reading from a buffer (unpack/unpack_nl/peekEncodedCString) --- we have to do the inverse process. +-- To work around this, when filling the buffer to be encoded (in +-- writeBlocks/withEncodedCString/newEncodedCString), we replace the +-- private-use characters with lone surrogates again! Likewise, when +-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we have +-- to do the inverse process. -- --- The user of String should never see these lone surrogates, but it ensures that iconv will throw an error when encountering them. --- We use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose. +-- The user of String should never see these lone surrogates, but it +-- ensures that iconv will throw an error when encountering them. We +-- use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose. codingFailureModeSuffix :: CodingFailureMode -> String codingFailureModeSuffix ErrorOnCodingFailure = "" @@ -69,48 +89,61 @@ codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE" codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT" codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP" --- | In transliterate mode, we use this character when decoding unknown bytes. +-- | In transliterate mode, we use this character when decoding +-- unknown bytes. -- --- This is the defined Unicode replacement character: <http://www.fileformat.info/info/unicode/char/0fffd/index.htm> +-- This is the defined Unicode replacement character: +-- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm> unrepresentableChar :: Char unrepresentableChar = '\xFFFD' --- It is extraordinarily important that this series of predicates/transformers gets inlined, because --- they tend to be used in inner loops related to text encoding. In particular, surrogatifyRoundtripCharacter --- must be inlined (see #5536) +-- It is extraordinarily important that this series of +-- predicates/transformers gets inlined, because they tend to be used +-- in inner loops related to text encoding. In particular, +-- surrogatifyRoundtripCharacter must be inlined (see #5536) --- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an --- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't --- give valid Unicode. +-- | Some characters are actually "surrogate" codepoints defined for +-- use in UTF-16. We need to signal an invalid character if we detect +-- them when encoding a sequence of 'Char's into 'Word8's because they +-- won't give valid Unicode. -- --- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's --- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding. +-- We may also need to signal an invalid character if we detect them +-- when encoding a sequence of 'Char's into 'Word8's because the +-- 'RoundtripFailure' mode creates these to round-trip bytes through +-- our internal UTF-16 encoding. {-# INLINE isSurrogate #-} isSurrogate :: Char -> Bool -isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF) +isSurrogate c = (0xD800 <= x && x <= 0xDBFF) + || (0xDC00 <= x && x <= 0xDFFF) where x = ord c --- | Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem) --- (We use some private-use characters for roundtripping unknown bytes through a String) +-- | Private use characters (in Strings) --> lone surrogates (in +-- Buffer CharBufElem) (We use some private-use characters for +-- roundtripping unknown bytes through a String) {-# INLINE surrogatifyRoundtripCharacter #-} surrogatifyRoundtripCharacter :: Char -> Char -surrogatifyRoundtripCharacter c | 0xEF00 <= x && x < 0xF000 = chr (x - (0xEF00 - 0xDC00)) - | otherwise = c +surrogatifyRoundtripCharacter c + | 0xEF00 <= x && x < 0xF000 = chr (x - (0xEF00 - 0xDC00)) + | otherwise = c where x = ord c --- | Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings) --- (We use some surrogate characters for roundtripping unknown bytes through a String) +-- | Lone surrogates (in Buffer CharBufElem) --> private use +-- characters (in Strings) (We use some surrogate characters for +-- roundtripping unknown bytes through a String) {-# INLINE desurrogatifyRoundtripCharacter #-} desurrogatifyRoundtripCharacter :: Char -> Char -desurrogatifyRoundtripCharacter c | 0xDC00 <= x && x < 0xDD00 = chr (x - (0xDC00 - 0xEF00)) - | otherwise = c +desurrogatifyRoundtripCharacter c + | 0xDC00 <= x && x < 0xDD00 = chr (x - (0xDC00 - 0xEF00)) + | otherwise = c where x = ord c -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem) {-# INLINE escapeToRoundtripCharacterSurrogate #-} escapeToRoundtripCharacterSurrogate :: Word8 -> Char escapeToRoundtripCharacterSurrogate b - | b < 128 = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset. + | b < 128 = chr (fromIntegral b) + -- Disallow 'smuggling' of ASCII bytes. For roundtripping to + -- work, this assumes encoding is ASCII-superset. | otherwise = chr (0xDC00 + fromIntegral b) -- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8) @@ -121,7 +154,8 @@ unescapeRoundtripCharacterSurrogate c | otherwise = Nothing where x = ord c -recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char) +recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char + -> IO (Buffer Word8, Buffer Char) recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do --puts $ "recoverDecode " ++ show ir @@ -136,7 +170,8 @@ recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b) return (input { bufL=ir+1 }, output { bufR=ow' }) -recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8) +recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 + -> IO (Buffer Char, Buffer Word8) recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do (c,ir') <- readCharBuf iraw ir @@ -147,18 +182,20 @@ recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } if c == '?' then return (input { bufL=ir' }, output) else do - -- XXX: evil hack! To implement transliteration, we just poke an - -- ASCII ? into the input buffer and tell the caller to try and decode - -- again. This is *probably* safe given current uses of TextEncoding. + -- XXX: evil hack! To implement transliteration, we just + -- poke an ASCII ? into the input buffer and tell the caller + -- to try and decode again. This is *probably* safe given + -- current uses of TextEncoding. -- - -- The "if" test above ensures we skip if the encoding fails to deal with - -- the ?, though this should never happen in practice as all encodings are - -- in fact capable of reperesenting all ASCII characters. + -- The "if" test above ensures we skip if the encoding fails + -- to deal with the ?, though this should never happen in + -- practice as all encodings are in fact capable of + -- reperesenting all ASCII characters. _ir' <- writeCharBuf iraw ir '?' return (input, output) - -- This implementation does not work because e.g. UTF-16 requires 2 bytes to - -- encode a simple ASCII value + -- This implementation does not work because e.g. UTF-16 + -- requires 2 bytes to encode a simple ASCII value --writeWord8Buf oraw ow unrepresentableByte --return (input { bufL=ir' }, output { bufR=ow+1 }) RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do |