diff options
author | Max Bolingbroke <batterseapower@hotmail.com> | 2011-05-14 22:50:46 +0100 |
---|---|---|
committer | Max Bolingbroke <batterseapower@hotmail.com> | 2011-05-14 22:50:46 +0100 |
commit | dc58b7398910a433259a6c0f58a0d05a48555191 (patch) | |
tree | a01062281a0cf1dd42329110ff0d0326be407f2b /libraries/base/GHC/IO/Encoding/Iconv.hs | |
parent | cdbce1218d9f9fb4152bdabffe8bbdee09f5ce60 (diff) | |
download | haskell-dc58b7398910a433259a6c0f58a0d05a48555191.tar.gz |
Big patch to improve Unicode support in GHC. Validated on OS X and Windows, this
patch series fixes #5061, #1414, #3309, #3308, #3307, #4006 and #4855.
The major changes are:
1) Make Foreign.C.String.*CString use the locale encoding
This change follows the FFI specification in Haskell 98, which
has never actually been implemented before.
The functions exported from Foreign.C.String are partially-applied
versions of those from GHC.Foreign, which allows the user to supply
their own TextEncoding.
We also introduce foreignEncoding as the name of the text encoding
that follows the FFI appendix in that it transliterates encoding
errors.
2) I also changed the code so that mkTextEncoding always tries the
native-Haskell decoders in preference to those from iconv, even on
non-Windows. The motivation here is simply that it is better for
compatibility if we do this, and those are the ones you get for
the utf* and latin1* predefined TextEncodings anyway.
3) Implement surrogate-byte error handling mode for TextEncoding
This implements PEP383-like behaviour so that we are able to
roundtrip byte strings through Strings without loss of information.
The withFilePath function now uses this encoding to get to/from CStrings,
so any code that uses that will get the right PEP383 behaviour automatically.
4) Implement three other coding failure modes: ignore, throw error, transliterate
These mimic the behaviour of the GNU Iconv extensions.
Diffstat (limited to 'libraries/base/GHC/IO/Encoding/Iconv.hs')
-rw-r--r-- | libraries/base/GHC/IO/Encoding/Iconv.hs | 114 |
1 files changed, 36 insertions, 78 deletions
diff --git a/libraries/base/GHC/IO/Encoding/Iconv.hs b/libraries/base/GHC/IO/Encoding/Iconv.hs index 6d87595444..d91907132c 100644 --- a/libraries/base/GHC/IO/Encoding/Iconv.hs +++ b/libraries/base/GHC/IO/Encoding/Iconv.hs @@ -21,12 +21,8 @@ -- #hide module GHC.IO.Encoding.Iconv ( #if !defined(mingw32_HOST_OS) - mkTextEncoding, - latin1, - utf8, - utf16, utf16le, utf16be, - utf32, utf32le, utf32be, - localeEncoding + iconvEncoding, mkIconvEncoding, + localeEncoding, mkLocaleEncoding #endif ) where @@ -40,6 +36,7 @@ import Foreign.C import Data.Maybe import GHC.Base import GHC.IO.Buffer +import GHC.IO.Encoding.Failure import GHC.IO.Encoding.Types import GHC.List (span) import GHC.Num @@ -56,47 +53,9 @@ iconv_trace s | c_DEBUG_DUMP = puts s | otherwise = return () -puts :: String -> IO () -puts s = do _ <- withCAStringLen (s ++ "\n") $ \(p, len) -> - -- In reality should be withCString, but assume ASCII to avoid loop - c_write 1 (castPtr p) (fromIntegral len) - return () - -- ----------------------------------------------------------------------------- -- iconv encoders/decoders -{-# NOINLINE latin1 #-} -latin1 :: TextEncoding -latin1 = unsafePerformIO (mkTextEncoding "Latin1") - -{-# NOINLINE utf8 #-} -utf8 :: TextEncoding -utf8 = unsafePerformIO (mkTextEncoding "UTF8") - -{-# NOINLINE utf16 #-} -utf16 :: TextEncoding -utf16 = unsafePerformIO (mkTextEncoding "UTF16") - -{-# NOINLINE utf16le #-} -utf16le :: TextEncoding -utf16le = unsafePerformIO (mkTextEncoding "UTF16LE") - -{-# NOINLINE utf16be #-} -utf16be :: TextEncoding -utf16be = unsafePerformIO (mkTextEncoding "UTF16BE") - -{-# NOINLINE utf32 #-} -utf32 :: TextEncoding -utf32 = unsafePerformIO (mkTextEncoding "UTF32") - -{-# NOINLINE utf32le #-} -utf32le :: TextEncoding -utf32le = unsafePerformIO (mkTextEncoding "UTF32LE") - -{-# NOINLINE utf32be #-} -utf32be :: TextEncoding -utf32be = unsafePerformIO (mkTextEncoding "UTF32BE") - {-# NOINLINE localeEncodingName #-} localeEncodingName :: String localeEncodingName = unsafePerformIO $ do @@ -105,9 +64,11 @@ localeEncodingName = unsafePerformIO $ do cstr <- c_localeEncoding peekCAString cstr -- Assume charset names are ASCII -{-# NOINLINE localeEncoding #-} localeEncoding :: TextEncoding -localeEncoding = unsafePerformIO $ mkTextEncoding localeEncodingName +localeEncoding = mkLocaleEncoding ErrorOnCodingFailure + +mkLocaleEncoding :: CodingFailureMode -> TextEncoding +mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName -- We hope iconv_t is a storable type. It should be, since it has at least the -- value -1, which is a possible return value from iconv_open. @@ -139,21 +100,25 @@ char_shift :: Int char_shift | charSize == 2 = 1 | otherwise = 2 -mkTextEncoding :: String -> IO TextEncoding -mkTextEncoding charset = do +iconvEncoding :: String -> IO TextEncoding +iconvEncoding = mkIconvEncoding ErrorOnCodingFailure + +mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding +mkIconvEncoding cfm charset = do return (TextEncoding { textEncodingName = charset, - mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) iconvDecode, - mkTextEncoder = newIConv haskellChar charset iconvEncode}) + mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode, + mkTextEncoder = newIConv haskellChar charset (recoverEncode cfm) iconvEncode}) where -- An annoying feature of GNU iconv is that the //PREFIXES only take -- effect when they appear on the tocode parameter to iconv_open: (raw_charset, suffix) = span (/= '/') charset newIConv :: String -> String - -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b)) + -> (Buffer a -> Buffer b -> IO (Buffer a, Buffer b)) + -> (IConv -> Buffer a -> Buffer b -> IO (CodingProgress, Buffer a, Buffer b)) -> IO (BufferCodec a b ()) -newIConv from to fn = +newIConv from to rec fn = -- Assume charset names are ASCII withCAString from $ \ from_str -> withCAString to $ \ to_str -> do @@ -161,22 +126,21 @@ newIConv from to fn = let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt return BufferCodec{ encode = fn iconvt, + recover = rec, close = iclose, -- iconv doesn't supply a way to save/restore the state getState = return (), setState = const $ return () } -iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem - -> IO (Buffer Word8, Buffer CharBufElem) +iconvDecode :: IConv -> DecodeBuffer iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift -iconvEncode :: IConv -> Buffer CharBufElem -> Buffer Word8 - -> IO (Buffer CharBufElem, Buffer Word8) +iconvEncode :: IConv -> EncodeBuffer iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0 iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int - -> IO (Buffer a, Buffer b) + -> IO (CodingProgress, Buffer a, Buffer b) iconvRecode iconv_t input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } iscale output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } oscale @@ -205,29 +169,23 @@ iconvRecode iconv_t iconv_trace ("iconvRecode after, output=" ++ show (summaryBuffer new_output)) if (res /= -1) then do -- all input translated - return (new_input, new_output) + return (InputUnderflow, new_input, new_output) else do errno <- getErrno case errno of - e | e == eINVAL || e == e2BIG - || e == eILSEQ && new_inleft' /= (iw-ir) -> do - iconv_trace ("iconv ignoring error: " ++ show (errnoToIOError "iconv" e Nothing Nothing)) - -- Output overflow is harmless - -- - -- Similarly, we ignore EILSEQ unless we converted no - -- characters. Sometimes iconv reports EILSEQ for a - -- character in the input even when there is no room - -- in the output; in this case we might be about to - -- change the encoding anyway, so the following bytes - -- could very well be in a different encoding. - -- This also helps with pinpointing EILSEQ errors: we - -- don't report it until the rest of the characters in - -- the buffer have been drained. - return (new_input, new_output) - - e -> do - iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing)) - throwErrno "iconvRecoder" - -- illegal sequence, or some other error + e | e == e2BIG -> return (OutputUnderflow, new_input, new_output) + | e == eINVAL -> return (InputUnderflow, new_input, new_output) + -- Sometimes iconv reports EILSEQ for a + -- character in the input even when there is no room + -- in the output; in this case we might be about to + -- change the encoding anyway, so the following bytes + -- could very well be in a different encoding. + -- + -- Because we can only say InvalidSequence if there is at least + -- one element left in the output, we have to special case this. + | e == eILSEQ -> return (if new_outleft' == 0 then OutputUnderflow else InvalidSequence, new_input, new_output) + | otherwise -> do + iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing)) + throwErrno "iconvRecoder" #endif /* !mingw32_HOST_OS */ |