summaryrefslogtreecommitdiff
path: root/libraries/base/GHC/IO/Encoding/Iconv.hs
diff options
context:
space:
mode:
authorMax Bolingbroke <batterseapower@hotmail.com>2011-05-14 22:50:46 +0100
committerMax Bolingbroke <batterseapower@hotmail.com>2011-05-14 22:50:46 +0100
commitdc58b7398910a433259a6c0f58a0d05a48555191 (patch)
treea01062281a0cf1dd42329110ff0d0326be407f2b /libraries/base/GHC/IO/Encoding/Iconv.hs
parentcdbce1218d9f9fb4152bdabffe8bbdee09f5ce60 (diff)
downloadhaskell-dc58b7398910a433259a6c0f58a0d05a48555191.tar.gz
Big patch to improve Unicode support in GHC. Validated on OS X and Windows, this
patch series fixes #5061, #1414, #3309, #3308, #3307, #4006 and #4855. The major changes are: 1) Make Foreign.C.String.*CString use the locale encoding This change follows the FFI specification in Haskell 98, which has never actually been implemented before. The functions exported from Foreign.C.String are partially-applied versions of those from GHC.Foreign, which allows the user to supply their own TextEncoding. We also introduce foreignEncoding as the name of the text encoding that follows the FFI appendix in that it transliterates encoding errors. 2) I also changed the code so that mkTextEncoding always tries the native-Haskell decoders in preference to those from iconv, even on non-Windows. The motivation here is simply that it is better for compatibility if we do this, and those are the ones you get for the utf* and latin1* predefined TextEncodings anyway. 3) Implement surrogate-byte error handling mode for TextEncoding This implements PEP383-like behaviour so that we are able to roundtrip byte strings through Strings without loss of information. The withFilePath function now uses this encoding to get to/from CStrings, so any code that uses that will get the right PEP383 behaviour automatically. 4) Implement three other coding failure modes: ignore, throw error, transliterate These mimic the behaviour of the GNU Iconv extensions.
Diffstat (limited to 'libraries/base/GHC/IO/Encoding/Iconv.hs')
-rw-r--r--libraries/base/GHC/IO/Encoding/Iconv.hs114
1 files changed, 36 insertions, 78 deletions
diff --git a/libraries/base/GHC/IO/Encoding/Iconv.hs b/libraries/base/GHC/IO/Encoding/Iconv.hs
index 6d87595444..d91907132c 100644
--- a/libraries/base/GHC/IO/Encoding/Iconv.hs
+++ b/libraries/base/GHC/IO/Encoding/Iconv.hs
@@ -21,12 +21,8 @@
-- #hide
module GHC.IO.Encoding.Iconv (
#if !defined(mingw32_HOST_OS)
- mkTextEncoding,
- latin1,
- utf8,
- utf16, utf16le, utf16be,
- utf32, utf32le, utf32be,
- localeEncoding
+ iconvEncoding, mkIconvEncoding,
+ localeEncoding, mkLocaleEncoding
#endif
) where
@@ -40,6 +36,7 @@ import Foreign.C
import Data.Maybe
import GHC.Base
import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.List (span)
import GHC.Num
@@ -56,47 +53,9 @@ iconv_trace s
| c_DEBUG_DUMP = puts s
| otherwise = return ()
-puts :: String -> IO ()
-puts s = do _ <- withCAStringLen (s ++ "\n") $ \(p, len) ->
- -- In reality should be withCString, but assume ASCII to avoid loop
- c_write 1 (castPtr p) (fromIntegral len)
- return ()
-
-- -----------------------------------------------------------------------------
-- iconv encoders/decoders
-{-# NOINLINE latin1 #-}
-latin1 :: TextEncoding
-latin1 = unsafePerformIO (mkTextEncoding "Latin1")
-
-{-# NOINLINE utf8 #-}
-utf8 :: TextEncoding
-utf8 = unsafePerformIO (mkTextEncoding "UTF8")
-
-{-# NOINLINE utf16 #-}
-utf16 :: TextEncoding
-utf16 = unsafePerformIO (mkTextEncoding "UTF16")
-
-{-# NOINLINE utf16le #-}
-utf16le :: TextEncoding
-utf16le = unsafePerformIO (mkTextEncoding "UTF16LE")
-
-{-# NOINLINE utf16be #-}
-utf16be :: TextEncoding
-utf16be = unsafePerformIO (mkTextEncoding "UTF16BE")
-
-{-# NOINLINE utf32 #-}
-utf32 :: TextEncoding
-utf32 = unsafePerformIO (mkTextEncoding "UTF32")
-
-{-# NOINLINE utf32le #-}
-utf32le :: TextEncoding
-utf32le = unsafePerformIO (mkTextEncoding "UTF32LE")
-
-{-# NOINLINE utf32be #-}
-utf32be :: TextEncoding
-utf32be = unsafePerformIO (mkTextEncoding "UTF32BE")
-
{-# NOINLINE localeEncodingName #-}
localeEncodingName :: String
localeEncodingName = unsafePerformIO $ do
@@ -105,9 +64,11 @@ localeEncodingName = unsafePerformIO $ do
cstr <- c_localeEncoding
peekCAString cstr -- Assume charset names are ASCII
-{-# NOINLINE localeEncoding #-}
localeEncoding :: TextEncoding
-localeEncoding = unsafePerformIO $ mkTextEncoding localeEncodingName
+localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
+
+mkLocaleEncoding :: CodingFailureMode -> TextEncoding
+mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName
-- We hope iconv_t is a storable type. It should be, since it has at least the
-- value -1, which is a possible return value from iconv_open.
@@ -139,21 +100,25 @@ char_shift :: Int
char_shift | charSize == 2 = 1
| otherwise = 2
-mkTextEncoding :: String -> IO TextEncoding
-mkTextEncoding charset = do
+iconvEncoding :: String -> IO TextEncoding
+iconvEncoding = mkIconvEncoding ErrorOnCodingFailure
+
+mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding
+mkIconvEncoding cfm charset = do
return (TextEncoding {
textEncodingName = charset,
- mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) iconvDecode,
- mkTextEncoder = newIConv haskellChar charset iconvEncode})
+ mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode,
+ mkTextEncoder = newIConv haskellChar charset (recoverEncode cfm) iconvEncode})
where
-- An annoying feature of GNU iconv is that the //PREFIXES only take
-- effect when they appear on the tocode parameter to iconv_open:
(raw_charset, suffix) = span (/= '/') charset
newIConv :: String -> String
- -> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
+ -> (Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
+ -> (IConv -> Buffer a -> Buffer b -> IO (CodingProgress, Buffer a, Buffer b))
-> IO (BufferCodec a b ())
-newIConv from to fn =
+newIConv from to rec fn =
-- Assume charset names are ASCII
withCAString from $ \ from_str ->
withCAString to $ \ to_str -> do
@@ -161,22 +126,21 @@ newIConv from to fn =
let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
return BufferCodec{
encode = fn iconvt,
+ recover = rec,
close = iclose,
-- iconv doesn't supply a way to save/restore the state
getState = return (),
setState = const $ return ()
}
-iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem
- -> IO (Buffer Word8, Buffer CharBufElem)
+iconvDecode :: IConv -> DecodeBuffer
iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift
-iconvEncode :: IConv -> Buffer CharBufElem -> Buffer Word8
- -> IO (Buffer CharBufElem, Buffer Word8)
+iconvEncode :: IConv -> EncodeBuffer
iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0
iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int
- -> IO (Buffer a, Buffer b)
+ -> IO (CodingProgress, Buffer a, Buffer b)
iconvRecode iconv_t
input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } iscale
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } oscale
@@ -205,29 +169,23 @@ iconvRecode iconv_t
iconv_trace ("iconvRecode after, output=" ++ show (summaryBuffer new_output))
if (res /= -1)
then do -- all input translated
- return (new_input, new_output)
+ return (InputUnderflow, new_input, new_output)
else do
errno <- getErrno
case errno of
- e | e == eINVAL || e == e2BIG
- || e == eILSEQ && new_inleft' /= (iw-ir) -> do
- iconv_trace ("iconv ignoring error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
- -- Output overflow is harmless
- --
- -- Similarly, we ignore EILSEQ unless we converted no
- -- characters. Sometimes iconv reports EILSEQ for a
- -- character in the input even when there is no room
- -- in the output; in this case we might be about to
- -- change the encoding anyway, so the following bytes
- -- could very well be in a different encoding.
- -- This also helps with pinpointing EILSEQ errors: we
- -- don't report it until the rest of the characters in
- -- the buffer have been drained.
- return (new_input, new_output)
-
- e -> do
- iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
- throwErrno "iconvRecoder"
- -- illegal sequence, or some other error
+ e | e == e2BIG -> return (OutputUnderflow, new_input, new_output)
+ | e == eINVAL -> return (InputUnderflow, new_input, new_output)
+ -- Sometimes iconv reports EILSEQ for a
+ -- character in the input even when there is no room
+ -- in the output; in this case we might be about to
+ -- change the encoding anyway, so the following bytes
+ -- could very well be in a different encoding.
+ --
+ -- Because we can only say InvalidSequence if there is at least
+ -- one element left in the output, we have to special case this.
+ | e == eILSEQ -> return (if new_outleft' == 0 then OutputUnderflow else InvalidSequence, new_input, new_output)
+ | otherwise -> do
+ iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
+ throwErrno "iconvRecoder"
#endif /* !mingw32_HOST_OS */