Big patch to improve Unicode support in GHC. Validated on OS X and Windows, this

patch series fixes #5061, #1414, #3309, #3308, #3307, #4006 and #4855. The major changes are: 1) Make Foreign.C.String.*CString use the locale encoding This change follows the FFI specification in Haskell 98, which has never actually been implemented before. The functions exported from Foreign.C.String are partially-applied versions of those from GHC.Foreign, which allows the user to supply their own TextEncoding. We also introduce foreignEncoding as the name of the text encoding that follows the FFI appendix in that it transliterates encoding errors. 2) I also changed the code so that mkTextEncoding always tries the native-Haskell decoders in preference to those from iconv, even on non-Windows. The motivation here is simply that it is better for compatibility if we do this, and those are the ones you get for the utf* and latin1* predefined TextEncodings anyway. 3) Implement surrogate-byte error handling mode for TextEncoding This implements PEP383-like behaviour so that we are able to roundtrip byte strings through Strings without loss of information. The withFilePath function now uses this encoding to get to/from CStrings, so any code that uses that will get the right PEP383 behaviour automatically. 4) Implement three other coding failure modes: ignore, throw error, transliterate These mimic the behaviour of the GNU Iconv extensions.
author: Max Bolingbroke <batterseapower@hotmail.com> 2011-05-14 22:50:46 +0100
committer: Max Bolingbroke <batterseapower@hotmail.com> 2011-05-14 22:50:46 +0100
commit: dc58b7398910a433259a6c0f58a0d05a48555191 (patch)
tree: a01062281a0cf1dd42329110ff0d0326be407f2b /libraries/base/GHC/IO/Encoding.hs
parent: cdbce1218d9f9fb4152bdabffe8bbdee09f5ce60 (diff)
download: haskell-dc58b7398910a433259a6c0f58a0d05a48555191.tar.gz
1 files changed, 56 insertions, 22 deletions
diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs
index 5d8ecb4c70..953fc2e259 100644
--- a/libraries/base/GHC/IO/Encoding.hs
+++ b/libraries/base/GHC/IO/Encoding.hs
@@ -16,18 +16,20 @@
 -----------------------------------------------------------------------------
 
 module GHC.IO.Encoding (
-  BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
+  BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder, CodingProgress(..),
   latin1, latin1_encode, latin1_decode,
   utf8, utf8_bom,
   utf16, utf16le, utf16be,
   utf32, utf32le, utf32be, 
-  localeEncoding,
+  localeEncoding, fileSystemEncoding, foreignEncoding,
   mkTextEncoding,
   ) where
 
 import GHC.Base
 --import GHC.IO
+import GHC.IO.Exception
 import GHC.IO.Buffer
+import GHC.IO.Encoding.Failure
 import GHC.IO.Encoding.Types
 import GHC.Word
 #if !defined(mingw32_HOST_OS)
@@ -41,10 +43,8 @@ import qualified GHC.IO.Encoding.UTF8   as UTF8
 import qualified GHC.IO.Encoding.UTF16  as UTF16
 import qualified GHC.IO.Encoding.UTF32  as UTF32
 
-#if defined(mingw32_HOST_OS)
+import Data.List
 import Data.Maybe
-import GHC.IO.Exception
-#endif
 
 -- -----------------------------------------------------------------------------
 
@@ -97,11 +97,32 @@ utf32be  :: TextEncoding
 utf32be = UTF32.utf32be
 
 -- | The Unicode encoding of the current locale
-localeEncoding  :: TextEncoding
+localeEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but allowing arbitrary
+-- undecodable bytes to be round-tripped through it.
+--
+-- This 'TextEncoding' is used to decode and encode command line arguments
+-- and environment variables on non-Windows platforms.
+--
+-- On Windows, this encoding *should not* be used if possible because
+-- the use of code pages is deprecated: Strings should be retrieved
+-- via the "wide" W-family of UTF-16 APIs instead
+fileSystemEncoding :: TextEncoding
+
+-- | The Unicode encoding of the current locale, but where undecodable
+-- bytes are replaced with their closest visual match. Used for
+-- the 'CString' marshalling functions in "Foreign.C.String"
+foreignEncoding :: TextEncoding
+
 #if !defined(mingw32_HOST_OS)
 localeEncoding = Iconv.localeEncoding
+fileSystemEncoding = Iconv.mkLocaleEncoding SurrogateEscapeFailure
+foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
 #else
 localeEncoding = CodePage.localeEncoding
+fileSystemEncoding = CodePage.mkLocaleEncoding SurrogateEscapeFailure
+foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
 #endif
 
 -- | Look up the named Unicode encoding.  May fail with 
@@ -131,27 +152,40 @@ localeEncoding = CodePage.localeEncoding
 -- @CP@; for example, @\"CP1250\"@.
 --
 mkTextEncoding :: String -> IO TextEncoding
-#if !defined(mingw32_HOST_OS)
-mkTextEncoding = Iconv.mkTextEncoding
+mkTextEncoding e = case mb_coding_failure_mode of
+  Nothing -> unknown_encoding
+  Just cfm -> case enc of
+    "UTF-8"    -> return $ UTF8.mkUTF8 cfm
+    "UTF-16"   -> return $ UTF16.mkUTF16 cfm
+    "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
+    "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
+    "UTF-32"   -> return $ UTF32.mkUTF32 cfm
+    "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
+    "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
+#if defined(mingw32_HOST_OS)
+    'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
+    _ -> unknown_encoding
 #else
-mkTextEncoding "UTF-8"    = return utf8
-mkTextEncoding "UTF-16"   = return utf16
-mkTextEncoding "UTF-16LE" = return utf16le
-mkTextEncoding "UTF-16BE" = return utf16be
-mkTextEncoding "UTF-32"   = return utf32
-mkTextEncoding "UTF-32LE" = return utf32le
-mkTextEncoding "UTF-32BE" = return utf32be
-mkTextEncoding ('C':'P':n)
-    | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
-mkTextEncoding e = ioException
-     (IOError Nothing NoSuchThing "mkTextEncoding"
-          ("unknown encoding:" ++ e)  Nothing Nothing)
+    _ -> Iconv.mkIconvEncoding cfm enc
 #endif
+  where
+    -- The only problem with actually documenting //IGNORE and //TRANSLIT as
+    -- supported suffixes is that they are not necessarily supported with non-GNU iconv
+    (enc, suffix) = span (/= '/') e
+    mb_coding_failure_mode = case suffix of
+        ""            -> Just ErrorOnCodingFailure
+        "//IGNORE"    -> Just IgnoreCodingFailure
+        "//TRANSLIT"  -> Just TransliterateCodingFailure
+        "//SURROGATE" -> Just SurrogateEscapeFailure
+        _             -> Nothing
+    
+    unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding"
+                                            ("unknown encoding:" ++ e)  Nothing Nothing)
 
 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
-latin1_encode = Latin1.latin1_encode -- unchecked, used for binary
+latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for binary
 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode
 
 latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)
-latin1_decode = Latin1.latin1_decode
+latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output
 --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode
author	Max Bolingbroke <batterseapower@hotmail.com>	2011-05-14 22:50:46 +0100
committer	Max Bolingbroke <batterseapower@hotmail.com>	2011-05-14 22:50:46 +0100
commit	dc58b7398910a433259a6c0f58a0d05a48555191 (patch)
tree	a01062281a0cf1dd42329110ff0d0326be407f2b /libraries/base/GHC/IO/Encoding.hs
parent	cdbce1218d9f9fb4152bdabffe8bbdee09f5ce60 (diff)
download	haskell-dc58b7398910a433259a6c0f58a0d05a48555191.tar.gz