diff options
author | Reid Barton <rwbarton@gmail.com> | 2015-07-21 19:13:20 +0200 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2015-07-21 21:42:54 +0200 |
commit | dbe6dac96543f426297a59d8d16c3f5afacf42d4 (patch) | |
tree | ece8ced882e19add696a4dfc99e298a93ad1a247 /libraries/base/tests | |
parent | 97a50d5096e134a1b713a085a7ba859bdd7143d1 (diff) | |
download | haskell-dbe6dac96543f426297a59d8d16c3f5afacf42d4.tar.gz |
When iconv is unavailable, use an ASCII encoding to encode ASCII
D898 and D1059 implemented a fallback behavior to handle the case
that the end user's iconv installation is broken (typically due to
running inside a chroot in which the necessary locale files and/or
gconv modules have not been installed). In this case, if the
program requests an ASCII locale, GHC's char8 encoding is used
rather than the program failing.
However, silently mangling data like char8 does when the programmer
did not ask for it is poor behavior, for reasons described in D1059.
This commit implements an ASCII encoding and uses it in the fallback
case when iconv is unavailable and the user has requested ASCII.
Test Plan:
Added tests for the encodings defined in Latin1.
Also, manually ran a statically-linked executable of that test
in a chroot and the tests passed (up to the ones that call
mkTextEncoding "LATIN1", since there is no fallback from iconv
for that case yet).
Reviewers: austin, hvr, hsyl20, bgamari
Reviewed By: hsyl20, bgamari
Subscribers: thomie
Differential Revision: https://phabricator.haskell.org/D1085
GHC Trac Issues: #7695, #10623
Diffstat (limited to 'libraries/base/tests')
-rw-r--r-- | libraries/base/tests/.gitignore | 1 | ||||
-rw-r--r-- | libraries/base/tests/IO/all.T | 1 | ||||
-rw-r--r-- | libraries/base/tests/IO/encoding005.hs | 115 | ||||
-rw-r--r-- | libraries/base/tests/IO/encoding005.stdout | 5 |
4 files changed, 122 insertions, 0 deletions
diff --git a/libraries/base/tests/.gitignore b/libraries/base/tests/.gitignore index af90b5e47c..a430bd700a 100644 --- a/libraries/base/tests/.gitignore +++ b/libraries/base/tests/.gitignore @@ -105,6 +105,7 @@ /IO/encoding002 /IO/encoding003 /IO/encoding004 +/IO/encoding005 /IO/encodingerror001 /IO/environment001 /IO/finalization001 diff --git a/libraries/base/tests/IO/all.T b/libraries/base/tests/IO/all.T index 43d94da452..29779457e4 100644 --- a/libraries/base/tests/IO/all.T +++ b/libraries/base/tests/IO/all.T @@ -138,6 +138,7 @@ test('encoding001', test('encoding002', normal, compile_and_run, ['']) test('encoding003', normal, compile_and_run, ['']) test('encoding004', normal, compile_and_run, ['']) +test('encoding005', normal, compile_and_run, ['']) test('environment001', [extra_clean(['environment001'])], diff --git a/libraries/base/tests/IO/encoding005.hs b/libraries/base/tests/IO/encoding005.hs new file mode 100644 index 0000000000..99db84af59 --- /dev/null +++ b/libraries/base/tests/IO/encoding005.hs @@ -0,0 +1,115 @@ +import Control.Monad +import Data.Word (Word8) +import Foreign.Ptr +import Foreign.Marshal.Array +import GHC.Foreign (peekCStringLen, withCStringLen) +import GHC.IO.Encoding.Failure (CodingFailureMode(..)) +import qualified GHC.IO.Encoding.Latin1 as Latin1 +import System.IO +import System.IO.Error + +-- Tests for single-byte encodings that map directly to Unicode +-- (module GHC.IO.Encoding.Latin1) + +eitherToMaybe :: Either a b -> Maybe b +eitherToMaybe (Left _) = Nothing +eitherToMaybe (Right b) = Just b + +decode :: TextEncoding -> [Word8] -> IO (Maybe String) +decode enc xs = fmap eitherToMaybe . tryIOError $ withArrayLen xs (\sz p -> peekCStringLen enc (castPtr p, sz)) + +encode :: TextEncoding -> String -> IO (Maybe [Word8]) +encode enc cs = fmap eitherToMaybe . tryIOError $ withCStringLen enc cs (\(p, sz) -> peekArray sz (castPtr p)) + +testIO :: (Eq a, Show a) => IO a -> a -> IO () +testIO action expected = do + result <- action + when (result /= expected) $ + putStrLn $ "Test failed: expected " ++ show expected ++ ", but got " ++ show result + +-- Test char8-like encodings +test_char8 :: TextEncoding -> IO () +test_char8 enc = do + testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff'] + + testIO (encode enc ['\0'..'\x200']) $ Just ([0..0xff] ++ [0..0xff] ++ [0]) + +-- Test latin1-like encodings +test_latin1 :: CodingFailureMode -> TextEncoding -> IO () +test_latin1 cfm enc = do + testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff'] + + testIO (encode enc ['\0'..'\xff']) $ Just [0..0xff] + testIO (encode enc "\xfe\xff\x100\x101\x100\xff\xfe") $ case cfm of + ErrorOnCodingFailure -> Nothing + IgnoreCodingFailure -> Just [0xfe,0xff,0xff,0xfe] + TransliterateCodingFailure -> Just [0xfe,0xff,0x3f,0x3f,0x3f,0xff,0xfe] + -- N.B. The argument "LATIN1//TRANSLIT" to mkTextEncoding does not + -- correspond to "LATIN1//TRANSLIT" in iconv! Instead GHC asks iconv + -- to encode to "LATIN1" and uses its own "evil hack" to insert '?' + -- (ASCII 0x3f) in place of failures. See GHC.IO.Encoding.recoverEncode. + -- + -- U+0100 is LATIN CAPITAL LETTER A WITH MACRON, which iconv would + -- transliterate to 'A' (ASCII 0x41). Similarly iconv would + -- transliterate U+0101 LATIN SMALL LETTER A WITH MACRON to 'a' + -- (ASCII 0x61). + RoundtripFailure -> Nothing + +test_ascii :: CodingFailureMode -> TextEncoding -> IO () +test_ascii cfm enc = do + testIO (decode enc [0..0x7f]) $ Just ['\0'..'\x7f'] + testIO (decode enc [0x7e,0x7f,0x80,0x81,0x80,0x7f,0x7e]) $ case cfm of + ErrorOnCodingFailure -> Nothing + IgnoreCodingFailure -> Just "\x7e\x7f\x7f\x7e" + TransliterateCodingFailure -> Just "\x7e\x7f\xfffd\xfffd\xfffd\x7f\x7e" + -- Another GHC special: decode invalid input to the Char U+FFFD + -- REPLACEMENT CHARACTER. + RoundtripFailure -> Just "\x7e\x7f\xdc80\xdc81\xdc80\x7f\x7e" + -- GHC's PEP383-style String-encoding of invalid input, + -- see Note [Roundtripping] + + testIO (encode enc ['\0'..'\x7f']) $ Just [0..0x7f] + testIO (encode enc "\x7e\x7f\x80\x81\x80\x7f\xe9") $ case cfm of + ErrorOnCodingFailure -> Nothing + IgnoreCodingFailure -> Just [0x7e,0x7f,0x7f] + TransliterateCodingFailure -> Just [0x7e,0x7f,0x3f,0x3f,0x3f,0x7f,0x3f] + -- See comment in test_latin1. iconv -t ASCII//TRANSLIT would encode + -- U+00E9 LATIN SMALL LETTER E WITH ACUTE as 'e' (ASCII 0x65). + RoundtripFailure -> Nothing + + -- Test roundtripping for good measure + case cfm of + RoundtripFailure -> do + Just s <- decode enc [0..0xff] + testIO (encode enc s) $ Just [0..0xff] + _ -> return () + +main = do + putStrLn "char8 tests" + test_char8 char8 -- char8 never fails in either direction + + -- These use GHC's own implementation + putStrLn "Latin1.ascii tests" + test_ascii ErrorOnCodingFailure (Latin1.ascii) + test_ascii IgnoreCodingFailure (Latin1.mkAscii IgnoreCodingFailure) + test_ascii TransliterateCodingFailure (Latin1.mkAscii TransliterateCodingFailure) + test_ascii RoundtripFailure (Latin1.mkAscii RoundtripFailure) + + putStrLn "Latin1.latin1_checked tests" + test_latin1 ErrorOnCodingFailure (Latin1.latin1_checked) + test_latin1 IgnoreCodingFailure (Latin1.mkLatin1_checked IgnoreCodingFailure) + test_latin1 TransliterateCodingFailure (Latin1.mkLatin1_checked TransliterateCodingFailure) + test_latin1 RoundtripFailure (Latin1.mkLatin1_checked RoundtripFailure) + + -- These use iconv (normally, unless it is broken) + putStrLn "mkTextEncoding ASCII tests" + test_ascii ErrorOnCodingFailure =<< mkTextEncoding "ASCII" + test_ascii IgnoreCodingFailure =<< mkTextEncoding "ASCII//IGNORE" + test_ascii TransliterateCodingFailure =<< mkTextEncoding "ASCII//TRANSLIT" + test_ascii RoundtripFailure =<< mkTextEncoding "ASCII//ROUNDTRIP" + + putStrLn "mkTextEncoding LATIN1 tests" + test_latin1 ErrorOnCodingFailure =<< mkTextEncoding "LATIN1" + test_latin1 IgnoreCodingFailure =<< mkTextEncoding "LATIN1//IGNORE" + test_latin1 TransliterateCodingFailure =<< mkTextEncoding "LATIN1//TRANSLIT" + test_latin1 RoundtripFailure =<< mkTextEncoding "LATIN1//ROUNDTRIP" diff --git a/libraries/base/tests/IO/encoding005.stdout b/libraries/base/tests/IO/encoding005.stdout new file mode 100644 index 0000000000..664a193592 --- /dev/null +++ b/libraries/base/tests/IO/encoding005.stdout @@ -0,0 +1,5 @@ +char8 tests +Latin1.ascii tests +Latin1.latin1_checked tests +mkTextEncoding ASCII tests +mkTextEncoding LATIN1 tests |