summaryrefslogtreecommitdiff
path: root/testsuite/tests/ghc-regress/lib/IO/encoding002.hs
blob: 65d60a399335b57a984b117d54613a74d48657f5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import Control.Monad

import System.IO
import Control.Exception

import Foreign.Marshal.Array
import Foreign.Ptr

import GHC.Foreign
import GHC.IO.Encoding (TextEncoding, mkTextEncoding)

import Data.Char
import Data.Word

import Prelude hiding (catch)


decode :: TextEncoding -> [Word8] -> IO String
decode enc xs = withArrayLen xs (\sz p -> peekCStringLen enc (castPtr p, sz)) `catch` \e -> return (show (e :: IOException))

encode :: TextEncoding -> String -> IO [Word8]
encode enc cs = withCStringLen enc cs (\(p, sz) -> peekArray sz (castPtr p)) `catch` \e -> return (const [] (e :: IOException))

asc :: Char -> Word8
asc = fromIntegral . ord

families = [ ([asc 'H', asc 'i', 0xED, 0xB2, 0x80, asc '!'],
              ["UTF-8",    "UTF-8//IGNORE",    "UTF-8//TRANSLIT",    "UTF-8//ROUNDTRIP"])
           , ([asc 'H', 0, asc 'i', 0, 0xFF, 0xDF, 0xFF, 0xDF, asc '!', 0],
              ["UTF-16LE", "UTF-16LE//IGNORE", "UTF-16LE//TRANSLIT", "UTF-16LE//ROUNDTRIP"])
           , ([0, asc 'H', 0, asc 'i', 0xDF, 0xFF, 0xDF, 0xFF, 0, asc '!'],
              ["UTF-16BE", "UTF-16BE//IGNORE", "UTF-16BE//TRANSLIT", "UTF-16BE//ROUNDTRIP"])
           , ([asc 'H', 0, 0, 0, asc 'i', 0, 0, 0, 0xED, 0xB2, 0x80, 0, asc '!', 0, 0, 0],
              ["UTF-32LE", "UTF-32LE//IGNORE", "UTF-32LE//TRANSLIT", "UTF-32LE//ROUNDTRIP"])
           , ([0, 0, 0, asc 'H', 0, 0, 0, asc 'i', 0, 0x80, 0xB2, 0xED, 0, 0, 0, asc '!'],
              ["UTF-32BE", "UTF-32BE//IGNORE", "UTF-32BE//TRANSLIT", "UTF-32BE//ROUNDTRIP"])
           ]

main = do
  surrogate_enc <- mkTextEncoding "UTF-8//ROUNDTRIP"
    
  -- Test that invalid input is correctly roundtripped as surrogates
  -- This only works for the UTF-8 UTF since it is the only UTF which
  -- is an ASCII superset.
  putStrLn $ "== UTF-8: roundtripping"
  let invalid_bytes = [asc 'H', asc 'i', 0xED, 0xB2, 0x80, asc '!']
  surrogates <- decode surrogate_enc invalid_bytes
  invalid_bytes' <- encode surrogate_enc surrogates
  print invalid_bytes
  print surrogates
  print invalid_bytes'
  print (invalid_bytes == invalid_bytes')
  putStrLn ""
  
  forM families $ \(invalid_bytes, enc_names) -> do
    encs <- mapM mkTextEncoding enc_names
    let name = head enc_names
    
    -- How we deal with decoding errors in the various modes:
    putStrLn $ "== " ++ name ++ ": decoding"
    forM encs $ \enc -> decode enc invalid_bytes >>= print
    
    -- How about encoding errors, particularly those from embedded surrogates?
    putStrLn $ "== " ++ name ++ ": encoding"
    forM encs $ \enc -> encode enc "Hi\xDC80!" >>= print
    
    putStrLn ""