summaryrefslogtreecommitdiff
path: root/libraries/base/GHC/IO/Encoding/Iconv.hs
blob: f8dfb88ae702ef06fd7749053e75cd3ca38ee91c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE CPP
           , NoImplicitPrelude
           , ForeignFunctionInterface
           , NondecreasingIndentation
  #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  GHC.IO.Encoding.Iconv
-- Copyright   :  (c) The University of Glasgow, 2008-2009
-- License     :  see libraries/base/LICENSE
-- 
-- Maintainer  :  libraries@haskell.org
-- Stability   :  internal
-- Portability :  non-portable
--
-- This module provides text encoding/decoding using iconv
--
-----------------------------------------------------------------------------

-- #hide
module GHC.IO.Encoding.Iconv (
#if !defined(mingw32_HOST_OS)
   iconvEncoding, mkIconvEncoding,
   localeEncoding, mkLocaleEncoding
#endif
 ) where

#include "MachDeps.h"
#include "HsBaseConfig.h"

#if !defined(mingw32_HOST_OS)

import Foreign.Safe
import Foreign.C
import Data.Maybe
import GHC.Base
import GHC.IO.Buffer
import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.List (span)
import GHC.Num
import GHC.Show
import GHC.Real
import System.IO.Unsafe (unsafePerformIO)
import System.Posix.Internals

c_DEBUG_DUMP :: Bool
c_DEBUG_DUMP = False

iconv_trace :: String -> IO ()
iconv_trace s
 | c_DEBUG_DUMP = puts s
 | otherwise    = return ()

-- -----------------------------------------------------------------------------
-- iconv encoders/decoders

{-# NOINLINE localeEncodingName #-}
localeEncodingName :: String
localeEncodingName = unsafePerformIO $ do
   -- Use locale_charset() or nl_langinfo(CODESET) to get the encoding
   -- if we have either of them.
   cstr <- c_localeEncoding
   peekCAString cstr -- Assume charset names are ASCII

localeEncoding :: TextEncoding
localeEncoding = mkLocaleEncoding ErrorOnCodingFailure

mkLocaleEncoding :: CodingFailureMode -> TextEncoding
mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName

-- We hope iconv_t is a storable type.  It should be, since it has at least the
-- value -1, which is a possible return value from iconv_open.
type IConv = CLong -- ToDo: (#type iconv_t)

foreign import ccall unsafe "hs_iconv_open"
    hs_iconv_open :: CString -> CString -> IO IConv

foreign import ccall unsafe "hs_iconv_close"
    hs_iconv_close :: IConv -> IO CInt

foreign import ccall unsafe "hs_iconv"
    hs_iconv :: IConv -> Ptr CString -> Ptr CSize -> Ptr CString -> Ptr CSize
	  -> IO CSize

foreign import ccall unsafe "localeEncoding"
    c_localeEncoding :: IO CString

haskellChar :: String
#ifdef WORDS_BIGENDIAN
haskellChar | charSize == 2 = "UTF-16BE"
            | otherwise     = "UTF-32BE"
#else
haskellChar | charSize == 2 = "UTF-16LE"
            | otherwise     = "UTF-32LE"
#endif

char_shift :: Int
char_shift | charSize == 2 = 1
           | otherwise     = 2

iconvEncoding :: String -> IO TextEncoding
iconvEncoding = mkIconvEncoding ErrorOnCodingFailure

mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding
mkIconvEncoding cfm charset = do
  return (TextEncoding { 
                textEncodingName = charset,
		mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode,
		mkTextEncoder = newIConv haskellChar charset                 (recoverEncode cfm) iconvEncode})
  where
    -- An annoying feature of GNU iconv is that the //PREFIXES only take
    -- effect when they appear on the tocode parameter to iconv_open:
    (raw_charset, suffix) = span (/= '/') charset

newIConv :: String -> String
   -> (Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
   -> (IConv -> Buffer a -> Buffer b -> IO (CodingProgress, Buffer a, Buffer b))
   -> IO (BufferCodec a b ())
newIConv from to rec fn =
  -- Assume charset names are ASCII
  withCAString from $ \ from_str ->
  withCAString to   $ \ to_str -> do
    iconvt <- throwErrnoIfMinus1 "mkTextEncoding" $ hs_iconv_open to_str from_str
    let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt
    return BufferCodec{
                encode = fn iconvt,
                recover = rec,
                close  = iclose,
                -- iconv doesn't supply a way to save/restore the state
                getState = return (),
                setState = const $ return ()
                }

iconvDecode :: IConv -> DecodeBuffer
iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift

iconvEncode :: IConv -> EncodeBuffer
iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0

iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int 
            -> IO (CodingProgress, Buffer a, Buffer b)
iconvRecode iconv_t
  input@Buffer{  bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_  }  iscale
  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow, bufSize=os }  oscale
  = do
    iconv_trace ("haskelChar=" ++ show haskellChar)
    iconv_trace ("iconvRecode before, input=" ++ show (summaryBuffer input))
    iconv_trace ("iconvRecode before, output=" ++ show (summaryBuffer output))
    withRawBuffer iraw $ \ piraw -> do
    withRawBuffer oraw $ \ poraw -> do
    with (piraw `plusPtr` (ir `shiftL` iscale)) $ \ p_inbuf -> do
    with (poraw `plusPtr` (ow `shiftL` oscale)) $ \ p_outbuf -> do
    with (fromIntegral ((iw-ir) `shiftL` iscale)) $ \ p_inleft -> do
    with (fromIntegral ((os-ow) `shiftL` oscale)) $ \ p_outleft -> do
      res <- hs_iconv iconv_t p_inbuf p_inleft p_outbuf p_outleft
      new_inleft  <- peek p_inleft
      new_outleft <- peek p_outleft
      let 
	  new_inleft'  = fromIntegral new_inleft `shiftR` iscale
	  new_outleft' = fromIntegral new_outleft `shiftR` oscale
	  new_input  
            | new_inleft == 0  = input { bufL = 0, bufR = 0 }
	    | otherwise        = input { bufL = iw - new_inleft' }
	  new_output = output{ bufR = os - new_outleft' }
      iconv_trace ("iconv res=" ++ show res)
      iconv_trace ("iconvRecode after,  input=" ++ show (summaryBuffer new_input))
      iconv_trace ("iconvRecode after,  output=" ++ show (summaryBuffer new_output))
      if (res /= -1)
	then do -- all input translated
	   return (InputUnderflow, new_input, new_output)
	else do
      errno <- getErrno
      case errno of
        e | e == e2BIG  -> return (OutputUnderflow, new_input, new_output)
          | e == eINVAL -> return (InputUnderflow, new_input, new_output)
           -- Sometimes iconv reports EILSEQ for a
           -- character in the input even when there is no room
           -- in the output; in this case we might be about to
           -- change the encoding anyway, so the following bytes
           -- could very well be in a different encoding.
           --
           -- Because we can only say InvalidSequence if there is at least
           -- one element left in the output, we have to special case this.
          | e == eILSEQ -> return (if new_outleft' == 0 then OutputUnderflow else InvalidSequence, new_input, new_output)
          | otherwise -> do
              iconv_trace ("iconv returned error: " ++ show (errnoToIOError "iconv" e Nothing Nothing))
              throwErrno "iconvRecoder"

#endif /* !mingw32_HOST_OS */