libraries/base/codepages/MakeTable.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

{--
This is a script to generate the necessary tables to support Windows code page
encoding/decoding.

License: see libraries/base/LICENSE

The code page tables are available from :
http://www.unicode.org/Public/MAPPINGS/

To run this script, use e.g.
runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT

Currently, this script only supports single-byte encodings, since the lookup
tables required for the CJK double-byte codepages are too large to be
statically linked into every executable.  We plan to add support for them once
GHC is able to produce Windows DLLs.
--}

module Main where

import System.FilePath
import qualified Data.Map as Map
import System.IO
import Data.Maybe (mapMaybe)
import Data.List (intersperse)
import Data.Word
import Numeric
import Control.Monad.State
import System.Environment
import Control.Exception(evaluate)

main :: IO ()
main = do
    moduleName:outFile:files <- getArgs
    let badFiles = -- These fail with an error like
                   --     MakeTable: Enum.toEnum{Word8}: tag (33088) is outside of bounds (0,255)
                   -- I have no idea what's going on, so for now we just
                   -- skip them.
                   ["CPs/CP932.TXT",
                    "CPs/CP936.TXT",
                    "CPs/CP949.TXT",
                    "CPs/CP950.TXT"]
    let files' = filter (`notElem` badFiles) files
    sbes <- mapM readMapAndIx files'
    putStrLn "Writing output"
    withBinaryFile outFile WriteMode $ flip hPutStr
        $ unlines $ makeTableFile moduleName files' sbes
  where
    readMapAndIx f = do
        putStrLn ("Reading " ++ f)
        m <- readMap f
        return (codePageNum f, m)

-- filenames are assumed to be of the form "CP1250.TXT"
codePageNum :: FilePath -> Int
codePageNum = read . drop 2 . takeBaseName

readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char)
readMap f  = withBinaryFile f ReadMode $ \h -> do
    contents <- hGetContents h
    let ms = Map.fromList $ mapMaybe parseLine $ lines contents
    evaluate $ Map.size ms
    return ms

parseLine :: Enum a => String -> Maybe (a,Char)
parseLine s = case words s of
    ('#':_):_           -> Nothing
    bs:"#DBCS":_        -> Just (readHex' bs, toEnum 0xDC00)
    bs:"#UNDEFINED":_   -> Just (readHex' bs, toEnum 0)
    bs:cs:('#':_):_     -> Just (readHex' bs, readCharHex cs)
    _                   -> Nothing

readHex' :: Enum a => String -> a
readHex' ('0':'x':s) = case readHex s of
    [(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors.
    _ -> error $ "Can't read hex: " ++ show s
readHex' s = error $ "Can't read hex: " ++ show s

readCharHex :: String -> Char
readCharHex s = if c > fromEnum (maxBound :: Word16)
                    then error "Can't handle non-BMP character."
                    else toEnum c
    where c = readHex' s


-------------------------------------------
-- Writing out the main data values.

makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String]
makeTableFile moduleName files maps = concat
    [ languageDirectives, firstComment files, header,
        theImports, theTypes, blockSizeText, tablePart]
  where
    header = [ "module " ++ moduleName ++ " where"
             , ""
             ]
    tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]"
                , "codePageMap = ["
                ] ++ (intersperse "\n    ," $ map mkTableEntry maps)
                ++ ["    ]"]
    mkTableEntry (i,m) = "    (" ++ show i ++ ", " ++ makeSBE m ++ "    )"
    blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize]


makeSBE :: Map.Map Word8 Char -> String
makeSBE m = unlines
                [ "SingleByteCP {"
                , "     decoderArray = " ++ mkConvArray es
                , "     , encoderArray = " ++ mkCompactArray (swapMap m)
                , "   }"
                ]
  where
    es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]]

swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a
swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList
  where
    swap (x,y) = (y,x)


mkConvArray :: Embed a => [a] -> String
mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#"


-------------------------------------------
-- Compact arrays
--
-- The decoding map (from Word8 to Char) can be implemented with a simple array
-- of 256 Word16's.  Bytes which do not belong to the code page are mapped to
-- '\0'.
--
-- However, a naive table mapping Char to Word8 would require 2^16 Word8's.  We
-- can use much less space with the right data structure, since at most 256 of
-- those entries are nonzero.
--
-- We use "compact arrays", as described in "Unicode Demystified" by Richard
-- Gillam.
--
-- Fix a block size S which is a power of two.  We compress an array of N
-- entries (where N>>S) as follows.  First, split the array into blocks of size
-- S, then remove all repeate blocks to form the "value" array.  Then construct
-- a separate "index" array which maps the position of blocks in the old array
-- to a position in the value array.
--
-- For example, assume that S=32 we have six blocks ABABCA, each with 32
-- elements.
--
-- Then the compressed table consists of two arrays:
-- 1) An array "values", concatenating the unique blocks ABC
-- 2) An array "indices" which equals [0,1,0,1,2,0].
--
-- To look up '\100', first calculate divMod 100 32 = (3,4).  Since
-- indices[3]=1, we look at the second unique block B; thus the encoded byte is
-- B[4].
--
-- The upshot of this representation is that the lookup is very quick as it only
-- requires two array accesses plus some bit masking/shifting.

-- From testing, this is an optimal size.
blockBitSize :: Int
blockBitSize = 6

mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String
mkCompactArray m = unlines [
            ""
            , " CompactArray {"
            , "        encoderIndices = " ++ mkConvArray is'
            , "        , encoderValues = "
                    ++ mkConvArray (concat $ Map.elems vs)
            , "        , encoderMax = " ++ show (fst $ Map.findMax m)
            , "        }"
            ]
  where
    blockSize = 2 ^ blockBitSize
    (is,(vs,_)) = compress blockSize $ m
    is' = map (* blockSize) is

type CompressState b = (Map.Map Int [b], Map.Map [b] Int)
-- each entry in the list corresponds to a block of size n.
compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b
        -> ([Int], CompressState b)
compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty)
    where
        chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms)
                    $ [minBound..fst (Map.findMax ms)]
        mkChunks [] = []
        mkChunks xs = take n xs : mkChunks (drop n xs)
        lookupOrAdd xs = do
            (m,rm) <- get
            case Map.lookup xs rm of
                Just i -> return i
                Nothing -> do
                    let i = if Map.null m
                                then 0
                                else 1 + fst (Map.findMax m)
                    put (Map.insert i xs m, Map.insert xs i rm)
                    return i

-------------------------------------------
-- Static parts of the generated module.

languageDirectives :: [String]
languageDirectives = ["{-# LANGUAGE CPP, MagicHash #-}"]


firstComment :: [FilePath] -> [String]
firstComment files = map ("-- " ++) $
    [ "Do not edit this file directly!"
    , "It was generated by the MakeTable.hs script using the files below."
    , "To regenerate it, run \"make\" in ../../../../codepages/"
    , ""
    , "Files:"
    ] ++ map takeFileName files

theImports :: [String]
theImports = map ("import " ++ )
    ["GHC.Prim", "GHC.Base", "GHC.Word"]

theTypes :: [String]
theTypes = [ "data ConvArray a = ConvArray Addr#"
           , "data CompactArray a b = CompactArray {"
           , "    encoderMax :: !a,"
           , "    encoderIndices :: !(ConvArray Int),"
           , "    encoderValues :: !(ConvArray b)"
           , "  }"
           , ""
           , "data CodePageArrays = SingleByteCP {"
           , "    decoderArray :: !(ConvArray Char),"
           , "    encoderArray :: !(CompactArray Char Word8)"
           , "  }"
           , ""
           ]

-------------------------------------------
-- Embed class and associated functions

class (Ord a, Enum a, Bounded a, Show a) => Embed a where
    mkHex :: a -> String

instance Embed Word8 where
    mkHex = showHex'

instance Embed Word16 where
    mkHex = repDualByte

instance Embed Char where
    mkHex = repDualByte

-- this is used for the indices of the compressed array.
instance Embed Int where
    mkHex = repDualByte

showHex' :: Integral a => a -> String
showHex' s = "\\x" ++ showHex s ""

repDualByte :: Enum c => c -> String
repDualByte c
    | n >= 2^(16::Int) = error "value is too high!"
    -- NOTE : this assumes little-endian architecture.  But we're only using this on Windows,
    -- so it's probably OK.
    | otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256)
  where
    n = fromEnum c