On Windows, use the console code page for text file encoding/decoding.

We keep all of the code page tables in the module GHC.IO.Encoding.CodePage.Table. That file was generated automatically by running codepages/MakeTable.hs; more details are in the comments at the start of that script. Storing the lookup tables adds about 40KB to each statically linked executable; this only increases the size of a "hello world" program by about 7%. Currently we do not support double-byte encodings (Chinese/Japanese/Korean), since including those codepages would increase the table size to 400KB. It will be straightforward to implement them once the work on library DLLs is finished.
author: Judah Jacobson <judah.jacobson@gmail.com> 2009-09-13 02:21:26 +0000
committer: Judah Jacobson <judah.jacobson@gmail.com> 2009-09-13 02:21:26 +0000
commit: b63b596e116ab4bb753a340f1338cbe9572cea0d (patch)
tree: e016418002fa954ca2bb593c80251f98721a9b35 /libraries/base/codepages
parent: bf7ad38e1e473a3b61b2402bcd7f481bbf3ca423 (diff)
download: haskell-b63b596e116ab4bb753a340f1338cbe9572cea0d.tar.gz
1 files changed, 252 insertions, 0 deletions
diff --git a/libraries/base/codepages/MakeTable.hs b/libraries/base/codepages/MakeTable.hs
new file mode 100644
index 0000000000..af123df4dd
--- /dev/null
+++ b/libraries/base/codepages/MakeTable.hs
@@ -0,0 +1,252 @@
+{-- 
+This is a script to generate the necessary tables to support Windows code page
+encoding/decoding.  
+
+License: see libraries/base/LICENSE
+
+The code page tables are available from :
+http://www.unicode.org/Public/MAPPINGS/
+
+To run this script, use e.g.
+runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT
+
+Currently, this script only supports single-byte encodings, since the lookup
+tables required for the CJK double-byte codepages are too large to be
+statically linked into every executable.  We plan to add support for them once
+GHC is able to produce Windows DLLs.
+
+--} 
+
+module Main where
+
+import System.FilePath
+import qualified Data.Map as Map
+import System.IO
+import Data.Maybe (mapMaybe)
+import Data.List (intersperse)
+import Data.Word
+import Numeric
+import Control.Monad.State
+import System.Environment
+import Control.Exception(evaluate)
+
+main :: IO ()
+main = do
+    moduleName:outFile:files <- getArgs
+    sbes <- mapM readMapAndIx files
+    withBinaryFile outFile WriteMode $ flip hPutStr
+        $ unlines $ makeTableFile moduleName files sbes
+  where
+    readMapAndIx f = do
+        m <- readMap f
+        return (codePageNum f, m)
+
+-- filenames are assumed to be of the form "CP1250.TXT"
+codePageNum :: FilePath -> Int
+codePageNum = read . drop 2 . takeBaseName
+
+readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char)
+readMap f  = withBinaryFile f ReadMode $ \h -> do
+    contents <- hGetContents h
+    let ms = Map.fromList $ mapMaybe parseLine $ lines contents
+    evaluate $ Map.size ms
+    return ms
+
+parseLine :: Enum a => String -> Maybe (a,Char)
+parseLine s = case words s of
+    ('#':_):_           -> Nothing
+    bs:"#DBCS":_        -> Just (readHex' bs, toEnum 0xDC00)
+    bs:"#UNDEFINED":_   -> Just (readHex' bs, toEnum 0)
+    bs:cs:('#':_):_     -> Just (readHex' bs, readCharHex cs)
+    _                   -> Nothing
+
+readHex' :: Enum a => String -> a
+readHex' ('0':'x':s) = case readHex s of
+    [(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors.
+    _ -> error $ "Can't read hex: " ++ show s
+readHex' s = error $ "Can't read hex: " ++ show s
+
+readCharHex :: String -> Char
+readCharHex s = if c > fromEnum (maxBound :: Word16)
+                    then error "Can't handle non-BMP character."
+                    else toEnum c
+    where c = readHex' s 
+
+
+-------------------------------------------
+-- Writing out the main data values.
+
+makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String]
+makeTableFile moduleName files maps = concat
+    [ languageDirectives, firstComment files, header,
+        theImports, theTypes, blockSizeText, tablePart]
+  where
+    header = [ "module " ++ moduleName ++ " where"
+             , ""
+             ]
+    tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]"
+                , "codePageMap = ["
+                ] ++ (intersperse "\n    ," $ map mkTableEntry maps)
+                ++ ["    ]"]
+    mkTableEntry (i,m) = "    (" ++ show i ++ ", " ++ makeSBE m ++ "    )"
+    blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize]
+                    
+
+makeSBE :: Map.Map Word8 Char -> String
+makeSBE m = unlines
+                [ "SingleByteCP {"
+                , "     decoderArray = " ++ mkConvArray es
+                , "     , encoderArray = " ++ mkCompactArray (swapMap m) 
+                , "   }"
+                ]
+  where
+    es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]]
+
+swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a
+swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList
+  where
+    swap (x,y) = (y,x)
+
+
+mkConvArray :: Embed a => [a] -> String
+mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#"
+
+
+-------------------------------------------
+-- Compact arrays
+--
+-- The decoding map (from Word8 to Char) can be implemented with a simple array
+-- of 256 Word16's.  Bytes which do not belong to the code page are mapped to
+-- '\0'.
+--
+-- However, a naive table mapping Char to Word8 would require 2^16 Word8's.  We
+-- can use much less space with the right data structure, since at most 256 of
+-- those entries are nonzero.
+--
+-- We use "compact arrays", as described in "Unicode Demystified" by Richard
+-- Gillam.
+--
+-- Fix a block size S which is a power of two.  We compress an array of N
+-- entries (where N>>S) as follows.  First, split the array into blocks of size
+-- S, then remove all repeate blocks to form the "value" array.  Then construct
+-- a separate "index" array which maps the position of blocks in the old array
+-- to a position in the value array.
+--
+-- For example, assume that S=32 we have six blocks ABABCA, each with 32
+-- elements.
+--
+-- Then the compressed table consists of two arrays:
+-- 1) An array "values", concatenating the unique blocks ABC
+-- 2) An array "indices" which equals [0,1,0,1,2,0].
+--
+-- To look up '\100', first calculate divMod 100 32 = (3,4).  Since
+-- indices[3]=1, we look at the second unique block B; thus the encoded byte is
+-- B[4].
+--
+-- The upshot of this representation is that the lookup is very quick as it only
+-- requires two array accesses plus some bit masking/shifting.
+
+-- From testing, this is an optimal size.
+blockBitSize :: Int
+blockBitSize = 6
+
+mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String
+mkCompactArray m = unlines [
+            ""
+            , " CompactArray {"
+            , "        encoderIndices = " ++ mkConvArray is'
+            , "        , encoderValues = "
+                    ++ mkConvArray (concat $ Map.elems vs)
+            , "        , encoderMax = " ++ show (fst $ Map.findMax m)
+            , "        }"
+            ]
+  where
+    blockSize = 2 ^ blockBitSize
+    (is,(vs,_)) = compress blockSize $ m
+    is' = map (* blockSize) is
+
+type CompressState b = (Map.Map Int [b], Map.Map [b] Int)
+-- each entry in the list corresponds to a block of size n.
+compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b 
+        -> ([Int], CompressState b)
+compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty)
+    where
+        chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms)
+                    $ [minBound..fst (Map.findMax ms)]
+        mkChunks [] = []
+        mkChunks xs = take n xs : mkChunks (drop n xs)
+        lookupOrAdd xs = do
+            (m,rm) <- get
+            case Map.lookup xs rm of
+                Just i -> return i
+                Nothing -> do
+                    let i = if Map.null m
+                                then 0
+                                else 1 + fst (Map.findMax m)
+                    put (Map.insert i xs m, Map.insert xs i rm)
+                    return i
+
+-------------------------------------------
+-- Static parts of the generated module.
+
+languageDirectives :: [String]
+languageDirectives = ["{-# LANGUAGE MagicHash #-}"]
+
+
+firstComment :: [FilePath] -> [String]
+firstComment files = map ("-- " ++) $
+    [ "Do not edit this file directly!"
+    , "It was generated by the MakeTable.hs script using the following files:"
+    ] ++ map takeFileName files
+
+theImports :: [String]
+theImports = map ("import " ++ )
+    ["GHC.Prim", "GHC.Base", "GHC.Word", "GHC.Num"]
+
+theTypes :: [String]
+theTypes = [ "data ConvArray a = ConvArray Addr#"
+           , "data CompactArray a b = CompactArray {"
+           , "    encoderMax :: !a,"
+           , "    encoderIndices :: !(ConvArray Int),"
+           , "    encoderValues :: !(ConvArray b)"
+           , "  }"
+           , ""
+           , "data CodePageArrays = SingleByteCP {"
+           , "    decoderArray :: !(ConvArray Char),"
+           , "    encoderArray :: !(CompactArray Char Word8)"
+           , "  }"
+           , ""
+           ]
+
+-------------------------------------------
+-- Embed class and associated functions
+
+class (Ord a, Enum a, Bounded a, Show a) => Embed a where
+    mkHex :: a -> String
+
+instance Embed Word8 where
+    mkHex = showHex'
+    
+instance Embed Word16 where
+    mkHex = repDualByte
+
+instance Embed Char where
+    mkHex = repDualByte
+
+-- this is used for the indices of the compressed array.
+instance Embed Int where
+    mkHex = repDualByte
+
+showHex' :: Integral a => a -> String
+showHex' s = "\\x" ++ showHex s ""
+
+repDualByte :: Enum c => c -> String
+repDualByte c 
+    | n >= 2^(16::Int) = error "value is too high!"
+    -- NOTE : this assumes little-endian architecture.  But we're only using this on Windows,
+    -- so it's probably OK.
+    | otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256)
+  where
+    n = fromEnum c
+
+
author	Judah Jacobson <judah.jacobson@gmail.com>	2009-09-13 02:21:26 +0000
committer	Judah Jacobson <judah.jacobson@gmail.com>	2009-09-13 02:21:26 +0000
commit	b63b596e116ab4bb753a340f1338cbe9572cea0d (patch)
tree	e016418002fa954ca2bb593c80251f98721a9b35 /libraries/base/codepages
parent	bf7ad38e1e473a3b61b2402bcd7f481bbf3ca423 (diff)
download	haskell-b63b596e116ab4bb753a340f1338cbe9572cea0d.tar.gz