diff options
author | Judah Jacobson <judah.jacobson@gmail.com> | 2009-09-13 02:21:26 +0000 |
---|---|---|
committer | Judah Jacobson <judah.jacobson@gmail.com> | 2009-09-13 02:21:26 +0000 |
commit | b63b596e116ab4bb753a340f1338cbe9572cea0d (patch) | |
tree | e016418002fa954ca2bb593c80251f98721a9b35 /libraries/base/codepages | |
parent | bf7ad38e1e473a3b61b2402bcd7f481bbf3ca423 (diff) | |
download | haskell-b63b596e116ab4bb753a340f1338cbe9572cea0d.tar.gz |
On Windows, use the console code page for text file encoding/decoding.
We keep all of the code page tables in the module
GHC.IO.Encoding.CodePage.Table. That file was generated automatically
by running codepages/MakeTable.hs; more details are in the comments at the
start of that script.
Storing the lookup tables adds about 40KB to each statically linked executable;
this only increases the size of a "hello world" program by about 7%.
Currently we do not support double-byte encodings (Chinese/Japanese/Korean), since
including those codepages would increase the table size to 400KB. It will be
straightforward to implement them once the work on library DLLs is finished.
Diffstat (limited to 'libraries/base/codepages')
-rw-r--r-- | libraries/base/codepages/MakeTable.hs | 252 |
1 files changed, 252 insertions, 0 deletions
diff --git a/libraries/base/codepages/MakeTable.hs b/libraries/base/codepages/MakeTable.hs new file mode 100644 index 0000000000..af123df4dd --- /dev/null +++ b/libraries/base/codepages/MakeTable.hs @@ -0,0 +1,252 @@ +{-- +This is a script to generate the necessary tables to support Windows code page +encoding/decoding. + +License: see libraries/base/LICENSE + +The code page tables are available from : +http://www.unicode.org/Public/MAPPINGS/ + +To run this script, use e.g. +runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT + +Currently, this script only supports single-byte encodings, since the lookup +tables required for the CJK double-byte codepages are too large to be +statically linked into every executable. We plan to add support for them once +GHC is able to produce Windows DLLs. + +--} + +module Main where + +import System.FilePath +import qualified Data.Map as Map +import System.IO +import Data.Maybe (mapMaybe) +import Data.List (intersperse) +import Data.Word +import Numeric +import Control.Monad.State +import System.Environment +import Control.Exception(evaluate) + +main :: IO () +main = do + moduleName:outFile:files <- getArgs + sbes <- mapM readMapAndIx files + withBinaryFile outFile WriteMode $ flip hPutStr + $ unlines $ makeTableFile moduleName files sbes + where + readMapAndIx f = do + m <- readMap f + return (codePageNum f, m) + +-- filenames are assumed to be of the form "CP1250.TXT" +codePageNum :: FilePath -> Int +codePageNum = read . drop 2 . takeBaseName + +readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char) +readMap f = withBinaryFile f ReadMode $ \h -> do + contents <- hGetContents h + let ms = Map.fromList $ mapMaybe parseLine $ lines contents + evaluate $ Map.size ms + return ms + +parseLine :: Enum a => String -> Maybe (a,Char) +parseLine s = case words s of + ('#':_):_ -> Nothing + bs:"#DBCS":_ -> Just (readHex' bs, toEnum 0xDC00) + bs:"#UNDEFINED":_ -> Just (readHex' bs, toEnum 0) + bs:cs:('#':_):_ -> Just (readHex' bs, readCharHex cs) + _ -> Nothing + +readHex' :: Enum a => String -> a +readHex' ('0':'x':s) = case readHex s of + [(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors. + _ -> error $ "Can't read hex: " ++ show s +readHex' s = error $ "Can't read hex: " ++ show s + +readCharHex :: String -> Char +readCharHex s = if c > fromEnum (maxBound :: Word16) + then error "Can't handle non-BMP character." + else toEnum c + where c = readHex' s + + +------------------------------------------- +-- Writing out the main data values. + +makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String] +makeTableFile moduleName files maps = concat + [ languageDirectives, firstComment files, header, + theImports, theTypes, blockSizeText, tablePart] + where + header = [ "module " ++ moduleName ++ " where" + , "" + ] + tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]" + , "codePageMap = [" + ] ++ (intersperse "\n ," $ map mkTableEntry maps) + ++ [" ]"] + mkTableEntry (i,m) = " (" ++ show i ++ ", " ++ makeSBE m ++ " )" + blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize] + + +makeSBE :: Map.Map Word8 Char -> String +makeSBE m = unlines + [ "SingleByteCP {" + , " decoderArray = " ++ mkConvArray es + , " , encoderArray = " ++ mkCompactArray (swapMap m) + , " }" + ] + where + es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]] + +swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a +swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList + where + swap (x,y) = (y,x) + + +mkConvArray :: Embed a => [a] -> String +mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#" + + +------------------------------------------- +-- Compact arrays +-- +-- The decoding map (from Word8 to Char) can be implemented with a simple array +-- of 256 Word16's. Bytes which do not belong to the code page are mapped to +-- '\0'. +-- +-- However, a naive table mapping Char to Word8 would require 2^16 Word8's. We +-- can use much less space with the right data structure, since at most 256 of +-- those entries are nonzero. +-- +-- We use "compact arrays", as described in "Unicode Demystified" by Richard +-- Gillam. +-- +-- Fix a block size S which is a power of two. We compress an array of N +-- entries (where N>>S) as follows. First, split the array into blocks of size +-- S, then remove all repeate blocks to form the "value" array. Then construct +-- a separate "index" array which maps the position of blocks in the old array +-- to a position in the value array. +-- +-- For example, assume that S=32 we have six blocks ABABCA, each with 32 +-- elements. +-- +-- Then the compressed table consists of two arrays: +-- 1) An array "values", concatenating the unique blocks ABC +-- 2) An array "indices" which equals [0,1,0,1,2,0]. +-- +-- To look up '\100', first calculate divMod 100 32 = (3,4). Since +-- indices[3]=1, we look at the second unique block B; thus the encoded byte is +-- B[4]. +-- +-- The upshot of this representation is that the lookup is very quick as it only +-- requires two array accesses plus some bit masking/shifting. + +-- From testing, this is an optimal size. +blockBitSize :: Int +blockBitSize = 6 + +mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String +mkCompactArray m = unlines [ + "" + , " CompactArray {" + , " encoderIndices = " ++ mkConvArray is' + , " , encoderValues = " + ++ mkConvArray (concat $ Map.elems vs) + , " , encoderMax = " ++ show (fst $ Map.findMax m) + , " }" + ] + where + blockSize = 2 ^ blockBitSize + (is,(vs,_)) = compress blockSize $ m + is' = map (* blockSize) is + +type CompressState b = (Map.Map Int [b], Map.Map [b] Int) +-- each entry in the list corresponds to a block of size n. +compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b + -> ([Int], CompressState b) +compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty) + where + chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms) + $ [minBound..fst (Map.findMax ms)] + mkChunks [] = [] + mkChunks xs = take n xs : mkChunks (drop n xs) + lookupOrAdd xs = do + (m,rm) <- get + case Map.lookup xs rm of + Just i -> return i + Nothing -> do + let i = if Map.null m + then 0 + else 1 + fst (Map.findMax m) + put (Map.insert i xs m, Map.insert xs i rm) + return i + +------------------------------------------- +-- Static parts of the generated module. + +languageDirectives :: [String] +languageDirectives = ["{-# LANGUAGE MagicHash #-}"] + + +firstComment :: [FilePath] -> [String] +firstComment files = map ("-- " ++) $ + [ "Do not edit this file directly!" + , "It was generated by the MakeTable.hs script using the following files:" + ] ++ map takeFileName files + +theImports :: [String] +theImports = map ("import " ++ ) + ["GHC.Prim", "GHC.Base", "GHC.Word", "GHC.Num"] + +theTypes :: [String] +theTypes = [ "data ConvArray a = ConvArray Addr#" + , "data CompactArray a b = CompactArray {" + , " encoderMax :: !a," + , " encoderIndices :: !(ConvArray Int)," + , " encoderValues :: !(ConvArray b)" + , " }" + , "" + , "data CodePageArrays = SingleByteCP {" + , " decoderArray :: !(ConvArray Char)," + , " encoderArray :: !(CompactArray Char Word8)" + , " }" + , "" + ] + +------------------------------------------- +-- Embed class and associated functions + +class (Ord a, Enum a, Bounded a, Show a) => Embed a where + mkHex :: a -> String + +instance Embed Word8 where + mkHex = showHex' + +instance Embed Word16 where + mkHex = repDualByte + +instance Embed Char where + mkHex = repDualByte + +-- this is used for the indices of the compressed array. +instance Embed Int where + mkHex = repDualByte + +showHex' :: Integral a => a -> String +showHex' s = "\\x" ++ showHex s "" + +repDualByte :: Enum c => c -> String +repDualByte c + | n >= 2^(16::Int) = error "value is too high!" + -- NOTE : this assumes little-endian architecture. But we're only using this on Windows, + -- so it's probably OK. + | otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256) + where + n = fromEnum c + + |