summaryrefslogtreecommitdiff
path: root/libraries/base/codepages
diff options
context:
space:
mode:
authorJudah Jacobson <judah.jacobson@gmail.com>2009-09-13 02:21:26 +0000
committerJudah Jacobson <judah.jacobson@gmail.com>2009-09-13 02:21:26 +0000
commitb63b596e116ab4bb753a340f1338cbe9572cea0d (patch)
treee016418002fa954ca2bb593c80251f98721a9b35 /libraries/base/codepages
parentbf7ad38e1e473a3b61b2402bcd7f481bbf3ca423 (diff)
downloadhaskell-b63b596e116ab4bb753a340f1338cbe9572cea0d.tar.gz
On Windows, use the console code page for text file encoding/decoding.
We keep all of the code page tables in the module GHC.IO.Encoding.CodePage.Table. That file was generated automatically by running codepages/MakeTable.hs; more details are in the comments at the start of that script. Storing the lookup tables adds about 40KB to each statically linked executable; this only increases the size of a "hello world" program by about 7%. Currently we do not support double-byte encodings (Chinese/Japanese/Korean), since including those codepages would increase the table size to 400KB. It will be straightforward to implement them once the work on library DLLs is finished.
Diffstat (limited to 'libraries/base/codepages')
-rw-r--r--libraries/base/codepages/MakeTable.hs252
1 files changed, 252 insertions, 0 deletions
diff --git a/libraries/base/codepages/MakeTable.hs b/libraries/base/codepages/MakeTable.hs
new file mode 100644
index 0000000000..af123df4dd
--- /dev/null
+++ b/libraries/base/codepages/MakeTable.hs
@@ -0,0 +1,252 @@
+{--
+This is a script to generate the necessary tables to support Windows code page
+encoding/decoding.
+
+License: see libraries/base/LICENSE
+
+The code page tables are available from :
+http://www.unicode.org/Public/MAPPINGS/
+
+To run this script, use e.g.
+runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT
+
+Currently, this script only supports single-byte encodings, since the lookup
+tables required for the CJK double-byte codepages are too large to be
+statically linked into every executable. We plan to add support for them once
+GHC is able to produce Windows DLLs.
+
+--}
+
+module Main where
+
+import System.FilePath
+import qualified Data.Map as Map
+import System.IO
+import Data.Maybe (mapMaybe)
+import Data.List (intersperse)
+import Data.Word
+import Numeric
+import Control.Monad.State
+import System.Environment
+import Control.Exception(evaluate)
+
+main :: IO ()
+main = do
+ moduleName:outFile:files <- getArgs
+ sbes <- mapM readMapAndIx files
+ withBinaryFile outFile WriteMode $ flip hPutStr
+ $ unlines $ makeTableFile moduleName files sbes
+ where
+ readMapAndIx f = do
+ m <- readMap f
+ return (codePageNum f, m)
+
+-- filenames are assumed to be of the form "CP1250.TXT"
+codePageNum :: FilePath -> Int
+codePageNum = read . drop 2 . takeBaseName
+
+readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char)
+readMap f = withBinaryFile f ReadMode $ \h -> do
+ contents <- hGetContents h
+ let ms = Map.fromList $ mapMaybe parseLine $ lines contents
+ evaluate $ Map.size ms
+ return ms
+
+parseLine :: Enum a => String -> Maybe (a,Char)
+parseLine s = case words s of
+ ('#':_):_ -> Nothing
+ bs:"#DBCS":_ -> Just (readHex' bs, toEnum 0xDC00)
+ bs:"#UNDEFINED":_ -> Just (readHex' bs, toEnum 0)
+ bs:cs:('#':_):_ -> Just (readHex' bs, readCharHex cs)
+ _ -> Nothing
+
+readHex' :: Enum a => String -> a
+readHex' ('0':'x':s) = case readHex s of
+ [(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors.
+ _ -> error $ "Can't read hex: " ++ show s
+readHex' s = error $ "Can't read hex: " ++ show s
+
+readCharHex :: String -> Char
+readCharHex s = if c > fromEnum (maxBound :: Word16)
+ then error "Can't handle non-BMP character."
+ else toEnum c
+ where c = readHex' s
+
+
+-------------------------------------------
+-- Writing out the main data values.
+
+makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String]
+makeTableFile moduleName files maps = concat
+ [ languageDirectives, firstComment files, header,
+ theImports, theTypes, blockSizeText, tablePart]
+ where
+ header = [ "module " ++ moduleName ++ " where"
+ , ""
+ ]
+ tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]"
+ , "codePageMap = ["
+ ] ++ (intersperse "\n ," $ map mkTableEntry maps)
+ ++ [" ]"]
+ mkTableEntry (i,m) = " (" ++ show i ++ ", " ++ makeSBE m ++ " )"
+ blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize]
+
+
+makeSBE :: Map.Map Word8 Char -> String
+makeSBE m = unlines
+ [ "SingleByteCP {"
+ , " decoderArray = " ++ mkConvArray es
+ , " , encoderArray = " ++ mkCompactArray (swapMap m)
+ , " }"
+ ]
+ where
+ es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]]
+
+swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a
+swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList
+ where
+ swap (x,y) = (y,x)
+
+
+mkConvArray :: Embed a => [a] -> String
+mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#"
+
+
+-------------------------------------------
+-- Compact arrays
+--
+-- The decoding map (from Word8 to Char) can be implemented with a simple array
+-- of 256 Word16's. Bytes which do not belong to the code page are mapped to
+-- '\0'.
+--
+-- However, a naive table mapping Char to Word8 would require 2^16 Word8's. We
+-- can use much less space with the right data structure, since at most 256 of
+-- those entries are nonzero.
+--
+-- We use "compact arrays", as described in "Unicode Demystified" by Richard
+-- Gillam.
+--
+-- Fix a block size S which is a power of two. We compress an array of N
+-- entries (where N>>S) as follows. First, split the array into blocks of size
+-- S, then remove all repeate blocks to form the "value" array. Then construct
+-- a separate "index" array which maps the position of blocks in the old array
+-- to a position in the value array.
+--
+-- For example, assume that S=32 we have six blocks ABABCA, each with 32
+-- elements.
+--
+-- Then the compressed table consists of two arrays:
+-- 1) An array "values", concatenating the unique blocks ABC
+-- 2) An array "indices" which equals [0,1,0,1,2,0].
+--
+-- To look up '\100', first calculate divMod 100 32 = (3,4). Since
+-- indices[3]=1, we look at the second unique block B; thus the encoded byte is
+-- B[4].
+--
+-- The upshot of this representation is that the lookup is very quick as it only
+-- requires two array accesses plus some bit masking/shifting.
+
+-- From testing, this is an optimal size.
+blockBitSize :: Int
+blockBitSize = 6
+
+mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String
+mkCompactArray m = unlines [
+ ""
+ , " CompactArray {"
+ , " encoderIndices = " ++ mkConvArray is'
+ , " , encoderValues = "
+ ++ mkConvArray (concat $ Map.elems vs)
+ , " , encoderMax = " ++ show (fst $ Map.findMax m)
+ , " }"
+ ]
+ where
+ blockSize = 2 ^ blockBitSize
+ (is,(vs,_)) = compress blockSize $ m
+ is' = map (* blockSize) is
+
+type CompressState b = (Map.Map Int [b], Map.Map [b] Int)
+-- each entry in the list corresponds to a block of size n.
+compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b
+ -> ([Int], CompressState b)
+compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty)
+ where
+ chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms)
+ $ [minBound..fst (Map.findMax ms)]
+ mkChunks [] = []
+ mkChunks xs = take n xs : mkChunks (drop n xs)
+ lookupOrAdd xs = do
+ (m,rm) <- get
+ case Map.lookup xs rm of
+ Just i -> return i
+ Nothing -> do
+ let i = if Map.null m
+ then 0
+ else 1 + fst (Map.findMax m)
+ put (Map.insert i xs m, Map.insert xs i rm)
+ return i
+
+-------------------------------------------
+-- Static parts of the generated module.
+
+languageDirectives :: [String]
+languageDirectives = ["{-# LANGUAGE MagicHash #-}"]
+
+
+firstComment :: [FilePath] -> [String]
+firstComment files = map ("-- " ++) $
+ [ "Do not edit this file directly!"
+ , "It was generated by the MakeTable.hs script using the following files:"
+ ] ++ map takeFileName files
+
+theImports :: [String]
+theImports = map ("import " ++ )
+ ["GHC.Prim", "GHC.Base", "GHC.Word", "GHC.Num"]
+
+theTypes :: [String]
+theTypes = [ "data ConvArray a = ConvArray Addr#"
+ , "data CompactArray a b = CompactArray {"
+ , " encoderMax :: !a,"
+ , " encoderIndices :: !(ConvArray Int),"
+ , " encoderValues :: !(ConvArray b)"
+ , " }"
+ , ""
+ , "data CodePageArrays = SingleByteCP {"
+ , " decoderArray :: !(ConvArray Char),"
+ , " encoderArray :: !(CompactArray Char Word8)"
+ , " }"
+ , ""
+ ]
+
+-------------------------------------------
+-- Embed class and associated functions
+
+class (Ord a, Enum a, Bounded a, Show a) => Embed a where
+ mkHex :: a -> String
+
+instance Embed Word8 where
+ mkHex = showHex'
+
+instance Embed Word16 where
+ mkHex = repDualByte
+
+instance Embed Char where
+ mkHex = repDualByte
+
+-- this is used for the indices of the compressed array.
+instance Embed Int where
+ mkHex = repDualByte
+
+showHex' :: Integral a => a -> String
+showHex' s = "\\x" ++ showHex s ""
+
+repDualByte :: Enum c => c -> String
+repDualByte c
+ | n >= 2^(16::Int) = error "value is too high!"
+ -- NOTE : this assumes little-endian architecture. But we're only using this on Windows,
+ -- so it's probably OK.
+ | otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256)
+ where
+ n = fromEnum c
+
+