[project @ 2006-01-06 16:30:17 by simonmar]

Add support for UTF-8 source files GHC finally has support for full Unicode in source files. Source files are now assumed to be UTF-8 encoded, and the full range of Unicode characters can be used, with classifications recognised using the implementation from Data.Char. This incedentally means that only the stage2 compiler will recognise Unicode in source files, because I was too lazy to port the unicode classifier code into libcompat. Additionally, the following synonyms for keywords are now recognised: forall symbol (U+2200) forall right arrow (U+2192) -> left arrow (U+2190) <- horizontal ellipsis (U+22EF) .. there are probably more things we could add here. This will break some source files if Latin-1 characters are being used. In most cases this should result in a UTF-8 decoding error. Later on if we want to support more encodings (perhaps with a pragma to specify the encoding), I plan to do it by recoding into UTF-8 before parsing. Internally, there were some pretty big changes: - FastStrings are now stored in UTF-8 - Z-encoding has been moved right to the back end. Previously we used to Z-encode every identifier on the way in for simplicity, and only decode when we needed to show something to the user. Instead, we now keep every string in its UTF-8 encoding, and Z-encode right before printing it out. To avoid Z-encoding the same string multiple times, the Z-encoding is cached inside the FastString the first time it is requested. This speeds up the compiler - I've measured some definite improvement in parsing at least, and I expect compilations overall to be faster too. It also cleans up a lot of cruft from the OccName interface. Z-encoding is nicely hidden inside the Outputable instance for Names & OccNames now. - StringBuffers are UTF-8 too, and are now represented as ForeignPtrs. - I've put together some test cases, not by any means exhaustive, but there are some interesting UTF-8 decoding error cases that aren't obvious. Also, take a look at unicode001.hs for a demo.
author: simonmar <unknown> 2006-01-06 16:30:19 +0000
committer: simonmar <unknown> 2006-01-06 16:30:19 +0000
commit: 9d7da331989abcd1844e9d03b8d1e4163796fa85 (patch)
tree: 8efa2e6fdcf8bfee777ae6477a686d0594c5ff76 /ghc/compiler/utils
parent: 2a2efb720c0fdc06fe749f96f284b00b30f8f3f7 (diff)
download: haskell-9d7da331989abcd1844e9d03b8d1e4163796fa85.tar.gz
9 files changed, 886 insertions, 888 deletions
diff --git a/ghc/compiler/utils/Binary.hs b/ghc/compiler/utils/Binary.hs
index 1902ff1f66..7b40bd279d 100644
--- a/ghc/compiler/utils/Binary.hs
+++ b/ghc/compiler/utils/Binary.hs
@@ -58,26 +58,7 @@ import UniqFM
 import FastMutInt
 import PackageConfig		( PackageId, packageIdFS, fsToPackageId )
 
-#if __GLASGOW_HASKELL__ < 503
-import DATA_IOREF
-import DATA_BITS
-import DATA_INT
-import DATA_WORD
-import Char
-import Monad
-import Exception
-import GlaExts hiding (ByteArray, newByteArray, freezeByteArray)
-import Array
-import IO
-import PrelIOBase		( IOError(..), IOErrorType(..)
-#if __GLASGOW_HASKELL__ > 411
-				, IOException(..)
-#endif
-				)
-import PrelReal			( Ratio(..) )
-import PrelIOBase	 	( IO(..) )
-import IOExts			( openFileEx, IOModeEx(..) )
-#else
+import Foreign
 import Data.Array.IO
 import Data.Array
 import Data.Bits
@@ -102,44 +83,12 @@ import GHC.Handle		( openFileEx, IOModeEx(..) )
 #else
 import System.IO		( openBinaryFile )
 #endif
-#endif
 
 #if __GLASGOW_HASKELL__ < 601
 openBinaryFile f mode = openFileEx f (BinaryMode mode)
 #endif
 
-#if __GLASGOW_HASKELL__ < 503
-type BinArray = MutableByteArray RealWorld Int
-newArray_ bounds     = stToIO (newCharArray bounds)
-unsafeWrite arr ix e = stToIO (writeWord8Array arr ix e)
-unsafeRead  arr ix   = stToIO (readWord8Array arr ix)
-#if __GLASGOW_HASKELL__ < 411
-newByteArray#        = newCharArray#
-#endif
-hPutArray h arr sz   = hPutBufBAFull h arr sz
-hGetArray h sz       = hGetBufBAFull h sz
-
-mkIOError :: IOErrorType -> String -> Maybe Handle -> Maybe FilePath -> Exception
-mkIOError t location maybe_hdl maybe_filename
-  = IOException (IOError maybe_hdl t location ""
-#if __GLASGOW_HASKELL__ > 411
-		         maybe_filename
-#endif
-  		)
-
-eofErrorType = EOF
-
-#ifndef SIZEOF_HSINT
-#define SIZEOF_HSINT  INT_SIZE_IN_BYTES
-#endif
-
-#ifndef SIZEOF_HSWORD
-#define SIZEOF_HSWORD WORD_SIZE_IN_BYTES
-#endif
-
-#else
 type BinArray = IOUArray Int Word8
-#endif
 
 ---------------------------------------------------------------
 --		BinHandle
@@ -741,13 +690,17 @@ constructDictionary j fm = array (0,j-1) (eltsUFM fm)
 --		Reading and writing FastStrings
 ---------------------------------------------------------
 
-putFS bh (FastString id l ba) = do
-  put_ bh (I# l)
-  putByteArray bh ba l
-putFS bh s = error ("Binary.put_(FastString): " ++ unpackFS s)
-	-- Note: the length of the FastString is *not* the same as
-	-- the size of the ByteArray: the latter is rounded up to a
-	-- multiple of the word size.
+putFS bh (FastString id l _ buf _) = do
+  put_ bh l
+  withForeignPtr buf $ \ptr -> 
+    let 
+	go n | n == l    = return ()
+	     | otherwise = do
+		b <- peekElemOff ptr n
+		putByte bh b
+		go (n+1)
+   in 
+   go 0
   
 {- -- possible faster version, not quite there yet:
 getFS bh@BinMem{} = do
@@ -757,16 +710,24 @@ getFS bh@BinMem{} = do
   return $! (mkFastSubStringBA# arr off l)
 -}
 getFS bh = do
-  (I# l) <- get bh
-  (BA ba) <- getByteArray bh (I# l)
-  return $! (mkFastSubStringBA# ba 0# l)
+  l <- get bh
+  fp <- mallocForeignPtrBytes l
+  withForeignPtr fp $ \ptr -> do
+  let 
+	go n | n == l = mkFastStringForeignPtr ptr fp l
+	     | otherwise = do
+		b <- getByte bh
+		pokeElemOff ptr n b
+		go (n+1)
+  --
+  go 0
 
 instance Binary PackageId where
   put_ bh pid = put_ bh (packageIdFS pid)
   get bh = do { fs <- get bh; return (fsToPackageId fs) }
 
 instance Binary FastString where
-  put_ bh f@(FastString id l ba) =
+  put_ bh f@(FastString id l _ fp _) =
     case getUserData bh of { 
 	UserData { ud_next = j_r, ud_map = out_r, ud_dict = dict} -> do
     out <- readIORef out_r
diff --git a/ghc/compiler/utils/BufWrite.hs b/ghc/compiler/utils/BufWrite.hs
index 6d00e46634..b15089ead3 100644
--- a/ghc/compiler/utils/BufWrite.hs
+++ b/ghc/compiler/utils/BufWrite.hs
@@ -31,17 +31,11 @@ import Char		( ord )
 import Foreign
 import IO
 
-#if __GLASGOW_HASKELL__ < 503
-import PrelIOBase	( IO(..) )
-import IOExts		( hPutBufFull )
-#else
 import GHC.IOBase	( IO(..) )
 import System.IO	( hPutBuf )
-#endif
-
-import GLAEXTS		( touch#, byteArrayContents#, Int(..), Int#, Addr# )
+import GHC.Ptr		( Ptr(..) )
 
-import PrimPacked	( Ptr(..) )
+import GLAEXTS		( Int(..), Int#, Addr# )
 
 -- -----------------------------------------------------------------------------
 
@@ -88,22 +82,17 @@ bPutStr b@(BufHandle buf r hdl) str = do
 		loop cs (i+1)
   
 bPutFS :: BufHandle -> FastString -> IO ()
-bPutFS b@(BufHandle buf r hdl) fs@(FastString _ len# arr#) = do
-  let len = I# len#
+bPutFS b@(BufHandle buf r hdl) fs@(FastString _ len _ fp _) =
+ withForeignPtr fp $ \ptr -> do
   i <- readFastMutInt r
   if (i + len) >= buf_size
 	then do hPutBuf hdl buf i
 		writeFastMutInt r 0
 		if (len >= buf_size) 
-		    then do
-			 let a# = byteArrayContents# arr#
-			 hPutBuf hdl (Ptr a#) len
-			 touch fs
+		    then hPutBuf hdl ptr len
 		    else bPutFS b fs
 	else do
-		let a# = byteArrayContents# arr#
-		copyBytes (buf `plusPtr` i) (Ptr a#) len
-		touch fs
+		copyBytes (buf `plusPtr` i) ptr len
 		writeFastMutInt r (i+len)
 bPutFS _ _ = panic "bPutFS"
 
@@ -128,8 +117,6 @@ bFlush b@(BufHandle buf r hdl) = do
   free buf
   return ()
 
-touch r = IO $ \s -> case touch# r s of s -> (# s, () #)
-
 #if 0
 myPutBuf s hdl buf i = 
   modifyIOError (\e -> ioeSetErrorString e (ioeGetErrorString e ++ ':':s ++ " (" ++ show buf ++ "," ++ show i ++ ")")) $
diff --git a/ghc/compiler/utils/Encoding.hs b/ghc/compiler/utils/Encoding.hs
new file mode 100644
index 0000000000..d15c0216ae
--- /dev/null
+++ b/ghc/compiler/utils/Encoding.hs
@@ -0,0 +1,386 @@
+{-# OPTIONS_GHC -O #-}
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow, 1997-2003
+--
+-- Character encodings 
+--
+-- -----------------------------------------------------------------------------
+
+module Encoding ( 
+	-- * UTF-8
+	utf8DecodeChar#,
+	utf8PrevChar,
+	utf8CharStart,
+	utf8DecodeChar,
+	utf8DecodeString,
+	utf8EncodeChar,
+	utf8EncodeString,
+	utf8EncodedLength,
+	countUTF8Chars,
+
+	-- * Latin-1
+	latin1DecodeChar,
+	latin1EncodeChar,
+
+	-- * Z-encoding
+	zEncodeString,
+	zDecodeString
+  ) where
+
+#define COMPILING_FAST_STRING
+#include "HsVersions.h"
+import Foreign
+import Data.Char	( ord, chr, isDigit, digitToInt, isHexDigit )
+import Numeric		( showHex )
+
+import GHC.Ptr		( Ptr(..) )
+import GHC.Base
+
+-- -----------------------------------------------------------------------------
+-- Latin-1
+
+latin1DecodeChar ptr = do
+  w <- peek ptr
+  return (unsafeChr (fromIntegral w), ptr `plusPtr` 1)
+
+latin1EncodeChar c ptr = do
+  poke ptr (fromIntegral (ord c))
+  return (ptr `plusPtr` 1)
+
+-- -----------------------------------------------------------------------------
+-- UTF-8
+
+-- We can't write the decoder as efficiently as we'd like without
+-- resorting to unboxed extensions, unfortunately.  I tried to write
+-- an IO version of this function, but GHC can't eliminate boxed
+-- results from an IO-returning function.
+--
+-- We assume we can ignore overflow when parsing a multibyte character here.
+-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
+-- before decoding them (see StringBuffer.hs).
+
+{-# INLINE utf8DecodeChar# #-}
+utf8DecodeChar# :: Addr# -> (# Char#, Addr# #)
+utf8DecodeChar# a# =
+  let ch0 = word2Int# (indexWord8OffAddr# a# 0#) in
+  case () of 
+    _ | ch0 <=# 0x7F# -> (# chr# ch0, a# `plusAddr#` 1# #)
+
+      | ch0 >=# 0xC0# && ch0 <=# 0xDF# ->
+	let ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
+	if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else
+	(# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
+	   	  (ch1 -# 0x80#)),
+	   a# `plusAddr#` 2# #)
+
+      | ch0 >=# 0xE0# && ch0 <=# 0xEF# ->
+	let ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
+	if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else
+	let ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
+	if ch2 <# 0x80# || ch2 >=# 0xC0# then fail 2# else
+	(# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+	   	 ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
+	    	  (ch2 -# 0x80#)),
+	   a# `plusAddr#` 3# #)
+
+     | ch0 >=# 0xF0# && ch0 <=# 0xF8# ->
+	let ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
+	if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else
+	let ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
+	if ch2 <# 0x80# || ch2 >=# 0xC0# then fail 2# else
+	let ch3 = word2Int# (indexWord8OffAddr# a# 3#) in
+	if ch3 <# 0x80# || ch3 >=# 0xC0# then fail 3# else
+	(# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+		 ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
+		 ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
+		  (ch3 -# 0x80#)),
+	   a# `plusAddr#` 4# #)
+
+      | otherwise -> fail 1#
+  where
+	-- all invalid sequences end up here:
+	fail n = (# '\0'#, a# `plusAddr#` n #)
+	-- '\xFFFD' would be the usual replacement character, but
+	-- that's a valid symbol in Haskell, so will result in a
+	-- confusing parse error later on.  Instead we use '\0' which
+	-- will signal a lexer error immediately.
+
+utf8DecodeChar :: Ptr Word8 -> (Char, Ptr Word8)
+utf8DecodeChar (Ptr a#) = ( C# c#, Ptr b# )
+  where (# c#, b# #) = utf8DecodeChar# a#
+
+-- UTF-8 is cleverly designed so that we can always figure out where
+-- the start of the current character is, given any position in a
+-- stream.  This function finds the start of the previous character,
+-- assuming there *is* a previous character.
+utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
+utf8PrevChar p = utf8CharStart (p `plusPtr` (-1))
+
+utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
+utf8CharStart p = go p
+ where go p = do w <- peek p
+		 if (w .&. 0xC0) == 0x80
+			then go (p `plusPtr` (-1))
+			else return p
+
+utf8DecodeString :: Ptr Word8 -> Int -> IO [Char]
+STRICT2(utf8DecodeString)
+utf8DecodeString (Ptr a#) (I# len#)
+  = unpack a#
+  where
+    end# = addr2Int# (a# `plusAddr#` len#)
+
+    unpack p#
+	| addr2Int# p# >=# end# = return []
+	| otherwise  =
+	case utf8DecodeChar# p# of
+	   (# c#, q# #) -> do
+		chs <- unpack q#
+		return (C# c# : chs)
+
+countUTF8Chars :: Ptr Word8 -> Int -> IO Int
+countUTF8Chars ptr bytes = go ptr 0
+  where
+	end = ptr `plusPtr` bytes
+
+	STRICT2(go)
+	go ptr n 
+	   | ptr >= end = return n
+	   | otherwise  = do
+		case utf8DecodeChar# (unPtr ptr) of
+		  (# c, a #) -> go (Ptr a) (n+1)
+
+unPtr (Ptr a) = a
+
+utf8EncodeChar c ptr =
+  let x = ord c in
+  case () of
+    _ | x > 0 && x <= 0x007f -> do
+	  poke ptr (fromIntegral x)
+	  return (ptr `plusPtr` 1)
+	-- NB. '\0' is encoded as '\xC0\x80', not '\0'.  This is so that we
+	-- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
+      | x <= 0x07ff -> do
+	  poke ptr (fromIntegral (0xC0 .|. ((x `shiftR` 6) .&. 0x1F)))
+	  pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x .&. 0x3F)))
+	  return (ptr `plusPtr` 2)
+      | x <= 0xffff -> do
+	  poke ptr (fromIntegral (0xE0 .|. (x `shiftR` 12) .&. 0x0F))
+	  pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x `shiftR` 6) .&. 0x3F))
+	  pokeElemOff ptr 2 (fromIntegral (0x80 .|. (x .&. 0x3F)))
+	  return (ptr `plusPtr` 3)
+      | otherwise -> do
+	  poke ptr (fromIntegral (0xF0 .|. (x `shiftR` 18)))
+	  pokeElemOff ptr 1 (fromIntegral (0x80 .|. ((x `shiftR` 12) .&. 0x3F)))
+	  pokeElemOff ptr 2 (fromIntegral (0x80 .|. ((x `shiftR` 6) .&. 0x3F)))
+	  pokeElemOff ptr 3 (fromIntegral (0x80 .|. (x .&. 0x3F)))
+	  return (ptr `plusPtr` 4)
+
+utf8EncodeString :: Ptr Word8 -> String -> IO ()
+utf8EncodeString ptr str = go ptr str
+  where STRICT2(go)
+	go ptr [] = return ()
+	go ptr (c:cs) = do
+	  ptr' <- utf8EncodeChar c ptr
+	  go ptr' cs
+
+utf8EncodedLength :: String -> Int
+utf8EncodedLength str = go 0 str
+  where STRICT2(go)
+	go n [] = n
+        go n (c:cs)
+	  | ord c > 0 && ord c <= 0x007f = go (n+1) cs
+	  | ord c <= 0x07ff = go (n+2) cs
+	  | ord c <= 0xffff = go (n+3) cs	
+	  | otherwise       = go (n+4) cs	
+
+-- -----------------------------------------------------------------------------
+-- The Z-encoding
+
+{-
+This is the main name-encoding and decoding function.  It encodes any
+string into a string that is acceptable as a C name.  This is the name
+by which things are known right through the compiler.
+
+The basic encoding scheme is this.  
+
+* Tuples (,,,) are coded as Z3T
+
+* Alphabetic characters (upper and lower) and digits
+	all translate to themselves; 
+	except 'Z', which translates to 'ZZ'
+	and    'z', which translates to 'zz'
+  We need both so that we can preserve the variable/tycon distinction
+
+* Most other printable characters translate to 'zx' or 'Zx' for some
+	alphabetic character x
+
+* The others translate as 'znnnU' where 'nnn' is the decimal number
+        of the character
+
+	Before		After
+	--------------------------
+	Trak		Trak
+	foo_wib		foozuwib
+	>		zg
+	>1		zg1
+	foo#		foozh
+	foo##		foozhzh
+	foo##1		foozhzh1
+	fooZ		fooZZ	
+	:+		ZCzp
+	()		Z0T	0-tuple
+	(,,,,)		Z5T	5-tuple  
+	(# #)           Z1H     unboxed 1-tuple	(note the space)
+	(#,,,,#)	Z5H	unboxed 5-tuple
+		(NB: There is no Z1T nor Z0H.)
+-}
+
+type UserString = String	-- As the user typed it
+type EncodedString = String	-- Encoded form
+
+
+zEncodeString :: UserString -> EncodedString
+zEncodeString cs = case maybe_tuple cs of
+		Just n  -> n		-- Tuples go to Z2T etc
+		Nothing -> go cs
+	  where
+		go []     = []
+		go (c:cs) = encode_ch c ++ go cs
+
+unencodedChar :: Char -> Bool	-- True for chars that don't need encoding
+unencodedChar 'Z' = False
+unencodedChar 'z' = False
+unencodedChar c   =  c >= 'a' && c <= 'z'
+	          || c >= 'A' && c <= 'Z'
+		  || c >= '0' && c <= '9'
+
+encode_ch :: Char -> EncodedString
+encode_ch c | unencodedChar c = [c]	-- Common case first
+
+-- Constructors
+encode_ch '('  = "ZL"	-- Needed for things like (,), and (->)
+encode_ch ')'  = "ZR"	-- For symmetry with (
+encode_ch '['  = "ZM"
+encode_ch ']'  = "ZN"
+encode_ch ':'  = "ZC"
+encode_ch 'Z'  = "ZZ"
+
+-- Variables
+encode_ch 'z'  = "zz"
+encode_ch '&'  = "za"
+encode_ch '|'  = "zb"
+encode_ch '^'  = "zc"
+encode_ch '$'  = "zd"
+encode_ch '='  = "ze"
+encode_ch '>'  = "zg"
+encode_ch '#'  = "zh"
+encode_ch '.'  = "zi"
+encode_ch '<'  = "zl"
+encode_ch '-'  = "zm"
+encode_ch '!'  = "zn"
+encode_ch '+'  = "zp"
+encode_ch '\'' = "zq"
+encode_ch '\\' = "zr"
+encode_ch '/'  = "zs"
+encode_ch '*'  = "zt"
+encode_ch '_'  = "zu"
+encode_ch '%'  = "zv"
+encode_ch c    = 'z' : if isDigit (head hex_str) then hex_str
+					         else '0':hex_str
+  where hex_str = showHex (ord c) "U"
+  -- ToDo: we could improve the encoding here in various ways.
+  -- eg. strings of unicode characters come out as 'z1234Uz5678U', we
+  -- could remove the 'U' in the middle (the 'z' works as a separator).
+
+zDecodeString :: EncodedString -> UserString
+zDecodeString [] = []
+zDecodeString ('Z' : d : rest) 
+  | isDigit d = decode_tuple   d rest
+  | otherwise = decode_upper   d : zDecodeString rest
+zDecodeString ('z' : d : rest)
+  | isDigit d = decode_num_esc d rest
+  | otherwise = decode_lower   d : zDecodeString rest
+zDecodeString (c   : rest) = c : zDecodeString rest
+
+decode_upper, decode_lower :: Char -> Char
+
+decode_upper 'L' = '('
+decode_upper 'R' = ')'
+decode_upper 'M' = '['
+decode_upper 'N' = ']'
+decode_upper 'C' = ':'
+decode_upper 'Z' = 'Z'
+decode_upper ch  = {-pprTrace "decode_upper" (char ch)-} ch
+	     	
+decode_lower 'z' = 'z'
+decode_lower 'a' = '&'
+decode_lower 'b' = '|'
+decode_lower 'c' = '^'
+decode_lower 'd' = '$'
+decode_lower 'e' = '='
+decode_lower 'g' = '>'
+decode_lower 'h' = '#'
+decode_lower 'i' = '.'
+decode_lower 'l' = '<'
+decode_lower 'm' = '-'
+decode_lower 'n' = '!'
+decode_lower 'p' = '+'
+decode_lower 'q' = '\''
+decode_lower 'r' = '\\'
+decode_lower 's' = '/'
+decode_lower 't' = '*'
+decode_lower 'u' = '_'
+decode_lower 'v' = '%'
+decode_lower ch  = {-pprTrace "decode_lower" (char ch)-} ch
+
+-- Characters not having a specific code are coded as z224U (in hex)
+decode_num_esc d rest
+  = go (digitToInt d) rest
+  where
+    go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest
+    go n ('U' : rest)           = chr n : zDecodeString rest
+    go n other = error ("decode_num_esc: " ++ show n ++  ' ':other)
+
+decode_tuple :: Char -> EncodedString -> UserString
+decode_tuple d rest
+  = go (digitToInt d) rest
+  where
+	-- NB. recurse back to zDecodeString after decoding the tuple, because
+	-- the tuple might be embedded in a longer name.
+    go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest
+    go 0 ('T':rest)	= "()" ++ zDecodeString rest
+    go n ('T':rest)	= '(' : replicate (n-1) ',' ++ ")" ++ zDecodeString rest
+    go 1 ('H':rest)	= "(# #)" ++ zDecodeString rest
+    go n ('H':rest)	= '(' : '#' : replicate (n-1) ',' ++ "#)" ++ zDecodeString rest
+    go n other = error ("decode_tuple: " ++ show n ++ ' ':other)
+
+{-
+Tuples are encoded as
+	Z3T or Z3H
+for 3-tuples or unboxed 3-tuples respectively.  No other encoding starts 
+	Z<digit>
+
+* "(# #)" is the tycon for an unboxed 1-tuple (not 0-tuple)
+  There are no unboxed 0-tuples.  
+
+* "()" is the tycon for a boxed 0-tuple.
+  There are no boxed 1-tuples.
+-}
+
+maybe_tuple :: UserString -> Maybe EncodedString
+
+maybe_tuple "(# #)" = Just("Z1H")
+maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of
+				 (n, '#' : ')' : cs) -> Just ('Z' : shows (n+1) "H")
+				 other		     -> Nothing
+maybe_tuple "()" = Just("Z0T")
+maybe_tuple ('(' : cs)       = case count_commas (0::Int) cs of
+				 (n, ')' : cs) -> Just ('Z' : shows (n+1) "T")
+				 other	       -> Nothing
+maybe_tuple other    	     = Nothing
+
+count_commas :: Int -> String -> (Int, String)
+count_commas n (',' : cs) = count_commas (n+1) cs
+count_commas n cs	  = (n,cs)
diff --git a/ghc/compiler/utils/FastString.lhs b/ghc/compiler/utils/FastString.lhs
index 52512d3e20..2558c5630a 100644
--- a/ghc/compiler/utils/FastString.lhs
+++ b/ghc/compiler/utils/FastString.lhs
@@ -1,8 +1,10 @@
 %
-% (c) The GRASP/AQUA Project, Glasgow University, 1997-1998
+% (c) The University of Glasgow, 1997-2006
 %
-\section{Fast strings}
+\begin{code}
+{-# OPTIONS -fglasgow-exts -O #-}
 
+{-
 FastString:	A compact, hash-consed, representation of character strings.
 		Comparison is O(1), and you can get a Unique from them.
 		Generated by the FSLIT macro
@@ -15,40 +17,46 @@ LitString:	Just a wrapper for the Addr# of a C string (Ptr CChar).
 		Turn into SDoc with Outputable.ptext
 
 Use LitString unless you want the facilities of FastString
-
-\begin{code}
+-}
 module FastString
        (
+	-- * FastStrings
 	FastString(..),     -- not abstract, for now.
 
-        mkFastString,       -- :: String -> FastString
-        mkFastStringNarrow, -- :: String -> FastString
-        mkFastSubString,    -- :: Addr -> Int -> Int -> FastString
-
-	mkFastString#,      -- :: Addr# -> FastString
-        mkFastSubStringBA#, -- :: ByteArray# -> Int# -> Int# -> FastString
-
-        mkFastStringInt,    -- :: [Int] -> FastString
-
-        uniqueOfFS,	    -- :: FastString -> Int#
-	lengthFS,	    -- :: FastString -> Int
-	nullFastString,     -- :: FastString -> Bool
+	-- ** Construction
+        mkFastString,
+	mkFastStringBytes,
+	mkFastStringForeignPtr,
+	mkFastString#,
+	mkZFastString,
+	mkZFastStringBytes,
 
+	-- ** Deconstruction
 	unpackFS,	    -- :: FastString -> String
-	unpackIntFS,	    -- :: FastString -> [Int]
-	appendFS,	    -- :: FastString -> FastString -> FastString
-        headFS,		    -- :: FastString -> Char
-        headIntFS,	    -- :: FastString -> Int
-        tailFS,		    -- :: FastString -> FastString
-	concatFS,	    -- :: [FastString] -> FastString
-        consFS,             -- :: Char -> FastString -> FastString
-	indexFS,	    -- :: FastString -> Int -> Char
-	nilFS,		    -- :: FastString
-
-        hPutFS,		    -- :: Handle -> FastString -> IO ()
-
+	bytesFS,	    -- :: FastString -> [Word8]
+
+	-- ** Encoding
+	isZEncoded,
+	zEncodeFS,
+
+	-- ** Operations
+        uniqueOfFS,
+	lengthFS,
+	nullFS,
+	appendFS,
+        headFS,
+        tailFS,
+	concatFS,
+        consFS,
+	nilFS,
+
+	-- ** Outputing
+        hPutFS,
+
+	-- * LitStrings
 	LitString, 
-	mkLitString#	    -- :: Addr# -> LitString
+	mkLitString#,
+	strLength
        ) where
 
 -- This #define suppresses the "import FastString" that
@@ -56,64 +64,49 @@ module FastString
 #define COMPILING_FAST_STRING
 #include "HsVersions.h"
 
-#if __GLASGOW_HASKELL__ < 503
-import PrelIOBase	( IO(..) )
-#else
-import GHC.IOBase	( IO(..) )
-#endif
+import Encoding
 
-import PrimPacked
+import Foreign
+import Foreign.C
 import GLAEXTS
 import UNSAFE_IO	( unsafePerformIO )
 import MONAD_ST		( stToIO )
 import DATA_IOREF	( IORef, newIORef, readIORef, writeIORef )
+import System.IO	( hPutBuf )
 
-#if __GLASGOW_HASKELL__ < 503
-import PrelArr		( STArray(..), newSTArray )
-#else
 import GHC.Arr		( STArray(..), newSTArray )
-#endif
-
-#if __GLASGOW_HASKELL__ >= 504
-import GHC.IOBase
-import GHC.Handle
-import Foreign.C
-#else
-import IOExts		( hPutBufBAFull )
-#endif
+import GHC.IOBase	( IO(..) )
 
 import IO
-import Char             ( chr, ord )
 
 #define hASH_TBL_SIZE  4091
-\end{code} 
 
-@FastString@s are packed representations of strings
-with a unique id for fast comparisons. The unique id
-is assigned when creating the @FastString@, using
-a hash table to map from the character string representation
-to the unique ID.
 
-\begin{code}
-data FastString
-  = FastString   -- packed repr. on the heap.
-      Int#       -- unique id
-		 --  0 => string literal, comparison
-		 --  will
-      Int#       -- length
-      ByteArray# -- stuff
-
-  | UnicodeStr   -- if contains characters outside '\1'..'\xFF'
-      Int#       -- unique id
-      [Int]      -- character numbers
+{-|
+A 'FastString' is an array of bytes, hashed to support fast O(1)
+comparison.  It is also associated with a character encoding, so that
+we know how to convert a 'FastString' to the local encoding, or to the
+Z-encoding used by the compiler internally.
 
-instance Eq FastString where
-	-- shortcut for real FastStrings
-  (FastString u1 _ _) == (FastString u2 _ _) = u1 ==# u2
-  a == b = case cmpFS a b of { LT -> False; EQ -> True;  GT -> False }
+'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
+-}
+
+data FastString = FastString {
+      uniq    :: {-# UNPACK #-} !Int,       -- unique id
+      n_bytes :: {-# UNPACK #-} !Int,       -- number of bytes
+      n_chars :: {-# UNPACK #-} !Int,	  -- number of chars
+      buf     :: {-# UNPACK #-} !(ForeignPtr Word8),
+      enc     :: FSEncoding
+  }
+
+data FSEncoding
+  = ZEncoded
+ 	-- including strings that don't need any encoding
+  | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
+	-- A UTF-8 string with a memoized Z-encoding
 
-  (FastString u1 _ _) /= (FastString u2 _ _) = u1 /=# u2
-  a /= b = case cmpFS a b of { LT -> True;  EQ -> False; GT -> True  }
+instance Eq FastString where
+  f1 == f2  =  uniq f1 == uniq f2
 
 instance Ord FastString where
 	-- Compares lexicographically, not by unique
@@ -130,360 +123,311 @@ instance Ord FastString where
 instance Show FastString where
    show fs = show (unpackFS fs)
 
-lengthFS :: FastString -> Int
-lengthFS (FastString _ l# _) = I# l#
-lengthFS (UnicodeStr _ s) = length s
-
-nullFastString :: FastString -> Bool
-nullFastString (FastString _ l# _) = l# ==# 0#
-nullFastString (UnicodeStr _ []) = True
-nullFastString (UnicodeStr _ (_:_)) = False
-
-unpackFS :: FastString -> String
-unpackFS (FastString _ l# ba#) = unpackNBytesBA (BA ba#) (I# l#)
-unpackFS (UnicodeStr _ s) = map chr s
-
-unpackIntFS :: FastString -> [Int]
-unpackIntFS (UnicodeStr _ s) = s
-unpackIntFS fs = map ord (unpackFS fs)
-
-appendFS :: FastString -> FastString -> FastString
-appendFS fs1 fs2 = mkFastStringInt (unpackIntFS fs1 ++ unpackIntFS fs2)
-
-concatFS :: [FastString] -> FastString
-concatFS ls = mkFastStringInt (concat (map unpackIntFS ls)) -- ToDo: do better
-
-headFS :: FastString -> Char
-headFS (FastString _ l# ba#) = 
- if l# ># 0# then C# (indexCharArray# ba# 0#) else error ("headFS: empty FS")
-headFS (UnicodeStr _ (c:_)) = chr c
-headFS (UnicodeStr _ []) = error ("headFS: empty FS")
-
-headIntFS :: FastString -> Int
-headIntFS (UnicodeStr _ (c:_)) = c
-headIntFS fs = ord (headFS fs)
-
-indexFS :: FastString -> Int -> Char
-indexFS f i@(I# i#) =
- case f of
-   FastString _ l# ba#
-     | l# ># 0# && l# ># i#  -> C# (indexCharArray# ba# i#)
-     | otherwise	     -> error (msg (I# l#))
-   UnicodeStr _ s	     -> chr (s!!i)
- where
-  msg l =  "indexFS: out of range: " ++ show (l,i)
-
-tailFS :: FastString -> FastString
-tailFS (FastString _ l# ba#) = mkFastSubStringBA# ba# 1# (l# -# 1#)
-tailFS fs = mkFastStringInt (tail (unpackIntFS fs))
-
-consFS :: Char -> FastString -> FastString
-consFS c fs = mkFastStringInt (ord c : unpackIntFS fs)
+cmpFS :: FastString -> FastString -> Ordering
+cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
+  if u1 == u2 then EQ else
+  let l = if l1 <= l2 then l1 else l2 in
+  inlinePerformIO $
+    withForeignPtr buf1 $ \p1 ->
+    withForeignPtr buf2 $ \p2 -> do
+      res <- memcmp p1 p2 l
+      case () of
+       _ | res <  0  -> return LT
+	 | res == 0  -> if l1 == l2 then return EQ 
+				    else if l1 < l2 then return LT
+						    else return GT
+ 	 | otherwise -> return GT
 
-uniqueOfFS :: FastString -> Int#
-uniqueOfFS (FastString u# _ _) = u#
-uniqueOfFS (UnicodeStr u# _) = u#
+#ifndef __HADDOCK__
+foreign import ccall unsafe "ghc_memcmp" 
+  memcmp :: Ptr a -> Ptr b -> Int -> IO Int
+#endif
 
-nilFS = mkFastString ""
-\end{code}
+-- -----------------------------------------------------------------------------
+-- Construction
 
+{-
 Internally, the compiler will maintain a fast string symbol
 table, providing sharing and fast comparison. Creation of
 new @FastString@s then covertly does a lookup, re-using the
 @FastString@ if there was a hit.
+-}
 
-Caution: mkFastStringUnicode assumes that if the string is in the
-table, it sits under the UnicodeStr constructor. Other mkFastString
-variants analogously assume the FastString constructor.
-
-\begin{code}
 data FastStringTable = 
  FastStringTable
-    Int#
+    {-# UNPACK #-} !Int
     (MutableArray# RealWorld [FastString])
 
-type FastStringTableVar = IORef FastStringTable
-
-string_table :: FastStringTableVar
+string_table :: IORef FastStringTable
 string_table = 
- unsafePerformIO (
-   stToIO (newSTArray (0::Int,hASH_TBL_SIZE) [])
-	>>= \ (STArray _ _ arr#) ->
-   newIORef (FastStringTable 0# arr#))
-
-lookupTbl :: FastStringTable -> Int# -> IO [FastString]
-lookupTbl (FastStringTable _ arr#) i# =
-  IO ( \ s# ->
-  readArray# arr# i# s#)
-
-updTbl :: FastStringTableVar -> FastStringTable -> Int# -> [FastString] -> IO ()
-updTbl fs_table_var (FastStringTable uid# arr#) i# ls =
- IO (\ s# -> case writeArray# arr# i# ls s# of { s2# -> 
-	(# s2#, () #) }) >>
- writeIORef fs_table_var (FastStringTable (uid# +# 1#) arr#)
+ unsafePerformIO $ do
+   (STArray _ _ arr#) <- stToIO (newSTArray (0::Int,hASH_TBL_SIZE) [])
+   newIORef (FastStringTable 0 arr#)
+
+lookupTbl :: FastStringTable -> Int -> IO [FastString]
+lookupTbl (FastStringTable _ arr#) (I# i#) =
+  IO $ \ s# -> readArray# arr# i# s#
+
+updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
+updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
+  (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
+  writeIORef fs_table_var (FastStringTable (uid+1) arr#)
 
 mkFastString# :: Addr# -> FastString
-mkFastString# a# =
- case strLength (Ptr a#) of { (I# len#) -> mkFastStringLen# a# len# }
+mkFastString# a# = mkFastStringBytes ptr (strLength ptr)
+  where ptr = Ptr a#
 
-mkFastStringLen# :: Addr# -> Int# -> FastString
-mkFastStringLen# a# len# =
- unsafePerformIO  (
-  readIORef string_table	>>= \ ft@(FastStringTable uid# tbl#) ->
+mkFastStringBytes :: Ptr Word8 -> Int -> FastString
+mkFastStringBytes ptr len = unsafePerformIO $ do
+  ft@(FastStringTable uid tbl#) <- readIORef string_table
   let
-   h = hashStr a# len#
-  in
---  _trace ("hashed: "++show (I# h)) $
-  lookupTbl ft h	>>= \ lookup_result ->
+   h = hashStr ptr len
+   add_it ls = do
+	fs <- copyNewFastString uid ptr len
+	updTbl string_table ft h (fs:ls)
+	{- _trace ("new: " ++ show f_str)   $ -}
+	return fs
+  --
+  lookup_result <- lookupTbl ft h
   case lookup_result of
-    [] -> 
-       -- no match, add it to table by copying out the
-       -- the string into a ByteArray
-       -- _trace "empty bucket" $
-       case copyPrefixStr a# (I# len#) of
-	 BA barr# ->  
-	   let f_str = FastString uid# len# barr# in
-           updTbl string_table ft h [f_str] >>
-           ({- _trace ("new: " ++ show f_str)   $ -} return f_str)
-    ls -> 
-       -- non-empty `bucket', scan the list looking
-       -- entry with same length and compare byte by byte.
-       -- _trace ("non-empty bucket"++show ls) $
-       case bucket_match ls len# a# of
-	 Nothing -> 
-           case copyPrefixStr a# (I# len#) of
-	     BA barr# ->  
-              let f_str = FastString uid# len# barr# in
-              updTbl string_table ft h (f_str:ls) >>
-	      ( {- _trace ("new: " ++ show f_str)  $ -} return f_str)
-	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v)
-  where
-   bucket_match [] _ _ = Nothing
-   bucket_match (v@(FastString _ l# ba#):ls) len# a# =
-      if len# ==# l# && eqStrPrefix a# ba# l# then
-	 Just v
-      else
-	 bucket_match ls len# a#
-   bucket_match (UnicodeStr _ _ : ls) len# a# =
-      bucket_match ls len# a#
-
-mkFastSubStringBA# :: ByteArray# -> Int# -> Int# -> FastString
-mkFastSubStringBA# barr# start# len# =
- unsafePerformIO  (
-  readIORef string_table	>>= \ ft@(FastStringTable uid# tbl#) ->
+    [] -> add_it []
+    ls -> do
+       b <- bucket_match ls len ptr
+       case b of
+	 Nothing -> add_it ls
+	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v
+
+mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
+mkZFastStringBytes ptr len = unsafePerformIO $ do
+  ft@(FastStringTable uid tbl#) <- readIORef string_table
   let
-   h = hashSubStrBA barr# start# len#
-  in
---  _trace ("hashed(b): "++show (I# h)) $
-  lookupTbl ft h		>>= \ lookup_result ->
+   h = hashStr ptr len
+   add_it ls = do
+	fs <- copyNewZFastString uid ptr len
+	updTbl string_table ft h (fs:ls)
+	{- _trace ("new: " ++ show f_str)   $ -}
+	return fs
+  --
+  lookup_result <- lookupTbl ft h
   case lookup_result of
-    [] -> 
-       -- no match, add it to table by copying out the
-       -- the string into a ByteArray
-       -- _trace "empty bucket(b)" $
-       case copySubStrBA (BA barr#) (I# start#) (I# len#) of
-         BA ba# ->  
-          let f_str = FastString uid# len# ba# in
-          updTbl string_table ft h [f_str]     >>
-          -- _trace ("new(b): " ++ show f_str)   $
-	  return f_str
-    ls ->
-       -- non-empty `bucket', scan the list looking
-       -- entry with same length and compare byte by byte. 
-       -- _trace ("non-empty bucket(b)"++show ls) $
-       case bucket_match ls start# len# barr# of
-	 Nothing ->
-          case copySubStrBA (BA barr#) (I# start#) (I# len#) of
-            BA ba# ->  
-              let f_str = FastString uid# len# ba# in
-              updTbl string_table ft h (f_str:ls) >>
-	      -- _trace ("new(b): " ++ show f_str)   $
-	      return f_str
-	 Just v  -> 
-              -- _trace ("re-use(b): "++show v) $
-	      return v
-  )
- where
-   bucket_match [] _ _ _ = Nothing
-   bucket_match (v:ls) start# len# ba# =
-    case v of
-     FastString _ l# barr# ->
-      if len# ==# l# && eqStrPrefixBA barr# ba# start# len# then
-	 Just v
-      else
-	 bucket_match ls start# len# ba#
-     UnicodeStr _ _ -> bucket_match ls start# len# ba#
-
-mkFastStringUnicode :: [Int] -> FastString
-mkFastStringUnicode s =
- unsafePerformIO  (
-  readIORef string_table	>>= \ ft@(FastStringTable uid# tbl#) ->
+    [] -> add_it []
+    ls -> do
+       b <- bucket_match ls len ptr
+       case b of
+	 Nothing -> add_it ls
+	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v
+
+-- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
+-- between this and 'mkFastStringBytes' is that we don't have to copy
+-- the bytes if the string is new to the table.
+mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
+mkFastStringForeignPtr ptr fp len = do
+  ft@(FastStringTable uid tbl#) <- readIORef string_table
+--  _trace ("hashed: "++show (I# h)) $
   let
-   h = hashUnicode s 0#
-  in
---  _trace ("hashed(b): "++show (I# h)) $
-  lookupTbl ft h		>>= \ lookup_result ->
+    h = hashStr ptr len
+    add_it ls = do
+	fs <- mkNewFastString uid ptr fp len
+	updTbl string_table ft h (fs:ls)
+	{- _trace ("new: " ++ show f_str)   $ -}
+	return fs
+  --
+  lookup_result <- lookupTbl ft h
+  case lookup_result of
+    [] -> add_it []
+    ls -> do
+       b <- bucket_match ls len ptr
+       case b of
+	 Nothing -> add_it ls
+	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v
+
+mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
+mkZFastStringForeignPtr ptr fp len = do
+  ft@(FastStringTable uid tbl#) <- readIORef string_table
+--  _trace ("hashed: "++show (I# h)) $
+  let
+    h = hashStr ptr len
+    add_it ls = do
+	fs <- mkNewZFastString uid ptr fp len
+	updTbl string_table ft h (fs:ls)
+	{- _trace ("new: " ++ show f_str)   $ -}
+	return fs
+  --
+  lookup_result <- lookupTbl ft h
   case lookup_result of
-    [] -> 
-       -- no match, add it to table by copying out the
-       -- the string into a [Int]
-          let f_str = UnicodeStr uid# s in
-          updTbl string_table ft h [f_str]     >>
-          -- _trace ("new(b): " ++ show f_str)   $
-	  return f_str
-    ls -> 
-       -- non-empty `bucket', scan the list looking
-       -- entry with same length and compare byte by byte. 
-       -- _trace ("non-empty bucket(b)"++show ls) $
-       case bucket_match ls of
-	 Nothing -> 
-              let f_str = UnicodeStr uid# s in
-              updTbl string_table ft h (f_str:ls) >>
-	      -- _trace ("new(b): " ++ show f_str)   $
-	      return f_str
-	 Just v  -> 
-              -- _trace ("re-use(b): "++show v) $
-	      return v
-  )
- where
-   bucket_match [] = Nothing
-   bucket_match (v@(UnicodeStr _ s'):ls) =
-       if s' == s then Just v else bucket_match ls
-   bucket_match (FastString _ _ _ : ls) = bucket_match ls
-
-mkFastStringNarrow :: String -> FastString
-mkFastStringNarrow str =
- case packString str of { (I# len#, BA frozen#) -> 
-    mkFastSubStringBA# frozen# 0# len#
- }
- {- 0-indexed array, len# == index to one beyond end of string,
-    i.e., (0,1) => empty string.    -}
+    [] -> add_it []
+    ls -> do
+       b <- bucket_match ls len ptr
+       case b of
+	 Nothing -> add_it ls
+	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v
 
-mkFastString :: String -> FastString
-mkFastString str = if all good str
-    then mkFastStringNarrow str
-    else mkFastStringUnicode (map ord str)
-    where
-    good c = c >= '\1' && c <= '\xFF'
-
-mkFastStringInt :: [Int] -> FastString
-mkFastStringInt str = if all good str
-    then mkFastStringNarrow (map chr str)
-    else mkFastStringUnicode str
-    where
-    good c = c >= 1 && c <= 0xFF
-
-mkFastSubString :: Addr# -> Int -> Int -> FastString
-mkFastSubString a# (I# start#) (I# len#) =
- mkFastStringLen# (a# `plusAddr#` start#) len#
-\end{code}
 
-\begin{code}
-hashStr  :: Addr# -> Int# -> Int#
+-- | Creates a UTF-8 encoded 'FastString' from a 'String'
+mkFastString :: String -> FastString
+mkFastString str = 
+  inlinePerformIO $ do
+    let l = utf8EncodedLength str
+    buf <- mallocForeignPtrBytes l
+    withForeignPtr buf $ \ptr -> do
+      utf8EncodeString ptr str
+      mkFastStringForeignPtr ptr buf l 
+
+
+-- | Creates a Z-encoded 'FastString' from a 'String'
+mkZFastString :: String -> FastString
+mkZFastString str = 
+  inlinePerformIO $ do
+    let l = Prelude.length str
+    buf <- mallocForeignPtrBytes l
+    withForeignPtr buf $ \ptr -> do
+      pokeCAString (castPtr ptr) str
+      mkZFastStringForeignPtr ptr buf l 
+
+bucket_match [] _ _ = return Nothing
+bucket_match (v@(FastString _ l _ buf _):ls) len ptr
+      | len == l  =  do
+	 b <- cmpStringPrefix ptr buf len
+	 if b then return (Just v)
+	      else bucket_match ls len ptr
+      | otherwise = 
+	 bucket_match ls len ptr
+
+mkNewFastString uid ptr fp len = do
+  ref <- newIORef Nothing
+  n_chars <- countUTF8Chars ptr len
+  return (FastString uid len n_chars fp (UTF8Encoded ref))
+
+mkNewZFastString uid ptr fp len = do
+  return (FastString uid len len fp ZEncoded)
+
+
+copyNewFastString uid ptr len = do
+  fp <- copyBytesToForeignPtr ptr len
+  ref <- newIORef Nothing
+  n_chars <- countUTF8Chars ptr len
+  return (FastString uid len n_chars fp (UTF8Encoded ref))
+
+copyNewZFastString uid ptr len = do
+  fp <- copyBytesToForeignPtr ptr len
+  return (FastString uid len len fp ZEncoded)
+
+
+copyBytesToForeignPtr ptr len = do
+  fp <- mallocForeignPtrBytes len
+  withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
+  return fp
+
+cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
+cmpStringPrefix ptr fp len =
+  withForeignPtr fp $ \ptr' -> do
+    r <- memcmp ptr ptr' len
+    return (r == 0)
+
+
+hashStr  :: Ptr Word8 -> Int -> Int
  -- use the Addr to produce a hash value between 0 & m (inclusive)
-hashStr a# len# = loop 0# 0#
+hashStr (Ptr a#) (I# len#) = loop 0# 0#
    where 
-    loop h n | n ==# len# = h
+    loop h n | n ==# len# = I# h
 	     | otherwise  = loop h2 (n +# 1#)
 	  where c = ord# (indexCharOffAddr# a# n)
 		h2 = (c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE#
 
-hashSubStrBA  :: ByteArray# -> Int# -> Int# -> Int#
- -- use the byte array to produce a hash value between 0 & m (inclusive)
-hashSubStrBA ba# start# len# = loop 0# 0#
-   where 
-    loop h n | n ==# len# = h
-	     | otherwise  = loop h2 (n +# 1#)
-	  where c = ord# (indexCharArray# ba# (start# +# n))
-		h2 = (c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE#
+-- -----------------------------------------------------------------------------
+-- Operations
 
-hashUnicode :: [Int] -> Int# -> Int#
-hashUnicode [] h = h
-hashUnicode (I# c : cs) h = hashUnicode cs ((c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE#)
-\end{code}
+-- | Returns the length of the 'FastString' in characters
+lengthFS :: FastString -> Int
+lengthFS f = n_chars f
 
-\begin{code}
-cmpFS :: FastString -> FastString -> Ordering
-cmpFS (UnicodeStr u1# s1) (UnicodeStr u2# s2) = if u1# ==# u2# then EQ
-    else compare s1 s2
-cmpFS (UnicodeStr _ s1) s2 = compare s1 (unpackIntFS s2)
-cmpFS s1 (UnicodeStr _ s2) = compare (unpackIntFS s1) s2
-cmpFS (FastString u1# l1# b1#) (FastString u2# l2# b2#) =
-  if u1# ==# u2# then EQ else
-  let l# = if l1# <=# l2# then l1# else l2# in
-  unsafePerformIO (
-    memcmp b1# b2# l# >>= \ (I# res) ->
-    return (
-    if      res <#  0# then LT
-    else if res ==# 0# then 
-	if l1# ==# l2# then EQ
-	else if l1# <# l2# then LT else GT
-    else		    GT
-    ))
+-- | Returns 'True' if the 'FastString' is Z-encoded
+isZEncoded :: FastString -> Bool
+isZEncoded fs | ZEncoded <- enc fs = True
+		| otherwise          = False
 
-#ifndef __HADDOCK__
-foreign import ccall unsafe "ghc_memcmp" 
-  memcmp :: ByteArray# -> ByteArray# -> Int# -> IO Int
-#endif
+-- | Returns 'True' if the 'FastString' is empty
+nullFS :: FastString -> Bool
+nullFS f  =  n_bytes f == 0
+
+-- | unpacks and decodes the FastString
+unpackFS :: FastString -> String
+unpackFS (FastString _ n_bytes _ buf enc) = 
+  inlinePerformIO $ withForeignPtr buf $ \ptr ->
+    case enc of
+	ZEncoded      -> peekCAStringLen (castPtr ptr,n_bytes)
+	UTF8Encoded _ -> utf8DecodeString ptr n_bytes
+
+bytesFS :: FastString -> [Word8]
+bytesFS (FastString _ n_bytes _ buf enc) = 
+  inlinePerformIO $ withForeignPtr buf $ \ptr ->
+    peekArray n_bytes ptr
+
+-- | returns a Z-encoded version of a 'FastString'.  This might be the
+-- original, if it was already Z-encoded.  The first time this
+-- function is applied to a particular 'FastString', the results are
+-- memoized.
+--
+zEncodeFS :: FastString -> FastString
+zEncodeFS fs@(FastString uid n_bytes _ fp enc) =
+  case enc of
+    ZEncoded -> fs
+    UTF8Encoded ref ->
+      inlinePerformIO $ do
+        m <- readIORef ref
+        case m of
+	  Just fs -> return fs
+	  Nothing -> do
+            let efs = mkZFastString (zEncodeString (unpackFS fs))
+	    writeIORef ref (Just efs)
+	    return efs
+
+appendFS :: FastString -> FastString -> FastString
+appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)
+
+concatFS :: [FastString] -> FastString
+concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better
+
+headFS :: FastString -> Char
+headFS (FastString _ n_bytes _ buf enc) = 
+  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
+    case enc of
+      ZEncoded -> do 
+	 w <- peek (castPtr ptr)
+	 return (castCCharToChar w)
+      UTF8Encoded _ -> 
+	 return (fst (utf8DecodeChar ptr))
+
+tailFS :: FastString -> FastString
+tailFS (FastString _ n_bytes _ buf enc) = 
+  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
+    case enc of
+      ZEncoded -> do
+	return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
+      UTF8Encoded _ -> do
+	 let (_,ptr') = utf8DecodeChar ptr
+	 let off = ptr' `minusPtr` ptr
+	 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)
+
+consFS :: Char -> FastString -> FastString
+consFS c fs = mkFastString (c : unpackFS fs)
+
+uniqueOfFS :: FastString -> Int#
+uniqueOfFS (FastString (I# u#) _ _ _ _) = u#
+
+nilFS = mkFastString ""
 
 -- -----------------------------------------------------------------------------
 -- Outputting 'FastString's
 
-#if __GLASGOW_HASKELL__ >= 504
-
--- this is our own version of hPutBuf for FastStrings, because in
--- 5.04+ we don't have mutable byte arrays and therefore hPutBufBA.
--- The closest is hPutArray in Data.Array.IO, but that does some extra
--- range checks that we want to avoid here.
-
-foreign import ccall unsafe "__hscore_memcpy_dst_off"
-   memcpy_baoff_ba :: RawBuffer -> Int -> RawBuffer -> CSize -> IO (Ptr ())
-
-hPutFS handle (FastString _ l# ba#)
-  | l# ==# 0#  = return ()
-  | otherwise
-   = do wantWritableHandle "hPutFS" handle $ 
-          \ handle_@Handle__{ haFD=fd, haBuffer=ref, haIsStream=stream } -> do
-
-          old_buf@Buffer{ bufBuf=old_raw, bufRPtr=r, bufWPtr=w, bufSize=size }
-	    <- readIORef ref
-
-	  let count = I# l#
-	      raw = unsafeCoerce# ba# :: MutableByteArray# RealWorld
-
-          -- enough room in handle buffer?
-          if (size - w > count)
-		-- There's enough room in the buffer:
-		-- just copy the data in and update bufWPtr.
-	    then do memcpy_baoff_ba old_raw w raw (fromIntegral count)
-		    writeIORef ref old_buf{ bufWPtr = w + count }
-		    return ()
-
-		-- else, we have to flush
-	    else do flushed_buf <- flushWriteBuffer fd stream old_buf
-		    writeIORef ref flushed_buf
-		    let this_buf = 
-			    Buffer{ bufBuf=raw, bufState=WriteBuffer, 
-				    bufRPtr=0, bufWPtr=count, bufSize=count }
-		    flushWriteBuffer fd stream this_buf
-		    return ()
-
-#else
-
-hPutFS :: Handle -> FastString -> IO ()
-hPutFS handle (FastString _ l# ba#)
-  | l# ==# 0#  = return ()
-  | otherwise  = do mba <- stToIO $ unsafeThawByteArray (ByteArray (bot::Int) bot ba#)
-                    hPutBufBAFull  handle mba (I# l#)
- where
-  bot = error "hPutFS.ba"
-
-#endif
+-- |Outputs a 'FastString' with /no decoding at all/, that is, you
+-- get the actual bytes in the 'FastString' written to the 'Handle'.
+hPutFS handle (FastString _ len _ fp _)
+  | len == 0  = return ()
+  | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
 
--- ONLY here for debugging the NCG (so -ddump-stix works for string
--- literals); no idea if this is really necessary.  JRS, 010131
-hPutFS handle (UnicodeStr _ is) 
-  = hPutStr handle ("(UnicodeStr " ++ show is ++ ")")
+-- ToDo: we'll probably want an hPutFSLocal, or something, to output
+-- in the current locale's encoding (for error messages and suchlike).
 
 -- -----------------------------------------------------------------------------
 -- LitStrings, here for convenience only.
@@ -492,4 +436,24 @@ type LitString = Ptr ()
 
 mkLitString# :: Addr# -> LitString
 mkLitString# a# = Ptr a#
+
+foreign import ccall unsafe "ghc_strlen" 
+  strLength :: Ptr () -> Int
+
+-- -----------------------------------------------------------------------------
+-- under the carpet
+
+-- Just like unsafePerformIO, but we inline it.
+{-# INLINE inlinePerformIO #-}
+inlinePerformIO :: IO a -> a
+inlinePerformIO (IO m) = case m realWorld# of (# _, r #)   -> r
+
+pokeCAString :: Ptr CChar -> String -> IO ()
+pokeCAString ptr str =
+  let
+	go [] n     = pokeElemOff ptr n 0
+    	go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
+  in
+  go str 0
+
 \end{code}
diff --git a/ghc/compiler/utils/FastTypes.lhs b/ghc/compiler/utils/FastTypes.lhs
index 9f9d9038f1..bb92c8c02f 100644
--- a/ghc/compiler/utils/FastTypes.lhs
+++ b/ghc/compiler/utils/FastTypes.lhs
@@ -9,7 +9,7 @@ module FastTypes (
     (+#), (-#), (*#), quotFastInt, negateFastInt,
     (==#), (<#), (<=#), (>=#), (>#),
 
-    FastBool, fastBool, isFastTrue, fastOr
+    FastBool, fastBool, isFastTrue, fastOr, fastAnd
   ) where
 
 #include "HsVersions.h"
diff --git a/ghc/compiler/utils/Pretty.lhs b/ghc/compiler/utils/Pretty.lhs
index 916755e902..ec8f1e75ad 100644
--- a/ghc/compiler/utils/Pretty.lhs
+++ b/ghc/compiler/utils/Pretty.lhs
@@ -179,26 +179,16 @@ module Pretty (
 
 import BufWrite
 import FastString
-import PrimPacked 	( strLength )
 
 import GLAEXTS
 
 import Numeric (fromRat)
 import IO
 
-#if __GLASGOW_HASKELL__ < 503
-import IOExts		( hPutBufFull )
-#else
 import System.IO	( hPutBuf )
-#endif
 
-#if __GLASGOW_HASKELL__ < 503
-import PrelBase		( unpackCString# )
-#else
 import GHC.Base		( unpackCString# )
-#endif
-
-import PrimPacked	( Ptr(..) )
+import GHC.Ptr		( Ptr(..) )
 
 -- Don't import Util( assertPanic ) because it makes a loop in the module structure
 
diff --git a/ghc/compiler/utils/PrimPacked.lhs b/ghc/compiler/utils/PrimPacked.lhs
deleted file mode 100644
index f2d034dcee..0000000000
--- a/ghc/compiler/utils/PrimPacked.lhs
+++ /dev/null
@@ -1,265 +0,0 @@
-%
-% (c) The GRASP/AQUA Project, Glasgow University, 1997-1998
-%
-\section{Basic ops on packed representations}
-
-Some basic operations for working on packed representations of series
-of bytes (character strings). Used by the interface lexer input
-subsystem, mostly.
-
-\begin{code}
-{-# OPTIONS -optc-DNON_POSIX_SOURCE #-}
-
-module PrimPacked (
-	Ptr(..), nullPtr, plusAddr#,
-	BA(..),
-	packString, 	   -- :: String -> (Int, BA)
-	unpackNBytesBA,    -- :: BA -> Int -> [Char]
-        strLength,	   -- :: Ptr CChar -> Int
-        copyPrefixStr,	   -- :: Addr# -> Int -> BA
-        copySubStrBA,	   -- :: BA -> Int -> Int -> BA
-        eqStrPrefix,	   -- :: Addr# -> ByteArray# -> Int# -> Bool
-        eqStrPrefixBA,	   -- :: ByteArray# -> ByteArray# -> Int# -> Int# -> Bool
- ) where
-
--- This #define suppresses the "import FastString" that
--- HsVersions otherwise produces
-#define COMPILING_FAST_STRING
-#include "HsVersions.h"
-
-import GLAEXTS
-import UNSAFE_IO	( unsafePerformIO )
-
-import MONAD_ST
-import Foreign
-
-#if __GLASGOW_HASKELL__ < 503
-import PrelST
-#else
-import GHC.ST
-#endif
-
-#if __GLASGOW_HASKELL__ >= 504
-import GHC.Ptr	( Ptr(..) )
-#elif __GLASGOW_HASKELL__ >= 500
-import Ptr	( Ptr(..) )
-#endif
-
-#if __GLASGOW_HASKELL__ < 504
-import PrelIOBase	( IO(..) )
-#else
-import GHC.IOBase	( IO(..) )
-#endif
-\end{code}
-
-Compatibility: 4.08 didn't have the Ptr type.
-
-\begin{code}
-#if __GLASGOW_HASKELL__ <= 408
-data Ptr a = Ptr Addr# deriving (Eq, Ord)
-
-nullPtr :: Ptr a
-nullPtr = Ptr (int2Addr# 0#)
-#endif
-
-#if __GLASGOW_HASKELL__ <= 500
--- plusAddr# is a primop in GHC > 5.00
-plusAddr# :: Addr# -> Int# -> Addr#
-plusAddr# a# i# = int2Addr# (addr2Int# a# +# i#)
-#endif
-\end{code}
-
-Wrapper types for bytearrays
-
-\begin{code}
-data BA    = BA  ByteArray#
-data MBA s = MBA (MutableByteArray# s)
-\end{code}
-
-\begin{code}
-packString :: String -> (Int, BA)
-packString str = (l, arr)
- where
-  l@(I# length#) = length str
-
-  arr = runST (do
-    ch_array <- new_ps_array length#
-      -- fill in packed string from "str"
-    fill_in ch_array 0# str
-      -- freeze the puppy:
-    freeze_ps_array ch_array length#
-   )
-
-  fill_in :: MBA s -> Int# -> [Char] -> ST s ()
-  fill_in arr_in# idx [] =
-   return ()
-  fill_in arr_in# idx (C# c : cs) =
-   write_ps_array arr_in# idx c	 >>
-   fill_in arr_in# (idx +# 1#) cs
-\end{code}
-
-Unpacking a string
-
-\begin{code}
-unpackNBytesBA :: BA -> Int -> [Char]
-unpackNBytesBA (BA bytes) (I# len)
- = unpack 0#
- where
-    unpack nh
-      | nh >=# len  = []
-      | otherwise   = C# ch : unpack (nh +# 1#)
-      where
-	ch = indexCharArray# bytes nh
-\end{code}
-
-Copying a char string prefix into a byte array.
-
-\begin{code}
-copyPrefixStr :: Addr# -> Int -> BA
-copyPrefixStr a# len@(I# length#) = copy' length#
- where
-   copy' length# = runST (do
-     {- allocate an array that will hold the string
-     -}
-     ch_array <- new_ps_array length#
-     {- Revert back to Haskell-only solution for the moment.
-     	_ccall_ memcpy ch_array (A# a) len        >>=  \ () ->
-     	write_ps_array ch_array length# (chr# 0#) >>
-     -}
-     -- fill in packed string from "addr"
-     fill_in ch_array 0#
-     -- freeze the puppy:
-     freeze_ps_array ch_array length#
-    )
-
-   fill_in :: MBA s -> Int# -> ST s ()
-   fill_in arr_in# idx
-      | idx ==# length#
-      = return ()
-      | otherwise
-      = case (indexCharOffAddr# a# idx) of { ch ->
-	write_ps_array arr_in# idx ch >>
-	fill_in arr_in# (idx +# 1#) }
-\end{code}
-
-Copying out a substring, assume a 0-indexed string:
-(and positive lengths, thank you).
-
-\begin{code}
-#ifdef UNUSED
-copySubStr :: Addr# -> Int -> Int -> BA
-copySubStr a# (I# start#) length =
-  copyPrefixStr (a# `plusAddr#` start#)  length
-#endif
-
-copySubStrBA :: BA -> Int -> Int -> BA
-copySubStrBA (BA barr#) (I# start#) len@(I# length#) = ba
- where
-  ba = runST (do
-     -- allocate an array that will hold the string
-    ch_array <- new_ps_array length#
-     -- fill in packed string from "addr"
-    fill_in ch_array 0#
-     -- freeze the puppy:
-    freeze_ps_array ch_array length#
-   )
-
-  fill_in :: MBA s -> Int# -> ST s ()
-  fill_in arr_in# idx
-      | idx ==# length#
-      = return ()
-      | otherwise
-      = case (indexCharArray# barr# (start# +# idx)) of { ch ->
-	write_ps_array arr_in# idx ch >>
-	fill_in arr_in# (idx +# 1#) }
-\end{code}
-
-(Very :-) ``Specialised'' versions of some CharArray things...
-[Copied from PackBase; no real reason -- UGH]
-
-\begin{code}
-new_ps_array	:: Int# -> ST s (MBA s)
-write_ps_array	:: MBA s -> Int# -> Char# -> ST s () 
-freeze_ps_array :: MBA s -> Int# -> ST s BA
-
-#if __GLASGOW_HASKELL__ < 411
-#define NEW_BYTE_ARRAY newCharArray#
-#else 
-#define NEW_BYTE_ARRAY newPinnedByteArray#
-#endif
-
-new_ps_array size = ST $ \ s ->
-    case (NEW_BYTE_ARRAY size s)  of { (# s2#, barr# #) ->
-    (# s2#, MBA barr# #) }
-
-write_ps_array (MBA barr#) n ch = ST $ \ s# ->
-    case writeCharArray# barr# n ch s#	of { s2#   ->
-    (# s2#, () #) }
-
--- same as unsafeFreezeByteArray
-freeze_ps_array (MBA arr#) len# = ST $ \ s# ->
-    case unsafeFreezeByteArray# arr# s# of { (# s2#, frozen# #) ->
-    (# s2#, BA frozen# #) }
-\end{code}
-
-
-Compare two equal-length strings for equality:
-
-\begin{code}
-eqStrPrefix :: Addr# -> ByteArray# -> Int# -> Bool
-eqStrPrefix a# barr# len# = 
-  inlinePerformIO $ do
-   x <- memcmp_ba a# barr# (I# len#)
-   return (x == 0)
-
-#ifdef UNUSED
-eqCharStrPrefix :: Addr# -> Addr# -> Int# -> Bool
-eqCharStrPrefix a1# a2# len# = 
-  inlinePerformIO $ do
-   x <- memcmp a1# a2# (I# len#)
-   return (x == 0)
-#endif
-
-eqStrPrefixBA :: ByteArray# -> ByteArray# -> Int# -> Int# -> Bool
-eqStrPrefixBA b1# b2# start# len# = 
-  inlinePerformIO $ do
-    x <- memcmp_baoff_ba b2# (I# start#) b1# (I# len#)
-    return (x == 0)
-
-#ifdef UNUSED
-eqCharStrPrefixBA :: Addr# -> ByteArray# -> Int# -> Int# -> Bool
-eqCharStrPrefixBA a# b2# start# len# = 
-  inlinePerformIO $ do
-    x <- memcmp_baoff b2# (I# start#) a# (I# len#) 
-    return (x == 0)
-#endif
-\end{code}
-
-\begin{code}
--- Just like unsafePerformIO, but we inline it.  This is safe when
--- there are no side effects, and improves performance.
-{-# INLINE inlinePerformIO #-}
-inlinePerformIO :: IO a -> a
-inlinePerformIO (IO m) = case m realWorld# of (# _, r #)   -> r
-
-#if __GLASGOW_HASKELL__ <= 408
-strLength (Ptr a#) = ghc_strlen a#
-foreign import ccall unsafe "ghc_strlen" 
-  ghc_strlen :: Addr# -> Int
-#else
-foreign import ccall unsafe "ghc_strlen" 
-  strLength :: Ptr () -> Int
-#endif
-
-foreign import ccall unsafe "ghc_memcmp"
-  memcmp :: Addr# -> Addr# -> Int -> IO Int
-
-foreign import ccall unsafe "ghc_memcmp" 
-  memcmp_ba :: Addr# -> ByteArray# -> Int -> IO Int
-
-foreign import ccall unsafe "ghc_memcmp_off"
-  memcmp_baoff :: ByteArray# -> Int -> Addr# -> Int -> IO Int
-
-foreign import ccall unsafe "ghc_memcmp_off"
-  memcmp_baoff_ba :: ByteArray# -> Int -> ByteArray# -> Int -> IO Int
-\end{code}
diff --git a/ghc/compiler/utils/StringBuffer.lhs b/ghc/compiler/utils/StringBuffer.lhs
index e53dbc89ce..e2eed889f2 100644
--- a/ghc/compiler/utils/StringBuffer.lhs
+++ b/ghc/compiler/utils/StringBuffer.lhs
@@ -6,27 +6,32 @@
 Buffers for scanning string input stored in external arrays.
 
 \begin{code}
+{-# OPTIONS_GHC -O #-}
+-- always optimise this module, it's critical
+
 module StringBuffer
        (
         StringBuffer(..),
 	-- non-abstract for vs\/HaskellService
 
 	 -- * Creation\/destruction
-        hGetStringBuffer,     -- :: FilePath     -> IO StringBuffer
-	stringToStringBuffer, -- :: String       -> IO StringBuffer
+        hGetStringBuffer,
+	stringToStringBuffer,
 
-         -- * Lookup
-	currentChar,       -- :: StringBuffer -> Char
-	prevChar,          -- :: StringBuffer -> Char -> Char
-	lookAhead,         -- :: StringBuffer -> Int  -> Char
-	atEnd,		   -- :: StringBuffer -> Bool
+	-- * Inspection
+	nextChar,
+	currentChar,
+	prevChar,
+	atEnd,
 
-	-- * Moving
-	stepOn, stepOnBy,
+	-- * Moving and comparison
+	stepOn,
+	offsetBytes,
+	byteDiff,
 
         -- * Conversion
-        lexemeToString,     -- :: StringBuffer -> Int -> String
-        lexemeToFastString, -- :: StringBuffer -> Int -> FastString
+        lexemeToString,
+        lexemeToFastString,
 
 	 -- * Parsing integers
 	parseInteger,
@@ -34,22 +39,19 @@ module StringBuffer
 
 #include "HsVersions.h"
 
-import FastString
-import Panic
+import Encoding
+import FastString	(FastString,mkFastString,mkFastStringBytes)
 
 import GLAEXTS
 
 import Foreign
 
-#if __GLASGOW_HASKELL__ < 503
-import PrelIOBase
-import PrelHandle
-#else
-import GHC.IOBase
-import GHC.IO		( slurpFile )
-#endif
+import GHC.IOBase		( IO(..) )
+import GHC.Base			( unsafeChr )
+
+import System.IO		( hGetBuf )
 
-import IO			( openFile, hFileSize, IOMode(ReadMode),
+import IO			( hFileSize, IOMode(ReadMode),
 				  hClose )
 #if __GLASGOW_HASKELL__ >= 601
 import System.IO		( openBinaryFile )
@@ -57,37 +59,35 @@ import System.IO		( openBinaryFile )
 import IOExts                   ( openFileEx, IOModeEx(..) )
 #endif
 
-#if __GLASGOW_HASKELL__ < 503
-import IArray			( listArray )
-import ArrayBase		( UArray(..) )
-import MutableArray
-import IOExts			( hGetBufBA )
-#else
-import Data.Array.IArray	( listArray )
-import Data.Array.MArray 	( unsafeFreeze, newArray_ )
-import Data.Array.Base		( UArray(..)  )
-import Data.Array.IO		( IOArray, hGetArray )
-#endif
-
-import Char			( ord )
-
 #if __GLASGOW_HASKELL__ < 601
 openBinaryFile fp mode = openFileEx fp (BinaryMode mode)
 #endif
+
 -- -----------------------------------------------------------------------------
 -- The StringBuffer type
 
--- A StringBuffer is a ByteArray# with a pointer into it.  We also cache
--- the length of the ByteArray# for speed.
-
+-- |A StringBuffer is an internal pointer to a sized chunk of bytes.
+-- The bytes are intended to be *immutable*.  There are pure
+-- operations to read the contents of a StringBuffer.
+--
+-- A StringBuffer may have a finalizer, depending on how it was
+-- obtained.
+--
 data StringBuffer
- = StringBuffer
-     ByteArray#
-     Int#         -- length
-     Int#         -- current pos
+ = StringBuffer {
+     buf :: {-# UNPACK #-} !(ForeignPtr Word8),
+     len :: {-# UNPACK #-} !Int, 	-- length
+     cur :: {-# UNPACK #-} !Int		-- current pos
+  }
+	-- The buffer is assumed to be UTF-8 encoded, and furthermore
+	-- we add three '\0' bytes to the end as sentinels so that the
+	-- decoder doesn't have to check for overflow at every single byte
+	-- of a multibyte sequence.
 
 instance Show StringBuffer where
-	showsPrec _ s = showString "<stringbuffer>"
+	showsPrec _ s = showString "<stringbuffer(" 
+		      . shows (len s) . showString "," . shows (cur s)
+		      . showString ">"
 
 -- -----------------------------------------------------------------------------
 -- Creation / Destruction
@@ -95,97 +95,108 @@ instance Show StringBuffer where
 hGetStringBuffer :: FilePath -> IO StringBuffer
 hGetStringBuffer fname = do
    h <- openBinaryFile fname ReadMode
-   size <- hFileSize h
-   let size_i@(I# sz#) = fromIntegral size
-#if __GLASGOW_HASKELL__ < 503
-   arr <- stToIO (newCharArray (0,size_i-1))
-   r <- hGetBufBA h arr size_i
-#else
-   arr <- newArray_ (0,size_i-1)
-   r <- if size_i == 0 then return 0 else hGetArray h arr size_i
-#endif
-   hClose h
-   if (r /= size_i)
+   size_i <- hFileSize h
+   let size = fromIntegral size_i
+   buf <- mallocForeignPtrArray (size+3)
+   withForeignPtr buf $ \ptr -> do
+     r <- if size == 0 then return 0 else hGetBuf h ptr size
+     hClose h
+     if (r /= size)
 	then ioError (userError "short read of file")
 	else do
-#if __GLASGOW_HASKELL__ < 503
-   frozen <- stToIO (unsafeFreezeByteArray arr)
-   case frozen of
-      ByteArray _ _ bytearr# -> return (StringBuffer bytearr# sz# 0#)
-#else
-   frozen <- unsafeFreeze arr
-   case frozen of
-      UArray _ _ bytearr# -> return (StringBuffer bytearr# sz# 0#)
-#endif
+	  pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
+		 -- sentinels for UTF-8 decoding
+	  return (StringBuffer buf size 0)
 
-#if __GLASGOW_HASKELL__ >= 502
+stringToStringBuffer :: String -> IO StringBuffer
 stringToStringBuffer str = do
-  let size@(I# sz#) = length str
-      arr = listArray (0,size-1) (map (fromIntegral.ord) str)
-		 :: UArray Int Word8
-  case arr of
-	UArray _ _ bytearr# -> return (StringBuffer bytearr# sz# 0#)
-#else
-stringToStringBuffer = panic "stringToStringBuffer: not implemented"
-#endif
+  let size = utf8EncodedLength str
+  buf <- mallocForeignPtrArray (size+3)
+  withForeignPtr buf $ \ptr -> do
+    utf8EncodeString ptr str
+    pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
+	 -- sentinels for UTF-8 decoding
+  return (StringBuffer buf size 0)
 
 -- -----------------------------------------------------------------------------
--- Lookup
-
-currentChar  :: StringBuffer -> Char
-currentChar (StringBuffer arr# l# current#) =
-  ASSERT(current# <# l#)
-  C# (indexCharArray# arr# current#)
+-- Grab a character
+
+-- Getting our fingers dirty a little here, but this is performance-critical
+{-# INLINE nextChar #-}
+nextChar :: StringBuffer -> (Char,StringBuffer)
+nextChar (StringBuffer buf len (I# cur#)) =
+  inlinePerformIO $ do
+    withForeignPtr buf $ \(Ptr a#) -> do
+	case utf8DecodeChar# (a# `plusAddr#` cur#) of
+	  (# c#, b# #) ->
+	     let cur' = I# (b# `minusAddr#` a#) in
+	     return (C# c#, StringBuffer buf len cur')
+
+currentChar :: StringBuffer -> Char
+currentChar = fst . nextChar
 
 prevChar :: StringBuffer -> Char -> Char
-prevChar (StringBuffer _ _ 0#) deflt = deflt
-prevChar s deflt = lookAhead s (-1)
-
-lookAhead :: StringBuffer -> Int  -> Char
-lookAhead (StringBuffer arr# l# c#) (I# i#) =
-  ASSERT(off <# l#  && off >=# 0#)
-  C# (indexCharArray# arr# off)
- where 
-   off = c# +# i#
+prevChar (StringBuffer buf len 0)   deflt = deflt
+prevChar (StringBuffer buf len cur) deflt = 
+  inlinePerformIO $ do
+    withForeignPtr buf $ \p -> do
+      p' <- utf8PrevChar (p `plusPtr` cur)
+      return (fst (utf8DecodeChar p'))
 
 -- -----------------------------------------------------------------------------
 -- Moving
 
 stepOn :: StringBuffer -> StringBuffer
-stepOn s = stepOnBy 1 s
+stepOn s = snd (nextChar s)
+
+offsetBytes :: Int -> StringBuffer -> StringBuffer
+offsetBytes i s = s { cur = cur s + i }
 
-stepOnBy :: Int -> StringBuffer -> StringBuffer
-stepOnBy (I# i#) (StringBuffer fo# l# c#) = StringBuffer fo# l# (c# +# i#)
+byteDiff :: StringBuffer -> StringBuffer -> Int
+byteDiff s1 s2 = cur s2 - cur s1
 
 atEnd :: StringBuffer -> Bool
-atEnd (StringBuffer _ l# c#) = l# ==# c#
+atEnd (StringBuffer _ l c) = l == c
 
 -- -----------------------------------------------------------------------------
 -- Conversion
 
-lexemeToString :: StringBuffer -> Int -> String
+lexemeToString :: StringBuffer -> Int {-bytes-} -> String
 lexemeToString _ 0 = ""
-lexemeToString (StringBuffer arr# _ current#) (I# len#) = unpack current#
- where
-    end = current# +# len#
+lexemeToString (StringBuffer buf _ cur) bytes =
+  inlinePerformIO $ 
+    withForeignPtr buf $ \ptr -> 
+      utf8DecodeString (ptr `plusPtr` cur) bytes
 
-    unpack nh
-      | nh >=# end  = []
-      | otherwise   = C# ch : unpack (nh +# 1#)
-      where
-	ch = indexCharArray# arr# nh
-
-lexemeToFastString :: StringBuffer -> Int -> FastString
+lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
 lexemeToFastString _ 0 = mkFastString ""
-lexemeToFastString (StringBuffer fo _ current#) (I# len) =
-    mkFastSubStringBA# fo current# len
+lexemeToFastString (StringBuffer buf _ cur) len =
+   inlinePerformIO $
+     withForeignPtr buf $ \ptr ->
+       return $! mkFastStringBytes (ptr `plusPtr` cur) len
 
 -- -----------------------------------------------------------------------------
 -- Parsing integer strings in various bases
 
+byteOff :: StringBuffer -> Int -> Char
+byteOff (StringBuffer buf _ cur) i = 
+  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
+    w <- peek (ptr `plusPtr` (cur+i))
+    return (unsafeChr (fromIntegral (w::Word8)))
+
+-- | XXX assumes ASCII digits only
 parseInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
 parseInteger buf len radix to_int 
   = go 0 0
   where go i x | i == len  = x
-	       | otherwise = go (i+1) (x * radix + toInteger (to_int (lookAhead buf i)))
+	       | otherwise = go (i+1) (x * radix + toInteger (to_int (byteOff buf i)))
+
+-- -----------------------------------------------------------------------------
+-- under the carpet
+
+-- Just like unsafePerformIO, but we inline it.
+{-# INLINE inlinePerformIO #-}
+inlinePerformIO :: IO a -> a
+inlinePerformIO (IO m) = case m realWorld# of (# _, r #)   -> r
+
 \end{code}
diff --git a/ghc/compiler/utils/UnicodeUtil.lhs b/ghc/compiler/utils/UnicodeUtil.lhs
deleted file mode 100644
index 56e95a5434..0000000000
--- a/ghc/compiler/utils/UnicodeUtil.lhs
+++ /dev/null
@@ -1,36 +0,0 @@
-Various Unicode-related utilities.
-
-\begin{code}
-module UnicodeUtil(
-    stringToUtf8, intsToUtf8
-  ) where
-
-#include "HsVersions.h"
-
-import Panic ( panic )
-import Char  ( chr, ord )
-\end{code}
-
-\begin{code}
-stringToUtf8 :: String -> String
-stringToUtf8 s = intsToUtf8 (map ord s)
-
-intsToUtf8 :: [Int] -> String
-intsToUtf8 []       = ""
-intsToUtf8 (c:s)
-    | c >= 1 && c <= 0x7F = chr c : intsToUtf8 s
-    | c < 0           = panic ("charToUtf8 ("++show c++")")
-    | c <= 0x7FF      = chr (0xC0 + c `div`       0x40           ) :
-                        chr (0x80 + c                  `mod` 0x40) :
-                        intsToUtf8 s
-    | c <= 0xFFFF     = chr (0xE0 + c `div`     0x1000           ) :
-                        chr (0x80 + c `div`       0x40 `mod` 0x40) :
-                        chr (0x80 + c                  `mod` 0x40) :
-                        intsToUtf8 s
-    | c <= 0x10FFFF   = chr (0xF0 + c `div`    0x40000           ) :
-                        chr (0x80 + c `div`     0x1000 `mod` 0x40) :
-                        chr (0x80 + c `div`       0x40 `mod` 0x40) :
-                        chr (0x80 + c                  `mod` 0x40) :
-                        intsToUtf8 s
-    | otherwise       = panic ("charToUtf8 "++show c)
-\end{code}
author	simonmar <unknown>	2006-01-06 16:30:19 +0000
committer	simonmar <unknown>	2006-01-06 16:30:19 +0000
commit	9d7da331989abcd1844e9d03b8d1e4163796fa85 (patch)
tree	8efa2e6fdcf8bfee777ae6477a686d0594c5ff76 /ghc/compiler/utils
parent	2a2efb720c0fdc06fe749f96f284b00b30f8f3f7 (diff)
download	haskell-9d7da331989abcd1844e9d03b8d1e4163796fa85.tar.gz