diff options
Diffstat (limited to 'compiler/utils/FastString.lhs')
-rw-r--r-- | compiler/utils/FastString.lhs | 499 |
1 files changed, 499 insertions, 0 deletions
diff --git a/compiler/utils/FastString.lhs b/compiler/utils/FastString.lhs new file mode 100644 index 0000000000..ea307799c4 --- /dev/null +++ b/compiler/utils/FastString.lhs @@ -0,0 +1,499 @@ +% +% (c) The University of Glasgow, 1997-2006 +% +\begin{code} +{- +FastString: A compact, hash-consed, representation of character strings. + Comparison is O(1), and you can get a Unique from them. + Generated by the FSLIT macro + Turn into SDoc with Outputable.ftext + +LitString: Just a wrapper for the Addr# of a C string (Ptr CChar). + Practically no operations + Outputing them is fast + Generated by the SLIT macro + Turn into SDoc with Outputable.ptext + +Use LitString unless you want the facilities of FastString +-} +module FastString + ( + -- * FastStrings + FastString(..), -- not abstract, for now. + + -- ** Construction + mkFastString, + mkFastStringBytes, + mkFastStringForeignPtr, + mkFastString#, + mkZFastString, + mkZFastStringBytes, + + -- ** Deconstruction + unpackFS, -- :: FastString -> String + bytesFS, -- :: FastString -> [Word8] + + -- ** Encoding + isZEncoded, + zEncodeFS, + + -- ** Operations + uniqueOfFS, + lengthFS, + nullFS, + appendFS, + headFS, + tailFS, + concatFS, + consFS, + nilFS, + + -- ** Outputing + hPutFS, + + -- ** Internal + getFastStringTable, + hasZEncoding, + + -- * LitStrings + LitString, + mkLitString#, + strLength + ) where + +-- This #define suppresses the "import FastString" that +-- HsVersions otherwise produces +#define COMPILING_FAST_STRING +#include "HsVersions.h" + +import Encoding + +import Foreign +import Foreign.C +import GHC.Exts +import System.IO.Unsafe ( unsafePerformIO ) +import Control.Monad.ST ( stToIO ) +import Data.IORef ( IORef, newIORef, readIORef, writeIORef ) +import System.IO ( hPutBuf ) +import Data.Maybe ( isJust ) + +import GHC.Arr ( STArray(..), newSTArray ) +import GHC.IOBase ( IO(..) ) +import GHC.Ptr ( Ptr(..) ) + +#define hASH_TBL_SIZE 4091 + + +{-| +A 'FastString' is an array of bytes, hashed to support fast O(1) +comparison. It is also associated with a character encoding, so that +we know how to convert a 'FastString' to the local encoding, or to the +Z-encoding used by the compiler internally. + +'FastString's support a memoized conversion to the Z-encoding via zEncodeFS. +-} + +data FastString = FastString { + uniq :: {-# UNPACK #-} !Int, -- unique id + n_bytes :: {-# UNPACK #-} !Int, -- number of bytes + n_chars :: {-# UNPACK #-} !Int, -- number of chars + buf :: {-# UNPACK #-} !(ForeignPtr Word8), + enc :: FSEncoding + } + +data FSEncoding + = ZEncoded + -- including strings that don't need any encoding + | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString)) + -- A UTF-8 string with a memoized Z-encoding + +instance Eq FastString where + f1 == f2 = uniq f1 == uniq f2 + +instance Ord FastString where + -- Compares lexicographically, not by unique + a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False } + a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False } + a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True } + a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True } + max x y | x >= y = x + | otherwise = y + min x y | x <= y = x + | otherwise = y + compare a b = cmpFS a b + +instance Show FastString where + show fs = show (unpackFS fs) + +cmpFS :: FastString -> FastString -> Ordering +cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) = + if u1 == u2 then EQ else + let l = if l1 <= l2 then l1 else l2 in + inlinePerformIO $ + withForeignPtr buf1 $ \p1 -> + withForeignPtr buf2 $ \p2 -> do + res <- memcmp p1 p2 l + case () of + _ | res < 0 -> return LT + | res == 0 -> if l1 == l2 then return EQ + else if l1 < l2 then return LT + else return GT + | otherwise -> return GT + +#ifndef __HADDOCK__ +foreign import ccall unsafe "ghc_memcmp" + memcmp :: Ptr a -> Ptr b -> Int -> IO Int +#endif + +-- ----------------------------------------------------------------------------- +-- Construction + +{- +Internally, the compiler will maintain a fast string symbol +table, providing sharing and fast comparison. Creation of +new @FastString@s then covertly does a lookup, re-using the +@FastString@ if there was a hit. +-} + +data FastStringTable = + FastStringTable + {-# UNPACK #-} !Int + (MutableArray# RealWorld [FastString]) + +string_table :: IORef FastStringTable +string_table = + unsafePerformIO $ do + (STArray _ _ arr#) <- stToIO (newSTArray (0::Int,hASH_TBL_SIZE) []) + newIORef (FastStringTable 0 arr#) + +lookupTbl :: FastStringTable -> Int -> IO [FastString] +lookupTbl (FastStringTable _ arr#) (I# i#) = + IO $ \ s# -> readArray# arr# i# s# + +updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO () +updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do + (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) }) + writeIORef fs_table_var (FastStringTable (uid+1) arr#) + +mkFastString# :: Addr# -> FastString +mkFastString# a# = mkFastStringBytes ptr (strLength ptr) + where ptr = Ptr a# + +mkFastStringBytes :: Ptr Word8 -> Int -> FastString +mkFastStringBytes ptr len = unsafePerformIO $ do + ft@(FastStringTable uid tbl#) <- readIORef string_table + let + h = hashStr ptr len + add_it ls = do + fs <- copyNewFastString uid ptr len + updTbl string_table ft h (fs:ls) + {- _trace ("new: " ++ show f_str) $ -} + return fs + -- + lookup_result <- lookupTbl ft h + case lookup_result of + [] -> add_it [] + ls -> do + b <- bucket_match ls len ptr + case b of + Nothing -> add_it ls + Just v -> {- _trace ("re-use: "++show v) $ -} return v + +mkZFastStringBytes :: Ptr Word8 -> Int -> FastString +mkZFastStringBytes ptr len = unsafePerformIO $ do + ft@(FastStringTable uid tbl#) <- readIORef string_table + let + h = hashStr ptr len + add_it ls = do + fs <- copyNewZFastString uid ptr len + updTbl string_table ft h (fs:ls) + {- _trace ("new: " ++ show f_str) $ -} + return fs + -- + lookup_result <- lookupTbl ft h + case lookup_result of + [] -> add_it [] + ls -> do + b <- bucket_match ls len ptr + case b of + Nothing -> add_it ls + Just v -> {- _trace ("re-use: "++show v) $ -} return v + +-- | Create a 'FastString' from an existing 'ForeignPtr'; the difference +-- between this and 'mkFastStringBytes' is that we don't have to copy +-- the bytes if the string is new to the table. +mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString +mkFastStringForeignPtr ptr fp len = do + ft@(FastStringTable uid tbl#) <- readIORef string_table +-- _trace ("hashed: "++show (I# h)) $ + let + h = hashStr ptr len + add_it ls = do + fs <- mkNewFastString uid ptr fp len + updTbl string_table ft h (fs:ls) + {- _trace ("new: " ++ show f_str) $ -} + return fs + -- + lookup_result <- lookupTbl ft h + case lookup_result of + [] -> add_it [] + ls -> do + b <- bucket_match ls len ptr + case b of + Nothing -> add_it ls + Just v -> {- _trace ("re-use: "++show v) $ -} return v + +mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString +mkZFastStringForeignPtr ptr fp len = do + ft@(FastStringTable uid tbl#) <- readIORef string_table +-- _trace ("hashed: "++show (I# h)) $ + let + h = hashStr ptr len + add_it ls = do + fs <- mkNewZFastString uid ptr fp len + updTbl string_table ft h (fs:ls) + {- _trace ("new: " ++ show f_str) $ -} + return fs + -- + lookup_result <- lookupTbl ft h + case lookup_result of + [] -> add_it [] + ls -> do + b <- bucket_match ls len ptr + case b of + Nothing -> add_it ls + Just v -> {- _trace ("re-use: "++show v) $ -} return v + + +-- | Creates a UTF-8 encoded 'FastString' from a 'String' +mkFastString :: String -> FastString +mkFastString str = + inlinePerformIO $ do + let l = utf8EncodedLength str + buf <- mallocForeignPtrBytes l + withForeignPtr buf $ \ptr -> do + utf8EncodeString ptr str + mkFastStringForeignPtr ptr buf l + + +-- | Creates a Z-encoded 'FastString' from a 'String' +mkZFastString :: String -> FastString +mkZFastString str = + inlinePerformIO $ do + let l = Prelude.length str + buf <- mallocForeignPtrBytes l + withForeignPtr buf $ \ptr -> do + pokeCAString (castPtr ptr) str + mkZFastStringForeignPtr ptr buf l + +bucket_match [] _ _ = return Nothing +bucket_match (v@(FastString _ l _ buf _):ls) len ptr + | len == l = do + b <- cmpStringPrefix ptr buf len + if b then return (Just v) + else bucket_match ls len ptr + | otherwise = + bucket_match ls len ptr + +mkNewFastString uid ptr fp len = do + ref <- newIORef Nothing + n_chars <- countUTF8Chars ptr len + return (FastString uid len n_chars fp (UTF8Encoded ref)) + +mkNewZFastString uid ptr fp len = do + return (FastString uid len len fp ZEncoded) + + +copyNewFastString uid ptr len = do + fp <- copyBytesToForeignPtr ptr len + ref <- newIORef Nothing + n_chars <- countUTF8Chars ptr len + return (FastString uid len n_chars fp (UTF8Encoded ref)) + +copyNewZFastString uid ptr len = do + fp <- copyBytesToForeignPtr ptr len + return (FastString uid len len fp ZEncoded) + + +copyBytesToForeignPtr ptr len = do + fp <- mallocForeignPtrBytes len + withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len + return fp + +cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool +cmpStringPrefix ptr fp len = + withForeignPtr fp $ \ptr' -> do + r <- memcmp ptr ptr' len + return (r == 0) + + +hashStr :: Ptr Word8 -> Int -> Int + -- use the Addr to produce a hash value between 0 & m (inclusive) +hashStr (Ptr a#) (I# len#) = loop 0# 0# + where + loop h n | n ==# len# = I# h + | otherwise = loop h2 (n +# 1#) + where c = ord# (indexCharOffAddr# a# n) + h2 = (c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE# + +-- ----------------------------------------------------------------------------- +-- Operations + +-- | Returns the length of the 'FastString' in characters +lengthFS :: FastString -> Int +lengthFS f = n_chars f + +-- | Returns 'True' if the 'FastString' is Z-encoded +isZEncoded :: FastString -> Bool +isZEncoded fs | ZEncoded <- enc fs = True + | otherwise = False + +-- | Returns 'True' if this 'FastString' is not Z-encoded but already has +-- a Z-encoding cached (used in producing stats). +hasZEncoding :: FastString -> Bool +hasZEncoding fs@(FastString uid n_bytes _ fp enc) = + case enc of + ZEncoded -> False + UTF8Encoded ref -> + inlinePerformIO $ do + m <- readIORef ref + return (isJust m) + +-- | Returns 'True' if the 'FastString' is empty +nullFS :: FastString -> Bool +nullFS f = n_bytes f == 0 + +-- | unpacks and decodes the FastString +unpackFS :: FastString -> String +unpackFS (FastString _ n_bytes _ buf enc) = + inlinePerformIO $ withForeignPtr buf $ \ptr -> + case enc of + ZEncoded -> peekCAStringLen (castPtr ptr,n_bytes) + UTF8Encoded _ -> utf8DecodeString ptr n_bytes + +bytesFS :: FastString -> [Word8] +bytesFS (FastString _ n_bytes _ buf enc) = + inlinePerformIO $ withForeignPtr buf $ \ptr -> + peekArray n_bytes ptr + +-- | returns a Z-encoded version of a 'FastString'. This might be the +-- original, if it was already Z-encoded. The first time this +-- function is applied to a particular 'FastString', the results are +-- memoized. +-- +zEncodeFS :: FastString -> FastString +zEncodeFS fs@(FastString uid n_bytes _ fp enc) = + case enc of + ZEncoded -> fs + UTF8Encoded ref -> + inlinePerformIO $ do + m <- readIORef ref + case m of + Just fs -> return fs + Nothing -> do + let efs = mkZFastString (zEncodeString (unpackFS fs)) + writeIORef ref (Just efs) + return efs + +appendFS :: FastString -> FastString -> FastString +appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2) + +concatFS :: [FastString] -> FastString +concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better + +headFS :: FastString -> Char +headFS (FastString _ n_bytes _ buf enc) = + inlinePerformIO $ withForeignPtr buf $ \ptr -> do + case enc of + ZEncoded -> do + w <- peek (castPtr ptr) + return (castCCharToChar w) + UTF8Encoded _ -> + return (fst (utf8DecodeChar ptr)) + +tailFS :: FastString -> FastString +tailFS (FastString _ n_bytes _ buf enc) = + inlinePerformIO $ withForeignPtr buf $ \ptr -> do + case enc of + ZEncoded -> do + return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1) + UTF8Encoded _ -> do + let (_,ptr') = utf8DecodeChar ptr + let off = ptr' `minusPtr` ptr + return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off) + +consFS :: Char -> FastString -> FastString +consFS c fs = mkFastString (c : unpackFS fs) + +uniqueOfFS :: FastString -> Int# +uniqueOfFS (FastString (I# u#) _ _ _ _) = u# + +nilFS = mkFastString "" + +-- ----------------------------------------------------------------------------- +-- Stats + +getFastStringTable :: IO [[FastString]] +getFastStringTable = do + tbl <- readIORef string_table + buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE] + return buckets + +-- ----------------------------------------------------------------------------- +-- Outputting 'FastString's + +-- |Outputs a 'FastString' with /no decoding at all/, that is, you +-- get the actual bytes in the 'FastString' written to the 'Handle'. +hPutFS handle (FastString _ len _ fp _) + | len == 0 = return () + | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len + +-- ToDo: we'll probably want an hPutFSLocal, or something, to output +-- in the current locale's encoding (for error messages and suchlike). + +-- ----------------------------------------------------------------------------- +-- LitStrings, here for convenience only. + +type LitString = Ptr () + +mkLitString# :: Addr# -> LitString +mkLitString# a# = Ptr a# + +foreign import ccall unsafe "ghc_strlen" + strLength :: Ptr () -> Int + +-- ----------------------------------------------------------------------------- +-- under the carpet + +-- Just like unsafePerformIO, but we inline it. +{-# INLINE inlinePerformIO #-} +inlinePerformIO :: IO a -> a +inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r + +-- NB. does *not* add a '\0'-terminator. +pokeCAString :: Ptr CChar -> String -> IO () +pokeCAString ptr str = + let + go [] n = return () + go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1) + in + go str 0 + +#if __GLASGOW_HASKELL__ < 600 + +mallocForeignPtrBytes :: Int -> IO (ForeignPtr a) +mallocForeignPtrBytes n = do + r <- mallocBytes n + newForeignPtr r (finalizerFree r) + +foreign import ccall unsafe "stdlib.h free" + finalizerFree :: Ptr a -> IO () + +peekCAStringLen = peekCStringLen + +#elif __GLASGOW_HASKELL__ <= 602 + +peekCAStringLen = peekCStringLen + +#endif +\end{code} |