% % (c) The University of Glasgow, 1997-2006 % \begin{code} {-# LANGUAGE BangPatterns #-} {-# OPTIONS -fno-warn-unused-imports #-} -- XXX GHC 6.9 seems to be confused by unpackCString# being used only in -- a RULE {-# OPTIONS_GHC -O -funbox-strict-fields #-} -- We always optimise this, otherwise performance of a non-optimised -- compiler is severely affected -- | -- There are two principal string types used internally by GHC: -- -- 'FastString': -- * A compact, hash-consed, representation of character strings. -- * Comparison is O(1), and you can get a 'Unique.Unique' from them. -- * Generated by 'fsLit'. -- * Turn into 'Outputable.SDoc' with 'Outputable.ftext'. -- -- 'LitString': -- * Just a wrapper for the @Addr#@ of a C string (@Ptr CChar@). -- * Practically no operations. -- * Outputing them is fast. -- * Generated by 'sLit'. -- * Turn into 'Outputable.SDoc' with 'Outputable.ptext' -- -- Use 'LitString' unless you want the facilities of 'FastString'. module FastString ( -- * FastStrings FastString(..), -- not abstract, for now. -- ** Construction fsLit, mkFastString, mkFastStringBytes, mkFastStringByteList, mkFastStringForeignPtr, #if defined(__GLASGOW_HASKELL__) mkFastString#, #endif mkZFastString, mkZFastStringBytes, -- ** Deconstruction unpackFS, -- :: FastString -> String bytesFS, -- :: FastString -> [Word8] -- ** Encoding isZEncoded, zEncodeFS, -- ** Operations uniqueOfFS, lengthFS, nullFS, appendFS, headFS, tailFS, concatFS, consFS, nilFS, -- ** Outputing hPutFS, -- ** Internal getFastStringTable, hasZEncoding, -- * LitStrings LitString, -- ** Construction sLit, #if defined(__GLASGOW_HASKELL__) mkLitString#, #endif mkLitString, -- ** Deconstruction unpackLitString, -- ** Operations lengthLS ) where #include "HsVersions.h" import Encoding import FastTypes import FastFunctions import Panic import Util import Foreign.C import GHC.Exts import System.IO import System.IO.Unsafe ( unsafePerformIO ) import Data.Data import Data.IORef ( IORef, newIORef, readIORef, writeIORef ) import Data.Maybe ( isJust ) import Data.Char ( ord ) import GHC.IO ( IO(..) ) import GHC.Ptr ( Ptr(..) ) #if __GLASGOW_HASKELL__ >= 701 import Foreign.Safe #else import Foreign hiding ( unsafePerformIO ) #endif #if defined(__GLASGOW_HASKELL__) import GHC.Base ( unpackCString# ) #endif #define hASH_TBL_SIZE 4091 #define hASH_TBL_SIZE_UNBOXED 4091# {-| A 'FastString' is an array of bytes, hashed to support fast O(1) comparison. It is also associated with a character encoding, so that we know how to convert a 'FastString' to the local encoding, or to the Z-encoding used by the compiler internally. 'FastString's support a memoized conversion to the Z-encoding via zEncodeFS. -} data FastString = FastString { uniq :: {-# UNPACK #-} !Int, -- unique id n_bytes :: {-# UNPACK #-} !Int, -- number of bytes n_chars :: {-# UNPACK #-} !Int, -- number of chars buf :: {-# UNPACK #-} !(ForeignPtr Word8), enc :: FSEncoding } deriving Typeable data FSEncoding -- including strings that don't need any encoding = ZEncoded -- A UTF-8 string with a memoized Z-encoding | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString)) instance Eq FastString where f1 == f2 = uniq f1 == uniq f2 instance Ord FastString where -- Compares lexicographically, not by unique a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False } a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False } a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True } a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True } max x y | x >= y = x | otherwise = y min x y | x <= y = x | otherwise = y compare a b = cmpFS a b instance Show FastString where show fs = show (unpackFS fs) instance Data FastString where -- don't traverse? toConstr _ = abstractConstr "FastString" gunfold _ _ = error "gunfold" dataTypeOf _ = mkNoRepType "FastString" cmpFS :: FastString -> FastString -> Ordering cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) = if u1 == u2 then EQ else case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of LT -> LT EQ -> compare l1 l2 GT -> GT unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int unsafeMemcmp buf1 buf2 l = inlinePerformIO $ withForeignPtr buf1 $ \p1 -> withForeignPtr buf2 $ \p2 -> memcmp p1 p2 l #ifndef __HADDOCK__ foreign import ccall unsafe "ghc_memcmp" memcmp :: Ptr a -> Ptr b -> Int -> IO Int #endif -- ----------------------------------------------------------------------------- -- Construction {- Internally, the compiler will maintain a fast string symbol table, providing sharing and fast comparison. Creation of new @FastString@s then covertly does a lookup, re-using the @FastString@ if there was a hit. -} data FastStringTable = FastStringTable {-# UNPACK #-} !Int (MutableArray# RealWorld [FastString]) {-# NOINLINE string_table #-} string_table :: IORef FastStringTable string_table = unsafePerformIO $ do tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of (# s2#, arr# #) -> (# s2#, FastStringTable 0 arr# #) newIORef tab lookupTbl :: FastStringTable -> Int -> IO [FastString] lookupTbl (FastStringTable _ arr#) (I# i#) = IO $ \ s# -> readArray# arr# i# s# updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO () updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) }) writeIORef fs_table_var (FastStringTable (uid+1) arr#) mkFastString# :: Addr# -> FastString mkFastString# a# = mkFastStringBytes ptr (ptrStrLength ptr) where ptr = Ptr a# mkFastStringBytes :: Ptr Word8 -> Int -> FastString mkFastStringBytes ptr len = unsafePerformIO $ do ft@(FastStringTable uid _) <- readIORef string_table let h = hashStr ptr len add_it ls = do fs <- copyNewFastString uid ptr len updTbl string_table ft h (fs:ls) {- _trace ("new: " ++ show f_str) $ -} return fs -- lookup_result <- lookupTbl ft h case lookup_result of [] -> add_it [] ls -> do b <- bucket_match ls len ptr case b of Nothing -> add_it ls Just v -> {- _trace ("re-use: "++show v) $ -} return v mkZFastStringBytes :: Ptr Word8 -> Int -> FastString mkZFastStringBytes ptr len = unsafePerformIO $ do ft@(FastStringTable uid _) <- readIORef string_table let h = hashStr ptr len add_it ls = do fs <- copyNewZFastString uid ptr len updTbl string_table ft h (fs:ls) {- _trace ("new: " ++ show f_str) $ -} return fs -- lookup_result <- lookupTbl ft h case lookup_result of [] -> add_it [] ls -> do b <- bucket_match ls len ptr case b of Nothing -> add_it ls Just v -> {- _trace ("re-use: "++show v) $ -} return v -- | Create a 'FastString' from an existing 'ForeignPtr'; the difference -- between this and 'mkFastStringBytes' is that we don't have to copy -- the bytes if the string is new to the table. mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString mkFastStringForeignPtr ptr fp len = do ft@(FastStringTable uid _) <- readIORef string_table -- _trace ("hashed: "++show (I# h)) $ let h = hashStr ptr len add_it ls = do fs <- mkNewFastString uid ptr fp len updTbl string_table ft h (fs:ls) {- _trace ("new: " ++ show f_str) $ -} return fs -- lookup_result <- lookupTbl ft h case lookup_result of [] -> add_it [] ls -> do b <- bucket_match ls len ptr case b of Nothing -> add_it ls Just v -> {- _trace ("re-use: "++show v) $ -} return v mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString mkZFastStringForeignPtr ptr fp len = do ft@(FastStringTable uid _) <- readIORef string_table -- _trace ("hashed: "++show (I# h)) $ let h = hashStr ptr len add_it ls = do fs <- mkNewZFastString uid ptr fp len updTbl string_table ft h (fs:ls) {- _trace ("new: " ++ show f_str) $ -} return fs -- lookup_result <- lookupTbl ft h case lookup_result of [] -> add_it [] ls -> do b <- bucket_match ls len ptr case b of Nothing -> add_it ls Just v -> {- _trace ("re-use: "++show v) $ -} return v -- | Creates a UTF-8 encoded 'FastString' from a 'String' mkFastString :: String -> FastString mkFastString str = inlinePerformIO $ do let l = utf8EncodedLength str buf <- mallocForeignPtrBytes l withForeignPtr buf $ \ptr -> do utf8EncodeString ptr str mkFastStringForeignPtr ptr buf l -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@ mkFastStringByteList :: [Word8] -> FastString mkFastStringByteList str = inlinePerformIO $ do let l = Prelude.length str buf <- mallocForeignPtrBytes l withForeignPtr buf $ \ptr -> do pokeArray (castPtr ptr) str mkFastStringForeignPtr ptr buf l -- | Creates a Z-encoded 'FastString' from a 'String' mkZFastString :: String -> FastString mkZFastString str = inlinePerformIO $ do let l = Prelude.length str buf <- mallocForeignPtrBytes l withForeignPtr buf $ \ptr -> do pokeCAString (castPtr ptr) str mkZFastStringForeignPtr ptr buf l bucket_match :: [FastString] -> Int -> Ptr Word8 -> IO (Maybe FastString) bucket_match [] _ _ = return Nothing bucket_match (v@(FastString _ l _ buf _):ls) len ptr | len == l = do b <- cmpStringPrefix ptr buf len if b then return (Just v) else bucket_match ls len ptr | otherwise = bucket_match ls len ptr mkNewFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString mkNewFastString uid ptr fp len = do ref <- newIORef Nothing n_chars <- countUTF8Chars ptr len return (FastString uid len n_chars fp (UTF8Encoded ref)) mkNewZFastString :: Int -> Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString mkNewZFastString uid _ fp len = do return (FastString uid len len fp ZEncoded) copyNewFastString :: Int -> Ptr Word8 -> Int -> IO FastString copyNewFastString uid ptr len = do fp <- copyBytesToForeignPtr ptr len ref <- newIORef Nothing n_chars <- countUTF8Chars ptr len return (FastString uid len n_chars fp (UTF8Encoded ref)) copyNewZFastString :: Int -> Ptr Word8 -> Int -> IO FastString copyNewZFastString uid ptr len = do fp <- copyBytesToForeignPtr ptr len return (FastString uid len len fp ZEncoded) copyBytesToForeignPtr :: Ptr Word8 -> Int -> IO (ForeignPtr Word8) copyBytesToForeignPtr ptr len = do fp <- mallocForeignPtrBytes len withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len return fp cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool cmpStringPrefix ptr fp len = withForeignPtr fp $ \ptr' -> do r <- memcmp ptr ptr' len return (r == 0) hashStr :: Ptr Word8 -> Int -> Int -- use the Addr to produce a hash value between 0 & m (inclusive) hashStr (Ptr a#) (I# len#) = loop 0# 0# where loop h n | n GHC.Exts.==# len# = I# h | otherwise = loop h2 (n GHC.Exts.+# 1#) where !c = ord# (indexCharOffAddr# a# n) !h2 = (c GHC.Exts.+# (h GHC.Exts.*# 128#)) `remInt#` hASH_TBL_SIZE# -- ----------------------------------------------------------------------------- -- Operations -- | Returns the length of the 'FastString' in characters lengthFS :: FastString -> Int lengthFS f = n_chars f -- | Returns @True@ if the 'FastString' is Z-encoded isZEncoded :: FastString -> Bool isZEncoded fs | ZEncoded <- enc fs = True | otherwise = False -- | Returns @True@ if this 'FastString' is not Z-encoded but already has -- a Z-encoding cached (used in producing stats). hasZEncoding :: FastString -> Bool hasZEncoding (FastString _ _ _ _ enc) = case enc of ZEncoded -> False UTF8Encoded ref -> inlinePerformIO $ do m <- readIORef ref return (isJust m) -- | Returns @True@ if the 'FastString' is empty nullFS :: FastString -> Bool nullFS f = n_bytes f == 0 -- | Unpacks and decodes the FastString unpackFS :: FastString -> String unpackFS (FastString _ n_bytes _ buf enc) = inlinePerformIO $ withForeignPtr buf $ \ptr -> case enc of ZEncoded -> peekCAStringLen (castPtr ptr,n_bytes) UTF8Encoded _ -> utf8DecodeString ptr n_bytes bytesFS :: FastString -> [Word8] bytesFS (FastString _ n_bytes _ buf _) = inlinePerformIO $ withForeignPtr buf $ \ptr -> peekArray n_bytes ptr -- | Returns a Z-encoded version of a 'FastString'. This might be the -- original, if it was already Z-encoded. The first time this -- function is applied to a particular 'FastString', the results are -- memoized. -- zEncodeFS :: FastString -> FastString zEncodeFS fs@(FastString _ _ _ _ enc) = case enc of ZEncoded -> fs UTF8Encoded ref -> inlinePerformIO $ do m <- readIORef ref case m of Just fs -> return fs Nothing -> do let efs = mkZFastString (zEncodeString (unpackFS fs)) writeIORef ref (Just efs) return efs appendFS :: FastString -> FastString -> FastString appendFS fs1 fs2 = inlinePerformIO $ do r <- mallocForeignPtrBytes len withForeignPtr r $ \ r' -> do withForeignPtr (buf fs1) $ \ fs1Ptr -> do withForeignPtr (buf fs2) $ \ fs2Ptr -> do copyBytes r' fs1Ptr len1 copyBytes (advancePtr r' len1) fs2Ptr len2 mkFastStringForeignPtr r' r len where len = len1 + len2 len1 = lengthFS fs1 len2 = lengthFS fs2 concatFS :: [FastString] -> FastString concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better headFS :: FastString -> Char headFS (FastString _ 0 _ _ _) = panic "headFS: Empty FastString" headFS (FastString _ _ _ buf enc) = inlinePerformIO $ withForeignPtr buf $ \ptr -> do case enc of ZEncoded -> do w <- peek (castPtr ptr) return (castCCharToChar w) UTF8Encoded _ -> return (fst (utf8DecodeChar ptr)) tailFS :: FastString -> FastString tailFS (FastString _ 0 _ _ _) = panic "tailFS: Empty FastString" tailFS (FastString _ n_bytes _ buf enc) = inlinePerformIO $ withForeignPtr buf $ \ptr -> do case enc of ZEncoded -> do return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1) UTF8Encoded _ -> do let (_,ptr') = utf8DecodeChar ptr let off = ptr' `minusPtr` ptr return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off) consFS :: Char -> FastString -> FastString consFS c fs = mkFastString (c : unpackFS fs) uniqueOfFS :: FastString -> FastInt uniqueOfFS (FastString u _ _ _ _) = iUnbox u nilFS :: FastString nilFS = mkFastString "" -- ----------------------------------------------------------------------------- -- Stats getFastStringTable :: IO [[FastString]] getFastStringTable = do tbl <- readIORef string_table buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE] return buckets -- ----------------------------------------------------------------------------- -- Outputting 'FastString's -- |Outputs a 'FastString' with /no decoding at all/, that is, you -- get the actual bytes in the 'FastString' written to the 'Handle'. hPutFS :: Handle -> FastString -> IO () hPutFS handle (FastString _ len _ fp _) | len == 0 = return () | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len -- ToDo: we'll probably want an hPutFSLocal, or something, to output -- in the current locale's encoding (for error messages and suchlike). -- ----------------------------------------------------------------------------- -- LitStrings, here for convenience only. -- hmm, not unboxed (or rather FastPtr), interesting --a.k.a. Ptr CChar, Ptr Word8, Ptr (), hmph. We don't --really care about C types in naming, where we can help it. type LitString = Ptr Word8 --Why do we recalculate length every time it's requested? --If it's commonly needed, we should perhaps have --data LitString = LitString {-#UNPACK#-}!(FastPtr Word8) {-#UNPACK#-}!FastInt #if defined(__GLASGOW_HASKELL__) mkLitString# :: Addr# -> LitString mkLitString# a# = Ptr a# #endif --can/should we use FastTypes here? --Is this likely to be memory-preserving if only used on constant strings? --should we inline it? If lucky, that would make a CAF that wouldn't --be computationally repeated... although admittedly we're not --really intending to use mkLitString when __GLASGOW_HASKELL__... --(I wonder, is unicode / multi-byte characters allowed in LitStrings -- at all?) {-# INLINE mkLitString #-} mkLitString :: String -> LitString mkLitString s = unsafePerformIO (do p <- mallocBytes (length s + 1) let loop :: Int -> String -> IO () loop n cs | n `seq` null cs = pokeByteOff p n (0 :: Word8) loop n (c:cs) = do pokeByteOff p n (fromIntegral (ord c) :: Word8) loop (1+n) cs -- XXX GHC isn't smart enough to know that we have already covered -- this case. loop _ [] = panic "mkLitString" loop 0 s return p ) unpackLitString :: LitString -> String unpackLitString p_ = case pUnbox p_ of p -> unpack (_ILIT(0)) where unpack n = case indexWord8OffFastPtrAsFastChar p n of ch -> if ch `eqFastChar` _CLIT('\0') then [] else cBox ch : unpack (n +# _ILIT(1)) lengthLS :: LitString -> Int lengthLS = ptrStrLength -- for now, use a simple String representation --no, let's not do that right now - it's work in other places #if 0 type LitString = String mkLitString :: String -> LitString mkLitString = id unpackLitString :: LitString -> String unpackLitString = id lengthLS :: LitString -> Int lengthLS = length #endif -- ----------------------------------------------------------------------------- -- under the carpet foreign import ccall unsafe "ghc_strlen" ptrStrLength :: Ptr Word8 -> Int -- NB. does *not* add a '\0'-terminator. -- We only use CChar here to be parallel to the imported -- peekC(A)StringLen. pokeCAString :: Ptr CChar -> String -> IO () pokeCAString ptr str = let go [] _ = return () go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1) in go str 0 {-# NOINLINE sLit #-} sLit :: String -> LitString sLit x = mkLitString x {-# NOINLINE fsLit #-} fsLit :: String -> FastString fsLit x = mkFastString x {-# RULES "slit" forall x . sLit (unpackCString# x) = mkLitString# x #-} {-# RULES "fslit" forall x . fsLit (unpackCString# x) = mkFastString# x #-} \end{code}