summaryrefslogtreecommitdiff
path: root/libraries
diff options
context:
space:
mode:
Diffstat (limited to 'libraries')
-rw-r--r--libraries/ghc-boot/GHC/Data/ShortText.hs112
-rw-r--r--libraries/ghc-boot/GHC/Unit/Database.hs66
-rw-r--r--libraries/ghc-boot/GHC/Utils/Encoding.hs529
-rw-r--r--libraries/ghc-boot/ghc-boot.cabal.in3
4 files changed, 678 insertions, 32 deletions
diff --git a/libraries/ghc-boot/GHC/Data/ShortText.hs b/libraries/ghc-boot/GHC/Data/ShortText.hs
new file mode 100644
index 0000000000..f51d79864b
--- /dev/null
+++ b/libraries/ghc-boot/GHC/Data/ShortText.hs
@@ -0,0 +1,112 @@
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, GeneralizedNewtypeDeriving, DerivingStrategies #-}
+{-# OPTIONS_GHC -O2 -funbox-strict-fields #-}
+
+-- |
+-- An Unicode string for internal GHC use. Meant to replace String
+-- in places where being a lazy linked is not very useful and a more
+-- memory efficient data structure is desirable.
+
+-- Very similar to FastString, but not hash-consed and with some extra instances and
+-- functions for serialisation and I/O. Should be imported qualified.
+
+module GHC.Data.ShortText (
+ -- * ShortText
+ ShortText(..),
+ -- ** Conversion to and from String
+ pack,
+ unpack,
+ -- ** Operations
+ codepointLength,
+ byteLength,
+ GHC.Data.ShortText.null,
+ splitFilePath,
+ GHC.Data.ShortText.head,
+ stripPrefix
+ ) where
+
+import Prelude
+
+import Control.Monad (guard)
+import Control.DeepSeq as DeepSeq
+import Data.Binary
+import qualified Data.ByteString.Char8 as B8
+import qualified Data.ByteString.Short.Internal as SBS
+import GHC.Exts
+import GHC.IO
+import GHC.Utils.Encoding
+import System.FilePath (isPathSeparator)
+
+{-| A 'ShortText' is a modified UTF-8 encoded string meant for short strings like
+file paths, module descriptions, etc.
+-}
+newtype ShortText = ShortText { contents :: SBS.ShortByteString
+ }
+ deriving stock (Show)
+ deriving newtype (Eq, Ord, Binary, Semigroup, Monoid, NFData)
+
+-- We don't want to derive this one from ShortByteString since that one won't handle
+-- UTF-8 characters correctly.
+instance IsString ShortText where
+ fromString = pack
+
+-- | /O(n)/ Returns the length of the 'ShortText' in characters.
+codepointLength :: ShortText -> Int
+codepointLength st = unsafeDupablePerformIO $ countUTF8Chars (contents st)
+-- | /O(1)/ Returns the length of the 'ShortText' in bytes.
+byteLength :: ShortText -> Int
+byteLength st = SBS.length $ contents st
+
+-- | /O(n)/ Convert a 'String' into a 'ShortText'.
+pack :: String -> ShortText
+pack s = unsafeDupablePerformIO $ ShortText <$> utf8EncodeShortByteString s
+
+-- | /O(n)/ Convert a 'ShortText' into a 'String'.
+unpack :: ShortText -> String
+unpack st = utf8DecodeShortByteString $ contents st
+
+-- | /O(1)/ Test whether the 'ShortText' is the empty string.
+null :: ShortText -> Bool
+null st = SBS.null $ contents st
+
+-- | /O(n)/ Split a 'ShortText' representing a file path into its components by separating
+-- on the file separator characters for this platform.
+splitFilePath :: ShortText -> [ShortText]
+-- This seems dangerous, but since the path separators are in the ASCII set they map down
+-- to a single byte when encoded in UTF-8 and so this should work even when casting to ByteString.
+-- We DeepSeq.force the resulting list so that we can be sure that no references to the
+-- bytestring in `st'` remain in unevaluated thunks, which might prevent `st'` from being
+-- collected by the GC.
+splitFilePath st = DeepSeq.force $ map (ShortText . SBS.toShort) $ B8.splitWith isPathSeparator st'
+ where st' = SBS.fromShort $ contents st
+
+-- | /O(1)/ Returns the first UTF-8 codepoint in the 'ShortText'. Depending on the string in
+-- question, this may or may not be the actual first character in the string due to Unicode
+-- non-printable characters.
+head :: ShortText -> Char
+head st
+ | SBS.null $ contents st = error "head: Empty ShortText"
+ | otherwise = Prelude.head $ unpack st
+
+-- | /O(n)/ The 'stripPrefix' function takes two 'ShortText's and returns 'Just' the remainder of
+-- the second iff the first is its prefix, and otherwise Nothing.
+stripPrefix :: ShortText -> ShortText -> Maybe ShortText
+stripPrefix prefix st = do
+ let !(SBS.SBS prefixBA) = contents prefix
+ let !(SBS.SBS stBA) = contents st
+ let prefixLength = sizeofByteArray# prefixBA
+ let stLength = sizeofByteArray# stBA
+ -- If the length of 'st' is not >= than the length of 'prefix', it is impossible for 'prefix'
+ -- to be the prefix of `st`.
+ guard $ (I# stLength) >= (I# prefixLength)
+ -- 'prefix' is a prefix of 'st' if the first <length of prefix> bytes of 'st'
+ -- are equal to 'prefix'
+ guard $ I# (compareByteArrays# prefixBA 0# stBA 0# prefixLength) == 0
+ -- Allocate a new ByteArray# and copy the remainder of the 'st' into it
+ unsafeDupablePerformIO $ do
+ let newBAsize = (stLength -# prefixLength)
+ newSBS <- IO $ \s0 ->
+ let !(# s1, ba #) = newByteArray# newBAsize s0
+ s2 = copyByteArray# stBA prefixLength ba 0# newBAsize s1
+ !(# s3, fba #) = unsafeFreezeByteArray# ba s2
+ in (# s3, SBS.SBS fba #)
+ return . Just . ShortText $ newSBS
diff --git a/libraries/ghc-boot/GHC/Unit/Database.hs b/libraries/ghc-boot/GHC/Unit/Database.hs
index 8ed139adf3..cdef39e362 100644
--- a/libraries/ghc-boot/GHC/Unit/Database.hs
+++ b/libraries/ghc-boot/GHC/Unit/Database.hs
@@ -12,6 +12,7 @@
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE OverloadedStrings #-}
-----------------------------------------------------------------------------
-- |
@@ -82,16 +83,16 @@ import Data.Bifunctor
import Data.Binary as Bin
import Data.Binary.Put as Bin
import Data.Binary.Get as Bin
+import Data.List (intersperse)
import Control.Exception as Exception
import Control.Monad (when)
import System.FilePath as FilePath
-import qualified System.FilePath.Posix as FilePath.Posix
import System.IO
import System.IO.Error
import GHC.IO.Exception (IOErrorType(InappropriateType))
+import qualified GHC.Data.ShortText as ST
import GHC.IO.Handle.Lock
import System.Directory
-import Data.List (stripPrefix)
-- | @ghc-boot@'s UnitInfo, serialized to the database.
type DbUnitInfo = GenericUnitInfo BS.ByteString BS.ByteString BS.ByteString BS.ByteString BS.ByteString DbModule
@@ -142,28 +143,28 @@ data GenericUnitInfo compid srcpkgid srcpkgname uid modulename mod = GenericUnit
-- components that can be registered in a database and used by other
-- modules.
- , unitAbiHash :: String
+ , unitAbiHash :: ST.ShortText
-- ^ ABI hash used to avoid mixing up units compiled with different
-- dependencies, compiler, options, etc.
, unitDepends :: [uid]
-- ^ Identifiers of the units this one depends on
- , unitAbiDepends :: [(uid, String)]
+ , unitAbiDepends :: [(uid, ST.ShortText)]
-- ^ Like 'unitDepends', but each dependency is annotated with the ABI hash
-- we expect the dependency to respect.
- , unitImportDirs :: [FilePath]
+ , unitImportDirs :: [FilePathST]
-- ^ Directories containing module interfaces
- , unitLibraries :: [String]
+ , unitLibraries :: [ST.ShortText]
-- ^ Names of the Haskell libraries provided by this unit
- , unitExtDepLibsSys :: [String]
+ , unitExtDepLibsSys :: [ST.ShortText]
-- ^ Names of the external system libraries that this unit depends on. See
-- also `unitExtDepLibsGhc` field.
- , unitExtDepLibsGhc :: [String]
+ , unitExtDepLibsGhc :: [ST.ShortText]
-- ^ Because of slight differences between the GHC dynamic linker (in
-- GHC.Runtime.Linker) and the
-- native system linker, some packages have to link with a different list
@@ -174,46 +175,46 @@ data GenericUnitInfo compid srcpkgid srcpkgname uid modulename mod = GenericUnit
-- If this field is set, then we use that instead of the
-- `unitExtDepLibsSys` field.
- , unitLibraryDirs :: [FilePath]
+ , unitLibraryDirs :: [FilePathST]
-- ^ Directories containing libraries provided by this unit. See also
-- `unitLibraryDynDirs`.
--
-- It seems to be used to store paths to external library dependencies
-- too.
- , unitLibraryDynDirs :: [FilePath]
+ , unitLibraryDynDirs :: [FilePathST]
-- ^ Directories containing the dynamic libraries provided by this unit.
-- See also `unitLibraryDirs`.
--
-- It seems to be used to store paths to external dynamic library
-- dependencies too.
- , unitExtDepFrameworks :: [String]
+ , unitExtDepFrameworks :: [ST.ShortText]
-- ^ Names of the external MacOS frameworks that this unit depends on.
- , unitExtDepFrameworkDirs :: [FilePath]
+ , unitExtDepFrameworkDirs :: [FilePathST]
-- ^ Directories containing MacOS frameworks that this unit depends
-- on.
- , unitLinkerOptions :: [String]
+ , unitLinkerOptions :: [ST.ShortText]
-- ^ Linker (e.g. ld) command line options
- , unitCcOptions :: [String]
+ , unitCcOptions :: [ST.ShortText]
-- ^ C compiler options that needs to be passed to the C compiler when we
-- compile some C code against this unit.
- , unitIncludes :: [String]
+ , unitIncludes :: [ST.ShortText]
-- ^ C header files that are required by this unit (provided by this unit
-- or external)
- , unitIncludeDirs :: [FilePath]
+ , unitIncludeDirs :: [FilePathST]
-- ^ Directories containing C header files that this unit depends
-- on.
- , unitHaddockInterfaces :: [FilePath]
+ , unitHaddockInterfaces :: [FilePathST]
-- ^ Paths to Haddock interface files for this unit
- , unitHaddockHTMLs :: [FilePath]
+ , unitHaddockHTMLs :: [FilePathST]
-- ^ Paths to Haddock directories containing HTML files
, unitExposedModules :: [(modulename, Maybe mod)]
@@ -242,6 +243,8 @@ data GenericUnitInfo compid srcpkgid srcpkgname uid modulename mod = GenericUnit
}
deriving (Eq, Show)
+type FilePathST = ST.ShortText
+
-- | Convert between GenericUnitInfo instances
mapGenericUnitInfo
:: (uid1 -> uid2)
@@ -646,12 +649,12 @@ instance Binary DbInstUnitId where
-- Also perform a similar substitution for the older GHC-specific
-- "$topdir" variable. The "topdir" is the location of the ghc
-- installation (obtained from the -B option).
-mkMungePathUrl :: FilePath -> FilePath -> (FilePath -> FilePath, FilePath -> FilePath)
+mkMungePathUrl :: FilePathST -> FilePathST -> (FilePathST -> FilePathST, FilePathST -> FilePathST)
mkMungePathUrl top_dir pkgroot = (munge_path, munge_url)
where
munge_path p
- | Just p' <- stripVarPrefix "${pkgroot}" p = pkgroot ++ p'
- | Just p' <- stripVarPrefix "$topdir" p = top_dir ++ p'
+ | Just p' <- stripVarPrefix "${pkgroot}" p = mappend pkgroot p'
+ | Just p' <- stripVarPrefix "$topdir" p = mappend top_dir p'
| otherwise = p
munge_url p
@@ -659,20 +662,19 @@ mkMungePathUrl top_dir pkgroot = (munge_path, munge_url)
| Just p' <- stripVarPrefix "$httptopdir" p = toUrlPath top_dir p'
| otherwise = p
- toUrlPath r p = "file:///"
- -- URLs always use posix style '/' separators:
- ++ FilePath.Posix.joinPath
- (r : -- We need to drop a leading "/" or "\\"
- -- if there is one:
- dropWhile (all isPathSeparator)
- (FilePath.splitDirectories p))
+ toUrlPath r p = mconcat $ "file:///" : (intersperse "/" (r : (splitDirectories p)))
+ -- URLs always use posix style '/' separators
+
+ -- We need to drop a leading "/" or "\\" if there is one:
+ splitDirectories :: FilePathST -> [FilePathST]
+ splitDirectories p = filter (not . ST.null) $ ST.splitFilePath p
-- We could drop the separator here, and then use </> above. However,
-- by leaving it in and using ++ we keep the same path separator
-- rather than letting FilePath change it to use \ as the separator
- stripVarPrefix var path = case stripPrefix var path of
- Just [] -> Just []
- Just cs@(c : _) | isPathSeparator c -> Just cs
+ stripVarPrefix var path = case ST.stripPrefix var path of
+ Just "" -> Just ""
+ Just cs | isPathSeparator (ST.head cs) -> Just cs
_ -> Nothing
@@ -684,7 +686,7 @@ mkMungePathUrl top_dir pkgroot = (munge_path, munge_url)
-- Also perform a similar substitution for the older GHC-specific
-- "$topdir" variable. The "topdir" is the location of the ghc
-- installation (obtained from the -B option).
-mungeUnitInfoPaths :: FilePath -> FilePath -> GenericUnitInfo a b c d e f -> GenericUnitInfo a b c d e f
+mungeUnitInfoPaths :: FilePathST -> FilePathST -> GenericUnitInfo a b c d e f -> GenericUnitInfo a b c d e f
mungeUnitInfoPaths top_dir pkgroot pkg =
-- TODO: similar code is duplicated in utils/ghc-pkg/Main.hs
pkg
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding.hs b/libraries/ghc-boot/GHC/Utils/Encoding.hs
new file mode 100644
index 0000000000..0f84be189b
--- /dev/null
+++ b/libraries/ghc-boot/GHC/Utils/Encoding.hs
@@ -0,0 +1,529 @@
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-}
+{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}
+-- We always optimise this, otherwise performance of a non-optimised
+-- compiler is severely affected. This module used to live in the `ghc`
+-- package but has been moved to `ghc-boot` because the definition
+-- of the package database (needed in both ghc and in ghc-pkg) lives in
+-- `ghc-boot` and uses ShortText, which in turn depends on this module.
+
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow, 1997-2006
+--
+-- Character encodings
+--
+-- -----------------------------------------------------------------------------
+
+module GHC.Utils.Encoding (
+ -- * UTF-8
+ utf8DecodeCharAddr#,
+ utf8PrevChar,
+ utf8CharStart,
+ utf8DecodeChar,
+ utf8DecodeByteString,
+ utf8DecodeShortByteString,
+ utf8CompareShortByteString,
+ utf8DecodeStringLazy,
+ utf8EncodeChar,
+ utf8EncodeString,
+ utf8EncodeShortByteString,
+ utf8EncodedLength,
+ countUTF8Chars,
+
+ -- * Z-encoding
+ zEncodeString,
+ zDecodeString,
+
+ -- * Base62-encoding
+ toBase62,
+ toBase62Padded
+ ) where
+
+import Prelude
+
+import Foreign
+import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
+import Data.Char
+import qualified Data.Char as Char
+import Numeric
+import GHC.IO
+import GHC.ST
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Internal as BS
+import Data.ByteString.Short.Internal (ShortByteString(..))
+
+import GHC.Exts
+
+-- -----------------------------------------------------------------------------
+-- UTF-8
+
+-- We can't write the decoder as efficiently as we'd like without
+-- resorting to unboxed extensions, unfortunately. I tried to write
+-- an IO version of this function, but GHC can't eliminate boxed
+-- results from an IO-returning function.
+--
+-- We assume we can ignore overflow when parsing a multibyte character here.
+-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
+-- before decoding them (see "GHC.Data.StringBuffer").
+
+{-# INLINE utf8DecodeChar# #-}
+utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
+utf8DecodeChar# indexWord8# =
+ let !ch0 = word2Int# (indexWord8# 0#) in
+ case () of
+ _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
+
+ | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
+ (ch1 -# 0x80#)),
+ 2# #)
+
+ | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ let !ch2 = word2Int# (indexWord8# 2#) in
+ if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+ (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+ ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
+ (ch2 -# 0x80#)),
+ 3# #)
+
+ | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ let !ch2 = word2Int# (indexWord8# 2#) in
+ if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+ let !ch3 = word2Int# (indexWord8# 3#) in
+ if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
+ (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+ ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
+ ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
+ (ch3 -# 0x80#)),
+ 4# #)
+
+ | otherwise -> fail 1#
+ where
+ -- all invalid sequences end up here:
+ fail :: Int# -> (# Char#, Int# #)
+ fail nBytes# = (# '\0'#, nBytes# #)
+ -- '\xFFFD' would be the usual replacement character, but
+ -- that's a valid symbol in Haskell, so will result in a
+ -- confusing parse error later on. Instead we use '\0' which
+ -- will signal a lexer error immediately.
+
+utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharAddr# a# off# =
+ utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
+
+utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharByteArray# ba# off# =
+ utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
+
+utf8DecodeChar :: Ptr Word8 -> (Char, Int)
+utf8DecodeChar !(Ptr a#) =
+ case utf8DecodeCharAddr# a# 0# of
+ (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
+
+-- UTF-8 is cleverly designed so that we can always figure out where
+-- the start of the current character is, given any position in a
+-- stream. This function finds the start of the previous character,
+-- assuming there *is* a previous character.
+utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
+utf8PrevChar p = utf8CharStart (p `plusPtr` (-1))
+
+utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
+utf8CharStart p = go p
+ where go p = do w <- peek p
+ if w >= 0x80 && w < 0xC0
+ then go (p `plusPtr` (-1))
+ else return p
+
+{-# INLINE utf8DecodeLazy# #-}
+utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
+utf8DecodeLazy# retain decodeChar# len#
+ = unpack 0#
+ where
+ unpack i#
+ | isTrue# (i# >=# len#) = retain >> return []
+ | otherwise =
+ case decodeChar# i# of
+ (# c#, nBytes# #) -> do
+ rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
+ return (C# c# : rest)
+
+utf8DecodeByteString :: ByteString -> [Char]
+utf8DecodeByteString (BS.PS fptr offset len)
+ = utf8DecodeStringLazy fptr offset len
+
+utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
+utf8DecodeStringLazy fp offset (I# len#)
+ = unsafeDupablePerformIO $ do
+ let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
+ utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
+-- Note that since utf8DecodeLazy# returns a thunk the lifetime of the
+-- ForeignPtr actually needs to be longer than the lexical lifetime
+-- withForeignPtr would provide here. That's why we use touchForeignPtr to
+-- keep the fp alive until the last character has actually been decoded.
+
+utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
+utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0#
+ -- UTF-8 has the property that sorting by bytes values also sorts by
+ -- code-points.
+ -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
+ -- doesn't hold and we must explicitly check this case here.
+ -- Note that decoding every code point would also work but it would be much
+ -- more costly.
+ where
+ !sz1 = sizeofByteArray# a1
+ !sz2 = sizeofByteArray# a2
+ go off1 off2
+ | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
+ | isTrue# (off1 >=# sz1) = LT
+ | isTrue# (off2 >=# sz2) = GT
+ | otherwise =
+ let !b1_1 = indexWord8Array# a1 off1
+ !b2_1 = indexWord8Array# a2 off2
+ in case b1_1 of
+ 0xC0## -> case b2_1 of
+ 0xC0## -> go (off1 +# 1#) (off2 +# 1#)
+ _ -> case indexWord8Array# a1 (off1 +# 1#) of
+ 0x80## -> LT
+ _ -> go (off1 +# 1#) (off2 +# 1#)
+ _ -> case b2_1 of
+ 0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
+ 0x80## -> GT
+ _ -> go (off1 +# 1#) (off2 +# 1#)
+ _ | isTrue# (b1_1 `gtWord#` b2_1) -> GT
+ | isTrue# (b1_1 `ltWord#` b2_1) -> LT
+ | otherwise -> go (off1 +# 1#) (off2 +# 1#)
+
+utf8DecodeShortByteString :: ShortByteString -> [Char]
+utf8DecodeShortByteString (SBS ba#)
+ = unsafeDupablePerformIO $
+ let len# = sizeofByteArray# ba# in
+ utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len#
+
+countUTF8Chars :: ShortByteString -> IO Int
+countUTF8Chars (SBS ba) = go 0# 0#
+ where
+ len# = sizeofByteArray# ba
+ go i# n#
+ | isTrue# (i# >=# len#) =
+ return (I# n#)
+ | otherwise = do
+ case utf8DecodeCharByteArray# ba i# of
+ (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
+
+{-# INLINE utf8EncodeChar #-}
+utf8EncodeChar :: (Int# -> Word# -> State# s -> State# s)
+ -> Char -> ST s Int
+utf8EncodeChar write# c =
+ let x = ord c in
+ case () of
+ _ | x > 0 && x <= 0x007f -> do
+ write 0 x
+ return 1
+ -- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we
+ -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
+ | x <= 0x07ff -> do
+ write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
+ write 1 (0x80 .|. (x .&. 0x3F))
+ return 2
+ | x <= 0xffff -> do
+ write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
+ write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
+ write 2 (0x80 .|. (x .&. 0x3F))
+ return 3
+ | otherwise -> do
+ write 0 (0xF0 .|. (x `shiftR` 18))
+ write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
+ write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
+ write 3 (0x80 .|. (x .&. 0x3F))
+ return 4
+ where
+ {-# INLINE write #-}
+ write (I# off#) (I# c#) = ST $ \s ->
+ case write# off# (int2Word# c#) s of
+ s -> (# s, () #)
+
+utf8EncodeString :: Ptr Word8 -> String -> IO ()
+utf8EncodeString (Ptr a#) str = go a# str
+ where go !_ [] = return ()
+ go a# (c:cs) = do
+ I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
+ go (a# `plusAddr#` off#) cs
+
+utf8EncodeShortByteString :: String -> IO ShortByteString
+utf8EncodeShortByteString str = IO $ \s ->
+ case utf8EncodedLength str of { I# len# ->
+ case newByteArray# len# s of { (# s, mba# #) ->
+ case go mba# 0# str of { ST f_go ->
+ case f_go s of { (# s, () #) ->
+ case unsafeFreezeByteArray# mba# s of { (# s, ba# #) ->
+ (# s, SBS ba# #) }}}}}
+ where
+ go _ _ [] = return ()
+ go mba# i# (c:cs) = do
+ I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
+ go mba# (i# +# off#) cs
+
+utf8EncodedLength :: String -> Int
+utf8EncodedLength str = go 0 str
+ where go !n [] = n
+ go n (c:cs)
+ | ord c > 0 && ord c <= 0x007f = go (n+1) cs
+ | ord c <= 0x07ff = go (n+2) cs
+ | ord c <= 0xffff = go (n+3) cs
+ | otherwise = go (n+4) cs
+
+-- -----------------------------------------------------------------------------
+-- Note [Z-Encoding]
+-- ~~~~~~~~~~~~~~~~~
+
+{-
+This is the main name-encoding and decoding function. It encodes any
+string into a string that is acceptable as a C name. This is done
+right before we emit a symbol name into the compiled C or asm code.
+Z-encoding of strings is cached in the FastString interface, so we
+never encode the same string more than once.
+
+The basic encoding scheme is this.
+
+* Tuples (,,,) are coded as Z3T
+
+* Alphabetic characters (upper and lower) and digits
+ all translate to themselves;
+ except 'Z', which translates to 'ZZ'
+ and 'z', which translates to 'zz'
+ We need both so that we can preserve the variable/tycon distinction
+
+* Most other printable characters translate to 'zx' or 'Zx' for some
+ alphabetic character x
+
+* The others translate as 'znnnU' where 'nnn' is the decimal number
+ of the character
+
+ Before After
+ --------------------------
+ Trak Trak
+ foo_wib foozuwib
+ > zg
+ >1 zg1
+ foo# foozh
+ foo## foozhzh
+ foo##1 foozhzh1
+ fooZ fooZZ
+ :+ ZCzp
+ () Z0T 0-tuple
+ (,,,,) Z5T 5-tuple
+ (# #) Z1H unboxed 1-tuple (note the space)
+ (#,,,,#) Z5H unboxed 5-tuple
+ (NB: There is no Z1T nor Z0H.)
+-}
+
+type UserString = String -- As the user typed it
+type EncodedString = String -- Encoded form
+
+
+zEncodeString :: UserString -> EncodedString
+zEncodeString cs = case maybe_tuple cs of
+ Just n -> n -- Tuples go to Z2T etc
+ Nothing -> go cs
+ where
+ go [] = []
+ go (c:cs) = encode_digit_ch c ++ go' cs
+ go' [] = []
+ go' (c:cs) = encode_ch c ++ go' cs
+
+unencodedChar :: Char -> Bool -- True for chars that don't need encoding
+unencodedChar 'Z' = False
+unencodedChar 'z' = False
+unencodedChar c = c >= 'a' && c <= 'z'
+ || c >= 'A' && c <= 'Z'
+ || c >= '0' && c <= '9'
+
+-- If a digit is at the start of a symbol then we need to encode it.
+-- Otherwise package names like 9pH-0.1 give linker errors.
+encode_digit_ch :: Char -> EncodedString
+encode_digit_ch c | c >= '0' && c <= '9' = encode_as_unicode_char c
+encode_digit_ch c | otherwise = encode_ch c
+
+encode_ch :: Char -> EncodedString
+encode_ch c | unencodedChar c = [c] -- Common case first
+
+-- Constructors
+encode_ch '(' = "ZL" -- Needed for things like (,), and (->)
+encode_ch ')' = "ZR" -- For symmetry with (
+encode_ch '[' = "ZM"
+encode_ch ']' = "ZN"
+encode_ch ':' = "ZC"
+encode_ch 'Z' = "ZZ"
+
+-- Variables
+encode_ch 'z' = "zz"
+encode_ch '&' = "za"
+encode_ch '|' = "zb"
+encode_ch '^' = "zc"
+encode_ch '$' = "zd"
+encode_ch '=' = "ze"
+encode_ch '>' = "zg"
+encode_ch '#' = "zh"
+encode_ch '.' = "zi"
+encode_ch '<' = "zl"
+encode_ch '-' = "zm"
+encode_ch '!' = "zn"
+encode_ch '+' = "zp"
+encode_ch '\'' = "zq"
+encode_ch '\\' = "zr"
+encode_ch '/' = "zs"
+encode_ch '*' = "zt"
+encode_ch '_' = "zu"
+encode_ch '%' = "zv"
+encode_ch c = encode_as_unicode_char c
+
+encode_as_unicode_char :: Char -> EncodedString
+encode_as_unicode_char c = 'z' : if isDigit (head hex_str) then hex_str
+ else '0':hex_str
+ where hex_str = showHex (ord c) "U"
+ -- ToDo: we could improve the encoding here in various ways.
+ -- eg. strings of unicode characters come out as 'z1234Uz5678U', we
+ -- could remove the 'U' in the middle (the 'z' works as a separator).
+
+zDecodeString :: EncodedString -> UserString
+zDecodeString [] = []
+zDecodeString ('Z' : d : rest)
+ | isDigit d = decode_tuple d rest
+ | otherwise = decode_upper d : zDecodeString rest
+zDecodeString ('z' : d : rest)
+ | isDigit d = decode_num_esc d rest
+ | otherwise = decode_lower d : zDecodeString rest
+zDecodeString (c : rest) = c : zDecodeString rest
+
+decode_upper, decode_lower :: Char -> Char
+
+decode_upper 'L' = '('
+decode_upper 'R' = ')'
+decode_upper 'M' = '['
+decode_upper 'N' = ']'
+decode_upper 'C' = ':'
+decode_upper 'Z' = 'Z'
+decode_upper ch = {-pprTrace "decode_upper" (char ch)-} ch
+
+decode_lower 'z' = 'z'
+decode_lower 'a' = '&'
+decode_lower 'b' = '|'
+decode_lower 'c' = '^'
+decode_lower 'd' = '$'
+decode_lower 'e' = '='
+decode_lower 'g' = '>'
+decode_lower 'h' = '#'
+decode_lower 'i' = '.'
+decode_lower 'l' = '<'
+decode_lower 'm' = '-'
+decode_lower 'n' = '!'
+decode_lower 'p' = '+'
+decode_lower 'q' = '\''
+decode_lower 'r' = '\\'
+decode_lower 's' = '/'
+decode_lower 't' = '*'
+decode_lower 'u' = '_'
+decode_lower 'v' = '%'
+decode_lower ch = {-pprTrace "decode_lower" (char ch)-} ch
+
+-- Characters not having a specific code are coded as z224U (in hex)
+decode_num_esc :: Char -> EncodedString -> UserString
+decode_num_esc d rest
+ = go (digitToInt d) rest
+ where
+ go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest
+ go n ('U' : rest) = chr n : zDecodeString rest
+ go n other = error ("decode_num_esc: " ++ show n ++ ' ':other)
+
+decode_tuple :: Char -> EncodedString -> UserString
+decode_tuple d rest
+ = go (digitToInt d) rest
+ where
+ -- NB. recurse back to zDecodeString after decoding the tuple, because
+ -- the tuple might be embedded in a longer name.
+ go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest
+ go 0 ('T':rest) = "()" ++ zDecodeString rest
+ go n ('T':rest) = '(' : replicate (n-1) ',' ++ ")" ++ zDecodeString rest
+ go 1 ('H':rest) = "(# #)" ++ zDecodeString rest
+ go n ('H':rest) = '(' : '#' : replicate (n-1) ',' ++ "#)" ++ zDecodeString rest
+ go n other = error ("decode_tuple: " ++ show n ++ ' ':other)
+
+{-
+Tuples are encoded as
+ Z3T or Z3H
+for 3-tuples or unboxed 3-tuples respectively. No other encoding starts
+ Z<digit>
+
+* "(# #)" is the tycon for an unboxed 1-tuple (not 0-tuple)
+ There are no unboxed 0-tuples.
+
+* "()" is the tycon for a boxed 0-tuple.
+ There are no boxed 1-tuples.
+-}
+
+maybe_tuple :: UserString -> Maybe EncodedString
+
+maybe_tuple "(# #)" = Just("Z1H")
+maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of
+ (n, '#' : ')' : _) -> Just ('Z' : shows (n+1) "H")
+ _ -> Nothing
+maybe_tuple "()" = Just("Z0T")
+maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of
+ (n, ')' : _) -> Just ('Z' : shows (n+1) "T")
+ _ -> Nothing
+maybe_tuple _ = Nothing
+
+count_commas :: Int -> String -> (Int, String)
+count_commas n (',' : cs) = count_commas (n+1) cs
+count_commas n cs = (n,cs)
+
+
+{-
+************************************************************************
+* *
+ Base 62
+* *
+************************************************************************
+
+Note [Base 62 encoding 128-bit integers]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Instead of base-62 encoding a single 128-bit integer
+(ceil(21.49) characters), we'll base-62 a pair of 64-bit integers
+(2 * ceil(10.75) characters). Luckily for us, it's the same number of
+characters!
+-}
+
+--------------------------------------------------------------------------
+-- Base 62
+
+-- The base-62 code is based off of 'locators'
+-- ((c) Operational Dynamics Consulting, BSD3 licensed)
+
+-- | Size of a 64-bit word when written as a base-62 string
+word64Base62Len :: Int
+word64Base62Len = 11
+
+-- | Converts a 64-bit word into a base-62 string
+toBase62Padded :: Word64 -> String
+toBase62Padded w = pad ++ str
+ where
+ pad = replicate len '0'
+ len = word64Base62Len - length str -- 11 == ceil(64 / lg 62)
+ str = toBase62 w
+
+toBase62 :: Word64 -> String
+toBase62 w = showIntAtBase 62 represent w ""
+ where
+ represent :: Int -> Char
+ represent x
+ | x < 10 = Char.chr (48 + x)
+ | x < 36 = Char.chr (65 + x - 10)
+ | x < 62 = Char.chr (97 + x - 36)
+ | otherwise = error "represent (base 62): impossible!"
diff --git a/libraries/ghc-boot/ghc-boot.cabal.in b/libraries/ghc-boot/ghc-boot.cabal.in
index 1d70a65646..c58b6893eb 100644
--- a/libraries/ghc-boot/ghc-boot.cabal.in
+++ b/libraries/ghc-boot/ghc-boot.cabal.in
@@ -38,6 +38,8 @@ Library
exposed-modules:
GHC.BaseDir
+ GHC.Data.ShortText
+ GHC.Utils.Encoding
GHC.LanguageExtensions
GHC.Unit.Database
GHC.Serialized
@@ -68,4 +70,5 @@ Library
containers >= 0.5 && < 0.7,
directory >= 1.2 && < 1.4,
filepath >= 1.3 && < 1.5,
+ deepseq >= 1.4 && < 1.5,
ghc-boot-th == @ProjectVersionMunged@