From 7fdcce6d4d13a10a1b2336c1d40482c64dba664d Mon Sep 17 00:00:00 2001
From: Wander Hillen <wjw.hillen@gmail.com>
Date: Fri, 25 Sep 2020 11:41:06 +0200
Subject: Initial ShortText code and conversion of package db code

Metric Decrease:
    Naperian
    T10421
    T10421a
    T10547
    T12150
    T12234
    T12425
    T13035
    T18140
    T18304
    T5837
    T6048
    T13253-spj
    T18282
    T18223
    T3064
    T9961
Metric Increase
    T13701

HFSKJH
---
 libraries/ghc-boot/GHC/Data/ShortText.hs | 112 +++++++
 libraries/ghc-boot/GHC/Unit/Database.hs  |  66 ++--
 libraries/ghc-boot/GHC/Utils/Encoding.hs | 529 +++++++++++++++++++++++++++++++
 libraries/ghc-boot/ghc-boot.cabal.in     |   3 +
 4 files changed, 678 insertions(+), 32 deletions(-)
 create mode 100644 libraries/ghc-boot/GHC/Data/ShortText.hs
 create mode 100644 libraries/ghc-boot/GHC/Utils/Encoding.hs

(limited to 'libraries')

diff --git a/libraries/ghc-boot/GHC/Data/ShortText.hs b/libraries/ghc-boot/GHC/Data/ShortText.hs
new file mode 100644
index 0000000000..f51d79864b
--- /dev/null
+++ b/libraries/ghc-boot/GHC/Data/ShortText.hs
@@ -0,0 +1,112 @@
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, GeneralizedNewtypeDeriving, DerivingStrategies #-}
+{-# OPTIONS_GHC -O2 -funbox-strict-fields #-}
+
+-- |
+-- An Unicode string for internal GHC use. Meant to replace String
+-- in places where being a lazy linked is not very useful and a more
+-- memory efficient data structure is desirable.
+
+-- Very similar to FastString, but not hash-consed and with some extra instances and
+-- functions for serialisation and I/O. Should be imported qualified.
+
+module GHC.Data.ShortText (
+        -- * ShortText
+        ShortText(..),
+        -- ** Conversion to and from String
+        pack,
+        unpack,
+        -- ** Operations
+        codepointLength,
+        byteLength,
+        GHC.Data.ShortText.null,
+        splitFilePath,
+        GHC.Data.ShortText.head,
+        stripPrefix
+  ) where
+
+import Prelude
+
+import Control.Monad (guard)
+import Control.DeepSeq as DeepSeq
+import Data.Binary
+import qualified Data.ByteString.Char8 as B8
+import qualified Data.ByteString.Short.Internal as SBS
+import GHC.Exts
+import GHC.IO
+import GHC.Utils.Encoding
+import System.FilePath (isPathSeparator)
+
+{-| A 'ShortText' is a modified UTF-8 encoded string meant for short strings like
+file paths, module descriptions, etc.
+-}
+newtype ShortText = ShortText { contents :: SBS.ShortByteString
+                              }
+                              deriving stock (Show)
+                              deriving newtype (Eq, Ord, Binary, Semigroup, Monoid, NFData)
+
+-- We don't want to derive this one from ShortByteString since that one won't handle
+-- UTF-8 characters correctly.
+instance IsString ShortText where
+  fromString = pack
+
+-- | /O(n)/ Returns the length of the 'ShortText' in characters.
+codepointLength :: ShortText -> Int
+codepointLength st = unsafeDupablePerformIO $ countUTF8Chars (contents st)
+-- | /O(1)/ Returns the length of the 'ShortText' in bytes.
+byteLength :: ShortText -> Int
+byteLength st = SBS.length $ contents st
+
+-- | /O(n)/ Convert a 'String' into a 'ShortText'.
+pack :: String -> ShortText
+pack s = unsafeDupablePerformIO $ ShortText <$> utf8EncodeShortByteString s
+
+-- | /O(n)/ Convert a 'ShortText' into a 'String'.
+unpack :: ShortText -> String
+unpack st = utf8DecodeShortByteString $ contents st
+
+-- | /O(1)/ Test whether the 'ShortText' is the empty string.
+null :: ShortText -> Bool
+null st = SBS.null $ contents st
+
+-- | /O(n)/ Split a 'ShortText' representing a file path into its components by separating
+-- on the file separator characters for this platform.
+splitFilePath :: ShortText -> [ShortText]
+-- This seems dangerous, but since the path separators are in the ASCII set they map down
+-- to a single byte when encoded in UTF-8 and so this should work even when casting to ByteString.
+-- We DeepSeq.force the resulting list so that we can be sure that no references to the
+-- bytestring in `st'` remain in unevaluated thunks, which might prevent `st'` from being
+-- collected by the GC.
+splitFilePath st = DeepSeq.force $ map (ShortText . SBS.toShort) $ B8.splitWith isPathSeparator st'
+  where st' = SBS.fromShort $ contents st
+
+-- | /O(1)/ Returns the first UTF-8 codepoint in the 'ShortText'. Depending on the string in
+-- question, this may or may not be the actual first character in the string due to Unicode
+-- non-printable characters.
+head :: ShortText -> Char
+head st
+  | SBS.null $ contents st = error "head: Empty ShortText"
+  | otherwise              = Prelude.head $ unpack st
+
+-- | /O(n)/ The 'stripPrefix' function takes two 'ShortText's and returns 'Just' the remainder of
+-- the second iff the first is its prefix, and otherwise Nothing.
+stripPrefix :: ShortText -> ShortText -> Maybe ShortText
+stripPrefix prefix st = do
+  let !(SBS.SBS prefixBA) = contents prefix
+  let !(SBS.SBS stBA)     = contents st
+  let prefixLength        = sizeofByteArray# prefixBA
+  let stLength            = sizeofByteArray# stBA
+  -- If the length of 'st' is not >= than the length of 'prefix', it is impossible for 'prefix'
+  -- to be the prefix of `st`.
+  guard $ (I# stLength) >= (I# prefixLength)
+  -- 'prefix' is a prefix of 'st' if the first <length of prefix> bytes of 'st'
+  -- are equal to 'prefix'
+  guard $ I# (compareByteArrays# prefixBA 0# stBA 0# prefixLength) == 0
+  -- Allocate a new ByteArray# and copy the remainder of the 'st' into it
+  unsafeDupablePerformIO $ do
+    let newBAsize = (stLength -# prefixLength)
+    newSBS <- IO $ \s0 ->
+      let !(# s1, ba #)  = newByteArray# newBAsize s0
+          s2             = copyByteArray# stBA prefixLength ba 0# newBAsize s1
+          !(# s3, fba #) = unsafeFreezeByteArray# ba s2
+      in  (# s3, SBS.SBS fba #)
+    return . Just . ShortText $ newSBS
diff --git a/libraries/ghc-boot/GHC/Unit/Database.hs b/libraries/ghc-boot/GHC/Unit/Database.hs
index 8ed139adf3..cdef39e362 100644
--- a/libraries/ghc-boot/GHC/Unit/Database.hs
+++ b/libraries/ghc-boot/GHC/Unit/Database.hs
@@ -12,6 +12,7 @@
 {-# LANGUAGE TypeSynonymInstances #-}
 {-# LANGUAGE ExplicitNamespaces #-}
 {-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE OverloadedStrings #-}
 
 -----------------------------------------------------------------------------
 -- |
@@ -82,16 +83,16 @@ import Data.Bifunctor
 import Data.Binary as Bin
 import Data.Binary.Put as Bin
 import Data.Binary.Get as Bin
+import Data.List (intersperse)
 import Control.Exception as Exception
 import Control.Monad (when)
 import System.FilePath as FilePath
-import qualified System.FilePath.Posix as FilePath.Posix
 import System.IO
 import System.IO.Error
 import GHC.IO.Exception (IOErrorType(InappropriateType))
+import qualified GHC.Data.ShortText as ST
 import GHC.IO.Handle.Lock
 import System.Directory
-import Data.List (stripPrefix)
 
 -- | @ghc-boot@'s UnitInfo, serialized to the database.
 type DbUnitInfo      = GenericUnitInfo BS.ByteString BS.ByteString BS.ByteString BS.ByteString BS.ByteString DbModule
@@ -142,28 +143,28 @@ data GenericUnitInfo compid srcpkgid srcpkgname uid modulename mod = GenericUnit
       -- components that can be registered in a database and used by other
       -- modules.
 
-   , unitAbiHash        :: String
+   , unitAbiHash        :: ST.ShortText
       -- ^ ABI hash used to avoid mixing up units compiled with different
       -- dependencies, compiler, options, etc.
 
    , unitDepends        :: [uid]
       -- ^ Identifiers of the units this one depends on
 
-   , unitAbiDepends     :: [(uid, String)]
+   , unitAbiDepends     :: [(uid, ST.ShortText)]
      -- ^ Like 'unitDepends', but each dependency is annotated with the ABI hash
      -- we expect the dependency to respect.
 
-   , unitImportDirs     :: [FilePath]
+   , unitImportDirs     :: [FilePathST]
       -- ^ Directories containing module interfaces
 
-   , unitLibraries      :: [String]
+   , unitLibraries      :: [ST.ShortText]
       -- ^ Names of the Haskell libraries provided by this unit
 
-   , unitExtDepLibsSys  :: [String]
+   , unitExtDepLibsSys  :: [ST.ShortText]
       -- ^ Names of the external system libraries that this unit depends on. See
       -- also `unitExtDepLibsGhc` field.
 
-   , unitExtDepLibsGhc  :: [String]
+   , unitExtDepLibsGhc  :: [ST.ShortText]
       -- ^ Because of slight differences between the GHC dynamic linker (in
       -- GHC.Runtime.Linker) and the
       -- native system linker, some packages have to link with a different list
@@ -174,46 +175,46 @@ data GenericUnitInfo compid srcpkgid srcpkgname uid modulename mod = GenericUnit
       -- If this field is set, then we use that instead of the
       -- `unitExtDepLibsSys` field.
 
-   , unitLibraryDirs    :: [FilePath]
+   , unitLibraryDirs    :: [FilePathST]
       -- ^ Directories containing libraries provided by this unit. See also
       -- `unitLibraryDynDirs`.
       --
       -- It seems to be used to store paths to external library dependencies
       -- too.
 
-   , unitLibraryDynDirs :: [FilePath]
+   , unitLibraryDynDirs :: [FilePathST]
       -- ^ Directories containing the dynamic libraries provided by this unit.
       -- See also `unitLibraryDirs`.
       --
       -- It seems to be used to store paths to external dynamic library
       -- dependencies too.
 
-   , unitExtDepFrameworks :: [String]
+   , unitExtDepFrameworks :: [ST.ShortText]
       -- ^ Names of the external MacOS frameworks that this unit depends on.
 
-   , unitExtDepFrameworkDirs :: [FilePath]
+   , unitExtDepFrameworkDirs :: [FilePathST]
       -- ^ Directories containing MacOS frameworks that this unit depends
       -- on.
 
-   , unitLinkerOptions  :: [String]
+   , unitLinkerOptions  :: [ST.ShortText]
       -- ^ Linker (e.g. ld) command line options
 
-   , unitCcOptions      :: [String]
+   , unitCcOptions      :: [ST.ShortText]
       -- ^ C compiler options that needs to be passed to the C compiler when we
       -- compile some C code against this unit.
 
-   , unitIncludes       :: [String]
+   , unitIncludes       :: [ST.ShortText]
       -- ^ C header files that are required by this unit (provided by this unit
       -- or external)
 
-   , unitIncludeDirs    :: [FilePath]
+   , unitIncludeDirs    :: [FilePathST]
       -- ^ Directories containing C header files that this unit depends
       -- on.
 
-   , unitHaddockInterfaces :: [FilePath]
+   , unitHaddockInterfaces :: [FilePathST]
       -- ^ Paths to Haddock interface files for this unit
 
-   , unitHaddockHTMLs   :: [FilePath]
+   , unitHaddockHTMLs   :: [FilePathST]
       -- ^ Paths to Haddock directories containing HTML files
 
    , unitExposedModules :: [(modulename, Maybe mod)]
@@ -242,6 +243,8 @@ data GenericUnitInfo compid srcpkgid srcpkgname uid modulename mod = GenericUnit
    }
    deriving (Eq, Show)
 
+type FilePathST = ST.ShortText
+
 -- | Convert between GenericUnitInfo instances
 mapGenericUnitInfo
    :: (uid1 -> uid2)
@@ -646,12 +649,12 @@ instance Binary DbInstUnitId where
 -- Also perform a similar substitution for the older GHC-specific
 -- "$topdir" variable. The "topdir" is the location of the ghc
 -- installation (obtained from the -B option).
-mkMungePathUrl :: FilePath -> FilePath -> (FilePath -> FilePath, FilePath -> FilePath)
+mkMungePathUrl :: FilePathST -> FilePathST -> (FilePathST -> FilePathST, FilePathST -> FilePathST)
 mkMungePathUrl top_dir pkgroot = (munge_path, munge_url)
    where
     munge_path p
-      | Just p' <- stripVarPrefix "${pkgroot}" p = pkgroot ++ p'
-      | Just p' <- stripVarPrefix "$topdir"    p = top_dir ++ p'
+      | Just p' <- stripVarPrefix "${pkgroot}" p = mappend pkgroot p'
+      | Just p' <- stripVarPrefix "$topdir"    p = mappend top_dir p'
       | otherwise                                = p
 
     munge_url p
@@ -659,20 +662,19 @@ mkMungePathUrl top_dir pkgroot = (munge_path, munge_url)
       | Just p' <- stripVarPrefix "$httptopdir"   p = toUrlPath top_dir p'
       | otherwise                                   = p
 
-    toUrlPath r p = "file:///"
-                 -- URLs always use posix style '/' separators:
-                 ++ FilePath.Posix.joinPath
-                        (r : -- We need to drop a leading "/" or "\\"
-                             -- if there is one:
-                             dropWhile (all isPathSeparator)
-                                       (FilePath.splitDirectories p))
+    toUrlPath r p = mconcat $ "file:///" : (intersperse "/" (r : (splitDirectories p)))
+                                          -- URLs always use posix style '/' separators
+
+    -- We need to drop a leading "/" or "\\" if there is one:
+    splitDirectories :: FilePathST -> [FilePathST]
+    splitDirectories p  = filter (not . ST.null) $ ST.splitFilePath p
 
     -- We could drop the separator here, and then use </> above. However,
     -- by leaving it in and using ++ we keep the same path separator
     -- rather than letting FilePath change it to use \ as the separator
-    stripVarPrefix var path = case stripPrefix var path of
-                              Just [] -> Just []
-                              Just cs@(c : _) | isPathSeparator c -> Just cs
+    stripVarPrefix var path = case ST.stripPrefix var path of
+                              Just "" -> Just ""
+                              Just cs | isPathSeparator (ST.head cs) -> Just cs
                               _ -> Nothing
 
 
@@ -684,7 +686,7 @@ mkMungePathUrl top_dir pkgroot = (munge_path, munge_url)
 -- Also perform a similar substitution for the older GHC-specific
 -- "$topdir" variable. The "topdir" is the location of the ghc
 -- installation (obtained from the -B option).
-mungeUnitInfoPaths :: FilePath -> FilePath -> GenericUnitInfo a b c d e f -> GenericUnitInfo a b c d e f
+mungeUnitInfoPaths :: FilePathST -> FilePathST -> GenericUnitInfo a b c d e f -> GenericUnitInfo a b c d e f
 mungeUnitInfoPaths top_dir pkgroot pkg =
    -- TODO: similar code is duplicated in utils/ghc-pkg/Main.hs
     pkg
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding.hs b/libraries/ghc-boot/GHC/Utils/Encoding.hs
new file mode 100644
index 0000000000..0f84be189b
--- /dev/null
+++ b/libraries/ghc-boot/GHC/Utils/Encoding.hs
@@ -0,0 +1,529 @@
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-}
+{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}
+-- We always optimise this, otherwise performance of a non-optimised
+-- compiler is severely affected. This module used to live in the `ghc`
+-- package but has been moved to `ghc-boot` because the definition
+-- of the package database (needed in both ghc and in ghc-pkg) lives in
+-- `ghc-boot` and uses ShortText, which in turn depends on this module.
+
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow, 1997-2006
+--
+-- Character encodings
+--
+-- -----------------------------------------------------------------------------
+
+module GHC.Utils.Encoding (
+        -- * UTF-8
+        utf8DecodeCharAddr#,
+        utf8PrevChar,
+        utf8CharStart,
+        utf8DecodeChar,
+        utf8DecodeByteString,
+        utf8DecodeShortByteString,
+        utf8CompareShortByteString,
+        utf8DecodeStringLazy,
+        utf8EncodeChar,
+        utf8EncodeString,
+        utf8EncodeShortByteString,
+        utf8EncodedLength,
+        countUTF8Chars,
+
+        -- * Z-encoding
+        zEncodeString,
+        zDecodeString,
+
+        -- * Base62-encoding
+        toBase62,
+        toBase62Padded
+  ) where
+
+import Prelude
+
+import Foreign
+import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
+import Data.Char
+import qualified Data.Char as Char
+import Numeric
+import GHC.IO
+import GHC.ST
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Internal as BS
+import Data.ByteString.Short.Internal (ShortByteString(..))
+
+import GHC.Exts
+
+-- -----------------------------------------------------------------------------
+-- UTF-8
+
+-- We can't write the decoder as efficiently as we'd like without
+-- resorting to unboxed extensions, unfortunately.  I tried to write
+-- an IO version of this function, but GHC can't eliminate boxed
+-- results from an IO-returning function.
+--
+-- We assume we can ignore overflow when parsing a multibyte character here.
+-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
+-- before decoding them (see "GHC.Data.StringBuffer").
+
+{-# INLINE utf8DecodeChar# #-}
+utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
+utf8DecodeChar# indexWord8# =
+  let !ch0 = word2Int# (indexWord8# 0#) in
+  case () of
+    _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
+
+      | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
+                  (ch1 -# 0x80#)),
+           2# #)
+
+      | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        let !ch2 = word2Int# (indexWord8# 2#) in
+        if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+        (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+                 ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
+                  (ch2 -# 0x80#)),
+           3# #)
+
+     | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
+        let !ch1 = word2Int# (indexWord8# 1#) in
+        if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+        let !ch2 = word2Int# (indexWord8# 2#) in
+        if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+        let !ch3 = word2Int# (indexWord8# 3#) in
+        if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
+        (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+                 ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
+                 ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
+                  (ch3 -# 0x80#)),
+           4# #)
+
+      | otherwise -> fail 1#
+  where
+        -- all invalid sequences end up here:
+        fail :: Int# -> (# Char#, Int# #)
+        fail nBytes# = (# '\0'#, nBytes# #)
+        -- '\xFFFD' would be the usual replacement character, but
+        -- that's a valid symbol in Haskell, so will result in a
+        -- confusing parse error later on.  Instead we use '\0' which
+        -- will signal a lexer error immediately.
+
+utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharAddr# a# off# =
+    utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
+
+utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharByteArray# ba# off# =
+    utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
+
+utf8DecodeChar :: Ptr Word8 -> (Char, Int)
+utf8DecodeChar !(Ptr a#) =
+  case utf8DecodeCharAddr# a# 0# of
+    (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
+
+-- UTF-8 is cleverly designed so that we can always figure out where
+-- the start of the current character is, given any position in a
+-- stream.  This function finds the start of the previous character,
+-- assuming there *is* a previous character.
+utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
+utf8PrevChar p = utf8CharStart (p `plusPtr` (-1))
+
+utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
+utf8CharStart p = go p
+ where go p = do w <- peek p
+                 if w >= 0x80 && w < 0xC0
+                        then go (p `plusPtr` (-1))
+                        else return p
+
+{-# INLINE utf8DecodeLazy# #-}
+utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
+utf8DecodeLazy# retain decodeChar# len#
+  = unpack 0#
+  where
+    unpack i#
+        | isTrue# (i# >=# len#) = retain >> return []
+        | otherwise =
+            case decodeChar# i# of
+              (# c#, nBytes# #) -> do
+                rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
+                return (C# c# : rest)
+
+utf8DecodeByteString :: ByteString -> [Char]
+utf8DecodeByteString (BS.PS fptr offset len)
+  = utf8DecodeStringLazy fptr offset len
+
+utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
+utf8DecodeStringLazy fp offset (I# len#)
+  = unsafeDupablePerformIO $ do
+      let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
+      utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
+-- Note that since utf8DecodeLazy# returns a thunk the lifetime of the
+-- ForeignPtr actually needs to be longer than the lexical lifetime
+-- withForeignPtr would provide here. That's why we use touchForeignPtr to
+-- keep the fp alive until the last character has actually been decoded.
+
+utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
+utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0#
+   -- UTF-8 has the property that sorting by bytes values also sorts by
+   -- code-points.
+   -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
+   -- doesn't hold and we must explicitly check this case here.
+   -- Note that decoding every code point would also work but it would be much
+   -- more costly.
+   where
+       !sz1 = sizeofByteArray# a1
+       !sz2 = sizeofByteArray# a2
+       go off1 off2
+         | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
+         | isTrue# (off1 >=# sz1)                          = LT
+         | isTrue# (off2 >=# sz2)                          = GT
+         | otherwise =
+               let !b1_1 = indexWord8Array# a1 off1
+                   !b2_1 = indexWord8Array# a2 off2
+               in case b1_1 of
+                  0xC0## -> case b2_1 of
+                     0xC0## -> go (off1 +# 1#) (off2 +# 1#)
+                     _      -> case indexWord8Array# a1 (off1 +# 1#) of
+                        0x80## -> LT
+                        _      -> go (off1 +# 1#) (off2 +# 1#)
+                  _      -> case b2_1 of
+                     0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
+                        0x80## -> GT
+                        _      -> go (off1 +# 1#) (off2 +# 1#)
+                     _   | isTrue# (b1_1 `gtWord#` b2_1) -> GT
+                         | isTrue# (b1_1 `ltWord#` b2_1) -> LT
+                         | otherwise                     -> go (off1 +# 1#) (off2 +# 1#)
+
+utf8DecodeShortByteString :: ShortByteString -> [Char]
+utf8DecodeShortByteString (SBS ba#)
+  = unsafeDupablePerformIO $
+      let len# = sizeofByteArray# ba# in
+      utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len#
+
+countUTF8Chars :: ShortByteString -> IO Int
+countUTF8Chars (SBS ba) = go 0# 0#
+  where
+    len# = sizeofByteArray# ba
+    go i# n#
+      | isTrue# (i# >=# len#) =
+          return (I# n#)
+      | otherwise = do
+          case utf8DecodeCharByteArray# ba i# of
+            (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
+
+{-# INLINE utf8EncodeChar #-}
+utf8EncodeChar :: (Int# -> Word# -> State# s -> State# s)
+               -> Char -> ST s Int
+utf8EncodeChar write# c =
+  let x = ord c in
+  case () of
+    _ | x > 0 && x <= 0x007f -> do
+          write 0 x
+          return 1
+        -- NB. '\0' is encoded as '\xC0\x80', not '\0'.  This is so that we
+        -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
+      | x <= 0x07ff -> do
+          write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
+          write 1 (0x80 .|. (x .&. 0x3F))
+          return 2
+      | x <= 0xffff -> do
+          write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
+          write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
+          write 2 (0x80 .|. (x .&. 0x3F))
+          return 3
+      | otherwise -> do
+          write 0 (0xF0 .|. (x `shiftR` 18))
+          write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
+          write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
+          write 3 (0x80 .|. (x .&. 0x3F))
+          return 4
+  where
+    {-# INLINE write #-}
+    write (I# off#) (I# c#) = ST $ \s ->
+      case write# off# (int2Word# c#) s of
+        s -> (# s, () #)
+
+utf8EncodeString :: Ptr Word8 -> String -> IO ()
+utf8EncodeString (Ptr a#) str = go a# str
+  where go !_   []   = return ()
+        go a# (c:cs) = do
+          I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
+          go (a# `plusAddr#` off#) cs
+
+utf8EncodeShortByteString :: String -> IO ShortByteString
+utf8EncodeShortByteString str = IO $ \s ->
+  case utf8EncodedLength str         of { I# len# ->
+  case newByteArray# len# s          of { (# s, mba# #) ->
+  case go mba# 0# str                of { ST f_go ->
+  case f_go s                        of { (# s, () #) ->
+  case unsafeFreezeByteArray# mba# s of { (# s, ba# #) ->
+  (# s, SBS ba# #) }}}}}
+  where
+    go _ _ [] = return ()
+    go mba# i# (c:cs) = do
+      I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
+      go mba# (i# +# off#) cs
+
+utf8EncodedLength :: String -> Int
+utf8EncodedLength str = go 0 str
+  where go !n [] = n
+        go n (c:cs)
+          | ord c > 0 && ord c <= 0x007f = go (n+1) cs
+          | ord c <= 0x07ff = go (n+2) cs
+          | ord c <= 0xffff = go (n+3) cs
+          | otherwise       = go (n+4) cs
+
+-- -----------------------------------------------------------------------------
+-- Note [Z-Encoding]
+-- ~~~~~~~~~~~~~~~~~
+
+{-
+This is the main name-encoding and decoding function.  It encodes any
+string into a string that is acceptable as a C name.  This is done
+right before we emit a symbol name into the compiled C or asm code.
+Z-encoding of strings is cached in the FastString interface, so we
+never encode the same string more than once.
+
+The basic encoding scheme is this.
+
+* Tuples (,,,) are coded as Z3T
+
+* Alphabetic characters (upper and lower) and digits
+        all translate to themselves;
+        except 'Z', which translates to 'ZZ'
+        and    'z', which translates to 'zz'
+  We need both so that we can preserve the variable/tycon distinction
+
+* Most other printable characters translate to 'zx' or 'Zx' for some
+        alphabetic character x
+
+* The others translate as 'znnnU' where 'nnn' is the decimal number
+        of the character
+
+        Before          After
+        --------------------------
+        Trak            Trak
+        foo_wib         foozuwib
+        >               zg
+        >1              zg1
+        foo#            foozh
+        foo##           foozhzh
+        foo##1          foozhzh1
+        fooZ            fooZZ
+        :+              ZCzp
+        ()              Z0T     0-tuple
+        (,,,,)          Z5T     5-tuple
+        (# #)           Z1H     unboxed 1-tuple (note the space)
+        (#,,,,#)        Z5H     unboxed 5-tuple
+                (NB: There is no Z1T nor Z0H.)
+-}
+
+type UserString = String        -- As the user typed it
+type EncodedString = String     -- Encoded form
+
+
+zEncodeString :: UserString -> EncodedString
+zEncodeString cs = case maybe_tuple cs of
+                Just n  -> n            -- Tuples go to Z2T etc
+                Nothing -> go cs
+          where
+                go []     = []
+                go (c:cs) = encode_digit_ch c ++ go' cs
+                go' []     = []
+                go' (c:cs) = encode_ch c ++ go' cs
+
+unencodedChar :: Char -> Bool   -- True for chars that don't need encoding
+unencodedChar 'Z' = False
+unencodedChar 'z' = False
+unencodedChar c   =  c >= 'a' && c <= 'z'
+                  || c >= 'A' && c <= 'Z'
+                  || c >= '0' && c <= '9'
+
+-- If a digit is at the start of a symbol then we need to encode it.
+-- Otherwise package names like 9pH-0.1 give linker errors.
+encode_digit_ch :: Char -> EncodedString
+encode_digit_ch c | c >= '0' && c <= '9' = encode_as_unicode_char c
+encode_digit_ch c | otherwise            = encode_ch c
+
+encode_ch :: Char -> EncodedString
+encode_ch c | unencodedChar c = [c]     -- Common case first
+
+-- Constructors
+encode_ch '('  = "ZL"   -- Needed for things like (,), and (->)
+encode_ch ')'  = "ZR"   -- For symmetry with (
+encode_ch '['  = "ZM"
+encode_ch ']'  = "ZN"
+encode_ch ':'  = "ZC"
+encode_ch 'Z'  = "ZZ"
+
+-- Variables
+encode_ch 'z'  = "zz"
+encode_ch '&'  = "za"
+encode_ch '|'  = "zb"
+encode_ch '^'  = "zc"
+encode_ch '$'  = "zd"
+encode_ch '='  = "ze"
+encode_ch '>'  = "zg"
+encode_ch '#'  = "zh"
+encode_ch '.'  = "zi"
+encode_ch '<'  = "zl"
+encode_ch '-'  = "zm"
+encode_ch '!'  = "zn"
+encode_ch '+'  = "zp"
+encode_ch '\'' = "zq"
+encode_ch '\\' = "zr"
+encode_ch '/'  = "zs"
+encode_ch '*'  = "zt"
+encode_ch '_'  = "zu"
+encode_ch '%'  = "zv"
+encode_ch c    = encode_as_unicode_char c
+
+encode_as_unicode_char :: Char -> EncodedString
+encode_as_unicode_char c = 'z' : if isDigit (head hex_str) then hex_str
+                                                           else '0':hex_str
+  where hex_str = showHex (ord c) "U"
+  -- ToDo: we could improve the encoding here in various ways.
+  -- eg. strings of unicode characters come out as 'z1234Uz5678U', we
+  -- could remove the 'U' in the middle (the 'z' works as a separator).
+
+zDecodeString :: EncodedString -> UserString
+zDecodeString [] = []
+zDecodeString ('Z' : d : rest)
+  | isDigit d = decode_tuple   d rest
+  | otherwise = decode_upper   d : zDecodeString rest
+zDecodeString ('z' : d : rest)
+  | isDigit d = decode_num_esc d rest
+  | otherwise = decode_lower   d : zDecodeString rest
+zDecodeString (c   : rest) = c : zDecodeString rest
+
+decode_upper, decode_lower :: Char -> Char
+
+decode_upper 'L' = '('
+decode_upper 'R' = ')'
+decode_upper 'M' = '['
+decode_upper 'N' = ']'
+decode_upper 'C' = ':'
+decode_upper 'Z' = 'Z'
+decode_upper ch  = {-pprTrace "decode_upper" (char ch)-} ch
+
+decode_lower 'z' = 'z'
+decode_lower 'a' = '&'
+decode_lower 'b' = '|'
+decode_lower 'c' = '^'
+decode_lower 'd' = '$'
+decode_lower 'e' = '='
+decode_lower 'g' = '>'
+decode_lower 'h' = '#'
+decode_lower 'i' = '.'
+decode_lower 'l' = '<'
+decode_lower 'm' = '-'
+decode_lower 'n' = '!'
+decode_lower 'p' = '+'
+decode_lower 'q' = '\''
+decode_lower 'r' = '\\'
+decode_lower 's' = '/'
+decode_lower 't' = '*'
+decode_lower 'u' = '_'
+decode_lower 'v' = '%'
+decode_lower ch  = {-pprTrace "decode_lower" (char ch)-} ch
+
+-- Characters not having a specific code are coded as z224U (in hex)
+decode_num_esc :: Char -> EncodedString -> UserString
+decode_num_esc d rest
+  = go (digitToInt d) rest
+  where
+    go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest
+    go n ('U' : rest)           = chr n : zDecodeString rest
+    go n other = error ("decode_num_esc: " ++ show n ++  ' ':other)
+
+decode_tuple :: Char -> EncodedString -> UserString
+decode_tuple d rest
+  = go (digitToInt d) rest
+  where
+        -- NB. recurse back to zDecodeString after decoding the tuple, because
+        -- the tuple might be embedded in a longer name.
+    go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest
+    go 0 ('T':rest)     = "()" ++ zDecodeString rest
+    go n ('T':rest)     = '(' : replicate (n-1) ',' ++ ")" ++ zDecodeString rest
+    go 1 ('H':rest)     = "(# #)" ++ zDecodeString rest
+    go n ('H':rest)     = '(' : '#' : replicate (n-1) ',' ++ "#)" ++ zDecodeString rest
+    go n other = error ("decode_tuple: " ++ show n ++ ' ':other)
+
+{-
+Tuples are encoded as
+        Z3T or Z3H
+for 3-tuples or unboxed 3-tuples respectively.  No other encoding starts
+        Z<digit>
+
+* "(# #)" is the tycon for an unboxed 1-tuple (not 0-tuple)
+  There are no unboxed 0-tuples.
+
+* "()" is the tycon for a boxed 0-tuple.
+  There are no boxed 1-tuples.
+-}
+
+maybe_tuple :: UserString -> Maybe EncodedString
+
+maybe_tuple "(# #)" = Just("Z1H")
+maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of
+                                 (n, '#' : ')' : _) -> Just ('Z' : shows (n+1) "H")
+                                 _                  -> Nothing
+maybe_tuple "()" = Just("Z0T")
+maybe_tuple ('(' : cs)       = case count_commas (0::Int) cs of
+                                 (n, ')' : _) -> Just ('Z' : shows (n+1) "T")
+                                 _            -> Nothing
+maybe_tuple _                = Nothing
+
+count_commas :: Int -> String -> (Int, String)
+count_commas n (',' : cs) = count_commas (n+1) cs
+count_commas n cs         = (n,cs)
+
+
+{-
+************************************************************************
+*                                                                      *
+                        Base 62
+*                                                                      *
+************************************************************************
+
+Note [Base 62 encoding 128-bit integers]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Instead of base-62 encoding a single 128-bit integer
+(ceil(21.49) characters), we'll base-62 a pair of 64-bit integers
+(2 * ceil(10.75) characters).  Luckily for us, it's the same number of
+characters!
+-}
+
+--------------------------------------------------------------------------
+-- Base 62
+
+-- The base-62 code is based off of 'locators'
+-- ((c) Operational Dynamics Consulting, BSD3 licensed)
+
+-- | Size of a 64-bit word when written as a base-62 string
+word64Base62Len :: Int
+word64Base62Len = 11
+
+-- | Converts a 64-bit word into a base-62 string
+toBase62Padded :: Word64 -> String
+toBase62Padded w = pad ++ str
+  where
+    pad = replicate len '0'
+    len = word64Base62Len - length str -- 11 == ceil(64 / lg 62)
+    str = toBase62 w
+
+toBase62 :: Word64 -> String
+toBase62 w = showIntAtBase 62 represent w ""
+  where
+    represent :: Int -> Char
+    represent x
+        | x < 10 = Char.chr (48 + x)
+        | x < 36 = Char.chr (65 + x - 10)
+        | x < 62 = Char.chr (97 + x - 36)
+        | otherwise = error "represent (base 62): impossible!"
diff --git a/libraries/ghc-boot/ghc-boot.cabal.in b/libraries/ghc-boot/ghc-boot.cabal.in
index 1d70a65646..c58b6893eb 100644
--- a/libraries/ghc-boot/ghc-boot.cabal.in
+++ b/libraries/ghc-boot/ghc-boot.cabal.in
@@ -38,6 +38,8 @@ Library
 
     exposed-modules:
             GHC.BaseDir
+            GHC.Data.ShortText
+            GHC.Utils.Encoding
             GHC.LanguageExtensions
             GHC.Unit.Database
             GHC.Serialized
@@ -68,4 +70,5 @@ Library
                    containers >= 0.5 && < 0.7,
                    directory  >= 1.2 && < 1.4,
                    filepath   >= 1.3 && < 1.5,
+                   deepseq    >= 1.4 && < 1.5,
                    ghc-boot-th == @ProjectVersionMunged@
-- 
cgit v1.2.1