Initial ShortText code and conversion of package db code

Metric Decrease: Naperian T10421 T10421a T10547 T12150 T12234 T12425 T13035 T18140 T18304 T5837 T6048 T13253-spj T18282 T18223 T3064 T9961 Metric Increase T13701 HFSKJH
author: Wander Hillen <wjw.hillen@gmail.com> 2020-09-25 11:41:06 +0200
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2020-10-13 00:12:47 -0400
commit: 7fdcce6d4d13a10a1b2336c1d40482c64dba664d (patch)
tree: 66dfd350e5abef849793060d745d1a1df64e47df /libraries/ghc-boot/GHC/Data/ShortText.hs
parent: 9bbc84d20d0f50901351246cbe97c45234ca7d95 (diff)
download: haskell-7fdcce6d4d13a10a1b2336c1d40482c64dba664d.tar.gz
1 files changed, 112 insertions, 0 deletions
diff --git a/libraries/ghc-boot/GHC/Data/ShortText.hs b/libraries/ghc-boot/GHC/Data/ShortText.hs
new file mode 100644
index 0000000000..f51d79864b
--- /dev/null
+++ b/libraries/ghc-boot/GHC/Data/ShortText.hs
@@ -0,0 +1,112 @@
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, GeneralizedNewtypeDeriving, DerivingStrategies #-}
+{-# OPTIONS_GHC -O2 -funbox-strict-fields #-}
+
+-- |
+-- An Unicode string for internal GHC use. Meant to replace String
+-- in places where being a lazy linked is not very useful and a more
+-- memory efficient data structure is desirable.
+
+-- Very similar to FastString, but not hash-consed and with some extra instances and
+-- functions for serialisation and I/O. Should be imported qualified.
+
+module GHC.Data.ShortText (
+        -- * ShortText
+        ShortText(..),
+        -- ** Conversion to and from String
+        pack,
+        unpack,
+        -- ** Operations
+        codepointLength,
+        byteLength,
+        GHC.Data.ShortText.null,
+        splitFilePath,
+        GHC.Data.ShortText.head,
+        stripPrefix
+  ) where
+
+import Prelude
+
+import Control.Monad (guard)
+import Control.DeepSeq as DeepSeq
+import Data.Binary
+import qualified Data.ByteString.Char8 as B8
+import qualified Data.ByteString.Short.Internal as SBS
+import GHC.Exts
+import GHC.IO
+import GHC.Utils.Encoding
+import System.FilePath (isPathSeparator)
+
+{-| A 'ShortText' is a modified UTF-8 encoded string meant for short strings like
+file paths, module descriptions, etc.
+-}
+newtype ShortText = ShortText { contents :: SBS.ShortByteString
+                              }
+                              deriving stock (Show)
+                              deriving newtype (Eq, Ord, Binary, Semigroup, Monoid, NFData)
+
+-- We don't want to derive this one from ShortByteString since that one won't handle
+-- UTF-8 characters correctly.
+instance IsString ShortText where
+  fromString = pack
+
+-- | /O(n)/ Returns the length of the 'ShortText' in characters.
+codepointLength :: ShortText -> Int
+codepointLength st = unsafeDupablePerformIO $ countUTF8Chars (contents st)
+-- | /O(1)/ Returns the length of the 'ShortText' in bytes.
+byteLength :: ShortText -> Int
+byteLength st = SBS.length $ contents st
+
+-- | /O(n)/ Convert a 'String' into a 'ShortText'.
+pack :: String -> ShortText
+pack s = unsafeDupablePerformIO $ ShortText <$> utf8EncodeShortByteString s
+
+-- | /O(n)/ Convert a 'ShortText' into a 'String'.
+unpack :: ShortText -> String
+unpack st = utf8DecodeShortByteString $ contents st
+
+-- | /O(1)/ Test whether the 'ShortText' is the empty string.
+null :: ShortText -> Bool
+null st = SBS.null $ contents st
+
+-- | /O(n)/ Split a 'ShortText' representing a file path into its components by separating
+-- on the file separator characters for this platform.
+splitFilePath :: ShortText -> [ShortText]
+-- This seems dangerous, but since the path separators are in the ASCII set they map down
+-- to a single byte when encoded in UTF-8 and so this should work even when casting to ByteString.
+-- We DeepSeq.force the resulting list so that we can be sure that no references to the
+-- bytestring in `st'` remain in unevaluated thunks, which might prevent `st'` from being
+-- collected by the GC.
+splitFilePath st = DeepSeq.force $ map (ShortText . SBS.toShort) $ B8.splitWith isPathSeparator st'
+  where st' = SBS.fromShort $ contents st
+
+-- | /O(1)/ Returns the first UTF-8 codepoint in the 'ShortText'. Depending on the string in
+-- question, this may or may not be the actual first character in the string due to Unicode
+-- non-printable characters.
+head :: ShortText -> Char
+head st
+  | SBS.null $ contents st = error "head: Empty ShortText"
+  | otherwise              = Prelude.head $ unpack st
+
+-- | /O(n)/ The 'stripPrefix' function takes two 'ShortText's and returns 'Just' the remainder of
+-- the second iff the first is its prefix, and otherwise Nothing.
+stripPrefix :: ShortText -> ShortText -> Maybe ShortText
+stripPrefix prefix st = do
+  let !(SBS.SBS prefixBA) = contents prefix
+  let !(SBS.SBS stBA)     = contents st
+  let prefixLength        = sizeofByteArray# prefixBA
+  let stLength            = sizeofByteArray# stBA
+  -- If the length of 'st' is not >= than the length of 'prefix', it is impossible for 'prefix'
+  -- to be the prefix of `st`.
+  guard $ (I# stLength) >= (I# prefixLength)
+  -- 'prefix' is a prefix of 'st' if the first <length of prefix> bytes of 'st'
+  -- are equal to 'prefix'
+  guard $ I# (compareByteArrays# prefixBA 0# stBA 0# prefixLength) == 0
+  -- Allocate a new ByteArray# and copy the remainder of the 'st' into it
+  unsafeDupablePerformIO $ do
+    let newBAsize = (stLength -# prefixLength)
+    newSBS <- IO $ \s0 ->
+      let !(# s1, ba #)  = newByteArray# newBAsize s0
+          s2             = copyByteArray# stBA prefixLength ba 0# newBAsize s1
+          !(# s3, fba #) = unsafeFreezeByteArray# ba s2
+      in  (# s3, SBS.SBS fba #)
+    return . Just . ShortText $ newSBS
author	Wander Hillen <wjw.hillen@gmail.com>	2020-09-25 11:41:06 +0200
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2020-10-13 00:12:47 -0400
commit	7fdcce6d4d13a10a1b2336c1d40482c64dba664d (patch)
tree	66dfd350e5abef849793060d745d1a1df64e47df /libraries/ghc-boot/GHC/Data/ShortText.hs
parent	9bbc84d20d0f50901351246cbe97c45234ca7d95 (diff)
download	haskell-7fdcce6d4d13a10a1b2336c1d40482c64dba664d.tar.gz