summaryrefslogtreecommitdiff
path: root/libraries/ghc-boot/GHC/Data/ShortText.hs
diff options
context:
space:
mode:
authorWander Hillen <wjw.hillen@gmail.com>2020-09-25 11:41:06 +0200
committerMarge Bot <ben+marge-bot@smart-cactus.org>2020-10-13 00:12:47 -0400
commit7fdcce6d4d13a10a1b2336c1d40482c64dba664d (patch)
tree66dfd350e5abef849793060d745d1a1df64e47df /libraries/ghc-boot/GHC/Data/ShortText.hs
parent9bbc84d20d0f50901351246cbe97c45234ca7d95 (diff)
downloadhaskell-7fdcce6d4d13a10a1b2336c1d40482c64dba664d.tar.gz
Initial ShortText code and conversion of package db code
Metric Decrease: Naperian T10421 T10421a T10547 T12150 T12234 T12425 T13035 T18140 T18304 T5837 T6048 T13253-spj T18282 T18223 T3064 T9961 Metric Increase T13701 HFSKJH
Diffstat (limited to 'libraries/ghc-boot/GHC/Data/ShortText.hs')
-rw-r--r--libraries/ghc-boot/GHC/Data/ShortText.hs112
1 files changed, 112 insertions, 0 deletions
diff --git a/libraries/ghc-boot/GHC/Data/ShortText.hs b/libraries/ghc-boot/GHC/Data/ShortText.hs
new file mode 100644
index 0000000000..f51d79864b
--- /dev/null
+++ b/libraries/ghc-boot/GHC/Data/ShortText.hs
@@ -0,0 +1,112 @@
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, GeneralizedNewtypeDeriving, DerivingStrategies #-}
+{-# OPTIONS_GHC -O2 -funbox-strict-fields #-}
+
+-- |
+-- An Unicode string for internal GHC use. Meant to replace String
+-- in places where being a lazy linked is not very useful and a more
+-- memory efficient data structure is desirable.
+
+-- Very similar to FastString, but not hash-consed and with some extra instances and
+-- functions for serialisation and I/O. Should be imported qualified.
+
+module GHC.Data.ShortText (
+ -- * ShortText
+ ShortText(..),
+ -- ** Conversion to and from String
+ pack,
+ unpack,
+ -- ** Operations
+ codepointLength,
+ byteLength,
+ GHC.Data.ShortText.null,
+ splitFilePath,
+ GHC.Data.ShortText.head,
+ stripPrefix
+ ) where
+
+import Prelude
+
+import Control.Monad (guard)
+import Control.DeepSeq as DeepSeq
+import Data.Binary
+import qualified Data.ByteString.Char8 as B8
+import qualified Data.ByteString.Short.Internal as SBS
+import GHC.Exts
+import GHC.IO
+import GHC.Utils.Encoding
+import System.FilePath (isPathSeparator)
+
+{-| A 'ShortText' is a modified UTF-8 encoded string meant for short strings like
+file paths, module descriptions, etc.
+-}
+newtype ShortText = ShortText { contents :: SBS.ShortByteString
+ }
+ deriving stock (Show)
+ deriving newtype (Eq, Ord, Binary, Semigroup, Monoid, NFData)
+
+-- We don't want to derive this one from ShortByteString since that one won't handle
+-- UTF-8 characters correctly.
+instance IsString ShortText where
+ fromString = pack
+
+-- | /O(n)/ Returns the length of the 'ShortText' in characters.
+codepointLength :: ShortText -> Int
+codepointLength st = unsafeDupablePerformIO $ countUTF8Chars (contents st)
+-- | /O(1)/ Returns the length of the 'ShortText' in bytes.
+byteLength :: ShortText -> Int
+byteLength st = SBS.length $ contents st
+
+-- | /O(n)/ Convert a 'String' into a 'ShortText'.
+pack :: String -> ShortText
+pack s = unsafeDupablePerformIO $ ShortText <$> utf8EncodeShortByteString s
+
+-- | /O(n)/ Convert a 'ShortText' into a 'String'.
+unpack :: ShortText -> String
+unpack st = utf8DecodeShortByteString $ contents st
+
+-- | /O(1)/ Test whether the 'ShortText' is the empty string.
+null :: ShortText -> Bool
+null st = SBS.null $ contents st
+
+-- | /O(n)/ Split a 'ShortText' representing a file path into its components by separating
+-- on the file separator characters for this platform.
+splitFilePath :: ShortText -> [ShortText]
+-- This seems dangerous, but since the path separators are in the ASCII set they map down
+-- to a single byte when encoded in UTF-8 and so this should work even when casting to ByteString.
+-- We DeepSeq.force the resulting list so that we can be sure that no references to the
+-- bytestring in `st'` remain in unevaluated thunks, which might prevent `st'` from being
+-- collected by the GC.
+splitFilePath st = DeepSeq.force $ map (ShortText . SBS.toShort) $ B8.splitWith isPathSeparator st'
+ where st' = SBS.fromShort $ contents st
+
+-- | /O(1)/ Returns the first UTF-8 codepoint in the 'ShortText'. Depending on the string in
+-- question, this may or may not be the actual first character in the string due to Unicode
+-- non-printable characters.
+head :: ShortText -> Char
+head st
+ | SBS.null $ contents st = error "head: Empty ShortText"
+ | otherwise = Prelude.head $ unpack st
+
+-- | /O(n)/ The 'stripPrefix' function takes two 'ShortText's and returns 'Just' the remainder of
+-- the second iff the first is its prefix, and otherwise Nothing.
+stripPrefix :: ShortText -> ShortText -> Maybe ShortText
+stripPrefix prefix st = do
+ let !(SBS.SBS prefixBA) = contents prefix
+ let !(SBS.SBS stBA) = contents st
+ let prefixLength = sizeofByteArray# prefixBA
+ let stLength = sizeofByteArray# stBA
+ -- If the length of 'st' is not >= than the length of 'prefix', it is impossible for 'prefix'
+ -- to be the prefix of `st`.
+ guard $ (I# stLength) >= (I# prefixLength)
+ -- 'prefix' is a prefix of 'st' if the first <length of prefix> bytes of 'st'
+ -- are equal to 'prefix'
+ guard $ I# (compareByteArrays# prefixBA 0# stBA 0# prefixLength) == 0
+ -- Allocate a new ByteArray# and copy the remainder of the 'st' into it
+ unsafeDupablePerformIO $ do
+ let newBAsize = (stLength -# prefixLength)
+ newSBS <- IO $ \s0 ->
+ let !(# s1, ba #) = newByteArray# newBAsize s0
+ s2 = copyByteArray# stBA prefixLength ba 0# newBAsize s1
+ !(# s3, fba #) = unsafeFreezeByteArray# ba s2
+ in (# s3, SBS.SBS fba #)
+ return . Just . ShortText $ newSBS