diff options
author | Wander Hillen <wjw.hillen@gmail.com> | 2020-09-25 11:41:06 +0200 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2020-10-13 00:12:47 -0400 |
commit | 7fdcce6d4d13a10a1b2336c1d40482c64dba664d (patch) | |
tree | 66dfd350e5abef849793060d745d1a1df64e47df /libraries/ghc-boot/GHC/Data/ShortText.hs | |
parent | 9bbc84d20d0f50901351246cbe97c45234ca7d95 (diff) | |
download | haskell-7fdcce6d4d13a10a1b2336c1d40482c64dba664d.tar.gz |
Initial ShortText code and conversion of package db code
Metric Decrease:
Naperian
T10421
T10421a
T10547
T12150
T12234
T12425
T13035
T18140
T18304
T5837
T6048
T13253-spj
T18282
T18223
T3064
T9961
Metric Increase
T13701
HFSKJH
Diffstat (limited to 'libraries/ghc-boot/GHC/Data/ShortText.hs')
-rw-r--r-- | libraries/ghc-boot/GHC/Data/ShortText.hs | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/libraries/ghc-boot/GHC/Data/ShortText.hs b/libraries/ghc-boot/GHC/Data/ShortText.hs new file mode 100644 index 0000000000..f51d79864b --- /dev/null +++ b/libraries/ghc-boot/GHC/Data/ShortText.hs @@ -0,0 +1,112 @@ +{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, GeneralizedNewtypeDeriving, DerivingStrategies #-} +{-# OPTIONS_GHC -O2 -funbox-strict-fields #-} + +-- | +-- An Unicode string for internal GHC use. Meant to replace String +-- in places where being a lazy linked is not very useful and a more +-- memory efficient data structure is desirable. + +-- Very similar to FastString, but not hash-consed and with some extra instances and +-- functions for serialisation and I/O. Should be imported qualified. + +module GHC.Data.ShortText ( + -- * ShortText + ShortText(..), + -- ** Conversion to and from String + pack, + unpack, + -- ** Operations + codepointLength, + byteLength, + GHC.Data.ShortText.null, + splitFilePath, + GHC.Data.ShortText.head, + stripPrefix + ) where + +import Prelude + +import Control.Monad (guard) +import Control.DeepSeq as DeepSeq +import Data.Binary +import qualified Data.ByteString.Char8 as B8 +import qualified Data.ByteString.Short.Internal as SBS +import GHC.Exts +import GHC.IO +import GHC.Utils.Encoding +import System.FilePath (isPathSeparator) + +{-| A 'ShortText' is a modified UTF-8 encoded string meant for short strings like +file paths, module descriptions, etc. +-} +newtype ShortText = ShortText { contents :: SBS.ShortByteString + } + deriving stock (Show) + deriving newtype (Eq, Ord, Binary, Semigroup, Monoid, NFData) + +-- We don't want to derive this one from ShortByteString since that one won't handle +-- UTF-8 characters correctly. +instance IsString ShortText where + fromString = pack + +-- | /O(n)/ Returns the length of the 'ShortText' in characters. +codepointLength :: ShortText -> Int +codepointLength st = unsafeDupablePerformIO $ countUTF8Chars (contents st) +-- | /O(1)/ Returns the length of the 'ShortText' in bytes. +byteLength :: ShortText -> Int +byteLength st = SBS.length $ contents st + +-- | /O(n)/ Convert a 'String' into a 'ShortText'. +pack :: String -> ShortText +pack s = unsafeDupablePerformIO $ ShortText <$> utf8EncodeShortByteString s + +-- | /O(n)/ Convert a 'ShortText' into a 'String'. +unpack :: ShortText -> String +unpack st = utf8DecodeShortByteString $ contents st + +-- | /O(1)/ Test whether the 'ShortText' is the empty string. +null :: ShortText -> Bool +null st = SBS.null $ contents st + +-- | /O(n)/ Split a 'ShortText' representing a file path into its components by separating +-- on the file separator characters for this platform. +splitFilePath :: ShortText -> [ShortText] +-- This seems dangerous, but since the path separators are in the ASCII set they map down +-- to a single byte when encoded in UTF-8 and so this should work even when casting to ByteString. +-- We DeepSeq.force the resulting list so that we can be sure that no references to the +-- bytestring in `st'` remain in unevaluated thunks, which might prevent `st'` from being +-- collected by the GC. +splitFilePath st = DeepSeq.force $ map (ShortText . SBS.toShort) $ B8.splitWith isPathSeparator st' + where st' = SBS.fromShort $ contents st + +-- | /O(1)/ Returns the first UTF-8 codepoint in the 'ShortText'. Depending on the string in +-- question, this may or may not be the actual first character in the string due to Unicode +-- non-printable characters. +head :: ShortText -> Char +head st + | SBS.null $ contents st = error "head: Empty ShortText" + | otherwise = Prelude.head $ unpack st + +-- | /O(n)/ The 'stripPrefix' function takes two 'ShortText's and returns 'Just' the remainder of +-- the second iff the first is its prefix, and otherwise Nothing. +stripPrefix :: ShortText -> ShortText -> Maybe ShortText +stripPrefix prefix st = do + let !(SBS.SBS prefixBA) = contents prefix + let !(SBS.SBS stBA) = contents st + let prefixLength = sizeofByteArray# prefixBA + let stLength = sizeofByteArray# stBA + -- If the length of 'st' is not >= than the length of 'prefix', it is impossible for 'prefix' + -- to be the prefix of `st`. + guard $ (I# stLength) >= (I# prefixLength) + -- 'prefix' is a prefix of 'st' if the first <length of prefix> bytes of 'st' + -- are equal to 'prefix' + guard $ I# (compareByteArrays# prefixBA 0# stBA 0# prefixLength) == 0 + -- Allocate a new ByteArray# and copy the remainder of the 'st' into it + unsafeDupablePerformIO $ do + let newBAsize = (stLength -# prefixLength) + newSBS <- IO $ \s0 -> + let !(# s1, ba #) = newByteArray# newBAsize s0 + s2 = copyByteArray# stBA prefixLength ba 0# newBAsize s1 + !(# s3, fba #) = unsafeFreezeByteArray# ba s2 + in (# s3, SBS.SBS fba #) + return . Just . ShortText $ newSBS |