Modules: Utils and Data (#13009)

Update Haddock submodule Metric Increase: haddock.compiler
author: Sylvain Henry <sylvain@haskus.fr> 2020-04-20 16:54:38 +0200
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2020-04-26 13:55:14 -0400
commit: af332442123878c1b61d236dce46418efcbe8750 (patch)
tree: ec4b332843cdd4fedb4aa60b11b7b8dba82a0764 /compiler/GHC/Data/StringBuffer.hs
parent: b0fbfc7582fb81314dc28a056536737fb5eeaa6e (diff)
download: haskell-af332442123878c1b61d236dce46418efcbe8750.tar.gz
1 files changed, 334 insertions, 0 deletions
diff --git a/compiler/GHC/Data/StringBuffer.hs b/compiler/GHC/Data/StringBuffer.hs
new file mode 100644
index 0000000000..8ac5d1ae07
--- /dev/null
+++ b/compiler/GHC/Data/StringBuffer.hs
@@ -0,0 +1,334 @@
+{-
+(c) The University of Glasgow 2006
+(c) The University of Glasgow, 1997-2006
+
+
+Buffers for scanning string input stored in external arrays.
+-}
+
+{-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples #-}
+{-# OPTIONS_GHC -O2 #-}
+-- We always optimise this, otherwise performance of a non-optimised
+-- compiler is severely affected
+
+module GHC.Data.StringBuffer
+       (
+        StringBuffer(..),
+        -- non-abstract for vs\/HaskellService
+
+         -- * Creation\/destruction
+        hGetStringBuffer,
+        hGetStringBufferBlock,
+        hPutStringBuffer,
+        appendStringBuffers,
+        stringToStringBuffer,
+
+        -- * Inspection
+        nextChar,
+        currentChar,
+        prevChar,
+        atEnd,
+
+        -- * Moving and comparison
+        stepOn,
+        offsetBytes,
+        byteDiff,
+        atLine,
+
+        -- * Conversion
+        lexemeToString,
+        lexemeToFastString,
+        decodePrevNChars,
+
+         -- * Parsing integers
+        parseUnsignedInteger,
+       ) where
+
+#include "HsVersions.h"
+
+import GHC.Prelude
+
+import GHC.Utils.Encoding
+import GHC.Data.FastString
+import GHC.Utils.IO.Unsafe
+import GHC.Utils.Panic.Plain
+import GHC.Utils.Misc
+
+import Data.Maybe
+import Control.Exception
+import System.IO
+import System.IO.Unsafe         ( unsafePerformIO )
+import GHC.IO.Encoding.UTF8     ( mkUTF8 )
+import GHC.IO.Encoding.Failure  ( CodingFailureMode(IgnoreCodingFailure) )
+
+import GHC.Exts
+
+import Foreign
+
+-- -----------------------------------------------------------------------------
+-- The StringBuffer type
+
+-- |A StringBuffer is an internal pointer to a sized chunk of bytes.
+-- The bytes are intended to be *immutable*.  There are pure
+-- operations to read the contents of a StringBuffer.
+--
+-- A StringBuffer may have a finalizer, depending on how it was
+-- obtained.
+--
+data StringBuffer
+ = StringBuffer {
+     buf :: {-# UNPACK #-} !(ForeignPtr Word8),
+     len :: {-# UNPACK #-} !Int,        -- length
+     cur :: {-# UNPACK #-} !Int         -- current pos
+  }
+  -- The buffer is assumed to be UTF-8 encoded, and furthermore
+  -- we add three @\'\\0\'@ bytes to the end as sentinels so that the
+  -- decoder doesn't have to check for overflow at every single byte
+  -- of a multibyte sequence.
+
+instance Show StringBuffer where
+        showsPrec _ s = showString "<stringbuffer("
+                      . shows (len s) . showString "," . shows (cur s)
+                      . showString ")>"
+
+-- -----------------------------------------------------------------------------
+-- Creation / Destruction
+
+-- | Read a file into a 'StringBuffer'.  The resulting buffer is automatically
+-- managed by the garbage collector.
+hGetStringBuffer :: FilePath -> IO StringBuffer
+hGetStringBuffer fname = do
+   h <- openBinaryFile fname ReadMode
+   size_i <- hFileSize h
+   offset_i <- skipBOM h size_i 0  -- offset is 0 initially
+   let size = fromIntegral $ size_i - offset_i
+   buf <- mallocForeignPtrArray (size+3)
+   withForeignPtr buf $ \ptr -> do
+     r <- if size == 0 then return 0 else hGetBuf h ptr size
+     hClose h
+     if (r /= size)
+        then ioError (userError "short read of file")
+        else newUTF8StringBuffer buf ptr size
+
+hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
+hGetStringBufferBlock handle wanted
+    = do size_i <- hFileSize handle
+         offset_i <- hTell handle >>= skipBOM handle size_i
+         let size = min wanted (fromIntegral $ size_i-offset_i)
+         buf <- mallocForeignPtrArray (size+3)
+         withForeignPtr buf $ \ptr ->
+             do r <- if size == 0 then return 0 else hGetBuf handle ptr size
+                if r /= size
+                   then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
+                   else newUTF8StringBuffer buf ptr size
+
+hPutStringBuffer :: Handle -> StringBuffer -> IO ()
+hPutStringBuffer hdl (StringBuffer buf len cur)
+    = do withForeignPtr (plusForeignPtr buf cur) $ \ptr ->
+             hPutBuf hdl ptr len
+
+-- | Skip the byte-order mark if there is one (see #1744 and #6016),
+-- and return the new position of the handle in bytes.
+--
+-- This is better than treating #FEFF as whitespace,
+-- because that would mess up layout.  We don't have a concept
+-- of zero-width whitespace in Haskell: all whitespace codepoints
+-- have a width of one column.
+skipBOM :: Handle -> Integer -> Integer -> IO Integer
+skipBOM h size offset =
+  -- Only skip BOM at the beginning of a file.
+  if size > 0 && offset == 0
+    then do
+      -- Validate assumption that handle is in binary mode.
+      ASSERTM( hGetEncoding h >>= return . isNothing )
+      -- Temporarily select utf8 encoding with error ignoring,
+      -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
+      bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
+        c <- hLookAhead h
+        if c == '\xfeff'
+          then hGetChar h >> hTell h
+          else return offset
+    else return offset
+  where
+    safeEncoding = mkUTF8 IgnoreCodingFailure
+
+newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
+newUTF8StringBuffer buf ptr size = do
+  pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
+  -- sentinels for UTF-8 decoding
+  return $ StringBuffer buf size 0
+
+appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
+appendStringBuffers sb1 sb2
+    = do newBuf <- mallocForeignPtrArray (size+3)
+         withForeignPtr newBuf $ \ptr ->
+          withForeignPtr (buf sb1) $ \sb1Ptr ->
+           withForeignPtr (buf sb2) $ \sb2Ptr ->
+             do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
+                copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
+                pokeArray (ptr `advancePtr` size) [0,0,0]
+                return (StringBuffer newBuf size 0)
+    where sb1_len = calcLen sb1
+          sb2_len = calcLen sb2
+          calcLen sb = len sb - cur sb
+          size =  sb1_len + sb2_len
+
+-- | Encode a 'String' into a 'StringBuffer' as UTF-8.  The resulting buffer
+-- is automatically managed by the garbage collector.
+stringToStringBuffer :: String -> StringBuffer
+stringToStringBuffer str =
+ unsafePerformIO $ do
+  let size = utf8EncodedLength str
+  buf <- mallocForeignPtrArray (size+3)
+  withForeignPtr buf $ \ptr -> do
+    utf8EncodeString ptr str
+    pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
+    -- sentinels for UTF-8 decoding
+  return (StringBuffer buf size 0)
+
+-- -----------------------------------------------------------------------------
+-- Grab a character
+
+-- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
+-- the remaining portion (analogous to 'Data.List.uncons').  __Warning:__ The
+-- behavior is undefined if the 'StringBuffer' is empty.  The result shares
+-- the same buffer as the original.  Similar to 'utf8DecodeChar', if the
+-- character cannot be decoded as UTF-8, @\'\\0\'@ is returned.
+{-# INLINE nextChar #-}
+nextChar :: StringBuffer -> (Char,StringBuffer)
+nextChar (StringBuffer buf len (I# cur#)) =
+  -- Getting our fingers dirty a little here, but this is performance-critical
+  inlinePerformIO $ do
+    withForeignPtr buf $ \(Ptr a#) -> do
+        case utf8DecodeChar# (a# `plusAddr#` cur#) of
+          (# c#, nBytes# #) ->
+             let cur' = I# (cur# +# nBytes#) in
+             return (C# c#, StringBuffer buf len cur')
+
+-- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
+-- to 'Data.List.head').  __Warning:__ The behavior is undefined if the
+-- 'StringBuffer' is empty.  Similar to 'utf8DecodeChar', if the character
+-- cannot be decoded as UTF-8, @\'\\0\'@ is returned.
+currentChar :: StringBuffer -> Char
+currentChar = fst . nextChar
+
+prevChar :: StringBuffer -> Char -> Char
+prevChar (StringBuffer _   _   0)   deflt = deflt
+prevChar (StringBuffer buf _   cur) _     =
+  inlinePerformIO $ do
+    withForeignPtr buf $ \p -> do
+      p' <- utf8PrevChar (p `plusPtr` cur)
+      return (fst (utf8DecodeChar p'))
+
+-- -----------------------------------------------------------------------------
+-- Moving
+
+-- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
+-- to 'Data.List.tail').  __Warning:__ The behavior is undefined if the
+-- 'StringBuffer' is empty.  The result shares the same buffer as the
+-- original.
+stepOn :: StringBuffer -> StringBuffer
+stepOn s = snd (nextChar s)
+
+-- | Return a 'StringBuffer' with the first @n@ bytes removed.  __Warning:__
+-- If there aren't enough characters, the returned 'StringBuffer' will be
+-- invalid and any use of it may lead to undefined behavior.  The result
+-- shares the same buffer as the original.
+offsetBytes :: Int                      -- ^ @n@, the number of bytes
+            -> StringBuffer
+            -> StringBuffer
+offsetBytes i s = s { cur = cur s + i }
+
+-- | Compute the difference in offset between two 'StringBuffer's that share
+-- the same buffer.  __Warning:__ The behavior is undefined if the
+-- 'StringBuffer's use separate buffers.
+byteDiff :: StringBuffer -> StringBuffer -> Int
+byteDiff s1 s2 = cur s2 - cur s1
+
+-- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
+atEnd :: StringBuffer -> Bool
+atEnd (StringBuffer _ l c) = l == c
+
+-- | Computes a 'StringBuffer' which points to the first character of the
+-- wanted line. Lines begin at 1.
+atLine :: Int -> StringBuffer -> Maybe StringBuffer
+atLine line sb@(StringBuffer buf len _) =
+  inlinePerformIO $
+    withForeignPtr buf $ \p -> do
+      p' <- skipToLine line len p
+      if p' == nullPtr
+        then return Nothing
+        else
+          let
+            delta = p' `minusPtr` p
+          in return $ Just (sb { cur = delta
+                               , len = len - delta
+                               })
+
+skipToLine :: Int -> Int -> Ptr Word8 -> IO (Ptr Word8)
+skipToLine !line !len !op0 = go 1 op0
+  where
+    !opend = op0 `plusPtr` len
+
+    go !i_line !op
+      | op >= opend    = pure nullPtr
+      | i_line == line = pure op
+      | otherwise      = do
+          w <- peek op :: IO Word8
+          case w of
+            10 -> go (i_line + 1) (plusPtr op 1)
+            13 -> do
+              -- this is safe because a 'StringBuffer' is
+              -- guaranteed to have 3 bytes sentinel values.
+              w' <- peek (plusPtr op 1) :: IO Word8
+              case w' of
+                10 -> go (i_line + 1) (plusPtr op 2)
+                _  -> go (i_line + 1) (plusPtr op 1)
+            _  -> go i_line (plusPtr op 1)
+
+-- -----------------------------------------------------------------------------
+-- Conversion
+
+-- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
+-- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
+-- they will be replaced with @\'\\0\'@.
+lexemeToString :: StringBuffer
+               -> Int                   -- ^ @n@, the number of bytes
+               -> String
+lexemeToString _ 0 = ""
+lexemeToString (StringBuffer buf _ cur) bytes =
+  utf8DecodeStringLazy buf cur bytes
+
+lexemeToFastString :: StringBuffer
+                   -> Int               -- ^ @n@, the number of bytes
+                   -> FastString
+lexemeToFastString _ 0 = nilFS
+lexemeToFastString (StringBuffer buf _ cur) len =
+   inlinePerformIO $
+     withForeignPtr buf $ \ptr ->
+       return $! mkFastStringBytes (ptr `plusPtr` cur) len
+
+-- | Return the previous @n@ characters (or fewer if we are less than @n@
+-- characters into the buffer.
+decodePrevNChars :: Int -> StringBuffer -> String
+decodePrevNChars n (StringBuffer buf _ cur) =
+    inlinePerformIO $ withForeignPtr buf $ \p0 ->
+      go p0 n "" (p0 `plusPtr` (cur - 1))
+  where
+    go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String
+    go buf0 n acc p | n == 0 || buf0 >= p = return acc
+    go buf0 n acc p = do
+        p' <- utf8PrevChar p
+        let (c,_) = utf8DecodeChar p'
+        go buf0 (n - 1) (c:acc) p'
+
+-- -----------------------------------------------------------------------------
+-- Parsing integer strings in various bases
+parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
+parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
+  = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
+    go i x | i == len  = x
+           | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
+               '_'  -> go (i + 1) x    -- skip "_" (#14473)
+               char -> go (i + 1) (x * radix + toInteger (char_to_int char))
+  in go 0 0
author	Sylvain Henry <sylvain@haskus.fr>	2020-04-20 16:54:38 +0200
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2020-04-26 13:55:14 -0400
commit	af332442123878c1b61d236dce46418efcbe8750 (patch)
tree	ec4b332843cdd4fedb4aa60b11b7b8dba82a0764 /compiler/GHC/Data/StringBuffer.hs
parent	b0fbfc7582fb81314dc28a056536737fb5eeaa6e (diff)
download	haskell-af332442123878c1b61d236dce46418efcbe8750.tar.gz