diff options
-rw-r--r-- | compiler/utils/StringBuffer.lhs | 45 | ||||
-rw-r--r-- | testsuite/.gitignore | 2 | ||||
-rw-r--r-- | testsuite/tests/parser/unicode/T6016.hs | 34 | ||||
-rw-r--r-- | testsuite/tests/parser/unicode/all.T | 1 |
4 files changed, 69 insertions, 13 deletions
diff --git a/compiler/utils/StringBuffer.lhs b/compiler/utils/StringBuffer.lhs index a54f45ffff..50d8443b05 100644 --- a/compiler/utils/StringBuffer.lhs +++ b/compiler/utils/StringBuffer.lhs @@ -47,9 +47,12 @@ import Encoding import FastString import FastTypes import FastFunctions +import Outputable +import Util -import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose - , Handle, hTell, openBinaryFile ) +import Data.Maybe +import Control.Exception +import System.IO import System.IO.Unsafe ( unsafePerformIO ) import GHC.Exts @@ -89,7 +92,8 @@ hGetStringBuffer :: FilePath -> IO StringBuffer hGetStringBuffer fname = do h <- openBinaryFile fname ReadMode size_i <- hFileSize h - let size = fromIntegral size_i + offset_i <- skipBOM h size_i 0 -- offset is 0 initially + let size = fromIntegral $ size_i - offset_i buf <- mallocForeignPtrArray (size+3) withForeignPtr buf $ \ptr -> do r <- if size == 0 then return 0 else hGetBuf h ptr size @@ -101,7 +105,7 @@ hGetStringBuffer fname = do hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer hGetStringBufferBlock handle wanted = do size_i <- hFileSize handle - offset_i <- hTell handle + offset_i <- hTell handle >>= skipBOM handle size_i let size = min wanted (fromIntegral $ size_i-offset_i) buf <- mallocForeignPtrArray (size+3) withForeignPtr buf $ \ptr -> @@ -110,19 +114,34 @@ hGetStringBufferBlock handle wanted then ioError (userError $ "short read of file: "++show(r,size,size_i,handle)) else newUTF8StringBuffer buf ptr size +-- | Skip the byte-order mark if there is one (see #1744 and #6016), +-- and return the new position of the handle in bytes. +-- +-- This is better than treating #FEFF as whitespace, +-- because that would mess up layout. We don't have a concept +-- of zero-width whitespace in Haskell: all whitespace codepoints +-- have a width of one column. +skipBOM :: Handle -> Integer -> Integer -> IO Integer +skipBOM h size offset = + -- Only skip BOM at the beginning of a file. + if size > 0 && offset == 0 + then do + -- Validate assumption that handle is in binary mode. + ASSERTM( hGetEncoding h >>= return . isNothing ) + -- Temporarily select text mode to make `hLookAhead` and + -- `hGetChar` return full Unicode characters. + bracket_ (hSetBinaryMode h False) (hSetBinaryMode h True) $ do + c <- hLookAhead h + if c == '\xfeff' + then hGetChar h >> hTell h + else return offset + else return offset + newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer newUTF8StringBuffer buf ptr size = do pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0] -- sentinels for UTF-8 decoding - let - sb0 = StringBuffer buf size 0 - (first_char, sb1) = nextChar sb0 - -- skip the byte-order mark if there is one (see #1744) - -- This is better than treating #FEFF as whitespace, - -- because that would mess up layout. We don't have a concept - -- of zero-width whitespace in Haskell: all whitespace codepoints - -- have a width of one column. - return (if first_char == '\xfeff' then sb1 else sb0) + return $ StringBuffer buf size 0 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer appendStringBuffers sb1 sb2 diff --git a/testsuite/.gitignore b/testsuite/.gitignore index 591545cdc3..4f8ac870e6 100644 --- a/testsuite/.gitignore +++ b/testsuite/.gitignore @@ -1074,6 +1074,8 @@ mk/ghcconfig_*_inplace_bin_ghc-stage2.exe.mk /tests/parser/should_run/readRun004 /tests/parser/unicode/1744 /tests/parser/unicode/T1744 +/tests/parser/unicode/T6016 +/tests/parser/unicode/T6016-twoBOMs /tests/parser/unicode/utf8_024 /tests/patsyn/should_run/bidir-explicit /tests/patsyn/should_run/bidir-explicit-scope diff --git a/testsuite/tests/parser/unicode/T6016.hs b/testsuite/tests/parser/unicode/T6016.hs new file mode 100644 index 0000000000..5783a72843 --- /dev/null +++ b/testsuite/tests/parser/unicode/T6016.hs @@ -0,0 +1,34 @@ +module Main where + +import Control.Exception +import Data.Char +import System.IO + +import StringBuffer + +twoBOMs = "T6016-twoBOMs" + +ignoreFirstBOM = do + -- StringBuffer should not contain initial byte-order mark. + -- + -- Just skipping over it, but leaving it in the Stringbuffer, is not + -- sufficient. The Lexer calls prevChar when a regular expression + -- starts with '^' (which is a shorthand for '\n^'). It would never + -- match on the first line, since instead of '\n', prevChar would + -- still return '\xfeff'. + s <- hGetStringBuffer twoBOMs + assert (prevChar s '\n' == '\n') return () + +dontIgnoreSecondBOM = do + -- U+FEFF is considered a BOM only if it appears as the first + -- character of a file. + h <- openBinaryFile twoBOMs ReadMode + hSeek h AbsoluteSeek 3 + s <- hGetStringBufferBlock h 3 + hClose h + assert (currentChar s == '\xfeff') return () + +main = do + writeFile twoBOMs "\xfeff\xfeff" + ignoreFirstBOM + dontIgnoreSecondBOM diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T index a8e19ebec9..2ff7edf927 100644 --- a/testsuite/tests/parser/unicode/all.T +++ b/testsuite/tests/parser/unicode/all.T @@ -20,4 +20,5 @@ test('T1744', normal, compile_and_run, ['']) test('T1103', normal, compile, ['']) test('T2302', only_ways(['normal']), compile_fail, ['']) test('T4373', normal, compile, ['']) +test('T6016', extra_clean('T6016-twoBOMs'), compile_and_run, ['-package ghc']) test('T7671', normal, compile, ['']) |