diff options
author | Alexis King <lexi.lambda@gmail.com> | 2022-04-26 12:30:47 -0500 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2022-05-04 09:58:14 -0400 |
commit | 85bc73bd8fab17ad80f925b45e8b4b059278ba6c (patch) | |
tree | d51782347d003da9dd4b81a6a5826b8fe305cf0b | |
parent | 948c7e40b29d3a9c71f2a968f90944319b1b03c2 (diff) | |
download | haskell-85bc73bd8fab17ad80f925b45e8b4b059278ba6c.tar.gz |
genprimopcode: Support Unicode properly
-rw-r--r-- | hadrian/src/Builder.hs | 19 | ||||
-rw-r--r-- | hadrian/src/Hadrian/Utilities.hs | 22 | ||||
-rw-r--r-- | utils/genprimopcode/Lexer.x | 1 | ||||
-rw-r--r-- | utils/genprimopcode/ParserM.hs | 55 |
4 files changed, 74 insertions, 23 deletions
diff --git a/hadrian/src/Builder.hs b/hadrian/src/Builder.hs index e29bf4316c..160eb44f7a 100644 --- a/hadrian/src/Builder.hs +++ b/hadrian/src/Builder.hs @@ -307,7 +307,8 @@ instance H.Builder Builder where -- Capture stdout and write it to the output file. captureStdout = do Stdout stdout <- cmd' [path] buildArgs - writeFileChanged output stdout + -- see Note [Capture stdout as a ByteString] + writeFileChangedBS output stdout case builder of Ar Pack _ -> do useTempFile <- flag ArSupportsAtFile @@ -332,7 +333,8 @@ instance H.Builder Builder where GenPrimopCode -> do stdin <- readFile' input Stdout stdout <- cmd' (Stdin stdin) [path] buildArgs - writeFileChanged output stdout + -- see Note [Capture stdout as a ByteString] + writeFileChangedBS output stdout GhcPkg Copy _ -> do Stdout pkgDesc <- cmd' [path] @@ -509,6 +511,19 @@ applyPatch dir patch = do -- tell if an Exit or ExitCode value is returned in `r`. So we use our own -- HasExit type class to provide the `hasExit` predicate that tells us if we -- should throw an exception as `cmd` would do in case of failure or not. +-- +-- Note [Capture stdout as a ByteString] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- As of shake-0.19.6, capturing a process stdout as a `String` using `Stdout` +-- mangles the encoding if it some other use of `Stdout` also captures it as a +-- `ByteString`; see <https://github.com/ndmitchell/shake/issues/828>. This +-- can cause us real problems, since `cmd'` (see Note [cmd wrapper]) *always* +-- captures stdout as a `ByteString`. +-- +-- Fortunately, a simple workaround is to avoid capturing stdout as a `String` +-- in the first place. It’s usually unnecessary (and is in fact pointless work), +-- as most of the time the captured output is immediately written to a file, so +-- we can just treat it as an opaque binary stream. -- | Wrapper for Shake's 'cmd' diff --git a/hadrian/src/Hadrian/Utilities.hs b/hadrian/src/Hadrian/Utilities.hs index e2fb221940..40b7835e55 100644 --- a/hadrian/src/Hadrian/Utilities.hs +++ b/hadrian/src/Hadrian/Utilities.hs @@ -18,7 +18,7 @@ module Hadrian.Utilities ( -- * File system operations copyFile, copyFileUntracked, createFileLink, fixFile, makeExecutable, moveFile, removeFile, createDirectory, copyDirectory, - moveDirectory, removeDirectory, removeFile_, + moveDirectory, removeDirectory, removeFile_, writeFileChangedBS, -- * Diagnostic info Colour (..), ANSIColour (..), putColoured, shouldUseColor, @@ -49,6 +49,7 @@ import Development.Shake.Classes import Development.Shake.FilePath import System.Environment (lookupEnv) +import qualified Data.ByteString as BS import qualified Control.Exception.Base as IO import qualified Data.HashMap.Strict as Map import qualified System.Directory.Extra as IO @@ -394,6 +395,25 @@ removeDirectory dir = do putProgressInfo $ "| Remove directory " ++ dir liftIO . whenM (IO.doesDirectoryExist dir) $ IO.removeDirectoryRecursive dir +-- | Like Shake's 'writeFileChanged', but accepts a 'ByteString'. +writeFileChangedBS :: FilePath -> BS.ByteString -> Action () +writeFileChangedBS name new = do + liftIO $ IO.createDirectoryIfMissing True $ takeDirectory name + exists <- liftIO $ IO.doesFileExist name + if exists + then do + old <- liftIO $ BS.readFile name + when (old /= new) $ do + liftIO $ removeFile_ name + do_write + else + do_write + where + do_write = do + putProgressInfo $ "| Write file " ++ name + liftIO $ BS.writeFile name new + + -- | Terminal output colours data Colour = Dull ANSIColour -- ^ 8-bit ANSI colours diff --git a/utils/genprimopcode/Lexer.x b/utils/genprimopcode/Lexer.x index 13c776fe4a..3f2432daab 100644 --- a/utils/genprimopcode/Lexer.x +++ b/utils/genprimopcode/Lexer.x @@ -77,4 +77,3 @@ get_tok = ParserM $ \i st -> lex_tok :: (Token -> ParserM a) -> ParserM a lex_tok cont = get_tok >>= cont } - diff --git a/utils/genprimopcode/ParserM.hs b/utils/genprimopcode/ParserM.hs index 63804d7654..3583dbfae5 100644 --- a/utils/genprimopcode/ParserM.hs +++ b/utils/genprimopcode/ParserM.hs @@ -1,6 +1,8 @@ +{-# LANGUAGE BinaryLiterals #-} + module ParserM ( -- Parser Monad - ParserM(..), AlexInput, run_parser, + ParserM(..), AlexInput(..), run_parser, -- Parser state St, StartCode, start_code, set_start_code, @@ -12,7 +14,7 @@ module ParserM ( -- Positions get_pos, show_pos, -- Input - alexGetChar, alexGetByte, alexInputPrevChar, input, position, + alexGetByte, alexInputPrevChar, -- Other happyError ) where @@ -23,8 +25,9 @@ import Prelude hiding (fail) import Control.Monad.Fail (MonadFail (..)) import Control.Monad (ap, liftM) -import Data.Word (Word8) +import Data.Bits ((.&.), (.|.), shiftR) import Data.Char (ord) +import Data.Word (Word8) -- Parser Monad newtype ParserM a = ParserM (AlexInput -> St -> Either String (AlexInput, St, a)) @@ -49,7 +52,7 @@ instance MonadFail ParserM where run_parser :: ParserM a -> (String -> Either String a) run_parser (ParserM f) - = \s -> case f (AlexInput init_pos s) init_state of + = \s -> case f (AlexInput init_pos [] s) init_state of Left es -> Left es Right (_, _, x) -> Right x @@ -144,7 +147,7 @@ mkTv f str = ParserM (\i st -> Right (i, st, f str)) data Pos = Pos !Int{- Line -} !Int{- Column -} get_pos :: ParserM Pos -get_pos = ParserM $ \i@(AlexInput p _) st -> Right (i, st, p) +get_pos = ParserM $ \i@(AlexInput p _ _) st -> Right (i, st, p) alexMove :: Pos -> Char -> Pos alexMove (Pos l _) '\n' = Pos (l+1) 1 @@ -152,25 +155,40 @@ alexMove (Pos l c) '\t' = Pos l ((c+8) `div` 8 * 8) alexMove (Pos l c) _ = Pos l (c+1) init_pos :: Pos -init_pos = Pos 1 1 +init_pos = Pos 1 0 show_pos :: Pos -> String show_pos (Pos l c) = "line " ++ show l ++ ", column " ++ show c -- Input -data AlexInput = AlexInput {position :: !Pos, input :: String} - --- alexGetByte is for Alex >= 3.0, alexGetChar for earlier --- XXX no UTF-8; we should do this properly sometime -alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) -alexGetByte (AlexInput p (x:xs)) = Just (fromIntegral (ord x), - AlexInput (alexMove p x) xs) -alexGetByte (AlexInput _ []) = Nothing - -alexGetChar :: AlexInput -> Maybe (Char,AlexInput) -alexGetChar (AlexInput p (x:xs)) = Just (x, AlexInput (alexMove p x) xs) -alexGetChar (AlexInput _ []) = Nothing +data AlexInput = AlexInput + { position :: !Pos + , char_bytes :: [Word8] + , input :: String + } + +alexGetByte :: AlexInput -> Maybe (Word8, AlexInput) +alexGetByte (AlexInput p (w:ws) cs) + = Just (w, AlexInput p ws cs) +alexGetByte (AlexInput p [] (c:cs)) + = alexGetByte (AlexInput (alexMove p c) (utf8_encode c) cs) +alexGetByte (AlexInput _ [] []) + = Nothing + +-- annoyingly, this doesn't seem to exist anywhere else as a standalone function +utf8_encode :: Char -> [Word8] +utf8_encode c = case ord c of + n | n < 0x80 -> [ fromIntegral n ] + | n < 0x800 -> [ fromIntegral $ 0b11000000 .|. (n `shiftR` 6) + , fromIntegral $ 0b10000000 .|. (n .&. 0b111111) ] + | n < 0x10000 -> [ fromIntegral $ 0b11100000 .|. (n `shiftR` 12) + , fromIntegral $ 0b10000000 .|. ((n `shiftR` 6) .&. 0b111111) + , fromIntegral $ 0b10000000 .|. (n .&. 0b111111) ] + | otherwise -> [ fromIntegral $ 0b11110000 .|. (n `shiftR` 18) + , fromIntegral $ 0b10000000 .|. ((n `shiftR` 12) .&. 0b111111) + , fromIntegral $ 0b10000000 .|. ((n `shiftR` 6) .&. 0b111111) + , fromIntegral $ 0b10000000 .|. (n .&. 0b111111) ] alexInputPrevChar :: AlexInput -> Char alexInputPrevChar _ = error "Lexer doesn't implement alexInputPrevChar" @@ -178,4 +196,3 @@ alexInputPrevChar _ = error "Lexer doesn't implement alexInputPrevChar" happyError :: ParserM a happyError = do p <- get_pos fail $ "Parse error at " ++ show_pos p - |