summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexis King <lexi.lambda@gmail.com>2022-04-26 12:30:47 -0500
committerMarge Bot <ben+marge-bot@smart-cactus.org>2022-05-04 09:58:14 -0400
commit85bc73bd8fab17ad80f925b45e8b4b059278ba6c (patch)
treed51782347d003da9dd4b81a6a5826b8fe305cf0b
parent948c7e40b29d3a9c71f2a968f90944319b1b03c2 (diff)
downloadhaskell-85bc73bd8fab17ad80f925b45e8b4b059278ba6c.tar.gz
genprimopcode: Support Unicode properly
-rw-r--r--hadrian/src/Builder.hs19
-rw-r--r--hadrian/src/Hadrian/Utilities.hs22
-rw-r--r--utils/genprimopcode/Lexer.x1
-rw-r--r--utils/genprimopcode/ParserM.hs55
4 files changed, 74 insertions, 23 deletions
diff --git a/hadrian/src/Builder.hs b/hadrian/src/Builder.hs
index e29bf4316c..160eb44f7a 100644
--- a/hadrian/src/Builder.hs
+++ b/hadrian/src/Builder.hs
@@ -307,7 +307,8 @@ instance H.Builder Builder where
-- Capture stdout and write it to the output file.
captureStdout = do
Stdout stdout <- cmd' [path] buildArgs
- writeFileChanged output stdout
+ -- see Note [Capture stdout as a ByteString]
+ writeFileChangedBS output stdout
case builder of
Ar Pack _ -> do
useTempFile <- flag ArSupportsAtFile
@@ -332,7 +333,8 @@ instance H.Builder Builder where
GenPrimopCode -> do
stdin <- readFile' input
Stdout stdout <- cmd' (Stdin stdin) [path] buildArgs
- writeFileChanged output stdout
+ -- see Note [Capture stdout as a ByteString]
+ writeFileChangedBS output stdout
GhcPkg Copy _ -> do
Stdout pkgDesc <- cmd' [path]
@@ -509,6 +511,19 @@ applyPatch dir patch = do
-- tell if an Exit or ExitCode value is returned in `r`. So we use our own
-- HasExit type class to provide the `hasExit` predicate that tells us if we
-- should throw an exception as `cmd` would do in case of failure or not.
+--
+-- Note [Capture stdout as a ByteString]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- As of shake-0.19.6, capturing a process stdout as a `String` using `Stdout`
+-- mangles the encoding if it some other use of `Stdout` also captures it as a
+-- `ByteString`; see <https://github.com/ndmitchell/shake/issues/828>. This
+-- can cause us real problems, since `cmd'` (see Note [cmd wrapper]) *always*
+-- captures stdout as a `ByteString`.
+--
+-- Fortunately, a simple workaround is to avoid capturing stdout as a `String`
+-- in the first place. It’s usually unnecessary (and is in fact pointless work),
+-- as most of the time the captured output is immediately written to a file, so
+-- we can just treat it as an opaque binary stream.
-- | Wrapper for Shake's 'cmd'
diff --git a/hadrian/src/Hadrian/Utilities.hs b/hadrian/src/Hadrian/Utilities.hs
index e2fb221940..40b7835e55 100644
--- a/hadrian/src/Hadrian/Utilities.hs
+++ b/hadrian/src/Hadrian/Utilities.hs
@@ -18,7 +18,7 @@ module Hadrian.Utilities (
-- * File system operations
copyFile, copyFileUntracked, createFileLink, fixFile,
makeExecutable, moveFile, removeFile, createDirectory, copyDirectory,
- moveDirectory, removeDirectory, removeFile_,
+ moveDirectory, removeDirectory, removeFile_, writeFileChangedBS,
-- * Diagnostic info
Colour (..), ANSIColour (..), putColoured, shouldUseColor,
@@ -49,6 +49,7 @@ import Development.Shake.Classes
import Development.Shake.FilePath
import System.Environment (lookupEnv)
+import qualified Data.ByteString as BS
import qualified Control.Exception.Base as IO
import qualified Data.HashMap.Strict as Map
import qualified System.Directory.Extra as IO
@@ -394,6 +395,25 @@ removeDirectory dir = do
putProgressInfo $ "| Remove directory " ++ dir
liftIO . whenM (IO.doesDirectoryExist dir) $ IO.removeDirectoryRecursive dir
+-- | Like Shake's 'writeFileChanged', but accepts a 'ByteString'.
+writeFileChangedBS :: FilePath -> BS.ByteString -> Action ()
+writeFileChangedBS name new = do
+ liftIO $ IO.createDirectoryIfMissing True $ takeDirectory name
+ exists <- liftIO $ IO.doesFileExist name
+ if exists
+ then do
+ old <- liftIO $ BS.readFile name
+ when (old /= new) $ do
+ liftIO $ removeFile_ name
+ do_write
+ else
+ do_write
+ where
+ do_write = do
+ putProgressInfo $ "| Write file " ++ name
+ liftIO $ BS.writeFile name new
+
+
-- | Terminal output colours
data Colour
= Dull ANSIColour -- ^ 8-bit ANSI colours
diff --git a/utils/genprimopcode/Lexer.x b/utils/genprimopcode/Lexer.x
index 13c776fe4a..3f2432daab 100644
--- a/utils/genprimopcode/Lexer.x
+++ b/utils/genprimopcode/Lexer.x
@@ -77,4 +77,3 @@ get_tok = ParserM $ \i st ->
lex_tok :: (Token -> ParserM a) -> ParserM a
lex_tok cont = get_tok >>= cont
}
-
diff --git a/utils/genprimopcode/ParserM.hs b/utils/genprimopcode/ParserM.hs
index 63804d7654..3583dbfae5 100644
--- a/utils/genprimopcode/ParserM.hs
+++ b/utils/genprimopcode/ParserM.hs
@@ -1,6 +1,8 @@
+{-# LANGUAGE BinaryLiterals #-}
+
module ParserM (
-- Parser Monad
- ParserM(..), AlexInput, run_parser,
+ ParserM(..), AlexInput(..), run_parser,
-- Parser state
St,
StartCode, start_code, set_start_code,
@@ -12,7 +14,7 @@ module ParserM (
-- Positions
get_pos, show_pos,
-- Input
- alexGetChar, alexGetByte, alexInputPrevChar, input, position,
+ alexGetByte, alexInputPrevChar,
-- Other
happyError
) where
@@ -23,8 +25,9 @@ import Prelude hiding (fail)
import Control.Monad.Fail (MonadFail (..))
import Control.Monad (ap, liftM)
-import Data.Word (Word8)
+import Data.Bits ((.&.), (.|.), shiftR)
import Data.Char (ord)
+import Data.Word (Word8)
-- Parser Monad
newtype ParserM a = ParserM (AlexInput -> St -> Either String (AlexInput, St, a))
@@ -49,7 +52,7 @@ instance MonadFail ParserM where
run_parser :: ParserM a -> (String -> Either String a)
run_parser (ParserM f)
- = \s -> case f (AlexInput init_pos s) init_state of
+ = \s -> case f (AlexInput init_pos [] s) init_state of
Left es -> Left es
Right (_, _, x) -> Right x
@@ -144,7 +147,7 @@ mkTv f str = ParserM (\i st -> Right (i, st, f str))
data Pos = Pos !Int{- Line -} !Int{- Column -}
get_pos :: ParserM Pos
-get_pos = ParserM $ \i@(AlexInput p _) st -> Right (i, st, p)
+get_pos = ParserM $ \i@(AlexInput p _ _) st -> Right (i, st, p)
alexMove :: Pos -> Char -> Pos
alexMove (Pos l _) '\n' = Pos (l+1) 1
@@ -152,25 +155,40 @@ alexMove (Pos l c) '\t' = Pos l ((c+8) `div` 8 * 8)
alexMove (Pos l c) _ = Pos l (c+1)
init_pos :: Pos
-init_pos = Pos 1 1
+init_pos = Pos 1 0
show_pos :: Pos -> String
show_pos (Pos l c) = "line " ++ show l ++ ", column " ++ show c
-- Input
-data AlexInput = AlexInput {position :: !Pos, input :: String}
-
--- alexGetByte is for Alex >= 3.0, alexGetChar for earlier
--- XXX no UTF-8; we should do this properly sometime
-alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
-alexGetByte (AlexInput p (x:xs)) = Just (fromIntegral (ord x),
- AlexInput (alexMove p x) xs)
-alexGetByte (AlexInput _ []) = Nothing
-
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar (AlexInput p (x:xs)) = Just (x, AlexInput (alexMove p x) xs)
-alexGetChar (AlexInput _ []) = Nothing
+data AlexInput = AlexInput
+ { position :: !Pos
+ , char_bytes :: [Word8]
+ , input :: String
+ }
+
+alexGetByte :: AlexInput -> Maybe (Word8, AlexInput)
+alexGetByte (AlexInput p (w:ws) cs)
+ = Just (w, AlexInput p ws cs)
+alexGetByte (AlexInput p [] (c:cs))
+ = alexGetByte (AlexInput (alexMove p c) (utf8_encode c) cs)
+alexGetByte (AlexInput _ [] [])
+ = Nothing
+
+-- annoyingly, this doesn't seem to exist anywhere else as a standalone function
+utf8_encode :: Char -> [Word8]
+utf8_encode c = case ord c of
+ n | n < 0x80 -> [ fromIntegral n ]
+ | n < 0x800 -> [ fromIntegral $ 0b11000000 .|. (n `shiftR` 6)
+ , fromIntegral $ 0b10000000 .|. (n .&. 0b111111) ]
+ | n < 0x10000 -> [ fromIntegral $ 0b11100000 .|. (n `shiftR` 12)
+ , fromIntegral $ 0b10000000 .|. ((n `shiftR` 6) .&. 0b111111)
+ , fromIntegral $ 0b10000000 .|. (n .&. 0b111111) ]
+ | otherwise -> [ fromIntegral $ 0b11110000 .|. (n `shiftR` 18)
+ , fromIntegral $ 0b10000000 .|. ((n `shiftR` 12) .&. 0b111111)
+ , fromIntegral $ 0b10000000 .|. ((n `shiftR` 6) .&. 0b111111)
+ , fromIntegral $ 0b10000000 .|. (n .&. 0b111111) ]
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar _ = error "Lexer doesn't implement alexInputPrevChar"
@@ -178,4 +196,3 @@ alexInputPrevChar _ = error "Lexer doesn't implement alexInputPrevChar"
happyError :: ParserM a
happyError = do p <- get_pos
fail $ "Parse error at " ++ show_pos p
-