diff options
Diffstat (limited to 'compiler/parser/Lexer.x')
-rw-r--r-- | compiler/parser/Lexer.x | 545 |
1 files changed, 358 insertions, 187 deletions
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x index 936948b40f..bceb48bf48 100644 --- a/compiler/parser/Lexer.x +++ b/compiler/parser/Lexer.x @@ -68,6 +68,7 @@ module Lexer ( explicitNamespacesEnabled, patternSynonymsEnabled, sccProfilingOn, hpcEnabled, + starIsTypeEnabled, addWarning, lexTokenStream, addAnnotation,AddAnn,addAnnsAt,mkParensApiAnn, @@ -75,11 +76,11 @@ module Lexer ( moveAnnotations ) where +import GhcPrelude + -- base import Control.Monad -#if __GLASGOW_HASKELL__ > 710 -import Control.Monad.Fail -#endif +import Control.Monad.Fail as MonadFail import Data.Bits import Data.Char import Data.List @@ -105,7 +106,7 @@ import Outputable import StringBuffer import FastString import UniqFM -import Util ( readRational ) +import Util ( readRational, readHexRational ) -- compiler/main import ErrUtils @@ -129,38 +130,38 @@ import ApiAnnotation -- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs -- Any changes here should likely be reflected there. -$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetByte. +$unispace = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $nl = [\n\r\f] $whitechar = [$nl\v\ $unispace] $white_no_nl = $whitechar # \n -- TODO #8424 $tab = \t $ascdigit = 0-9 -$unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetByte. +$unidigit = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $decdigit = $ascdigit -- for now, should really be $digit (ToDo) $digit = [$ascdigit $unidigit] $special = [\(\)\,\;\[\]\`\{\}] $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:] -$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetByte. +$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $symbol = [$ascsymbol $unisymbol] # [$special \_\"\'] -$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetByte. +$unilarge = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $asclarge = [A-Z] $large = [$asclarge $unilarge] -$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetByte. +$unismall = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $ascsmall = [a-z] $small = [$ascsmall $unismall \_] -$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetByte. +$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $graphic = [$small $large $symbol $digit $special $unigraphic \"\'] $binit = 0-1 $octit = 0-7 $hexit = [$decdigit A-F a-f] -$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte. +$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $idchar = [$small $large $digit $uniidchar \'] $pragmachar = [$small $large $digit] @@ -177,11 +178,14 @@ $docsym = [\| \^ \* \$] @varsym = ($symbol # \:) $symbol* -- variable (operator) symbol @consym = \: $symbol* -- constructor (operator) symbol -@decimal = $decdigit+ -@binary = $binit+ -@octal = $octit+ -@hexadecimal = $hexit+ -@exponent = [eE] [\-\+]? @decimal +-- See Note [Lexing NumericUnderscores extension] and #14473 +@numspc = _* -- numeric spacer (#14473) +@decimal = $decdigit(@numspc $decdigit)* +@binary = $binit(@numspc $binit)* +@octal = $octit(@numspc $octit)* +@hexadecimal = $hexit(@numspc $hexit)* +@exponent = @numspc [eE] [\-\+]? @decimal +@bin_exponent = @numspc [pP] [\-\+]? @decimal @qual = (@conid \.)+ @qvarid = @qual @varid @@ -189,7 +193,8 @@ $docsym = [\| \^ \* \$] @qvarsym = @qual @varsym @qconsym = @qual @consym -@floating_point = @decimal \. @decimal @exponent? | @decimal @exponent +@floating_point = @numspc @decimal \. @decimal @exponent? | @numspc @decimal @exponent +@hex_floating_point = @numspc @hexadecimal \. @hexadecimal @bin_exponent? | @numspc @hexadecimal @bin_exponent -- normal signed numerical literals can only be explicitly negative, -- not explicitly positive (contrast @exponent) @@ -307,15 +312,18 @@ $tab { warnTab } -- single-line line pragmas, of the form -- # <line> "<file>" <extra-stuff> \n -<line_prag1> @decimal { setLine line_prag1a } -<line_prag1a> \" [$graphic \ ]* \" { setFile line_prag1b } -<line_prag1b> .* { pop } +<line_prag1> { + @decimal $white_no_nl+ \" [$graphic \ ]* \" { setLineAndFile line_prag1a } + () { failLinePrag1 } +} +<line_prag1a> .* { popLinePrag1 } -- Haskell-style line pragmas, of the form -- {-# LINE <line> "<file>" #-} -<line_prag2> @decimal { setLine line_prag2a } -<line_prag2a> \" [$graphic \ ]* \" { setFile line_prag2b } -<line_prag2b> "#-}"|"-}" { pop } +<line_prag2> { + @decimal $white_no_nl+ \" [$graphic \ ]* \" { setLineAndFile line_prag2a } +} +<line_prag2a> "#-}"|"-}" { pop } -- NOTE: accept -} at the end of a LINE pragma, for compatibility -- with older versions of GHC which generated these. @@ -367,11 +375,6 @@ $tab { warnTab } -- "special" symbols <0> { - "[:" / { ifExtension parrEnabled } { token ITopabrack } - ":]" / { ifExtension parrEnabled } { token ITcpabrack } -} - -<0> { "[|" / { ifExtension thQuotesEnabled } { token (ITopenExpQuote NoE NormalSyntax) } "[||" / { ifExtension thQuotesEnabled } { token (ITopenTExpQuote NoE) } @@ -483,21 +486,34 @@ $tab { warnTab } -- For the normal boxed literals we need to be careful -- when trying to be close to Haskell98 + +-- Note [Lexing NumericUnderscores extension] (#14473) +-- +-- NumericUnderscores extension allows underscores in numeric literals. +-- Multiple underscores are represented with @numspc macro. +-- To be simpler, we have only the definitions with underscores. +-- And then we have a separate function (tok_integral and tok_frac) +-- that validates the literals. +-- If extensions are not enabled, check that there are no underscores. +-- <0> { -- Normal integral literals (:: Num a => a, from Integer) @decimal { tok_num positive 0 0 decimal } - 0[bB] @binary / { ifExtension binaryLiteralsEnabled } { tok_num positive 2 2 binary } - 0[oO] @octal { tok_num positive 2 2 octal } - 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal } + 0[bB] @numspc @binary / { ifExtension binaryLiteralsEnabled } { tok_num positive 2 2 binary } + 0[oO] @numspc @octal { tok_num positive 2 2 octal } + 0[xX] @numspc @hexadecimal { tok_num positive 2 2 hexadecimal } @negative @decimal / { ifExtension negativeLiteralsEnabled } { tok_num negative 1 1 decimal } - @negative 0[bB] @binary / { ifExtension negativeLiteralsEnabled `alexAndPred` - ifExtension binaryLiteralsEnabled } { tok_num negative 3 3 binary } - @negative 0[oO] @octal / { ifExtension negativeLiteralsEnabled } { tok_num negative 3 3 octal } - @negative 0[xX] @hexadecimal / { ifExtension negativeLiteralsEnabled } { tok_num negative 3 3 hexadecimal } + @negative 0[bB] @numspc @binary / { ifExtension negativeLiteralsEnabled `alexAndPred` + ifExtension binaryLiteralsEnabled } { tok_num negative 3 3 binary } + @negative 0[oO] @numspc @octal / { ifExtension negativeLiteralsEnabled } { tok_num negative 3 3 octal } + @negative 0[xX] @numspc @hexadecimal / { ifExtension negativeLiteralsEnabled } { tok_num negative 3 3 hexadecimal } -- Normal rational literals (:: Fractional a => a, from Rational) - @floating_point { strtoken tok_float } - @negative @floating_point / { ifExtension negativeLiteralsEnabled } { strtoken tok_float } + @floating_point { tok_frac 0 tok_float } + @negative @floating_point / { ifExtension negativeLiteralsEnabled } { tok_frac 0 tok_float } + 0[xX] @numspc @hex_floating_point / { ifExtension hexFloatLiteralsEnabled } { tok_frac 0 tok_hex_float } + @negative 0[xX] @numspc @hex_floating_point / { ifExtension hexFloatLiteralsEnabled `alexAndPred` + ifExtension negativeLiteralsEnabled } { tok_frac 0 tok_hex_float } } <0> { @@ -505,26 +521,26 @@ $tab { warnTab } -- It's simpler (and faster?) to give separate cases to the negatives, -- especially considering octal/hexadecimal prefixes. @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal } - 0[bB] @binary \# / { ifExtension magicHashEnabled `alexAndPred` + 0[bB] @numspc @binary \# / { ifExtension magicHashEnabled `alexAndPred` ifExtension binaryLiteralsEnabled } { tok_primint positive 2 3 binary } - 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal } - 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal } + 0[oO] @numspc @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal } + 0[xX] @numspc @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal } @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal } - @negative 0[bB] @binary \# / { ifExtension magicHashEnabled `alexAndPred` - ifExtension binaryLiteralsEnabled } { tok_primint negative 3 4 binary } - @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal } - @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal } + @negative 0[bB] @numspc @binary \# / { ifExtension magicHashEnabled `alexAndPred` + ifExtension binaryLiteralsEnabled } { tok_primint negative 3 4 binary } + @negative 0[oO] @numspc @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal } + @negative 0[xX] @numspc @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal } @decimal \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal } - 0[bB] @binary \# \# / { ifExtension magicHashEnabled `alexAndPred` + 0[bB] @numspc @binary \# \# / { ifExtension magicHashEnabled `alexAndPred` ifExtension binaryLiteralsEnabled } { tok_primword 2 4 binary } - 0[oO] @octal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal } - 0[xX] @hexadecimal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal } + 0[oO] @numspc @octal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal } + 0[xX] @numspc @hexadecimal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal } -- Unboxed floats and doubles (:: Float#, :: Double#) -- prim_{float,double} work with signed literals - @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat } - @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble } + @signed @floating_point \# / { ifExtension magicHashEnabled } { tok_frac 1 tok_primfloat } + @signed @floating_point \# \# / { ifExtension magicHashEnabled } { tok_frac 2 tok_primdouble } } -- Strings and chars are lexed by hand-written code. The reason is @@ -620,6 +636,7 @@ data Token | ITstatic | ITstock | ITanyclass + | ITvia -- Backpack tokens | ITunit @@ -635,7 +652,8 @@ data Token | ITrules_prag SourceText | ITwarning_prag SourceText | ITdeprecated_prag SourceText - | ITline_prag + | ITline_prag SourceText -- not usually produced, see 'use_pos_prags' + | ITcolumn_prag SourceText -- not usually produced, see 'use_pos_prags' | ITscc_prag SourceText | ITgenerated_prag SourceText | ITcore_prag SourceText -- hdaume: core annotations @@ -647,15 +665,13 @@ data Token | IToptions_prag String | ITinclude_prag String | ITlanguage_prag - | ITvect_prag SourceText - | ITvect_scalar_prag SourceText - | ITnovect_prag SourceText | ITminimal_prag SourceText | IToverlappable_prag SourceText -- instance overlap mode | IToverlapping_prag SourceText -- instance overlap mode | IToverlaps_prag SourceText -- instance overlap mode | ITincoherent_prag SourceText -- instance overlap mode | ITctype SourceText + | ITcomment_line_prag -- See Note [Nested comment line pragmas] | ITdotdot -- reserved symbols | ITcolon @@ -668,10 +684,10 @@ data Token | ITrarrow IsUnicodeSyntax | ITat | ITtilde - | ITtildehsh | ITdarrow IsUnicodeSyntax | ITminus | ITbang + | ITstar IsUnicodeSyntax | ITdot | ITbiglam -- GHC-extension symbols @@ -820,6 +836,7 @@ reservedWordsFM = listToUFM $ ( "static", ITstatic, 0 ), ( "stock", ITstock, 0 ), ( "anyclass", ITanyclass, 0 ), + ( "via", ITvia, 0 ), ( "group", ITgroup, xbit TransformComprehensionsBit), ( "by", ITby, xbit TransformComprehensionsBit), ( "using", ITusing, xbit TransformComprehensionsBit), @@ -878,11 +895,12 @@ reservedSymsFM = listToUFM $ ,("->", ITrarrow NormalSyntax, always) ,("@", ITat, always) ,("~", ITtilde, always) - ,("~#", ITtildehsh, magicHashEnabled) ,("=>", ITdarrow NormalSyntax, always) ,("-", ITminus, always) ,("!", ITbang, always) + ,("*", ITstar NormalSyntax, starIsTypeEnabled) + -- For 'forall a . t' ,(".", ITdot, always) -- \i -> explicitForallEnabled i || inRulePrag i) @@ -905,6 +923,8 @@ reservedSymsFM = listToUFM $ \i -> unicodeSyntaxEnabled i && arrowsEnabled i) ,("⤜", ITRarrowtail UnicodeSyntax, \i -> unicodeSyntaxEnabled i && arrowsEnabled i) + ,("★", ITstar UnicodeSyntax, + \i -> unicodeSyntaxEnabled i && starIsTypeEnabled i) -- ToDo: ideally, → and ∷ should be "specials", so that they cannot -- form part of a large operator. This would let us have a better @@ -938,17 +958,26 @@ strtoken :: (String -> Token) -> Action strtoken f span buf len = return (L span $! (f $! lexemeToString buf len)) -init_strtoken :: Int -> (String -> Token) -> Action --- like strtoken, but drops the last N character(s) -init_strtoken drop f span buf len = - return (L span $! (f $! lexemeToString buf (len-drop))) - begin :: Int -> Action begin code _span _str _len = do pushLexState code; lexToken pop :: Action pop _span _buf _len = do _ <- popLexState lexToken +-- See Note [Nested comment line pragmas] +failLinePrag1 :: Action +failLinePrag1 span _buf _len = do + b <- extension inNestedComment + if b then return (L span ITcomment_line_prag) + else lexError "lexical error in pragma" + +-- See Note [Nested comment line pragmas] +popLinePrag1 :: Action +popLinePrag1 span _buf _len = do + b <- extension inNestedComment + if b then return (L span ITcomment_line_prag) else do + _ <- popLexState + lexToken hopefully_open_brace :: Action hopefully_open_brace span buf len @@ -1088,6 +1117,12 @@ nested_comment cont span buf len = do Nothing -> errBrace input span Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input Just (_,_) -> go ('\123':commentAcc) n input + -- See Note [Nested comment line pragmas] + Just ('\n',input) -> case alexGetChar' input of + Nothing -> errBrace input span + Just ('#',_) -> do (parsedAcc,input) <- parseNestedPragma input + go (parsedAcc ++ '\n':commentAcc) n input + Just (_,_) -> go ('\n':commentAcc) n input Just (c,input) -> go (c:commentAcc) n input nested_doc_comment :: Action @@ -1107,8 +1142,60 @@ nested_doc_comment span buf _len = withLexedDocType (go "") let cont = do input <- getInput; go commentAcc input docType False nested_comment cont span buf _len Just (_,_) -> go ('\123':commentAcc) input docType False + -- See Note [Nested comment line pragmas] + Just ('\n',input) -> case alexGetChar' input of + Nothing -> errBrace input span + Just ('#',_) -> do (parsedAcc,input) <- parseNestedPragma input + go (parsedAcc ++ '\n':commentAcc) input docType False + Just (_,_) -> go ('\n':commentAcc) input docType False Just (c,input) -> go (c:commentAcc) input docType False +-- See Note [Nested comment line pragmas] +parseNestedPragma :: AlexInput -> P (String,AlexInput) +parseNestedPragma input@(AI _ buf) = do + origInput <- getInput + setInput input + setExts (.|. xbit InNestedCommentBit) + pushLexState bol + lt <- lexToken + _ <- popLexState + setExts (.&. complement (xbit InNestedCommentBit)) + postInput@(AI _ postBuf) <- getInput + setInput origInput + case unLoc lt of + ITcomment_line_prag -> do + let bytes = byteDiff buf postBuf + diff = lexemeToString buf bytes + return (reverse diff, postInput) + lt' -> panic ("parseNestedPragma: unexpected token" ++ (show lt')) + +{- +Note [Nested comment line pragmas] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We used to ignore cpp-preprocessor-generated #line pragmas if they were inside +nested comments. + +Now, when parsing a nested comment, if we encounter a line starting with '#' we +call parseNestedPragma, which executes the following: +1. Save the current lexer input (loc, buf) for later +2. Set the current lexer input to the beginning of the line starting with '#' +3. Turn the 'InNestedComment' extension on +4. Push the 'bol' lexer state +5. Lex a token. Due to (2), (3), and (4), this should always lex a single line + or less and return the ITcomment_line_prag token. This may set source line + and file location if a #line pragma is successfully parsed +6. Restore lexer input and state to what they were before we did all this +7. Return control to the function parsing a nested comment, informing it of + what the lexer parsed + +Regarding (5) above: +Every exit from the 'bol' lexer state (do_bol, popLinePrag1, failLinePrag1) +checks if the 'InNestedComment' extension is set. If it is, that function will +return control to parseNestedPragma by returning the ITcomment_line_prag token. + +See #314 for more background on the bug this fixes. +-} + withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (RealLocated Token)) -> P (RealLocated Token) withLexedDocType lexDocComment = do @@ -1135,6 +1222,27 @@ rulePrag span buf len = do let !src = lexemeToString buf len return (L span (ITrules_prag (SourceText src))) +-- When 'use_pos_prags' is not set, it is expected that we emit a token instead +-- of updating the position in 'PState' +linePrag :: Action +linePrag span buf len = do + ps <- getPState + if use_pos_prags ps + then begin line_prag2 span buf len + else let !src = lexemeToString buf len + in return (L span (ITline_prag (SourceText src))) + +-- When 'use_pos_prags' is not set, it is expected that we emit a token instead +-- of updating the position in 'PState' +columnPrag :: Action +columnPrag span buf len = do + ps <- getPState + let !src = lexemeToString buf len + if use_pos_prags ps + then begin column_prag span buf len + else let !src = lexemeToString buf len + in return (L span (ITcolumn_prag (SourceText src))) + endPrag :: Action endPrag span _buf _len = do setExts (.&. complement (xbit InRulePragBit)) @@ -1210,15 +1318,14 @@ varid :: Action varid span buf len = case lookupUFM reservedWordsFM fs of Just (ITcase, _) -> do - lambdaCase <- extension lambdaCaseEnabled - keyword <- if lambdaCase - then do - lastTk <- getLastTk - return $ case lastTk of - Just ITlam -> ITlcase - _ -> ITcase - else - return ITcase + lastTk <- getLastTk + keyword <- case lastTk of + Just ITlam -> do + lambdaCase <- extension lambdaCaseEnabled + if lambdaCase + then return ITlcase + else failMsgP "Illegal lambda-case (use -XLambdaCase)" + _ -> return ITcase maybe_layout keyword return $ L span keyword Just (ITstatic, _) -> do @@ -1272,8 +1379,12 @@ tok_integral :: (SourceText -> Integer -> Token) -> Int -> Int -> (Integer, (Char -> Int)) -> Action -tok_integral itint transint transbuf translen (radix,char_to_int) span buf len - = return $ L span $ itint (SourceText $ lexemeToString buf len) +tok_integral itint transint transbuf translen (radix,char_to_int) span buf len = do + numericUnderscores <- extension numericUnderscoresEnabled -- #14473 + let src = lexemeToString buf len + if (not numericUnderscores) && ('_' `elem` src) + then failMsgP "Use NumericUnderscores to allow underscores in integer literals" + else return $ L span $ itint (SourceText src) $! transint $ parseUnsignedInteger (offsetBytes transbuf buf) (subtract translen len) radix char_to_int @@ -1305,15 +1416,32 @@ octal = (8,octDecDigit) hexadecimal = (16,hexDigit) -- readRational can understand negative rationals, exponents, everything. +tok_frac :: Int -> (String -> Token) -> Action +tok_frac drop f span buf len = do + numericUnderscores <- extension numericUnderscoresEnabled -- #14473 + let src = lexemeToString buf (len-drop) + if (not numericUnderscores) && ('_' `elem` src) + then failMsgP "Use NumericUnderscores to allow underscores in floating literals" + else return (L span $! (f $! src)) + tok_float, tok_primfloat, tok_primdouble :: String -> Token -tok_float str = ITrational $! readFractionalLit str -tok_primfloat str = ITprimfloat $! readFractionalLit str -tok_primdouble str = ITprimdouble $! readFractionalLit str +tok_float str = ITrational $! readFractionalLit str +tok_hex_float str = ITrational $! readHexFractionalLit str +tok_primfloat str = ITprimfloat $! readFractionalLit str +tok_primdouble str = ITprimdouble $! readFractionalLit str readFractionalLit :: String -> FractionalLit readFractionalLit str = ((FL $! (SourceText str)) $! is_neg) $! readRational str where is_neg = case str of ('-':_) -> True _ -> False +readHexFractionalLit :: String -> FractionalLit +readHexFractionalLit str = + FL { fl_text = SourceText str + , fl_neg = case str of + '-' : _ -> True + _ -> False + , fl_value = readHexRational str + } -- ----------------------------------------------------------------------------- -- Layout processing @@ -1321,20 +1449,23 @@ readFractionalLit str = ((FL $! (SourceText str)) $! is_neg) $! readRational str -- we're at the first token on a line, insert layout tokens if necessary do_bol :: Action do_bol span _str _len = do - (pos, gen_semic) <- getOffside - case pos of - LT -> do - --trace "layout: inserting '}'" $ do - popContext - -- do NOT pop the lex state, we might have a ';' to insert - return (L span ITvccurly) - EQ | gen_semic -> do - --trace "layout: inserting ';'" $ do - _ <- popLexState - return (L span ITsemi) - _ -> do - _ <- popLexState - lexToken + -- See Note [Nested comment line pragmas] + b <- extension inNestedComment + if b then return (L span ITcomment_line_prag) else do + (pos, gen_semic) <- getOffside + case pos of + LT -> do + --trace "layout: inserting '}'" $ do + popContext + -- do NOT pop the lex state, we might have a ';' to insert + return (L span ITvccurly) + EQ | gen_semic -> do + --trace "layout: inserting ';'" $ do + _ <- popLexState + return (L span ITsemi) + _ -> do + _ <- popLexState + lexToken -- certain keywords put us in the "layout" state, where we might -- add an opening curly brace. @@ -1394,29 +1525,13 @@ do_layout_left span _buf _len = do -- ----------------------------------------------------------------------------- -- LINE pragmas -setLine :: Int -> Action -setLine code span buf len = do - let line = parseUnsignedInteger buf len 10 octDecDigit - setSrcLoc (mkRealSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1) - -- subtract one: the line number refers to the *following* line - _ <- popLexState - pushLexState code - lexToken - -setColumn :: Action -setColumn span buf len = do - let column = - case reads (lexemeToString buf len) of - [(column, _)] -> column - _ -> error "setColumn: expected integer" -- shouldn't happen - setSrcLoc (mkRealSrcLoc (srcSpanFile span) (srcSpanEndLine span) - (fromIntegral (column :: Integer))) - _ <- popLexState - lexToken - -setFile :: Int -> Action -setFile code span buf len = do - let file = mkFastString (go (lexemeToString (stepOn buf) (len-2))) +setLineAndFile :: Int -> Action +setLineAndFile code span buf len = do + let src = lexemeToString buf (len - 1) -- drop trailing quotation mark + linenumLen = length $ head $ words src + linenum = parseUnsignedInteger buf linenumLen 10 octDecDigit + file = mkFastString $ go $ drop 1 $ dropWhile (/= '"') src + -- skip everything through first quotation mark to get to the filename where go ('\\':c:cs) = c : go cs go (c:cs) = c : go cs go [] = [] @@ -1430,12 +1545,24 @@ setFile code span buf len = do -- filenames and it does not remove duplicate -- backslashes after the drive letter (should it?). setAlrLastLoc $ alrInitialLoc file - setSrcLoc (mkRealSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span)) + setSrcLoc (mkRealSrcLoc file (fromIntegral linenum - 1) (srcSpanEndCol span)) + -- subtract one: the line number refers to the *following* line addSrcFile file _ <- popLexState pushLexState code lexToken +setColumn :: Action +setColumn span buf len = do + let column = + case reads (lexemeToString buf len) of + [(column, _)] -> column + _ -> error "setColumn: expected integer" -- shouldn't happen + setSrcLoc (mkRealSrcLoc (srcSpanFile span) (srcSpanEndLine span) + (fromIntegral (column :: Integer))) + _ <- popLexState + lexToken + alrInitialLoc :: FastString -> RealSrcSpan alrInitialLoc file = mkRealSrcSpan loc loc where -- This is a hack to ensure that the first line in a file @@ -1859,6 +1986,10 @@ data PState = PState { -- token doesn't need to close anything: alr_justClosedExplicitLetBlock :: Bool, + -- If this is enabled, '{-# LINE ... -#}' and '{-# COLUMN ... #-}' + -- update the 'loc' field. Otherwise, those pragmas are lexed as tokens. + use_pos_prags :: Bool, + -- The next three are used to implement Annotations giving the -- locations of 'noise' tokens in the source, so that users of -- the GHC API can do source to source conversions. @@ -1892,12 +2023,10 @@ instance Applicative P where instance Monad P where (>>=) = thenP - fail = failP + fail = MonadFail.fail -#if __GLASGOW_HASKELL__ > 710 instance MonadFail P where fail = failP -#endif returnP :: a -> P a returnP a = a `seq` (P $ \s -> POk s a) @@ -1970,27 +2099,29 @@ getLastTk = P $ \s@(PState { last_tk = last_tk }) -> POk s last_tk data AlexInput = AI RealSrcLoc StringBuffer -alexInputPrevChar :: AlexInput -> Char -alexInputPrevChar (AI _ buf) = prevChar buf '\n' +{- +Note [Unicode in Alex] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Although newer versions of Alex support unicode, this grammar is processed with +the old style '--latin1' behaviour. This means that when implementing the +functions --- backwards compatibility for Alex 2.x -alexGetChar :: AlexInput -> Maybe (Char,AlexInput) -alexGetChar inp = case alexGetByte inp of - Nothing -> Nothing - Just (b,i) -> c `seq` Just (c,i) - where c = chr $ fromIntegral b + alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) + alexInputPrevChar :: AlexInput -> Char -alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) -alexGetByte (AI loc s) - | atEnd s = Nothing - | otherwise = byte `seq` loc' `seq` s' `seq` - --trace (show (ord c)) $ - Just (byte, (AI loc' s')) - where (c,s') = nextChar s - loc' = advanceSrcLoc loc c - byte = fromIntegral $ ord adj_c +which Alex uses to take apart our 'AlexInput', we must - non_graphic = '\x00' + * return a latin1 character in the 'Word8' that 'alexGetByte' expects + * return a latin1 character in 'alexInputPrevChar'. + +We handle this in 'adjustChar' by squishing entire classes of unicode +characters into single bytes. +-} + +{-# INLINE adjustChar #-} +adjustChar :: Char -> Word8 +adjustChar c = fromIntegral $ ord adj_c + where non_graphic = '\x00' upper = '\x01' lower = '\x02' digit = '\x03' @@ -2036,6 +2167,32 @@ alexGetByte (AI loc s) Space -> space _other -> non_graphic +-- Getting the previous 'Char' isn't enough here - we need to convert it into +-- the same format that 'alexGetByte' would have produced. +-- +-- See Note [Unicode in Alex] and #13986. +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar (AI _ buf) = chr (fromIntegral (adjustChar pc)) + where pc = prevChar buf '\n' + +-- backwards compatibility for Alex 2.x +alexGetChar :: AlexInput -> Maybe (Char,AlexInput) +alexGetChar inp = case alexGetByte inp of + Nothing -> Nothing + Just (b,i) -> c `seq` Just (c,i) + where c = chr $ fromIntegral b + +-- See Note [Unicode in Alex] +alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) +alexGetByte (AI loc s) + | atEnd s = Nothing + | otherwise = byte `seq` loc' `seq` s' `seq` + --trace (show (ord c)) $ + Just (byte, (AI loc' s')) + where (c,s') = nextChar s + loc' = advanceSrcLoc loc c + byte = adjustChar c + -- This version does not squash unicode characters, it is used when -- lexing strings. alexGetChar' :: AlexInput -> Maybe (Char,AlexInput) @@ -2146,7 +2303,6 @@ data ExtBits = FfiBit | InterruptibleFfiBit | CApiFfiBit - | ParrBit | ArrowsBit | ThBit | ThQuotesBit @@ -2166,6 +2322,7 @@ data ExtBits | TransformComprehensionsBit | QqBit -- enable quasiquoting | InRulePragBit + | InNestedCommentBit -- See Note [Nested comment line pragmas] | RawTokenStreamBit -- producing a token stream with all comments included | SccProfilingOnBit | HpcBit @@ -2178,15 +2335,16 @@ data ExtBits | LambdaCaseBit | BinaryLiteralsBit | NegativeLiteralsBit + | HexFloatLiteralsBit | TypeApplicationsBit | StaticPointersBit + | NumericUnderscoresBit + | StarIsTypeBit deriving Enum always :: ExtsBitmap -> Bool always _ = True -parrEnabled :: ExtsBitmap -> Bool -parrEnabled = xtest ParrBit arrowsEnabled :: ExtsBitmap -> Bool arrowsEnabled = xtest ArrowsBit thEnabled :: ExtsBitmap -> Bool @@ -2217,6 +2375,8 @@ qqEnabled :: ExtsBitmap -> Bool qqEnabled = xtest QqBit inRulePrag :: ExtsBitmap -> Bool inRulePrag = xtest InRulePragBit +inNestedComment :: ExtsBitmap -> Bool +inNestedComment = xtest InNestedCommentBit rawTokenStreamEnabled :: ExtsBitmap -> Bool rawTokenStreamEnabled = xtest RawTokenStreamBit alternativeLayoutRule :: ExtsBitmap -> Bool @@ -2240,12 +2400,18 @@ binaryLiteralsEnabled :: ExtsBitmap -> Bool binaryLiteralsEnabled = xtest BinaryLiteralsBit negativeLiteralsEnabled :: ExtsBitmap -> Bool negativeLiteralsEnabled = xtest NegativeLiteralsBit +hexFloatLiteralsEnabled :: ExtsBitmap -> Bool +hexFloatLiteralsEnabled = xtest HexFloatLiteralsBit patternSynonymsEnabled :: ExtsBitmap -> Bool patternSynonymsEnabled = xtest PatternSynonymsBit typeApplicationEnabled :: ExtsBitmap -> Bool typeApplicationEnabled = xtest TypeApplicationsBit staticPointersEnabled :: ExtsBitmap -> Bool staticPointersEnabled = xtest StaticPointersBit +numericUnderscoresEnabled :: ExtsBitmap -> Bool +numericUnderscoresEnabled = xtest NumericUnderscoresBit +starIsTypeEnabled :: ExtsBitmap -> Bool +starIsTypeEnabled = xtest StarIsTypeBit -- PState for parsing options pragmas -- @@ -2264,46 +2430,55 @@ mkParserFlags flags = , pExtsBitmap = bitmap } where - bitmap = FfiBit `setBitIf` xopt LangExt.ForeignFunctionInterface flags - .|. InterruptibleFfiBit `setBitIf` xopt LangExt.InterruptibleFFI flags - .|. CApiFfiBit `setBitIf` xopt LangExt.CApiFFI flags - .|. ParrBit `setBitIf` xopt LangExt.ParallelArrays flags - .|. ArrowsBit `setBitIf` xopt LangExt.Arrows flags - .|. ThBit `setBitIf` xopt LangExt.TemplateHaskell flags - .|. ThQuotesBit `setBitIf` xopt LangExt.TemplateHaskellQuotes flags - .|. QqBit `setBitIf` xopt LangExt.QuasiQuotes flags - .|. IpBit `setBitIf` xopt LangExt.ImplicitParams flags - .|. OverloadedLabelsBit `setBitIf` xopt LangExt.OverloadedLabels flags - .|. ExplicitForallBit `setBitIf` xopt LangExt.ExplicitForAll flags - .|. BangPatBit `setBitIf` xopt LangExt.BangPatterns flags - .|. HaddockBit `setBitIf` gopt Opt_Haddock flags - .|. MagicHashBit `setBitIf` xopt LangExt.MagicHash flags - .|. RecursiveDoBit `setBitIf` xopt LangExt.RecursiveDo flags - .|. UnicodeSyntaxBit `setBitIf` xopt LangExt.UnicodeSyntax flags - .|. UnboxedTuplesBit `setBitIf` xopt LangExt.UnboxedTuples flags - .|. UnboxedSumsBit `setBitIf` xopt LangExt.UnboxedSums flags - .|. DatatypeContextsBit `setBitIf` xopt LangExt.DatatypeContexts flags - .|. TransformComprehensionsBit `setBitIf` xopt LangExt.TransformListComp flags - .|. TransformComprehensionsBit `setBitIf` xopt LangExt.MonadComprehensions flags - .|. RawTokenStreamBit `setBitIf` gopt Opt_KeepRawTokenStream flags - .|. HpcBit `setBitIf` gopt Opt_Hpc flags - .|. AlternativeLayoutRuleBit `setBitIf` xopt LangExt.AlternativeLayoutRule flags - .|. RelaxedLayoutBit `setBitIf` xopt LangExt.RelaxedLayout flags - .|. SccProfilingOnBit `setBitIf` gopt Opt_SccProfilingOn flags - .|. NondecreasingIndentationBit `setBitIf` xopt LangExt.NondecreasingIndentation flags - .|. SafeHaskellBit `setBitIf` safeImportsOn flags - .|. TraditionalRecordSyntaxBit `setBitIf` xopt LangExt.TraditionalRecordSyntax flags - .|. ExplicitNamespacesBit `setBitIf` xopt LangExt.ExplicitNamespaces flags - .|. LambdaCaseBit `setBitIf` xopt LangExt.LambdaCase flags - .|. BinaryLiteralsBit `setBitIf` xopt LangExt.BinaryLiterals flags - .|. NegativeLiteralsBit `setBitIf` xopt LangExt.NegativeLiterals flags - .|. PatternSynonymsBit `setBitIf` xopt LangExt.PatternSynonyms flags - .|. TypeApplicationsBit `setBitIf` xopt LangExt.TypeApplications flags - .|. StaticPointersBit `setBitIf` xopt LangExt.StaticPointers flags - - setBitIf :: ExtBits -> Bool -> ExtsBitmap - b `setBitIf` cond | cond = xbit b - | otherwise = 0 + bitmap = safeHaskellBit .|. langExtBits .|. optBits + safeHaskellBit = + SafeHaskellBit `setBitIf` safeImportsOn flags + langExtBits = + FfiBit `xoptBit` LangExt.ForeignFunctionInterface + .|. InterruptibleFfiBit `xoptBit` LangExt.InterruptibleFFI + .|. CApiFfiBit `xoptBit` LangExt.CApiFFI + .|. ArrowsBit `xoptBit` LangExt.Arrows + .|. ThBit `xoptBit` LangExt.TemplateHaskell + .|. ThQuotesBit `xoptBit` LangExt.TemplateHaskellQuotes + .|. QqBit `xoptBit` LangExt.QuasiQuotes + .|. IpBit `xoptBit` LangExt.ImplicitParams + .|. OverloadedLabelsBit `xoptBit` LangExt.OverloadedLabels + .|. ExplicitForallBit `xoptBit` LangExt.ExplicitForAll + .|. BangPatBit `xoptBit` LangExt.BangPatterns + .|. MagicHashBit `xoptBit` LangExt.MagicHash + .|. RecursiveDoBit `xoptBit` LangExt.RecursiveDo + .|. UnicodeSyntaxBit `xoptBit` LangExt.UnicodeSyntax + .|. UnboxedTuplesBit `xoptBit` LangExt.UnboxedTuples + .|. UnboxedSumsBit `xoptBit` LangExt.UnboxedSums + .|. DatatypeContextsBit `xoptBit` LangExt.DatatypeContexts + .|. TransformComprehensionsBit `xoptBit` LangExt.TransformListComp + .|. TransformComprehensionsBit `xoptBit` LangExt.MonadComprehensions + .|. AlternativeLayoutRuleBit `xoptBit` LangExt.AlternativeLayoutRule + .|. RelaxedLayoutBit `xoptBit` LangExt.RelaxedLayout + .|. NondecreasingIndentationBit `xoptBit` LangExt.NondecreasingIndentation + .|. TraditionalRecordSyntaxBit `xoptBit` LangExt.TraditionalRecordSyntax + .|. ExplicitNamespacesBit `xoptBit` LangExt.ExplicitNamespaces + .|. LambdaCaseBit `xoptBit` LangExt.LambdaCase + .|. BinaryLiteralsBit `xoptBit` LangExt.BinaryLiterals + .|. NegativeLiteralsBit `xoptBit` LangExt.NegativeLiterals + .|. HexFloatLiteralsBit `xoptBit` LangExt.HexFloatLiterals + .|. PatternSynonymsBit `xoptBit` LangExt.PatternSynonyms + .|. TypeApplicationsBit `xoptBit` LangExt.TypeApplications + .|. StaticPointersBit `xoptBit` LangExt.StaticPointers + .|. NumericUnderscoresBit `xoptBit` LangExt.NumericUnderscores + .|. StarIsTypeBit `xoptBit` LangExt.StarIsType + optBits = + HaddockBit `goptBit` Opt_Haddock + .|. RawTokenStreamBit `goptBit` Opt_KeepRawTokenStream + .|. HpcBit `goptBit` Opt_Hpc + .|. SccProfilingOnBit `goptBit` Opt_SccProfilingOn + + xoptBit bit ext = bit `setBitIf` xopt ext flags + goptBit bit opt = bit `setBitIf` gopt opt flags + + setBitIf :: ExtBits -> Bool -> ExtsBitmap + b `setBitIf` cond | cond = xbit b + | otherwise = 0 -- | Creates a parse state from a 'DynFlags' value mkPState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState @@ -2331,6 +2506,7 @@ mkPStatePure options buf loc = alr_context = [], alr_expecting_ocurly = Nothing, alr_justClosedExplicitLetBlock = False, + use_pos_prags = True, annotations = [], comment_q = [], annotations_comments = [] @@ -2742,14 +2918,14 @@ reportLexError loc1 loc2 buf str lexTokenStream :: StringBuffer -> RealSrcLoc -> DynFlags -> ParseResult [Located Token] lexTokenStream buf loc dflags = unP go initState where dflags' = gopt_set (gopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream - initState = mkPState dflags' buf loc + initState = (mkPState dflags' buf loc) { use_pos_prags = False } go = do ltok <- lexer False return case ltok of L _ ITeof -> return [] _ -> liftM (ltok:) go -linePrags = Map.singleton "line" (begin line_prag2) +linePrags = Map.singleton "line" linePrag fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag), ("options_ghc", lex_string_prag IToptions_prag), @@ -2785,8 +2961,6 @@ oneWordPrags = Map.fromList [ ("unpack", strtoken (\s -> ITunpack_prag (SourceText s))), ("nounpack", strtoken (\s -> ITnounpack_prag (SourceText s))), ("ann", strtoken (\s -> ITann_prag (SourceText s))), - ("vectorize", strtoken (\s -> ITvect_prag (SourceText s))), - ("novectorize", strtoken (\s -> ITnovect_prag (SourceText s))), ("minimal", strtoken (\s -> ITminimal_prag (SourceText s))), ("overlaps", strtoken (\s -> IToverlaps_prag (SourceText s))), ("overlappable", strtoken (\s -> IToverlappable_prag (SourceText s))), @@ -2794,10 +2968,10 @@ oneWordPrags = Map.fromList [ ("incoherent", strtoken (\s -> ITincoherent_prag (SourceText s))), ("ctype", strtoken (\s -> ITctype (SourceText s))), ("complete", strtoken (\s -> ITcomplete_prag (SourceText s))), - ("column", begin column_prag) + ("column", columnPrag) ] -twoWordPrags = Map.fromList([ +twoWordPrags = Map.fromList [ ("inline conlike", strtoken (\s -> (ITinline_prag (SourceText s) Inline ConLike))), ("notinline conlike", @@ -2805,9 +2979,8 @@ twoWordPrags = Map.fromList([ ("specialize inline", strtoken (\s -> (ITspec_inline_prag (SourceText s) True))), ("specialize notinline", - strtoken (\s -> (ITspec_inline_prag (SourceText s) False))), - ("vectorize scalar", - strtoken (\s -> ITvect_scalar_prag (SourceText s)))]) + strtoken (\s -> (ITspec_inline_prag (SourceText s) False))) + ] dispatch_pragmas :: Map String Action -> Action dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of @@ -2829,8 +3002,6 @@ clean_pragma prag = canon_ws (map toLower (unprefix prag)) canonical prag' = case prag' of "noinline" -> "notinline" "specialise" -> "specialize" - "vectorise" -> "vectorize" - "novectorise" -> "novectorize" "constructorlike" -> "conlike" _ -> prag' canon_ws s = unwords (map canonical (words s)) |