diff options
Diffstat (limited to 'compiler/GHC/Parser/Lexer.x')
-rw-r--r-- | compiler/GHC/Parser/Lexer.x | 3294 |
1 files changed, 3294 insertions, 0 deletions
diff --git a/compiler/GHC/Parser/Lexer.x b/compiler/GHC/Parser/Lexer.x new file mode 100644 index 0000000000..17b6674c95 --- /dev/null +++ b/compiler/GHC/Parser/Lexer.x @@ -0,0 +1,3294 @@ +----------------------------------------------------------------------------- +-- (c) The University of Glasgow, 2006 +-- +-- GHC's lexer for Haskell 2010 [1]. +-- +-- This is a combination of an Alex-generated lexer [2] from a regex +-- definition, with some hand-coded bits. [3] +-- +-- Completely accurate information about token-spans within the source +-- file is maintained. Every token has a start and end RealSrcLoc +-- attached to it. +-- +-- References: +-- [1] https://www.haskell.org/onlinereport/haskell2010/haskellch2.html +-- [2] http://www.haskell.org/alex/ +-- [3] https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/parser +-- +----------------------------------------------------------------------------- + +-- ToDo / known bugs: +-- - parsing integers is a bit slow +-- - readRational is a bit slow +-- +-- Known bugs, that were also in the previous version: +-- - M... should be 3 tokens, not 1. +-- - pragma-end should be only valid in a pragma + +-- qualified operator NOTES. +-- +-- - If M.(+) is a single lexeme, then.. +-- - Probably (+) should be a single lexeme too, for consistency. +-- Otherwise ( + ) would be a prefix operator, but M.( + ) would not be. +-- - But we have to rule out reserved operators, otherwise (..) becomes +-- a different lexeme. +-- - Should we therefore also rule out reserved operators in the qualified +-- form? This is quite difficult to achieve. We don't do it for +-- qualified varids. + + +-- ----------------------------------------------------------------------------- +-- Alex "Haskell code fragment top" + +{ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE MultiWayIf #-} + +{-# OPTIONS_GHC -funbox-strict-fields #-} +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +module GHC.Parser.Lexer ( + Token(..), lexer, lexerDbg, pragState, mkPState, mkPStatePure, PState(..), + P(..), ParseResult(..), mkParserFlags, mkParserFlags', ParserFlags(..), + appendWarning, + appendError, + allocateComments, + MonadP(..), + getRealSrcLoc, getPState, withThisPackage, + failMsgP, failLocMsgP, srcParseFail, + getErrorMessages, getMessages, + popContext, pushModuleContext, setLastToken, setSrcLoc, + activeContext, nextIsEOF, + getLexState, popLexState, pushLexState, + ExtBits(..), + xtest, + lexTokenStream, + AddAnn(..),mkParensApiAnn, + addAnnsAt, + commentToAnnotation + ) where + +import GhcPrelude + +-- base +import Control.Monad +import Data.Bits +import Data.Char +import Data.List +import Data.Maybe +import Data.Word + +import EnumSet (EnumSet) +import qualified EnumSet + +-- ghc-boot +import qualified GHC.LanguageExtensions as LangExt + +-- bytestring +import Data.ByteString (ByteString) + +-- containers +import Data.Map (Map) +import qualified Data.Map as Map + +-- compiler/utils +import Bag +import Outputable +import StringBuffer +import FastString +import GHC.Types.Unique.FM +import Util ( readRational, readHexRational ) + +-- compiler/main +import ErrUtils +import GHC.Driver.Session as DynFlags + +-- compiler/basicTypes +import GHC.Types.SrcLoc +import GHC.Types.Module +import GHC.Types.Basic ( InlineSpec(..), RuleMatchInfo(..), + IntegralLit(..), FractionalLit(..), + SourceText(..) ) + +-- compiler/parser +import GHC.Parser.CharClass + +import GHC.Parser.Annotation +} + +-- ----------------------------------------------------------------------------- +-- Alex "Character set macros" + +-- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs +-- Any changes here should likely be reflected there. +$unispace = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex]. +$nl = [\n\r\f] +$whitechar = [$nl\v\ $unispace] +$white_no_nl = $whitechar # \n -- TODO #8424 +$tab = \t + +$ascdigit = 0-9 +$unidigit = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex]. +$decdigit = $ascdigit -- for now, should really be $digit (ToDo) +$digit = [$ascdigit $unidigit] + +$special = [\(\)\,\;\[\]\`\{\}] +$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:] +$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex]. +$symbol = [$ascsymbol $unisymbol] # [$special \_\"\'] + +$unilarge = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex]. +$asclarge = [A-Z] +$large = [$asclarge $unilarge] + +$unismall = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex]. +$ascsmall = [a-z] +$small = [$ascsmall $unismall \_] + +$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex]. +$graphic = [$small $large $symbol $digit $special $unigraphic \"\'] + +$binit = 0-1 +$octit = 0-7 +$hexit = [$decdigit A-F a-f] + +$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex]. +$idchar = [$small $large $digit $uniidchar \'] + +$pragmachar = [$small $large $digit] + +$docsym = [\| \^ \* \$] + + +-- ----------------------------------------------------------------------------- +-- Alex "Regular expression macros" + +@varid = $small $idchar* -- variable identifiers +@conid = $large $idchar* -- constructor identifiers + +@varsym = ($symbol # \:) $symbol* -- variable (operator) symbol +@consym = \: $symbol* -- constructor (operator) symbol + +-- See Note [Lexing NumericUnderscores extension] and #14473 +@numspc = _* -- numeric spacer (#14473) +@decimal = $decdigit(@numspc $decdigit)* +@binary = $binit(@numspc $binit)* +@octal = $octit(@numspc $octit)* +@hexadecimal = $hexit(@numspc $hexit)* +@exponent = @numspc [eE] [\-\+]? @decimal +@bin_exponent = @numspc [pP] [\-\+]? @decimal + +@qual = (@conid \.)+ +@qvarid = @qual @varid +@qconid = @qual @conid +@qvarsym = @qual @varsym +@qconsym = @qual @consym + +@floating_point = @numspc @decimal \. @decimal @exponent? | @numspc @decimal @exponent +@hex_floating_point = @numspc @hexadecimal \. @hexadecimal @bin_exponent? | @numspc @hexadecimal @bin_exponent + +-- normal signed numerical literals can only be explicitly negative, +-- not explicitly positive (contrast @exponent) +@negative = \- +@signed = @negative ? + + +-- ----------------------------------------------------------------------------- +-- Alex "Identifier" + +haskell :- + + +-- ----------------------------------------------------------------------------- +-- Alex "Rules" + +-- everywhere: skip whitespace +$white_no_nl+ ; +$tab { warnTab } + +-- Everywhere: deal with nested comments. We explicitly rule out +-- pragmas, "{-#", so that we don't accidentally treat them as comments. +-- (this can happen even though pragmas will normally take precedence due to +-- longest-match, because pragmas aren't valid in every state, but comments +-- are). We also rule out nested Haddock comments, if the -haddock flag is +-- set. + +"{-" / { isNormalComment } { nested_comment lexToken } + +-- Single-line comments are a bit tricky. Haskell 98 says that two or +-- more dashes followed by a symbol should be parsed as a varsym, so we +-- have to exclude those. + +-- Since Haddock comments aren't valid in every state, we need to rule them +-- out here. + +-- The following two rules match comments that begin with two dashes, but +-- continue with a different character. The rules test that this character +-- is not a symbol (in which case we'd have a varsym), and that it's not a +-- space followed by a Haddock comment symbol (docsym) (in which case we'd +-- have a Haddock comment). The rules then munch the rest of the line. + +"-- " ~$docsym .* { lineCommentToken } +"--" [^$symbol \ ] .* { lineCommentToken } + +-- Next, match Haddock comments if no -haddock flag + +"-- " $docsym .* / { alexNotPred (ifExtension HaddockBit) } { lineCommentToken } + +-- Now, when we've matched comments that begin with 2 dashes and continue +-- with a different character, we need to match comments that begin with three +-- or more dashes (which clearly can't be Haddock comments). We only need to +-- make sure that the first non-dash character isn't a symbol, and munch the +-- rest of the line. + +"---"\-* ~$symbol .* { lineCommentToken } + +-- Since the previous rules all match dashes followed by at least one +-- character, we also need to match a whole line filled with just dashes. + +"--"\-* / { atEOL } { lineCommentToken } + +-- We need this rule since none of the other single line comment rules +-- actually match this case. + +"-- " / { atEOL } { lineCommentToken } + +-- 'bol' state: beginning of a line. Slurp up all the whitespace (including +-- blank lines) until we find a non-whitespace character, then do layout +-- processing. +-- +-- One slight wibble here: what if the line begins with {-#? In +-- theory, we have to lex the pragma to see if it's one we recognise, +-- and if it is, then we backtrack and do_bol, otherwise we treat it +-- as a nested comment. We don't bother with this: if the line begins +-- with {-#, then we'll assume it's a pragma we know about and go for do_bol. +<bol> { + \n ; + ^\# line { begin line_prag1 } + ^\# / { followedByDigit } { begin line_prag1 } + ^\# pragma .* \n ; -- GCC 3.3 CPP generated, apparently + ^\# \! .* \n ; -- #!, for scripts + () { do_bol } +} + +-- after a layout keyword (let, where, do, of), we begin a new layout +-- context if the curly brace is missing. +-- Careful! This stuff is quite delicate. +<layout, layout_do, layout_if> { + \{ / { notFollowedBy '-' } { hopefully_open_brace } + -- we might encounter {-# here, but {- has been handled already + \n ; + ^\# (line)? { begin line_prag1 } +} + +-- after an 'if', a vertical bar starts a layout context for MultiWayIf +<layout_if> { + \| / { notFollowedBySymbol } { new_layout_context True dontGenerateSemic ITvbar } + () { pop } +} + +-- do is treated in a subtly different way, see new_layout_context +<layout> () { new_layout_context True generateSemic ITvocurly } +<layout_do> () { new_layout_context False generateSemic ITvocurly } + +-- after a new layout context which was found to be to the left of the +-- previous context, we have generated a '{' token, and we now need to +-- generate a matching '}' token. +<layout_left> () { do_layout_left } + +<0,option_prags> \n { begin bol } + +"{-#" $whitechar* $pragmachar+ / { known_pragma linePrags } + { dispatch_pragmas linePrags } + +-- single-line line pragmas, of the form +-- # <line> "<file>" <extra-stuff> \n +<line_prag1> { + @decimal $white_no_nl+ \" [$graphic \ ]* \" { setLineAndFile line_prag1a } + () { failLinePrag1 } +} +<line_prag1a> .* { popLinePrag1 } + +-- Haskell-style line pragmas, of the form +-- {-# LINE <line> "<file>" #-} +<line_prag2> { + @decimal $white_no_nl+ \" [$graphic \ ]* \" { setLineAndFile line_prag2a } +} +<line_prag2a> "#-}"|"-}" { pop } + -- NOTE: accept -} at the end of a LINE pragma, for compatibility + -- with older versions of GHC which generated these. + +-- Haskell-style column pragmas, of the form +-- {-# COLUMN <column> #-} +<column_prag> @decimal $whitechar* "#-}" { setColumn } + +<0,option_prags> { + "{-#" $whitechar* $pragmachar+ + $whitechar+ $pragmachar+ / { known_pragma twoWordPrags } + { dispatch_pragmas twoWordPrags } + + "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags } + { dispatch_pragmas oneWordPrags } + + -- We ignore all these pragmas, but don't generate a warning for them + "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags } + { dispatch_pragmas ignoredPrags } + + -- ToDo: should only be valid inside a pragma: + "#-}" { endPrag } +} + +<option_prags> { + "{-#" $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags } + { dispatch_pragmas fileHeaderPrags } +} + +<0> { + -- In the "0" mode we ignore these pragmas + "{-#" $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags } + { nested_comment lexToken } +} + +<0,option_prags> { + "{-#" { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma") + (nested_comment lexToken) } +} + +-- '0' state: ordinary lexemes + +-- Haddock comments + +<0,option_prags> { + "-- " $docsym / { ifExtension HaddockBit } { multiline_doc_comment } + "{-" \ ? $docsym / { ifExtension HaddockBit } { nested_doc_comment } +} + +-- "special" symbols + +<0> { + "[|" / { ifExtension ThQuotesBit } { token (ITopenExpQuote NoE NormalSyntax) } + "[||" / { ifExtension ThQuotesBit } { token (ITopenTExpQuote NoE) } + "[e|" / { ifExtension ThQuotesBit } { token (ITopenExpQuote HasE NormalSyntax) } + "[e||" / { ifExtension ThQuotesBit } { token (ITopenTExpQuote HasE) } + "[p|" / { ifExtension ThQuotesBit } { token ITopenPatQuote } + "[d|" / { ifExtension ThQuotesBit } { layout_token ITopenDecQuote } + "[t|" / { ifExtension ThQuotesBit } { token ITopenTypQuote } + "|]" / { ifExtension ThQuotesBit } { token (ITcloseQuote NormalSyntax) } + "||]" / { ifExtension ThQuotesBit } { token ITcloseTExpQuote } + + "[" @varid "|" / { ifExtension QqBit } { lex_quasiquote_tok } + + -- qualified quasi-quote (#5555) + "[" @qvarid "|" / { ifExtension QqBit } { lex_qquasiquote_tok } + + $unigraphic -- ⟦ + / { ifCurrentChar '⟦' `alexAndPred` + ifExtension UnicodeSyntaxBit `alexAndPred` + ifExtension ThQuotesBit } + { token (ITopenExpQuote NoE UnicodeSyntax) } + $unigraphic -- ⟧ + / { ifCurrentChar '⟧' `alexAndPred` + ifExtension UnicodeSyntaxBit `alexAndPred` + ifExtension ThQuotesBit } + { token (ITcloseQuote UnicodeSyntax) } +} + +<0> { + "(|" + / { ifExtension ArrowsBit `alexAndPred` + notFollowedBySymbol } + { special (IToparenbar NormalSyntax) } + "|)" + / { ifExtension ArrowsBit } + { special (ITcparenbar NormalSyntax) } + + $unigraphic -- ⦇ + / { ifCurrentChar '⦇' `alexAndPred` + ifExtension UnicodeSyntaxBit `alexAndPred` + ifExtension ArrowsBit } + { special (IToparenbar UnicodeSyntax) } + $unigraphic -- ⦈ + / { ifCurrentChar '⦈' `alexAndPred` + ifExtension UnicodeSyntaxBit `alexAndPred` + ifExtension ArrowsBit } + { special (ITcparenbar UnicodeSyntax) } +} + +<0> { + \? @varid / { ifExtension IpBit } { skip_one_varid ITdupipvarid } +} + +<0> { + "#" @varid / { ifExtension OverloadedLabelsBit } { skip_one_varid ITlabelvarid } +} + +<0> { + "(#" / { ifExtension UnboxedTuplesBit `alexOrPred` + ifExtension UnboxedSumsBit } + { token IToubxparen } + "#)" / { ifExtension UnboxedTuplesBit `alexOrPred` + ifExtension UnboxedSumsBit } + { token ITcubxparen } +} + +<0,option_prags> { + \( { special IToparen } + \) { special ITcparen } + \[ { special ITobrack } + \] { special ITcbrack } + \, { special ITcomma } + \; { special ITsemi } + \` { special ITbackquote } + + \{ { open_brace } + \} { close_brace } +} + +<0,option_prags> { + @qvarid { idtoken qvarid } + @qconid { idtoken qconid } + @varid { varid } + @conid { idtoken conid } +} + +<0> { + @qvarid "#"+ / { ifExtension MagicHashBit } { idtoken qvarid } + @qconid "#"+ / { ifExtension MagicHashBit } { idtoken qconid } + @varid "#"+ / { ifExtension MagicHashBit } { varid } + @conid "#"+ / { ifExtension MagicHashBit } { idtoken conid } +} + +-- Operators classified into prefix, suffix, tight infix, and loose infix. +-- See Note [Whitespace-sensitive operator parsing] +<0> { + @varsym / { precededByClosingToken `alexAndPred` followedByOpeningToken } { varsym_tight_infix } + @varsym / { followedByOpeningToken } { varsym_prefix } + @varsym / { precededByClosingToken } { varsym_suffix } + @varsym { varsym_loose_infix } +} + +-- ToDo: - move `var` and (sym) into lexical syntax? +-- - remove backquote from $special? +<0> { + @qvarsym { idtoken qvarsym } + @qconsym { idtoken qconsym } + @consym { consym } +} + +-- For the normal boxed literals we need to be careful +-- when trying to be close to Haskell98 + +-- Note [Lexing NumericUnderscores extension] (#14473) +-- +-- NumericUnderscores extension allows underscores in numeric literals. +-- Multiple underscores are represented with @numspc macro. +-- To be simpler, we have only the definitions with underscores. +-- And then we have a separate function (tok_integral and tok_frac) +-- that validates the literals. +-- If extensions are not enabled, check that there are no underscores. +-- +<0> { + -- Normal integral literals (:: Num a => a, from Integer) + @decimal { tok_num positive 0 0 decimal } + 0[bB] @numspc @binary / { ifExtension BinaryLiteralsBit } { tok_num positive 2 2 binary } + 0[oO] @numspc @octal { tok_num positive 2 2 octal } + 0[xX] @numspc @hexadecimal { tok_num positive 2 2 hexadecimal } + @negative @decimal / { ifExtension NegativeLiteralsBit } { tok_num negative 1 1 decimal } + @negative 0[bB] @numspc @binary / { ifExtension NegativeLiteralsBit `alexAndPred` + ifExtension BinaryLiteralsBit } { tok_num negative 3 3 binary } + @negative 0[oO] @numspc @octal / { ifExtension NegativeLiteralsBit } { tok_num negative 3 3 octal } + @negative 0[xX] @numspc @hexadecimal / { ifExtension NegativeLiteralsBit } { tok_num negative 3 3 hexadecimal } + + -- Normal rational literals (:: Fractional a => a, from Rational) + @floating_point { tok_frac 0 tok_float } + @negative @floating_point / { ifExtension NegativeLiteralsBit } { tok_frac 0 tok_float } + 0[xX] @numspc @hex_floating_point / { ifExtension HexFloatLiteralsBit } { tok_frac 0 tok_hex_float } + @negative 0[xX] @numspc @hex_floating_point + / { ifExtension HexFloatLiteralsBit `alexAndPred` + ifExtension NegativeLiteralsBit } { tok_frac 0 tok_hex_float } +} + +<0> { + -- Unboxed ints (:: Int#) and words (:: Word#) + -- It's simpler (and faster?) to give separate cases to the negatives, + -- especially considering octal/hexadecimal prefixes. + @decimal \# / { ifExtension MagicHashBit } { tok_primint positive 0 1 decimal } + 0[bB] @numspc @binary \# / { ifExtension MagicHashBit `alexAndPred` + ifExtension BinaryLiteralsBit } { tok_primint positive 2 3 binary } + 0[oO] @numspc @octal \# / { ifExtension MagicHashBit } { tok_primint positive 2 3 octal } + 0[xX] @numspc @hexadecimal \# / { ifExtension MagicHashBit } { tok_primint positive 2 3 hexadecimal } + @negative @decimal \# / { ifExtension MagicHashBit } { tok_primint negative 1 2 decimal } + @negative 0[bB] @numspc @binary \# / { ifExtension MagicHashBit `alexAndPred` + ifExtension BinaryLiteralsBit } { tok_primint negative 3 4 binary } + @negative 0[oO] @numspc @octal \# / { ifExtension MagicHashBit } { tok_primint negative 3 4 octal } + @negative 0[xX] @numspc @hexadecimal \# + / { ifExtension MagicHashBit } { tok_primint negative 3 4 hexadecimal } + + @decimal \# \# / { ifExtension MagicHashBit } { tok_primword 0 2 decimal } + 0[bB] @numspc @binary \# \# / { ifExtension MagicHashBit `alexAndPred` + ifExtension BinaryLiteralsBit } { tok_primword 2 4 binary } + 0[oO] @numspc @octal \# \# / { ifExtension MagicHashBit } { tok_primword 2 4 octal } + 0[xX] @numspc @hexadecimal \# \# / { ifExtension MagicHashBit } { tok_primword 2 4 hexadecimal } + + -- Unboxed floats and doubles (:: Float#, :: Double#) + -- prim_{float,double} work with signed literals + @signed @floating_point \# / { ifExtension MagicHashBit } { tok_frac 1 tok_primfloat } + @signed @floating_point \# \# / { ifExtension MagicHashBit } { tok_frac 2 tok_primdouble } +} + +-- Strings and chars are lexed by hand-written code. The reason is +-- that even if we recognise the string or char here in the regex +-- lexer, we would still have to parse the string afterward in order +-- to convert it to a String. +<0> { + \' { lex_char_tok } + \" { lex_string_tok } +} + +-- Note [Whitespace-sensitive operator parsing] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- In accord with GHC Proposal #229 https://github.com/ghc-proposals/ghc-proposals/blob/master/proposals/0229-whitespace-bang-patterns.rst +-- we classify operator occurrences into four categories: +-- +-- a ! b -- a loose infix occurrence +-- a!b -- a tight infix occurrence +-- a !b -- a prefix occurrence +-- a! b -- a suffix occurrence +-- +-- The rules are a bit more elaborate than simply checking for whitespace, in +-- order to accommodate the following use cases: +-- +-- f (!a) = ... -- prefix occurrence +-- g (a !) -- loose infix occurrence +-- g (! a) -- loose infix occurrence +-- +-- The precise rules are as follows: +-- +-- * Identifiers, literals, and opening brackets (, (#, [, [|, [||, [p|, [e|, +-- [t|, {, are considered "opening tokens". The function followedByOpeningToken +-- tests whether the next token is an opening token. +-- +-- * Identifiers, literals, and closing brackets ), #), ], |], }, +-- are considered "closing tokens". The function precededByClosingToken tests +-- whether the previous token is a closing token. +-- +-- * Whitespace, comments, separators, and other tokens, are considered +-- neither opening nor closing. +-- +-- * Any unqualified operator occurrence is classified as prefix, suffix, or +-- tight/loose infix, based on preceding and following tokens: +-- +-- precededByClosingToken | followedByOpeningToken | Occurrence +-- ------------------------+------------------------+------------ +-- False | True | prefix +-- True | False | suffix +-- True | True | tight infix +-- False | False | loose infix +-- ------------------------+------------------------+------------ +-- +-- A loose infix occurrence is always considered an operator. Other types of +-- occurrences may be assigned a special per-operator meaning override: +-- +-- Operator | Occurrence | Token returned +-- ----------+---------------+------------------------------------------ +-- ! | prefix | ITbang +-- | | strictness annotation or bang pattern, +-- | | e.g. f !x = rhs, data T = MkT !a +-- | not prefix | ITvarsym "!" +-- | | ordinary operator or type operator, +-- | | e.g. xs ! 3, (! x), Int ! Bool +-- ----------+---------------+------------------------------------------ +-- ~ | prefix | ITtilde +-- | | laziness annotation or lazy pattern, +-- | | e.g. f ~x = rhs, data T = MkT ~a +-- | not prefix | ITvarsym "~" +-- | | ordinary operator or type operator, +-- | | e.g. xs ~ 3, (~ x), Int ~ Bool +-- ----------+---------------+------------------------------------------ +-- $ $$ | prefix | ITdollar, ITdollardollar +-- | | untyped or typed Template Haskell splice, +-- | | e.g. $(f x), $$(f x), $$"str" +-- | not prefix | ITvarsym "$", ITvarsym "$$" +-- | | ordinary operator or type operator, +-- | | e.g. f $ g x, a $$ b +-- ----------+---------------+------------------------------------------ +-- @ | prefix | ITtypeApp +-- | | type application, e.g. fmap @Maybe +-- | tight infix | ITat +-- | | as-pattern, e.g. f p@(a,b) = rhs +-- | suffix | parse error +-- | | e.g. f p@ x = rhs +-- | loose infix | ITvarsym "@" +-- | | ordinary operator or type operator, +-- | | e.g. f @ g, (f @) +-- ----------+---------------+------------------------------------------ +-- +-- Also, some of these overrides are guarded behind language extensions. +-- According to the specification, we must determine the occurrence based on +-- surrounding *tokens* (see the proposal for the exact rules). However, in +-- the implementation we cheat a little and do the classification based on +-- characters, for reasons of both simplicity and efficiency (see +-- 'followedByOpeningToken' and 'precededByClosingToken') +-- +-- When an operator is subject to a meaning override, it is mapped to special +-- token: ITbang, ITtilde, ITat, ITdollar, ITdollardollar. Otherwise, it is +-- returned as ITvarsym. +-- +-- For example, this is how we process the (!): +-- +-- precededByClosingToken | followedByOpeningToken | Token +-- ------------------------+------------------------+------------- +-- False | True | ITbang +-- True | False | ITvarsym "!" +-- True | True | ITvarsym "!" +-- False | False | ITvarsym "!" +-- ------------------------+------------------------+------------- +-- +-- And this is how we process the (@): +-- +-- precededByClosingToken | followedByOpeningToken | Token +-- ------------------------+------------------------+------------- +-- False | True | ITtypeApp +-- True | False | parse error +-- True | True | ITat +-- False | False | ITvarsym "@" +-- ------------------------+------------------------+------------- + +-- ----------------------------------------------------------------------------- +-- Alex "Haskell code fragment bottom" + +{ + +-- ----------------------------------------------------------------------------- +-- The token type + +data Token + = ITas -- Haskell keywords + | ITcase + | ITclass + | ITdata + | ITdefault + | ITderiving + | ITdo + | ITelse + | IThiding + | ITforeign + | ITif + | ITimport + | ITin + | ITinfix + | ITinfixl + | ITinfixr + | ITinstance + | ITlet + | ITmodule + | ITnewtype + | ITof + | ITqualified + | ITthen + | ITtype + | ITwhere + + | ITforall IsUnicodeSyntax -- GHC extension keywords + | ITexport + | ITlabel + | ITdynamic + | ITsafe + | ITinterruptible + | ITunsafe + | ITstdcallconv + | ITccallconv + | ITcapiconv + | ITprimcallconv + | ITjavascriptcallconv + | ITmdo + | ITfamily + | ITrole + | ITgroup + | ITby + | ITusing + | ITpattern + | ITstatic + | ITstock + | ITanyclass + | ITvia + + -- Backpack tokens + | ITunit + | ITsignature + | ITdependency + | ITrequires + + -- Pragmas, see note [Pragma source text] in BasicTypes + | ITinline_prag SourceText InlineSpec RuleMatchInfo + | ITspec_prag SourceText -- SPECIALISE + | ITspec_inline_prag SourceText Bool -- SPECIALISE INLINE (or NOINLINE) + | ITsource_prag SourceText + | ITrules_prag SourceText + | ITwarning_prag SourceText + | ITdeprecated_prag SourceText + | ITline_prag SourceText -- not usually produced, see 'UsePosPragsBit' + | ITcolumn_prag SourceText -- not usually produced, see 'UsePosPragsBit' + | ITscc_prag SourceText + | ITgenerated_prag SourceText + | ITcore_prag SourceText -- hdaume: core annotations + | ITunpack_prag SourceText + | ITnounpack_prag SourceText + | ITann_prag SourceText + | ITcomplete_prag SourceText + | ITclose_prag + | IToptions_prag String + | ITinclude_prag String + | ITlanguage_prag + | ITminimal_prag SourceText + | IToverlappable_prag SourceText -- instance overlap mode + | IToverlapping_prag SourceText -- instance overlap mode + | IToverlaps_prag SourceText -- instance overlap mode + | ITincoherent_prag SourceText -- instance overlap mode + | ITctype SourceText + | ITcomment_line_prag -- See Note [Nested comment line pragmas] + + | ITdotdot -- reserved symbols + | ITcolon + | ITdcolon IsUnicodeSyntax + | ITequal + | ITlam + | ITlcase + | ITvbar + | ITlarrow IsUnicodeSyntax + | ITrarrow IsUnicodeSyntax + | ITdarrow IsUnicodeSyntax + | ITminus + | ITbang -- Prefix (!) only, e.g. f !x = rhs + | ITtilde -- Prefix (~) only, e.g. f ~x = rhs + | ITat -- Tight infix (@) only, e.g. f x@pat = rhs + | ITtypeApp -- Prefix (@) only, e.g. f @t + | ITstar IsUnicodeSyntax + | ITdot + + | ITbiglam -- GHC-extension symbols + + | ITocurly -- special symbols + | ITccurly + | ITvocurly + | ITvccurly + | ITobrack + | ITopabrack -- [:, for parallel arrays with -XParallelArrays + | ITcpabrack -- :], for parallel arrays with -XParallelArrays + | ITcbrack + | IToparen + | ITcparen + | IToubxparen + | ITcubxparen + | ITsemi + | ITcomma + | ITunderscore + | ITbackquote + | ITsimpleQuote -- ' + + | ITvarid FastString -- identifiers + | ITconid FastString + | ITvarsym FastString + | ITconsym FastString + | ITqvarid (FastString,FastString) + | ITqconid (FastString,FastString) + | ITqvarsym (FastString,FastString) + | ITqconsym (FastString,FastString) + + | ITdupipvarid FastString -- GHC extension: implicit param: ?x + | ITlabelvarid FastString -- Overloaded label: #x + + | ITchar SourceText Char -- Note [Literal source text] in BasicTypes + | ITstring SourceText FastString -- Note [Literal source text] in BasicTypes + | ITinteger IntegralLit -- Note [Literal source text] in BasicTypes + | ITrational FractionalLit + + | ITprimchar SourceText Char -- Note [Literal source text] in BasicTypes + | ITprimstring SourceText ByteString -- Note [Literal source text] @BasicTypes + | ITprimint SourceText Integer -- Note [Literal source text] in BasicTypes + | ITprimword SourceText Integer -- Note [Literal source text] in BasicTypes + | ITprimfloat FractionalLit + | ITprimdouble FractionalLit + + -- Template Haskell extension tokens + | ITopenExpQuote HasE IsUnicodeSyntax -- [| or [e| + | ITopenPatQuote -- [p| + | ITopenDecQuote -- [d| + | ITopenTypQuote -- [t| + | ITcloseQuote IsUnicodeSyntax -- |] + | ITopenTExpQuote HasE -- [|| or [e|| + | ITcloseTExpQuote -- ||] + | ITdollar -- prefix $ + | ITdollardollar -- prefix $$ + | ITtyQuote -- '' + | ITquasiQuote (FastString,FastString,PsSpan) + -- ITquasiQuote(quoter, quote, loc) + -- represents a quasi-quote of the form + -- [quoter| quote |] + | ITqQuasiQuote (FastString,FastString,FastString,PsSpan) + -- ITqQuasiQuote(Qual, quoter, quote, loc) + -- represents a qualified quasi-quote of the form + -- [Qual.quoter| quote |] + + -- Arrow notation extension + | ITproc + | ITrec + | IToparenbar IsUnicodeSyntax -- ^ @(|@ + | ITcparenbar IsUnicodeSyntax -- ^ @|)@ + | ITlarrowtail IsUnicodeSyntax -- ^ @-<@ + | ITrarrowtail IsUnicodeSyntax -- ^ @>-@ + | ITLarrowtail IsUnicodeSyntax -- ^ @-<<@ + | ITRarrowtail IsUnicodeSyntax -- ^ @>>-@ + + | ITunknown String -- ^ Used when the lexer can't make sense of it + | ITeof -- ^ end of file token + + -- Documentation annotations + | ITdocCommentNext String -- ^ something beginning @-- |@ + | ITdocCommentPrev String -- ^ something beginning @-- ^@ + | ITdocCommentNamed String -- ^ something beginning @-- $@ + | ITdocSection Int String -- ^ a section heading + | ITdocOptions String -- ^ doc options (prune, ignore-exports, etc) + | ITlineComment String -- ^ comment starting by "--" + | ITblockComment String -- ^ comment in {- -} + + deriving Show + +instance Outputable Token where + ppr x = text (show x) + + +-- the bitmap provided as the third component indicates whether the +-- corresponding extension keyword is valid under the extension options +-- provided to the compiler; if the extension corresponding to *any* of the +-- bits set in the bitmap is enabled, the keyword is valid (this setup +-- facilitates using a keyword in two different extensions that can be +-- activated independently) +-- +reservedWordsFM :: UniqFM (Token, ExtsBitmap) +reservedWordsFM = listToUFM $ + map (\(x, y, z) -> (mkFastString x, (y, z))) + [( "_", ITunderscore, 0 ), + ( "as", ITas, 0 ), + ( "case", ITcase, 0 ), + ( "class", ITclass, 0 ), + ( "data", ITdata, 0 ), + ( "default", ITdefault, 0 ), + ( "deriving", ITderiving, 0 ), + ( "do", ITdo, 0 ), + ( "else", ITelse, 0 ), + ( "hiding", IThiding, 0 ), + ( "if", ITif, 0 ), + ( "import", ITimport, 0 ), + ( "in", ITin, 0 ), + ( "infix", ITinfix, 0 ), + ( "infixl", ITinfixl, 0 ), + ( "infixr", ITinfixr, 0 ), + ( "instance", ITinstance, 0 ), + ( "let", ITlet, 0 ), + ( "module", ITmodule, 0 ), + ( "newtype", ITnewtype, 0 ), + ( "of", ITof, 0 ), + ( "qualified", ITqualified, 0 ), + ( "then", ITthen, 0 ), + ( "type", ITtype, 0 ), + ( "where", ITwhere, 0 ), + + ( "forall", ITforall NormalSyntax, 0), + ( "mdo", ITmdo, xbit RecursiveDoBit), + -- See Note [Lexing type pseudo-keywords] + ( "family", ITfamily, 0 ), + ( "role", ITrole, 0 ), + ( "pattern", ITpattern, xbit PatternSynonymsBit), + ( "static", ITstatic, xbit StaticPointersBit ), + ( "stock", ITstock, 0 ), + ( "anyclass", ITanyclass, 0 ), + ( "via", ITvia, 0 ), + ( "group", ITgroup, xbit TransformComprehensionsBit), + ( "by", ITby, xbit TransformComprehensionsBit), + ( "using", ITusing, xbit TransformComprehensionsBit), + + ( "foreign", ITforeign, xbit FfiBit), + ( "export", ITexport, xbit FfiBit), + ( "label", ITlabel, xbit FfiBit), + ( "dynamic", ITdynamic, xbit FfiBit), + ( "safe", ITsafe, xbit FfiBit .|. + xbit SafeHaskellBit), + ( "interruptible", ITinterruptible, xbit InterruptibleFfiBit), + ( "unsafe", ITunsafe, xbit FfiBit), + ( "stdcall", ITstdcallconv, xbit FfiBit), + ( "ccall", ITccallconv, xbit FfiBit), + ( "capi", ITcapiconv, xbit CApiFfiBit), + ( "prim", ITprimcallconv, xbit FfiBit), + ( "javascript", ITjavascriptcallconv, xbit FfiBit), + + ( "unit", ITunit, 0 ), + ( "dependency", ITdependency, 0 ), + ( "signature", ITsignature, 0 ), + + ( "rec", ITrec, xbit ArrowsBit .|. + xbit RecursiveDoBit), + ( "proc", ITproc, xbit ArrowsBit) + ] + +{----------------------------------- +Note [Lexing type pseudo-keywords] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One might think that we wish to treat 'family' and 'role' as regular old +varids whenever -XTypeFamilies and -XRoleAnnotations are off, respectively. +But, there is no need to do so. These pseudo-keywords are not stolen syntax: +they are only used after the keyword 'type' at the top-level, where varids are +not allowed. Furthermore, checks further downstream (GHC.Tc.TyCl) ensure that +type families and role annotations are never declared without their extensions +on. In fact, by unconditionally lexing these pseudo-keywords as special, we +can get better error messages. + +Also, note that these are included in the `varid` production in the parser -- +a key detail to make all this work. +-------------------------------------} + +reservedSymsFM :: UniqFM (Token, IsUnicodeSyntax, ExtsBitmap) +reservedSymsFM = listToUFM $ + map (\ (x,w,y,z) -> (mkFastString x,(w,y,z))) + [ ("..", ITdotdot, NormalSyntax, 0 ) + -- (:) is a reserved op, meaning only list cons + ,(":", ITcolon, NormalSyntax, 0 ) + ,("::", ITdcolon NormalSyntax, NormalSyntax, 0 ) + ,("=", ITequal, NormalSyntax, 0 ) + ,("\\", ITlam, NormalSyntax, 0 ) + ,("|", ITvbar, NormalSyntax, 0 ) + ,("<-", ITlarrow NormalSyntax, NormalSyntax, 0 ) + ,("->", ITrarrow NormalSyntax, NormalSyntax, 0 ) + ,("=>", ITdarrow NormalSyntax, NormalSyntax, 0 ) + ,("-", ITminus, NormalSyntax, 0 ) + + ,("*", ITstar NormalSyntax, NormalSyntax, xbit StarIsTypeBit) + + -- For 'forall a . t' + ,(".", ITdot, NormalSyntax, 0 ) + + ,("-<", ITlarrowtail NormalSyntax, NormalSyntax, xbit ArrowsBit) + ,(">-", ITrarrowtail NormalSyntax, NormalSyntax, xbit ArrowsBit) + ,("-<<", ITLarrowtail NormalSyntax, NormalSyntax, xbit ArrowsBit) + ,(">>-", ITRarrowtail NormalSyntax, NormalSyntax, xbit ArrowsBit) + + ,("∷", ITdcolon UnicodeSyntax, UnicodeSyntax, 0 ) + ,("⇒", ITdarrow UnicodeSyntax, UnicodeSyntax, 0 ) + ,("∀", ITforall UnicodeSyntax, UnicodeSyntax, 0 ) + ,("→", ITrarrow UnicodeSyntax, UnicodeSyntax, 0 ) + ,("←", ITlarrow UnicodeSyntax, UnicodeSyntax, 0 ) + + ,("⤙", ITlarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit) + ,("⤚", ITrarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit) + ,("⤛", ITLarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit) + ,("⤜", ITRarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit) + + ,("★", ITstar UnicodeSyntax, UnicodeSyntax, xbit StarIsTypeBit) + + -- ToDo: ideally, → and ∷ should be "specials", so that they cannot + -- form part of a large operator. This would let us have a better + -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe). + ] + +-- ----------------------------------------------------------------------------- +-- Lexer actions + +type Action = PsSpan -> StringBuffer -> Int -> P (PsLocated Token) + +special :: Token -> Action +special tok span _buf _len = return (L span tok) + +token, layout_token :: Token -> Action +token t span _buf _len = return (L span t) +layout_token t span _buf _len = pushLexState layout >> return (L span t) + +idtoken :: (StringBuffer -> Int -> Token) -> Action +idtoken f span buf len = return (L span $! (f buf len)) + +skip_one_varid :: (FastString -> Token) -> Action +skip_one_varid f span buf len + = return (L span $! f (lexemeToFastString (stepOn buf) (len-1))) + +skip_two_varid :: (FastString -> Token) -> Action +skip_two_varid f span buf len + = return (L span $! f (lexemeToFastString (stepOn (stepOn buf)) (len-2))) + +strtoken :: (String -> Token) -> Action +strtoken f span buf len = + return (L span $! (f $! lexemeToString buf len)) + +begin :: Int -> Action +begin code _span _str _len = do pushLexState code; lexToken + +pop :: Action +pop _span _buf _len = do _ <- popLexState + lexToken +-- See Note [Nested comment line pragmas] +failLinePrag1 :: Action +failLinePrag1 span _buf _len = do + b <- getBit InNestedCommentBit + if b then return (L span ITcomment_line_prag) + else lexError "lexical error in pragma" + +-- See Note [Nested comment line pragmas] +popLinePrag1 :: Action +popLinePrag1 span _buf _len = do + b <- getBit InNestedCommentBit + if b then return (L span ITcomment_line_prag) else do + _ <- popLexState + lexToken + +hopefully_open_brace :: Action +hopefully_open_brace span buf len + = do relaxed <- getBit RelaxedLayoutBit + ctx <- getContext + (AI l _) <- getInput + let offset = srcLocCol (psRealLoc l) + isOK = relaxed || + case ctx of + Layout prev_off _ : _ -> prev_off < offset + _ -> True + if isOK then pop_and open_brace span buf len + else addFatalError (mkSrcSpanPs span) (text "Missing block") + +pop_and :: Action -> Action +pop_and act span buf len = do _ <- popLexState + act span buf len + +-- See Note [Whitespace-sensitive operator parsing] +followedByOpeningToken :: AlexAccPred ExtsBitmap +followedByOpeningToken _ _ _ (AI _ buf) + | atEnd buf = False + | otherwise = + case nextChar buf of + ('{', buf') -> nextCharIsNot buf' (== '-') + ('(', _) -> True + ('[', _) -> True + ('\"', _) -> True + ('\'', _) -> True + ('_', _) -> True + (c, _) -> isAlphaNum c + +-- See Note [Whitespace-sensitive operator parsing] +precededByClosingToken :: AlexAccPred ExtsBitmap +precededByClosingToken _ (AI _ buf) _ _ = + case prevChar buf '\n' of + '}' -> decodePrevNChars 1 buf /= "-" + ')' -> True + ']' -> True + '\"' -> True + '\'' -> True + '_' -> True + c -> isAlphaNum c + +{-# INLINE nextCharIs #-} +nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool +nextCharIs buf p = not (atEnd buf) && p (currentChar buf) + +{-# INLINE nextCharIsNot #-} +nextCharIsNot :: StringBuffer -> (Char -> Bool) -> Bool +nextCharIsNot buf p = not (nextCharIs buf p) + +notFollowedBy :: Char -> AlexAccPred ExtsBitmap +notFollowedBy char _ _ _ (AI _ buf) + = nextCharIsNot buf (== char) + +notFollowedBySymbol :: AlexAccPred ExtsBitmap +notFollowedBySymbol _ _ _ (AI _ buf) + = nextCharIsNot buf (`elem` "!#$%&*+./<=>?@\\^|-~") + +followedByDigit :: AlexAccPred ExtsBitmap +followedByDigit _ _ _ (AI _ buf) + = afterOptionalSpace buf (\b -> nextCharIs b (`elem` ['0'..'9'])) + +ifCurrentChar :: Char -> AlexAccPred ExtsBitmap +ifCurrentChar char _ (AI _ buf) _ _ + = nextCharIs buf (== char) + +-- We must reject doc comments as being ordinary comments everywhere. +-- In some cases the doc comment will be selected as the lexeme due to +-- maximal munch, but not always, because the nested comment rule is +-- valid in all states, but the doc-comment rules are only valid in +-- the non-layout states. +isNormalComment :: AlexAccPred ExtsBitmap +isNormalComment bits _ _ (AI _ buf) + | HaddockBit `xtest` bits = notFollowedByDocOrPragma + | otherwise = nextCharIsNot buf (== '#') + where + notFollowedByDocOrPragma + = afterOptionalSpace buf (\b -> nextCharIsNot b (`elem` "|^*$#")) + +afterOptionalSpace :: StringBuffer -> (StringBuffer -> Bool) -> Bool +afterOptionalSpace buf p + = if nextCharIs buf (== ' ') + then p (snd (nextChar buf)) + else p buf + +atEOL :: AlexAccPred ExtsBitmap +atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n' + +ifExtension :: ExtBits -> AlexAccPred ExtsBitmap +ifExtension extBits bits _ _ _ = extBits `xtest` bits + +alexNotPred p userState in1 len in2 + = not (p userState in1 len in2) + +alexOrPred p1 p2 userState in1 len in2 + = p1 userState in1 len in2 || p2 userState in1 len in2 + +multiline_doc_comment :: Action +multiline_doc_comment span buf _len = withLexedDocType (worker "") + where + worker commentAcc input docType checkNextLine = case alexGetChar' input of + Just ('\n', input') + | checkNextLine -> case checkIfCommentLine input' of + Just input -> worker ('\n':commentAcc) input docType checkNextLine + Nothing -> docCommentEnd input commentAcc docType buf span + | otherwise -> docCommentEnd input commentAcc docType buf span + Just (c, input) -> worker (c:commentAcc) input docType checkNextLine + Nothing -> docCommentEnd input commentAcc docType buf span + + -- Check if the next line of input belongs to this doc comment as well. + -- A doc comment continues onto the next line when the following + -- conditions are met: + -- * The line starts with "--" + -- * The line doesn't start with "---". + -- * The line doesn't start with "-- $", because that would be the + -- start of a /new/ named haddock chunk (#10398). + checkIfCommentLine :: AlexInput -> Maybe AlexInput + checkIfCommentLine input = check (dropNonNewlineSpace input) + where + check input = do + ('-', input) <- alexGetChar' input + ('-', input) <- alexGetChar' input + (c, after_c) <- alexGetChar' input + case c of + '-' -> Nothing + ' ' -> case alexGetChar' after_c of + Just ('$', _) -> Nothing + _ -> Just input + _ -> Just input + + dropNonNewlineSpace input = case alexGetChar' input of + Just (c, input') + | isSpace c && c /= '\n' -> dropNonNewlineSpace input' + | otherwise -> input + Nothing -> input + +lineCommentToken :: Action +lineCommentToken span buf len = do + b <- getBit RawTokenStreamBit + if b then strtoken ITlineComment span buf len else lexToken + +{- + nested comments require traversing by hand, they can't be parsed + using regular expressions. +-} +nested_comment :: P (PsLocated Token) -> Action +nested_comment cont span buf len = do + input <- getInput + go (reverse $ lexemeToString buf len) (1::Int) input + where + go commentAcc 0 input = do + setInput input + b <- getBit RawTokenStreamBit + if b + then docCommentEnd input commentAcc ITblockComment buf span + else cont + go commentAcc n input = case alexGetChar' input of + Nothing -> errBrace input (psRealSpan span) + Just ('-',input) -> case alexGetChar' input of + Nothing -> errBrace input (psRealSpan span) + Just ('\125',input) -> go ('\125':'-':commentAcc) (n-1) input -- '}' + Just (_,_) -> go ('-':commentAcc) n input + Just ('\123',input) -> case alexGetChar' input of -- '{' char + Nothing -> errBrace input (psRealSpan span) + Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input + Just (_,_) -> go ('\123':commentAcc) n input + -- See Note [Nested comment line pragmas] + Just ('\n',input) -> case alexGetChar' input of + Nothing -> errBrace input (psRealSpan span) + Just ('#',_) -> do (parsedAcc,input) <- parseNestedPragma input + go (parsedAcc ++ '\n':commentAcc) n input + Just (_,_) -> go ('\n':commentAcc) n input + Just (c,input) -> go (c:commentAcc) n input + +nested_doc_comment :: Action +nested_doc_comment span buf _len = withLexedDocType (go "") + where + go commentAcc input docType _ = case alexGetChar' input of + Nothing -> errBrace input (psRealSpan span) + Just ('-',input) -> case alexGetChar' input of + Nothing -> errBrace input (psRealSpan span) + Just ('\125',input) -> + docCommentEnd input commentAcc docType buf span + Just (_,_) -> go ('-':commentAcc) input docType False + Just ('\123', input) -> case alexGetChar' input of + Nothing -> errBrace input (psRealSpan span) + Just ('-',input) -> do + setInput input + let cont = do input <- getInput; go commentAcc input docType False + nested_comment cont span buf _len + Just (_,_) -> go ('\123':commentAcc) input docType False + -- See Note [Nested comment line pragmas] + Just ('\n',input) -> case alexGetChar' input of + Nothing -> errBrace input (psRealSpan span) + Just ('#',_) -> do (parsedAcc,input) <- parseNestedPragma input + go (parsedAcc ++ '\n':commentAcc) input docType False + Just (_,_) -> go ('\n':commentAcc) input docType False + Just (c,input) -> go (c:commentAcc) input docType False + +-- See Note [Nested comment line pragmas] +parseNestedPragma :: AlexInput -> P (String,AlexInput) +parseNestedPragma input@(AI _ buf) = do + origInput <- getInput + setInput input + setExts (.|. xbit InNestedCommentBit) + pushLexState bol + lt <- lexToken + _ <- popLexState + setExts (.&. complement (xbit InNestedCommentBit)) + postInput@(AI _ postBuf) <- getInput + setInput origInput + case unLoc lt of + ITcomment_line_prag -> do + let bytes = byteDiff buf postBuf + diff = lexemeToString buf bytes + return (reverse diff, postInput) + lt' -> panic ("parseNestedPragma: unexpected token" ++ (show lt')) + +{- +Note [Nested comment line pragmas] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We used to ignore cpp-preprocessor-generated #line pragmas if they were inside +nested comments. + +Now, when parsing a nested comment, if we encounter a line starting with '#' we +call parseNestedPragma, which executes the following: +1. Save the current lexer input (loc, buf) for later +2. Set the current lexer input to the beginning of the line starting with '#' +3. Turn the 'InNestedComment' extension on +4. Push the 'bol' lexer state +5. Lex a token. Due to (2), (3), and (4), this should always lex a single line + or less and return the ITcomment_line_prag token. This may set source line + and file location if a #line pragma is successfully parsed +6. Restore lexer input and state to what they were before we did all this +7. Return control to the function parsing a nested comment, informing it of + what the lexer parsed + +Regarding (5) above: +Every exit from the 'bol' lexer state (do_bol, popLinePrag1, failLinePrag1) +checks if the 'InNestedComment' extension is set. If it is, that function will +return control to parseNestedPragma by returning the ITcomment_line_prag token. + +See #314 for more background on the bug this fixes. +-} + +withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (PsLocated Token)) + -> P (PsLocated Token) +withLexedDocType lexDocComment = do + input@(AI _ buf) <- getInput + case prevChar buf ' ' of + -- The `Bool` argument to lexDocComment signals whether or not the next + -- line of input might also belong to this doc comment. + '|' -> lexDocComment input ITdocCommentNext True + '^' -> lexDocComment input ITdocCommentPrev True + '$' -> lexDocComment input ITdocCommentNamed True + '*' -> lexDocSection 1 input + _ -> panic "withLexedDocType: Bad doc type" + where + lexDocSection n input = case alexGetChar' input of + Just ('*', input) -> lexDocSection (n+1) input + Just (_, _) -> lexDocComment input (ITdocSection n) False + Nothing -> do setInput input; lexToken -- eof reached, lex it normally + +-- RULES pragmas turn on the forall and '.' keywords, and we turn them +-- off again at the end of the pragma. +rulePrag :: Action +rulePrag span buf len = do + setExts (.|. xbit InRulePragBit) + let !src = lexemeToString buf len + return (L span (ITrules_prag (SourceText src))) + +-- When 'UsePosPragsBit' is not set, it is expected that we emit a token instead +-- of updating the position in 'PState' +linePrag :: Action +linePrag span buf len = do + usePosPrags <- getBit UsePosPragsBit + if usePosPrags + then begin line_prag2 span buf len + else let !src = lexemeToString buf len + in return (L span (ITline_prag (SourceText src))) + +-- When 'UsePosPragsBit' is not set, it is expected that we emit a token instead +-- of updating the position in 'PState' +columnPrag :: Action +columnPrag span buf len = do + usePosPrags <- getBit UsePosPragsBit + let !src = lexemeToString buf len + if usePosPrags + then begin column_prag span buf len + else let !src = lexemeToString buf len + in return (L span (ITcolumn_prag (SourceText src))) + +endPrag :: Action +endPrag span _buf _len = do + setExts (.&. complement (xbit InRulePragBit)) + return (L span ITclose_prag) + +-- docCommentEnd +------------------------------------------------------------------------------- +-- This function is quite tricky. We can't just return a new token, we also +-- need to update the state of the parser. Why? Because the token is longer +-- than what was lexed by Alex, and the lexToken function doesn't know this, so +-- it writes the wrong token length to the parser state. This function is +-- called afterwards, so it can just update the state. + +docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer -> + PsSpan -> P (PsLocated Token) +docCommentEnd input commentAcc docType buf span = do + setInput input + let (AI loc nextBuf) = input + comment = reverse commentAcc + span' = mkPsSpan (psSpanStart span) loc + last_len = byteDiff buf nextBuf + + span `seq` setLastToken span' last_len + return (L span' (docType comment)) + +errBrace :: AlexInput -> RealSrcSpan -> P a +errBrace (AI end _) span = failLocMsgP (realSrcSpanStart span) (psRealLoc end) "unterminated `{-'" + +open_brace, close_brace :: Action +open_brace span _str _len = do + ctx <- getContext + setContext (NoLayout:ctx) + return (L span ITocurly) +close_brace span _str _len = do + popContext + return (L span ITccurly) + +qvarid, qconid :: StringBuffer -> Int -> Token +qvarid buf len = ITqvarid $! splitQualName buf len False +qconid buf len = ITqconid $! splitQualName buf len False + +splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString) +-- takes a StringBuffer and a length, and returns the module name +-- and identifier parts of a qualified name. Splits at the *last* dot, +-- because of hierarchical module names. +splitQualName orig_buf len parens = split orig_buf orig_buf + where + split buf dot_buf + | orig_buf `byteDiff` buf >= len = done dot_buf + | c == '.' = found_dot buf' + | otherwise = split buf' dot_buf + where + (c,buf') = nextChar buf + + -- careful, we might get names like M.... + -- so, if the character after the dot is not upper-case, this is + -- the end of the qualifier part. + found_dot buf -- buf points after the '.' + | isUpper c = split buf' buf + | otherwise = done buf + where + (c,buf') = nextChar buf + + done dot_buf = + (lexemeToFastString orig_buf (qual_size - 1), + if parens -- Prelude.(+) + then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2) + else lexemeToFastString dot_buf (len - qual_size)) + where + qual_size = orig_buf `byteDiff` dot_buf + +varid :: Action +varid span buf len = + case lookupUFM reservedWordsFM fs of + Just (ITcase, _) -> do + lastTk <- getLastTk + keyword <- case lastTk of + Just ITlam -> do + lambdaCase <- getBit LambdaCaseBit + unless lambdaCase $ do + pState <- getPState + addError (mkSrcSpanPs (last_loc pState)) $ text + "Illegal lambda-case (use LambdaCase)" + return ITlcase + _ -> return ITcase + maybe_layout keyword + return $ L span keyword + Just (keyword, 0) -> do + maybe_layout keyword + return $ L span keyword + Just (keyword, i) -> do + exts <- getExts + if exts .&. i /= 0 + then do + maybe_layout keyword + return $ L span keyword + else + return $ L span $ ITvarid fs + Nothing -> + return $ L span $ ITvarid fs + where + !fs = lexemeToFastString buf len + +conid :: StringBuffer -> Int -> Token +conid buf len = ITconid $! lexemeToFastString buf len + +qvarsym, qconsym :: StringBuffer -> Int -> Token +qvarsym buf len = ITqvarsym $! splitQualName buf len False +qconsym buf len = ITqconsym $! splitQualName buf len False + +-- See Note [Whitespace-sensitive operator parsing] +varsym_prefix :: Action +varsym_prefix = sym $ \exts s -> + if | TypeApplicationsBit `xtest` exts, s == fsLit "@" + -> return ITtypeApp + | ThQuotesBit `xtest` exts, s == fsLit "$" + -> return ITdollar + | ThQuotesBit `xtest` exts, s == fsLit "$$" + -> return ITdollardollar + | s == fsLit "!" -> return ITbang + | s == fsLit "~" -> return ITtilde + | otherwise -> return (ITvarsym s) + +-- See Note [Whitespace-sensitive operator parsing] +varsym_suffix :: Action +varsym_suffix = sym $ \_ s -> + if | s == fsLit "@" + -> failMsgP "Suffix occurrence of @. For an as-pattern, remove the leading whitespace." + | otherwise -> return (ITvarsym s) + +-- See Note [Whitespace-sensitive operator parsing] +varsym_tight_infix :: Action +varsym_tight_infix = sym $ \_ s -> + if | s == fsLit "@" -> return ITat + | otherwise -> return (ITvarsym s) + +-- See Note [Whitespace-sensitive operator parsing] +varsym_loose_infix :: Action +varsym_loose_infix = sym (\_ s -> return $ ITvarsym s) + +consym :: Action +consym = sym (\_exts s -> return $ ITconsym s) + +sym :: (ExtsBitmap -> FastString -> P Token) -> Action +sym con span buf len = + case lookupUFM reservedSymsFM fs of + Just (keyword, NormalSyntax, 0) -> + return $ L span keyword + Just (keyword, NormalSyntax, i) -> do + exts <- getExts + if exts .&. i /= 0 + then return $ L span keyword + else L span <$!> con exts fs + Just (keyword, UnicodeSyntax, 0) -> do + exts <- getExts + if xtest UnicodeSyntaxBit exts + then return $ L span keyword + else L span <$!> con exts fs + Just (keyword, UnicodeSyntax, i) -> do + exts <- getExts + if exts .&. i /= 0 && xtest UnicodeSyntaxBit exts + then return $ L span keyword + else L span <$!> con exts fs + Nothing -> do + exts <- getExts + L span <$!> con exts fs + where + !fs = lexemeToFastString buf len + +-- Variations on the integral numeric literal. +tok_integral :: (SourceText -> Integer -> Token) + -> (Integer -> Integer) + -> Int -> Int + -> (Integer, (Char -> Int)) + -> Action +tok_integral itint transint transbuf translen (radix,char_to_int) span buf len = do + numericUnderscores <- getBit NumericUnderscoresBit -- #14473 + let src = lexemeToString buf len + when ((not numericUnderscores) && ('_' `elem` src)) $ do + pState <- getPState + addError (mkSrcSpanPs (last_loc pState)) $ text + "Use NumericUnderscores to allow underscores in integer literals" + return $ L span $ itint (SourceText src) + $! transint $ parseUnsignedInteger + (offsetBytes transbuf buf) (subtract translen len) radix char_to_int + +tok_num :: (Integer -> Integer) + -> Int -> Int + -> (Integer, (Char->Int)) -> Action +tok_num = tok_integral $ \case + st@(SourceText ('-':_)) -> itint st (const True) + st@(SourceText _) -> itint st (const False) + st@NoSourceText -> itint st (< 0) + where + itint :: SourceText -> (Integer -> Bool) -> Integer -> Token + itint !st is_negative !val = ITinteger ((IL st $! is_negative val) val) + +tok_primint :: (Integer -> Integer) + -> Int -> Int + -> (Integer, (Char->Int)) -> Action +tok_primint = tok_integral ITprimint + + +tok_primword :: Int -> Int + -> (Integer, (Char->Int)) -> Action +tok_primword = tok_integral ITprimword positive +positive, negative :: (Integer -> Integer) +positive = id +negative = negate +decimal, octal, hexadecimal :: (Integer, Char -> Int) +decimal = (10,octDecDigit) +binary = (2,octDecDigit) +octal = (8,octDecDigit) +hexadecimal = (16,hexDigit) + +-- readRational can understand negative rationals, exponents, everything. +tok_frac :: Int -> (String -> Token) -> Action +tok_frac drop f span buf len = do + numericUnderscores <- getBit NumericUnderscoresBit -- #14473 + let src = lexemeToString buf (len-drop) + when ((not numericUnderscores) && ('_' `elem` src)) $ do + pState <- getPState + addError (mkSrcSpanPs (last_loc pState)) $ text + "Use NumericUnderscores to allow underscores in floating literals" + return (L span $! (f $! src)) + +tok_float, tok_primfloat, tok_primdouble :: String -> Token +tok_float str = ITrational $! readFractionalLit str +tok_hex_float str = ITrational $! readHexFractionalLit str +tok_primfloat str = ITprimfloat $! readFractionalLit str +tok_primdouble str = ITprimdouble $! readFractionalLit str + +readFractionalLit :: String -> FractionalLit +readFractionalLit str = ((FL $! (SourceText str)) $! is_neg) $! readRational str + where is_neg = case str of ('-':_) -> True + _ -> False +readHexFractionalLit :: String -> FractionalLit +readHexFractionalLit str = + FL { fl_text = SourceText str + , fl_neg = case str of + '-' : _ -> True + _ -> False + , fl_value = readHexRational str + } + +-- ----------------------------------------------------------------------------- +-- Layout processing + +-- we're at the first token on a line, insert layout tokens if necessary +do_bol :: Action +do_bol span _str _len = do + -- See Note [Nested comment line pragmas] + b <- getBit InNestedCommentBit + if b then return (L span ITcomment_line_prag) else do + (pos, gen_semic) <- getOffside + case pos of + LT -> do + --trace "layout: inserting '}'" $ do + popContext + -- do NOT pop the lex state, we might have a ';' to insert + return (L span ITvccurly) + EQ | gen_semic -> do + --trace "layout: inserting ';'" $ do + _ <- popLexState + return (L span ITsemi) + _ -> do + _ <- popLexState + lexToken + +-- certain keywords put us in the "layout" state, where we might +-- add an opening curly brace. +maybe_layout :: Token -> P () +maybe_layout t = do -- If the alternative layout rule is enabled then + -- we never create an implicit layout context here. + -- Layout is handled XXX instead. + -- The code for closing implicit contexts, or + -- inserting implicit semi-colons, is therefore + -- irrelevant as it only applies in an implicit + -- context. + alr <- getBit AlternativeLayoutRuleBit + unless alr $ f t + where f ITdo = pushLexState layout_do + f ITmdo = pushLexState layout_do + f ITof = pushLexState layout + f ITlcase = pushLexState layout + f ITlet = pushLexState layout + f ITwhere = pushLexState layout + f ITrec = pushLexState layout + f ITif = pushLexState layout_if + f _ = return () + +-- Pushing a new implicit layout context. If the indentation of the +-- next token is not greater than the previous layout context, then +-- Haskell 98 says that the new layout context should be empty; that is +-- the lexer must generate {}. +-- +-- We are slightly more lenient than this: when the new context is started +-- by a 'do', then we allow the new context to be at the same indentation as +-- the previous context. This is what the 'strict' argument is for. +new_layout_context :: Bool -> Bool -> Token -> Action +new_layout_context strict gen_semic tok span _buf len = do + _ <- popLexState + (AI l _) <- getInput + let offset = srcLocCol (psRealLoc l) - len + ctx <- getContext + nondecreasing <- getBit NondecreasingIndentationBit + let strict' = strict || not nondecreasing + case ctx of + Layout prev_off _ : _ | + (strict' && prev_off >= offset || + not strict' && prev_off > offset) -> do + -- token is indented to the left of the previous context. + -- we must generate a {} sequence now. + pushLexState layout_left + return (L span tok) + _ -> do setContext (Layout offset gen_semic : ctx) + return (L span tok) + +do_layout_left :: Action +do_layout_left span _buf _len = do + _ <- popLexState + pushLexState bol -- we must be at the start of a line + return (L span ITvccurly) + +-- ----------------------------------------------------------------------------- +-- LINE pragmas + +setLineAndFile :: Int -> Action +setLineAndFile code (PsSpan span _) buf len = do + let src = lexemeToString buf (len - 1) -- drop trailing quotation mark + linenumLen = length $ head $ words src + linenum = parseUnsignedInteger buf linenumLen 10 octDecDigit + file = mkFastString $ go $ drop 1 $ dropWhile (/= '"') src + -- skip everything through first quotation mark to get to the filename + where go ('\\':c:cs) = c : go cs + go (c:cs) = c : go cs + go [] = [] + -- decode escapes in the filename. e.g. on Windows + -- when our filenames have backslashes in, gcc seems to + -- escape the backslashes. One symptom of not doing this + -- is that filenames in error messages look a bit strange: + -- C:\\foo\bar.hs + -- only the first backslash is doubled, because we apply + -- System.FilePath.normalise before printing out + -- filenames and it does not remove duplicate + -- backslashes after the drive letter (should it?). + resetAlrLastLoc file + setSrcLoc (mkRealSrcLoc file (fromIntegral linenum - 1) (srcSpanEndCol span)) + -- subtract one: the line number refers to the *following* line + addSrcFile file + _ <- popLexState + pushLexState code + lexToken + +setColumn :: Action +setColumn (PsSpan span _) buf len = do + let column = + case reads (lexemeToString buf len) of + [(column, _)] -> column + _ -> error "setColumn: expected integer" -- shouldn't happen + setSrcLoc (mkRealSrcLoc (srcSpanFile span) (srcSpanEndLine span) + (fromIntegral (column :: Integer))) + _ <- popLexState + lexToken + +alrInitialLoc :: FastString -> RealSrcSpan +alrInitialLoc file = mkRealSrcSpan loc loc + where -- This is a hack to ensure that the first line in a file + -- looks like it is after the initial location: + loc = mkRealSrcLoc file (-1) (-1) + +-- ----------------------------------------------------------------------------- +-- Options, includes and language pragmas. + +lex_string_prag :: (String -> Token) -> Action +lex_string_prag mkTok span _buf _len + = do input <- getInput + start <- getParsedLoc + tok <- go [] input + end <- getParsedLoc + return (L (mkPsSpan start end) tok) + where go acc input + = if isString input "#-}" + then do setInput input + return (mkTok (reverse acc)) + else case alexGetChar input of + Just (c,i) -> go (c:acc) i + Nothing -> err input + isString _ [] = True + isString i (x:xs) + = case alexGetChar i of + Just (c,i') | c == x -> isString i' xs + _other -> False + err (AI end _) = failLocMsgP (realSrcSpanStart (psRealSpan span)) (psRealLoc end) "unterminated options pragma" + + +-- ----------------------------------------------------------------------------- +-- Strings & Chars + +-- This stuff is horrible. I hates it. + +lex_string_tok :: Action +lex_string_tok span buf _len = do + tok <- lex_string "" + (AI end bufEnd) <- getInput + let + tok' = case tok of + ITprimstring _ bs -> ITprimstring (SourceText src) bs + ITstring _ s -> ITstring (SourceText src) s + _ -> panic "lex_string_tok" + src = lexemeToString buf (cur bufEnd - cur buf) + return (L (mkPsSpan (psSpanStart span) end) tok') + +lex_string :: String -> P Token +lex_string s = do + i <- getInput + case alexGetChar' i of + Nothing -> lit_error i + + Just ('"',i) -> do + setInput i + let s' = reverse s + magicHash <- getBit MagicHashBit + if magicHash + then do + i <- getInput + case alexGetChar' i of + Just ('#',i) -> do + setInput i + when (any (> '\xFF') s') $ do + pState <- getPState + addError (mkSrcSpanPs (last_loc pState)) $ text + "primitive string literal must contain only characters <= \'\\xFF\'" + return (ITprimstring (SourceText s') (unsafeMkByteString s')) + _other -> + return (ITstring (SourceText s') (mkFastString s')) + else + return (ITstring (SourceText s') (mkFastString s')) + + Just ('\\',i) + | Just ('&',i) <- next -> do + setInput i; lex_string s + | Just (c,i) <- next, c <= '\x7f' && is_space c -> do + -- is_space only works for <= '\x7f' (#3751, #5425) + setInput i; lex_stringgap s + where next = alexGetChar' i + + Just (c, i1) -> do + case c of + '\\' -> do setInput i1; c' <- lex_escape; lex_string (c':s) + c | isAny c -> do setInput i1; lex_string (c:s) + _other -> lit_error i + +lex_stringgap :: String -> P Token +lex_stringgap s = do + i <- getInput + c <- getCharOrFail i + case c of + '\\' -> lex_string s + c | c <= '\x7f' && is_space c -> lex_stringgap s + -- is_space only works for <= '\x7f' (#3751, #5425) + _other -> lit_error i + + +lex_char_tok :: Action +-- Here we are basically parsing character literals, such as 'x' or '\n' +-- but we additionally spot 'x and ''T, returning ITsimpleQuote and +-- ITtyQuote respectively, but WITHOUT CONSUMING the x or T part +-- (the parser does that). +-- So we have to do two characters of lookahead: when we see 'x we need to +-- see if there's a trailing quote +lex_char_tok span buf _len = do -- We've seen ' + i1 <- getInput -- Look ahead to first character + let loc = psSpanStart span + case alexGetChar' i1 of + Nothing -> lit_error i1 + + Just ('\'', i2@(AI end2 _)) -> do -- We've seen '' + setInput i2 + return (L (mkPsSpan loc end2) ITtyQuote) + + Just ('\\', i2@(AI _end2 _)) -> do -- We've seen 'backslash + setInput i2 + lit_ch <- lex_escape + i3 <- getInput + mc <- getCharOrFail i3 -- Trailing quote + if mc == '\'' then finish_char_tok buf loc lit_ch + else lit_error i3 + + Just (c, i2@(AI _end2 _)) + | not (isAny c) -> lit_error i1 + | otherwise -> + + -- We've seen 'x, where x is a valid character + -- (i.e. not newline etc) but not a quote or backslash + case alexGetChar' i2 of -- Look ahead one more character + Just ('\'', i3) -> do -- We've seen 'x' + setInput i3 + finish_char_tok buf loc c + _other -> do -- We've seen 'x not followed by quote + -- (including the possibility of EOF) + -- Just parse the quote only + let (AI end _) = i1 + return (L (mkPsSpan loc end) ITsimpleQuote) + +finish_char_tok :: StringBuffer -> PsLoc -> Char -> P (PsLocated Token) +finish_char_tok buf loc ch -- We've already seen the closing quote + -- Just need to check for trailing # + = do magicHash <- getBit MagicHashBit + i@(AI end bufEnd) <- getInput + let src = lexemeToString buf (cur bufEnd - cur buf) + if magicHash then do + case alexGetChar' i of + Just ('#',i@(AI end _)) -> do + setInput i + return (L (mkPsSpan loc end) + (ITprimchar (SourceText src) ch)) + _other -> + return (L (mkPsSpan loc end) + (ITchar (SourceText src) ch)) + else do + return (L (mkPsSpan loc end) (ITchar (SourceText src) ch)) + +isAny :: Char -> Bool +isAny c | c > '\x7f' = isPrint c + | otherwise = is_any c + +lex_escape :: P Char +lex_escape = do + i0 <- getInput + c <- getCharOrFail i0 + case c of + 'a' -> return '\a' + 'b' -> return '\b' + 'f' -> return '\f' + 'n' -> return '\n' + 'r' -> return '\r' + 't' -> return '\t' + 'v' -> return '\v' + '\\' -> return '\\' + '"' -> return '\"' + '\'' -> return '\'' + '^' -> do i1 <- getInput + c <- getCharOrFail i1 + if c >= '@' && c <= '_' + then return (chr (ord c - ord '@')) + else lit_error i1 + + 'x' -> readNum is_hexdigit 16 hexDigit + 'o' -> readNum is_octdigit 8 octDecDigit + x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x) + + c1 -> do + i <- getInput + case alexGetChar' i of + Nothing -> lit_error i0 + Just (c2,i2) -> + case alexGetChar' i2 of + Nothing -> do lit_error i0 + Just (c3,i3) -> + let str = [c1,c2,c3] in + case [ (c,rest) | (p,c) <- silly_escape_chars, + Just rest <- [stripPrefix p str] ] of + (escape_char,[]):_ -> do + setInput i3 + return escape_char + (escape_char,_:_):_ -> do + setInput i2 + return escape_char + [] -> lit_error i0 + +readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char +readNum is_digit base conv = do + i <- getInput + c <- getCharOrFail i + if is_digit c + then readNum2 is_digit base conv (conv c) + else lit_error i + +readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char +readNum2 is_digit base conv i = do + input <- getInput + read i input + where read i input = do + case alexGetChar' input of + Just (c,input') | is_digit c -> do + let i' = i*base + conv c + if i' > 0x10ffff + then setInput input >> lexError "numeric escape sequence out of range" + else read i' input' + _other -> do + setInput input; return (chr i) + + +silly_escape_chars :: [(String, Char)] +silly_escape_chars = [ + ("NUL", '\NUL'), + ("SOH", '\SOH'), + ("STX", '\STX'), + ("ETX", '\ETX'), + ("EOT", '\EOT'), + ("ENQ", '\ENQ'), + ("ACK", '\ACK'), + ("BEL", '\BEL'), + ("BS", '\BS'), + ("HT", '\HT'), + ("LF", '\LF'), + ("VT", '\VT'), + ("FF", '\FF'), + ("CR", '\CR'), + ("SO", '\SO'), + ("SI", '\SI'), + ("DLE", '\DLE'), + ("DC1", '\DC1'), + ("DC2", '\DC2'), + ("DC3", '\DC3'), + ("DC4", '\DC4'), + ("NAK", '\NAK'), + ("SYN", '\SYN'), + ("ETB", '\ETB'), + ("CAN", '\CAN'), + ("EM", '\EM'), + ("SUB", '\SUB'), + ("ESC", '\ESC'), + ("FS", '\FS'), + ("GS", '\GS'), + ("RS", '\RS'), + ("US", '\US'), + ("SP", '\SP'), + ("DEL", '\DEL') + ] + +-- before calling lit_error, ensure that the current input is pointing to +-- the position of the error in the buffer. This is so that we can report +-- a correct location to the user, but also so we can detect UTF-8 decoding +-- errors if they occur. +lit_error :: AlexInput -> P a +lit_error i = do setInput i; lexError "lexical error in string/character literal" + +getCharOrFail :: AlexInput -> P Char +getCharOrFail i = do + case alexGetChar' i of + Nothing -> lexError "unexpected end-of-file in string/character literal" + Just (c,i) -> do setInput i; return c + +-- ----------------------------------------------------------------------------- +-- QuasiQuote + +lex_qquasiquote_tok :: Action +lex_qquasiquote_tok span buf len = do + let (qual, quoter) = splitQualName (stepOn buf) (len - 2) False + quoteStart <- getParsedLoc + quote <- lex_quasiquote (psRealLoc quoteStart) "" + end <- getParsedLoc + return (L (mkPsSpan (psSpanStart span) end) + (ITqQuasiQuote (qual, + quoter, + mkFastString (reverse quote), + mkPsSpan quoteStart end))) + +lex_quasiquote_tok :: Action +lex_quasiquote_tok span buf len = do + let quoter = tail (lexemeToString buf (len - 1)) + -- 'tail' drops the initial '[', + -- while the -1 drops the trailing '|' + quoteStart <- getParsedLoc + quote <- lex_quasiquote (psRealLoc quoteStart) "" + end <- getParsedLoc + return (L (mkPsSpan (psSpanStart span) end) + (ITquasiQuote (mkFastString quoter, + mkFastString (reverse quote), + mkPsSpan quoteStart end))) + +lex_quasiquote :: RealSrcLoc -> String -> P String +lex_quasiquote start s = do + i <- getInput + case alexGetChar' i of + Nothing -> quasiquote_error start + + -- NB: The string "|]" terminates the quasiquote, + -- with absolutely no escaping. See the extensive + -- discussion on #5348 for why there is no + -- escape handling. + Just ('|',i) + | Just (']',i) <- alexGetChar' i + -> do { setInput i; return s } + + Just (c, i) -> do + setInput i; lex_quasiquote start (c : s) + +quasiquote_error :: RealSrcLoc -> P a +quasiquote_error start = do + (AI end buf) <- getInput + reportLexError start (psRealLoc end) buf "unterminated quasiquotation" + +-- ----------------------------------------------------------------------------- +-- Warnings + +warnTab :: Action +warnTab srcspan _buf _len = do + addTabWarning (psRealSpan srcspan) + lexToken + +warnThen :: WarningFlag -> SDoc -> Action -> Action +warnThen option warning action srcspan buf len = do + addWarning option (RealSrcSpan (psRealSpan srcspan) Nothing) warning + action srcspan buf len + +-- ----------------------------------------------------------------------------- +-- The Parse Monad + +-- | Do we want to generate ';' layout tokens? In some cases we just want to +-- generate '}', e.g. in MultiWayIf we don't need ';'s because '|' separates +-- alternatives (unlike a `case` expression where we need ';' to as a separator +-- between alternatives). +type GenSemic = Bool + +generateSemic, dontGenerateSemic :: GenSemic +generateSemic = True +dontGenerateSemic = False + +data LayoutContext + = NoLayout + | Layout !Int !GenSemic + deriving Show + +-- | The result of running a parser. +data ParseResult a + = POk -- ^ The parser has consumed a (possibly empty) prefix + -- of the input and produced a result. Use 'getMessages' + -- to check for accumulated warnings and non-fatal errors. + PState -- ^ The resulting parsing state. Can be used to resume parsing. + a -- ^ The resulting value. + | PFailed -- ^ The parser has consumed a (possibly empty) prefix + -- of the input and failed. + PState -- ^ The parsing state right before failure, including the fatal + -- parse error. 'getMessages' and 'getErrorMessages' must return + -- a non-empty bag of errors. + +-- | Test whether a 'WarningFlag' is set +warnopt :: WarningFlag -> ParserFlags -> Bool +warnopt f options = f `EnumSet.member` pWarningFlags options + +-- | The subset of the 'DynFlags' used by the parser. +-- See 'mkParserFlags' or 'mkParserFlags'' for ways to construct this. +data ParserFlags = ParserFlags { + pWarningFlags :: EnumSet WarningFlag + , pThisPackage :: UnitId -- ^ key of package currently being compiled + , pExtsBitmap :: !ExtsBitmap -- ^ bitmap of permitted extensions + } + +data PState = PState { + buffer :: StringBuffer, + options :: ParserFlags, + -- This needs to take DynFlags as an argument until + -- we have a fix for #10143 + messages :: DynFlags -> Messages, + tab_first :: Maybe RealSrcSpan, -- pos of first tab warning in the file + tab_count :: !Int, -- number of tab warnings in the file + last_tk :: Maybe Token, + last_loc :: PsSpan, -- pos of previous token + last_len :: !Int, -- len of previous token + loc :: PsLoc, -- current loc (end of prev token + 1) + context :: [LayoutContext], + lex_state :: [Int], + srcfiles :: [FastString], + -- Used in the alternative layout rule: + -- These tokens are the next ones to be sent out. They are + -- just blindly emitted, without the rule looking at them again: + alr_pending_implicit_tokens :: [PsLocated Token], + -- This is the next token to be considered or, if it is Nothing, + -- we need to get the next token from the input stream: + alr_next_token :: Maybe (PsLocated Token), + -- This is what we consider to be the location of the last token + -- emitted: + alr_last_loc :: PsSpan, + -- The stack of layout contexts: + alr_context :: [ALRContext], + -- Are we expecting a '{'? If it's Just, then the ALRLayout tells + -- us what sort of layout the '{' will open: + alr_expecting_ocurly :: Maybe ALRLayout, + -- Have we just had the '}' for a let block? If so, than an 'in' + -- token doesn't need to close anything: + alr_justClosedExplicitLetBlock :: Bool, + + -- The next three are used to implement Annotations giving the + -- locations of 'noise' tokens in the source, so that users of + -- the GHC API can do source to source conversions. + -- See note [Api annotations] in GHC.Parser.Annotation + annotations :: [(ApiAnnKey,[RealSrcSpan])], + eof_pos :: Maybe RealSrcSpan, + comment_q :: [RealLocated AnnotationComment], + annotations_comments :: [(RealSrcSpan,[RealLocated AnnotationComment])] + } + -- last_loc and last_len are used when generating error messages, + -- and in pushCurrentContext only. Sigh, if only Happy passed the + -- current token to happyError, we could at least get rid of last_len. + -- Getting rid of last_loc would require finding another way to + -- implement pushCurrentContext (which is only called from one place). + +data ALRContext = ALRNoLayout Bool{- does it contain commas? -} + Bool{- is it a 'let' block? -} + | ALRLayout ALRLayout Int +data ALRLayout = ALRLayoutLet + | ALRLayoutWhere + | ALRLayoutOf + | ALRLayoutDo + +-- | The parsing monad, isomorphic to @StateT PState Maybe@. +newtype P a = P { unP :: PState -> ParseResult a } + +instance Functor P where + fmap = liftM + +instance Applicative P where + pure = returnP + (<*>) = ap + +instance Monad P where + (>>=) = thenP + +returnP :: a -> P a +returnP a = a `seq` (P $ \s -> POk s a) + +thenP :: P a -> (a -> P b) -> P b +(P m) `thenP` k = P $ \ s -> + case m s of + POk s1 a -> (unP (k a)) s1 + PFailed s1 -> PFailed s1 + +failMsgP :: String -> P a +failMsgP msg = do + pState <- getPState + addFatalError (mkSrcSpanPs (last_loc pState)) (text msg) + +failLocMsgP :: RealSrcLoc -> RealSrcLoc -> String -> P a +failLocMsgP loc1 loc2 str = + addFatalError (RealSrcSpan (mkRealSrcSpan loc1 loc2) Nothing) (text str) + +getPState :: P PState +getPState = P $ \s -> POk s s + +withThisPackage :: (UnitId -> a) -> P a +withThisPackage f = P $ \s@(PState{options = o}) -> POk s (f (pThisPackage o)) + +getExts :: P ExtsBitmap +getExts = P $ \s -> POk s (pExtsBitmap . options $ s) + +setExts :: (ExtsBitmap -> ExtsBitmap) -> P () +setExts f = P $ \s -> POk s { + options = + let p = options s + in p { pExtsBitmap = f (pExtsBitmap p) } + } () + +setSrcLoc :: RealSrcLoc -> P () +setSrcLoc new_loc = + P $ \s@(PState{ loc = PsLoc _ buf_loc }) -> + POk s{ loc = PsLoc new_loc buf_loc } () + +getRealSrcLoc :: P RealSrcLoc +getRealSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s (psRealLoc loc) + +getParsedLoc :: P PsLoc +getParsedLoc = P $ \s@(PState{ loc=loc }) -> POk s loc + +addSrcFile :: FastString -> P () +addSrcFile f = P $ \s -> POk s{ srcfiles = f : srcfiles s } () + +setEofPos :: RealSrcSpan -> P () +setEofPos span = P $ \s -> POk s{ eof_pos = Just span } () + +setLastToken :: PsSpan -> Int -> P () +setLastToken loc len = P $ \s -> POk s { + last_loc=loc, + last_len=len + } () + +setLastTk :: Token -> P () +setLastTk tk = P $ \s -> POk s { last_tk = Just tk } () + +getLastTk :: P (Maybe Token) +getLastTk = P $ \s@(PState { last_tk = last_tk }) -> POk s last_tk + +data AlexInput = AI PsLoc StringBuffer + +{- +Note [Unicode in Alex] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Although newer versions of Alex support unicode, this grammar is processed with +the old style '--latin1' behaviour. This means that when implementing the +functions + + alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) + alexInputPrevChar :: AlexInput -> Char + +which Alex uses to take apart our 'AlexInput', we must + + * return a latin1 character in the 'Word8' that 'alexGetByte' expects + * return a latin1 character in 'alexInputPrevChar'. + +We handle this in 'adjustChar' by squishing entire classes of unicode +characters into single bytes. +-} + +{-# INLINE adjustChar #-} +adjustChar :: Char -> Word8 +adjustChar c = fromIntegral $ ord adj_c + where non_graphic = '\x00' + upper = '\x01' + lower = '\x02' + digit = '\x03' + symbol = '\x04' + space = '\x05' + other_graphic = '\x06' + uniidchar = '\x07' + + adj_c + | c <= '\x07' = non_graphic + | c <= '\x7f' = c + -- Alex doesn't handle Unicode, so when Unicode + -- character is encountered we output these values + -- with the actual character value hidden in the state. + | otherwise = + -- NB: The logic behind these definitions is also reflected + -- in basicTypes/Lexeme.hs + -- Any changes here should likely be reflected there. + + case generalCategory c of + UppercaseLetter -> upper + LowercaseLetter -> lower + TitlecaseLetter -> upper + ModifierLetter -> uniidchar -- see #10196 + OtherLetter -> lower -- see #1103 + NonSpacingMark -> uniidchar -- see #7650 + SpacingCombiningMark -> other_graphic + EnclosingMark -> other_graphic + DecimalNumber -> digit + LetterNumber -> other_graphic + OtherNumber -> digit -- see #4373 + ConnectorPunctuation -> symbol + DashPunctuation -> symbol + OpenPunctuation -> other_graphic + ClosePunctuation -> other_graphic + InitialQuote -> other_graphic + FinalQuote -> other_graphic + OtherPunctuation -> symbol + MathSymbol -> symbol + CurrencySymbol -> symbol + ModifierSymbol -> symbol + OtherSymbol -> symbol + Space -> space + _other -> non_graphic + +-- Getting the previous 'Char' isn't enough here - we need to convert it into +-- the same format that 'alexGetByte' would have produced. +-- +-- See Note [Unicode in Alex] and #13986. +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar (AI _ buf) = chr (fromIntegral (adjustChar pc)) + where pc = prevChar buf '\n' + +-- backwards compatibility for Alex 2.x +alexGetChar :: AlexInput -> Maybe (Char,AlexInput) +alexGetChar inp = case alexGetByte inp of + Nothing -> Nothing + Just (b,i) -> c `seq` Just (c,i) + where c = chr $ fromIntegral b + +-- See Note [Unicode in Alex] +alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) +alexGetByte (AI loc s) + | atEnd s = Nothing + | otherwise = byte `seq` loc' `seq` s' `seq` + --trace (show (ord c)) $ + Just (byte, (AI loc' s')) + where (c,s') = nextChar s + loc' = advancePsLoc loc c + byte = adjustChar c + +-- This version does not squash unicode characters, it is used when +-- lexing strings. +alexGetChar' :: AlexInput -> Maybe (Char,AlexInput) +alexGetChar' (AI loc s) + | atEnd s = Nothing + | otherwise = c `seq` loc' `seq` s' `seq` + --trace (show (ord c)) $ + Just (c, (AI loc' s')) + where (c,s') = nextChar s + loc' = advancePsLoc loc c + +getInput :: P AlexInput +getInput = P $ \s@PState{ loc=l, buffer=b } -> POk s (AI l b) + +setInput :: AlexInput -> P () +setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } () + +nextIsEOF :: P Bool +nextIsEOF = do + AI _ s <- getInput + return $ atEnd s + +pushLexState :: Int -> P () +pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} () + +popLexState :: P Int +popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls + +getLexState :: P Int +getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls + +popNextToken :: P (Maybe (PsLocated Token)) +popNextToken + = P $ \s@PState{ alr_next_token = m } -> + POk (s {alr_next_token = Nothing}) m + +activeContext :: P Bool +activeContext = do + ctxt <- getALRContext + expc <- getAlrExpectingOCurly + impt <- implicitTokenPending + case (ctxt,expc) of + ([],Nothing) -> return impt + _other -> return True + +resetAlrLastLoc :: FastString -> P () +resetAlrLastLoc file = + P $ \s@(PState {alr_last_loc = PsSpan _ buf_span}) -> + POk s{ alr_last_loc = PsSpan (alrInitialLoc file) buf_span } () + +setAlrLastLoc :: PsSpan -> P () +setAlrLastLoc l = P $ \s -> POk (s {alr_last_loc = l}) () + +getAlrLastLoc :: P PsSpan +getAlrLastLoc = P $ \s@(PState {alr_last_loc = l}) -> POk s l + +getALRContext :: P [ALRContext] +getALRContext = P $ \s@(PState {alr_context = cs}) -> POk s cs + +setALRContext :: [ALRContext] -> P () +setALRContext cs = P $ \s -> POk (s {alr_context = cs}) () + +getJustClosedExplicitLetBlock :: P Bool +getJustClosedExplicitLetBlock + = P $ \s@(PState {alr_justClosedExplicitLetBlock = b}) -> POk s b + +setJustClosedExplicitLetBlock :: Bool -> P () +setJustClosedExplicitLetBlock b + = P $ \s -> POk (s {alr_justClosedExplicitLetBlock = b}) () + +setNextToken :: PsLocated Token -> P () +setNextToken t = P $ \s -> POk (s {alr_next_token = Just t}) () + +implicitTokenPending :: P Bool +implicitTokenPending + = P $ \s@PState{ alr_pending_implicit_tokens = ts } -> + case ts of + [] -> POk s False + _ -> POk s True + +popPendingImplicitToken :: P (Maybe (PsLocated Token)) +popPendingImplicitToken + = P $ \s@PState{ alr_pending_implicit_tokens = ts } -> + case ts of + [] -> POk s Nothing + (t : ts') -> POk (s {alr_pending_implicit_tokens = ts'}) (Just t) + +setPendingImplicitTokens :: [PsLocated Token] -> P () +setPendingImplicitTokens ts = P $ \s -> POk (s {alr_pending_implicit_tokens = ts}) () + +getAlrExpectingOCurly :: P (Maybe ALRLayout) +getAlrExpectingOCurly = P $ \s@(PState {alr_expecting_ocurly = b}) -> POk s b + +setAlrExpectingOCurly :: Maybe ALRLayout -> P () +setAlrExpectingOCurly b = P $ \s -> POk (s {alr_expecting_ocurly = b}) () + +-- | For reasons of efficiency, boolean parsing flags (eg, language extensions +-- or whether we are currently in a @RULE@ pragma) are represented by a bitmap +-- stored in a @Word64@. +type ExtsBitmap = Word64 + +xbit :: ExtBits -> ExtsBitmap +xbit = bit . fromEnum + +xtest :: ExtBits -> ExtsBitmap -> Bool +xtest ext xmap = testBit xmap (fromEnum ext) + +-- | Various boolean flags, mostly language extensions, that impact lexing and +-- parsing. Note that a handful of these can change during lexing/parsing. +data ExtBits + -- Flags that are constant once parsing starts + = FfiBit + | InterruptibleFfiBit + | CApiFfiBit + | ArrowsBit + | ThBit + | ThQuotesBit + | IpBit + | OverloadedLabelsBit -- #x overloaded labels + | ExplicitForallBit -- the 'forall' keyword + | BangPatBit -- Tells the parser to understand bang-patterns + -- (doesn't affect the lexer) + | PatternSynonymsBit -- pattern synonyms + | HaddockBit-- Lex and parse Haddock comments + | MagicHashBit -- "#" in both functions and operators + | RecursiveDoBit -- mdo + | UnicodeSyntaxBit -- the forall symbol, arrow symbols, etc + | UnboxedTuplesBit -- (# and #) + | UnboxedSumsBit -- (# and #) + | DatatypeContextsBit + | MonadComprehensionsBit + | TransformComprehensionsBit + | QqBit -- enable quasiquoting + | RawTokenStreamBit -- producing a token stream with all comments included + | AlternativeLayoutRuleBit + | ALRTransitionalBit + | RelaxedLayoutBit + | NondecreasingIndentationBit + | SafeHaskellBit + | TraditionalRecordSyntaxBit + | ExplicitNamespacesBit + | LambdaCaseBit + | BinaryLiteralsBit + | NegativeLiteralsBit + | HexFloatLiteralsBit + | TypeApplicationsBit + | StaticPointersBit + | NumericUnderscoresBit + | StarIsTypeBit + | BlockArgumentsBit + | NPlusKPatternsBit + | DoAndIfThenElseBit + | MultiWayIfBit + | GadtSyntaxBit + | ImportQualifiedPostBit + + -- Flags that are updated once parsing starts + | InRulePragBit + | InNestedCommentBit -- See Note [Nested comment line pragmas] + | UsePosPragsBit + -- ^ If this is enabled, '{-# LINE ... -#}' and '{-# COLUMN ... #-}' + -- update the internal position. Otherwise, those pragmas are lexed as + -- tokens of their own. + deriving Enum + + + + + +-- PState for parsing options pragmas +-- +pragState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState +pragState dynflags buf loc = (mkPState dynflags buf loc) { + lex_state = [bol, option_prags, 0] + } + +{-# INLINE mkParserFlags' #-} +mkParserFlags' + :: EnumSet WarningFlag -- ^ warnings flags enabled + -> EnumSet LangExt.Extension -- ^ permitted language extensions enabled + -> UnitId -- ^ key of package currently being compiled + -> Bool -- ^ are safe imports on? + -> Bool -- ^ keeping Haddock comment tokens + -> Bool -- ^ keep regular comment tokens + + -> Bool + -- ^ If this is enabled, '{-# LINE ... -#}' and '{-# COLUMN ... #-}' update + -- the internal position kept by the parser. Otherwise, those pragmas are + -- lexed as 'ITline_prag' and 'ITcolumn_prag' tokens. + + -> ParserFlags +-- ^ Given exactly the information needed, set up the 'ParserFlags' +mkParserFlags' warningFlags extensionFlags thisPackage + safeImports isHaddock rawTokStream usePosPrags = + ParserFlags { + pWarningFlags = warningFlags + , pThisPackage = thisPackage + , pExtsBitmap = safeHaskellBit .|. langExtBits .|. optBits + } + where + safeHaskellBit = SafeHaskellBit `setBitIf` safeImports + langExtBits = + FfiBit `xoptBit` LangExt.ForeignFunctionInterface + .|. InterruptibleFfiBit `xoptBit` LangExt.InterruptibleFFI + .|. CApiFfiBit `xoptBit` LangExt.CApiFFI + .|. ArrowsBit `xoptBit` LangExt.Arrows + .|. ThBit `xoptBit` LangExt.TemplateHaskell + .|. ThQuotesBit `xoptBit` LangExt.TemplateHaskellQuotes + .|. QqBit `xoptBit` LangExt.QuasiQuotes + .|. IpBit `xoptBit` LangExt.ImplicitParams + .|. OverloadedLabelsBit `xoptBit` LangExt.OverloadedLabels + .|. ExplicitForallBit `xoptBit` LangExt.ExplicitForAll + .|. BangPatBit `xoptBit` LangExt.BangPatterns + .|. MagicHashBit `xoptBit` LangExt.MagicHash + .|. RecursiveDoBit `xoptBit` LangExt.RecursiveDo + .|. UnicodeSyntaxBit `xoptBit` LangExt.UnicodeSyntax + .|. UnboxedTuplesBit `xoptBit` LangExt.UnboxedTuples + .|. UnboxedSumsBit `xoptBit` LangExt.UnboxedSums + .|. DatatypeContextsBit `xoptBit` LangExt.DatatypeContexts + .|. TransformComprehensionsBit `xoptBit` LangExt.TransformListComp + .|. MonadComprehensionsBit `xoptBit` LangExt.MonadComprehensions + .|. AlternativeLayoutRuleBit `xoptBit` LangExt.AlternativeLayoutRule + .|. ALRTransitionalBit `xoptBit` LangExt.AlternativeLayoutRuleTransitional + .|. RelaxedLayoutBit `xoptBit` LangExt.RelaxedLayout + .|. NondecreasingIndentationBit `xoptBit` LangExt.NondecreasingIndentation + .|. TraditionalRecordSyntaxBit `xoptBit` LangExt.TraditionalRecordSyntax + .|. ExplicitNamespacesBit `xoptBit` LangExt.ExplicitNamespaces + .|. LambdaCaseBit `xoptBit` LangExt.LambdaCase + .|. BinaryLiteralsBit `xoptBit` LangExt.BinaryLiterals + .|. NegativeLiteralsBit `xoptBit` LangExt.NegativeLiterals + .|. HexFloatLiteralsBit `xoptBit` LangExt.HexFloatLiterals + .|. PatternSynonymsBit `xoptBit` LangExt.PatternSynonyms + .|. TypeApplicationsBit `xoptBit` LangExt.TypeApplications + .|. StaticPointersBit `xoptBit` LangExt.StaticPointers + .|. NumericUnderscoresBit `xoptBit` LangExt.NumericUnderscores + .|. StarIsTypeBit `xoptBit` LangExt.StarIsType + .|. BlockArgumentsBit `xoptBit` LangExt.BlockArguments + .|. NPlusKPatternsBit `xoptBit` LangExt.NPlusKPatterns + .|. DoAndIfThenElseBit `xoptBit` LangExt.DoAndIfThenElse + .|. MultiWayIfBit `xoptBit` LangExt.MultiWayIf + .|. GadtSyntaxBit `xoptBit` LangExt.GADTSyntax + .|. ImportQualifiedPostBit `xoptBit` LangExt.ImportQualifiedPost + optBits = + HaddockBit `setBitIf` isHaddock + .|. RawTokenStreamBit `setBitIf` rawTokStream + .|. UsePosPragsBit `setBitIf` usePosPrags + + xoptBit bit ext = bit `setBitIf` EnumSet.member ext extensionFlags + + setBitIf :: ExtBits -> Bool -> ExtsBitmap + b `setBitIf` cond | cond = xbit b + | otherwise = 0 + +-- | Extracts the flag information needed for parsing +mkParserFlags :: DynFlags -> ParserFlags +mkParserFlags = + mkParserFlags' + <$> DynFlags.warningFlags + <*> DynFlags.extensionFlags + <*> DynFlags.thisPackage + <*> safeImportsOn + <*> gopt Opt_Haddock + <*> gopt Opt_KeepRawTokenStream + <*> const True + +-- | Creates a parse state from a 'DynFlags' value +mkPState :: DynFlags -> StringBuffer -> RealSrcLoc -> PState +mkPState flags = mkPStatePure (mkParserFlags flags) + +-- | Creates a parse state from a 'ParserFlags' value +mkPStatePure :: ParserFlags -> StringBuffer -> RealSrcLoc -> PState +mkPStatePure options buf loc = + PState { + buffer = buf, + options = options, + messages = const emptyMessages, + tab_first = Nothing, + tab_count = 0, + last_tk = Nothing, + last_loc = mkPsSpan init_loc init_loc, + last_len = 0, + loc = init_loc, + context = [], + lex_state = [bol, 0], + srcfiles = [], + alr_pending_implicit_tokens = [], + alr_next_token = Nothing, + alr_last_loc = PsSpan (alrInitialLoc (fsLit "<no file>")) (BufSpan (BufPos 0) (BufPos 0)), + alr_context = [], + alr_expecting_ocurly = Nothing, + alr_justClosedExplicitLetBlock = False, + annotations = [], + eof_pos = Nothing, + comment_q = [], + annotations_comments = [] + } + where init_loc = PsLoc loc (BufPos 0) + +-- | An mtl-style class for monads that support parsing-related operations. +-- For example, sometimes we make a second pass over the parsing results to validate, +-- disambiguate, or rearrange them, and we do so in the PV monad which cannot consume +-- input but can report parsing errors, check for extension bits, and accumulate +-- parsing annotations. Both P and PV are instances of MonadP. +-- +-- MonadP grants us convenient overloading. The other option is to have separate operations +-- for each monad: addErrorP vs addErrorPV, getBitP vs getBitPV, and so on. +-- +class Monad m => MonadP m where + -- | Add a non-fatal error. Use this when the parser can produce a result + -- despite the error. + -- + -- For example, when GHC encounters a @forall@ in a type, + -- but @-XExplicitForAll@ is disabled, the parser constructs @ForAllTy@ + -- as if @-XExplicitForAll@ was enabled, adding a non-fatal error to + -- the accumulator. + -- + -- Control flow wise, non-fatal errors act like warnings: they are added + -- to the accumulator and parsing continues. This allows GHC to report + -- more than one parse error per file. + -- + addError :: SrcSpan -> SDoc -> m () + -- | Add a warning to the accumulator. + -- Use 'getMessages' to get the accumulated warnings. + addWarning :: WarningFlag -> SrcSpan -> SDoc -> m () + -- | Add a fatal error. This will be the last error reported by the parser, and + -- the parser will not produce any result, ending in a 'PFailed' state. + addFatalError :: SrcSpan -> SDoc -> m a + -- | Check if a given flag is currently set in the bitmap. + getBit :: ExtBits -> m Bool + -- | Given a location and a list of AddAnn, apply them all to the location. + addAnnotation :: SrcSpan -- SrcSpan of enclosing AST construct + -> AnnKeywordId -- The first two parameters are the key + -> SrcSpan -- The location of the keyword itself + -> m () + +appendError + :: SrcSpan + -> SDoc + -> (DynFlags -> Messages) + -> (DynFlags -> Messages) +appendError srcspan msg m = + \d -> + let (ws, es) = m d + errormsg = mkErrMsg d srcspan alwaysQualify msg + es' = es `snocBag` errormsg + in (ws, es') + +appendWarning + :: ParserFlags + -> WarningFlag + -> SrcSpan + -> SDoc + -> (DynFlags -> Messages) + -> (DynFlags -> Messages) +appendWarning o option srcspan warning m = + \d -> + let (ws, es) = m d + warning' = makeIntoWarning (Reason option) $ + mkWarnMsg d srcspan alwaysQualify warning + ws' = if warnopt option o then ws `snocBag` warning' else ws + in (ws', es) + +instance MonadP P where + addError srcspan msg + = P $ \s@PState{messages=m} -> + POk s{messages=appendError srcspan msg m} () + addWarning option srcspan warning + = P $ \s@PState{messages=m, options=o} -> + POk s{messages=appendWarning o option srcspan warning m} () + addFatalError span msg = + addError span msg >> P PFailed + getBit ext = P $ \s -> let b = ext `xtest` pExtsBitmap (options s) + in b `seq` POk s b + addAnnotation (RealSrcSpan l _) a (RealSrcSpan v _) = do + addAnnotationOnly l a v + allocateCommentsP l + addAnnotation _ _ _ = return () + +addAnnsAt :: MonadP m => SrcSpan -> [AddAnn] -> m () +addAnnsAt l = mapM_ (\(AddAnn a v) -> addAnnotation l a v) + +addTabWarning :: RealSrcSpan -> P () +addTabWarning srcspan + = P $ \s@PState{tab_first=tf, tab_count=tc, options=o} -> + let tf' = if isJust tf then tf else Just srcspan + tc' = tc + 1 + s' = if warnopt Opt_WarnTabs o + then s{tab_first = tf', tab_count = tc'} + else s + in POk s' () + +mkTabWarning :: PState -> DynFlags -> Maybe ErrMsg +mkTabWarning PState{tab_first=tf, tab_count=tc} d = + let middle = if tc == 1 + then text "" + else text ", and in" <+> speakNOf (tc - 1) (text "further location") + message = text "Tab character found here" + <> middle + <> text "." + $+$ text "Please use spaces instead." + in fmap (\s -> makeIntoWarning (Reason Opt_WarnTabs) $ + mkWarnMsg d (RealSrcSpan s Nothing) alwaysQualify message) tf + +-- | Get a bag of the errors that have been accumulated so far. +-- Does not take -Werror into account. +getErrorMessages :: PState -> DynFlags -> ErrorMessages +getErrorMessages PState{messages=m} d = + let (_, es) = m d in es + +-- | Get the warnings and errors accumulated so far. +-- Does not take -Werror into account. +getMessages :: PState -> DynFlags -> Messages +getMessages p@PState{messages=m} d = + let (ws, es) = m d + tabwarning = mkTabWarning p d + ws' = maybe ws (`consBag` ws) tabwarning + in (ws', es) + +getContext :: P [LayoutContext] +getContext = P $ \s@PState{context=ctx} -> POk s ctx + +setContext :: [LayoutContext] -> P () +setContext ctx = P $ \s -> POk s{context=ctx} () + +popContext :: P () +popContext = P $ \ s@(PState{ buffer = buf, options = o, context = ctx, + last_len = len, last_loc = last_loc }) -> + case ctx of + (_:tl) -> + POk s{ context = tl } () + [] -> + unP (addFatalError (mkSrcSpanPs last_loc) (srcParseErr o buf len)) s + +-- Push a new layout context at the indentation of the last token read. +pushCurrentContext :: GenSemic -> P () +pushCurrentContext gen_semic = P $ \ s@PState{ last_loc=loc, context=ctx } -> + POk s{context = Layout (srcSpanStartCol (psRealSpan loc)) gen_semic : ctx} () + +-- This is only used at the outer level of a module when the 'module' keyword is +-- missing. +pushModuleContext :: P () +pushModuleContext = pushCurrentContext generateSemic + +getOffside :: P (Ordering, Bool) +getOffside = P $ \s@PState{last_loc=loc, context=stk} -> + let offs = srcSpanStartCol (psRealSpan loc) in + let ord = case stk of + Layout n gen_semic : _ -> + --trace ("layout: " ++ show n ++ ", offs: " ++ show offs) $ + (compare offs n, gen_semic) + _ -> + (GT, dontGenerateSemic) + in POk s ord + +-- --------------------------------------------------------------------------- +-- Construct a parse error + +srcParseErr + :: ParserFlags + -> StringBuffer -- current buffer (placed just after the last token) + -> Int -- length of the previous token + -> MsgDoc +srcParseErr options buf len + = if null token + then text "parse error (possibly incorrect indentation or mismatched brackets)" + else text "parse error on input" <+> quotes (text token) + $$ ppWhen (not th_enabled && token == "$") -- #7396 + (text "Perhaps you intended to use TemplateHaskell") + $$ ppWhen (token == "<-") + (if mdoInLast100 + then text "Perhaps you intended to use RecursiveDo" + else text "Perhaps this statement should be within a 'do' block?") + $$ ppWhen (token == "=" && doInLast100) -- #15849 + (text "Perhaps you need a 'let' in a 'do' block?" + $$ text "e.g. 'let x = 5' instead of 'x = 5'") + $$ ppWhen (not ps_enabled && pattern == "pattern ") -- #12429 + (text "Perhaps you intended to use PatternSynonyms") + where token = lexemeToString (offsetBytes (-len) buf) len + pattern = decodePrevNChars 8 buf + last100 = decodePrevNChars 100 buf + doInLast100 = "do" `isInfixOf` last100 + mdoInLast100 = "mdo" `isInfixOf` last100 + th_enabled = ThQuotesBit `xtest` pExtsBitmap options + ps_enabled = PatternSynonymsBit `xtest` pExtsBitmap options + +-- Report a parse failure, giving the span of the previous token as +-- the location of the error. This is the entry point for errors +-- detected during parsing. +srcParseFail :: P a +srcParseFail = P $ \s@PState{ buffer = buf, options = o, last_len = len, + last_loc = last_loc } -> + unP (addFatalError (mkSrcSpanPs last_loc) (srcParseErr o buf len)) s + +-- A lexical error is reported at a particular position in the source file, +-- not over a token range. +lexError :: String -> P a +lexError str = do + loc <- getRealSrcLoc + (AI end buf) <- getInput + reportLexError loc (psRealLoc end) buf str + +-- ----------------------------------------------------------------------------- +-- This is the top-level function: called from the parser each time a +-- new token is to be read from the input. + +lexer, lexerDbg :: Bool -> (Located Token -> P a) -> P a + +lexer queueComments cont = do + alr <- getBit AlternativeLayoutRuleBit + let lexTokenFun = if alr then lexTokenAlr else lexToken + (L span tok) <- lexTokenFun + --trace ("token: " ++ show tok) $ do + + if (queueComments && isDocComment tok) + then queueComment (L (psRealSpan span) tok) + else return () + + if (queueComments && isComment tok) + then queueComment (L (psRealSpan span) tok) >> lexer queueComments cont + else cont (L (mkSrcSpanPs span) tok) + +-- Use this instead of 'lexer' in GHC.Parser to dump the tokens for debugging. +lexerDbg queueComments cont = lexer queueComments contDbg + where + contDbg tok = trace ("token: " ++ show (unLoc tok)) (cont tok) + +lexTokenAlr :: P (PsLocated Token) +lexTokenAlr = do mPending <- popPendingImplicitToken + t <- case mPending of + Nothing -> + do mNext <- popNextToken + t <- case mNext of + Nothing -> lexToken + Just next -> return next + alternativeLayoutRuleToken t + Just t -> + return t + setAlrLastLoc (getLoc t) + case unLoc t of + ITwhere -> setAlrExpectingOCurly (Just ALRLayoutWhere) + ITlet -> setAlrExpectingOCurly (Just ALRLayoutLet) + ITof -> setAlrExpectingOCurly (Just ALRLayoutOf) + ITlcase -> setAlrExpectingOCurly (Just ALRLayoutOf) + ITdo -> setAlrExpectingOCurly (Just ALRLayoutDo) + ITmdo -> setAlrExpectingOCurly (Just ALRLayoutDo) + ITrec -> setAlrExpectingOCurly (Just ALRLayoutDo) + _ -> return () + return t + +alternativeLayoutRuleToken :: PsLocated Token -> P (PsLocated Token) +alternativeLayoutRuleToken t + = do context <- getALRContext + lastLoc <- getAlrLastLoc + mExpectingOCurly <- getAlrExpectingOCurly + transitional <- getBit ALRTransitionalBit + justClosedExplicitLetBlock <- getJustClosedExplicitLetBlock + setJustClosedExplicitLetBlock False + let thisLoc = getLoc t + thisCol = srcSpanStartCol (psRealSpan thisLoc) + newLine = srcSpanStartLine (psRealSpan thisLoc) > srcSpanEndLine (psRealSpan lastLoc) + case (unLoc t, context, mExpectingOCurly) of + -- This case handles a GHC extension to the original H98 + -- layout rule... + (ITocurly, _, Just alrLayout) -> + do setAlrExpectingOCurly Nothing + let isLet = case alrLayout of + ALRLayoutLet -> True + _ -> False + setALRContext (ALRNoLayout (containsCommas ITocurly) isLet : context) + return t + -- ...and makes this case unnecessary + {- + -- I think our implicit open-curly handling is slightly + -- different to John's, in how it interacts with newlines + -- and "in" + (ITocurly, _, Just _) -> + do setAlrExpectingOCurly Nothing + setNextToken t + lexTokenAlr + -} + (_, ALRLayout _ col : _ls, Just expectingOCurly) + | (thisCol > col) || + (thisCol == col && + isNonDecreasingIndentation expectingOCurly) -> + do setAlrExpectingOCurly Nothing + setALRContext (ALRLayout expectingOCurly thisCol : context) + setNextToken t + return (L thisLoc ITvocurly) + | otherwise -> + do setAlrExpectingOCurly Nothing + setPendingImplicitTokens [L lastLoc ITvccurly] + setNextToken t + return (L lastLoc ITvocurly) + (_, _, Just expectingOCurly) -> + do setAlrExpectingOCurly Nothing + setALRContext (ALRLayout expectingOCurly thisCol : context) + setNextToken t + return (L thisLoc ITvocurly) + -- We do the [] cases earlier than in the spec, as we + -- have an actual EOF token + (ITeof, ALRLayout _ _ : ls, _) -> + do setALRContext ls + setNextToken t + return (L thisLoc ITvccurly) + (ITeof, _, _) -> + return t + -- the other ITeof case omitted; general case below covers it + (ITin, _, _) + | justClosedExplicitLetBlock -> + return t + (ITin, ALRLayout ALRLayoutLet _ : ls, _) + | newLine -> + do setPendingImplicitTokens [t] + setALRContext ls + return (L thisLoc ITvccurly) + -- This next case is to handle a transitional issue: + (ITwhere, ALRLayout _ col : ls, _) + | newLine && thisCol == col && transitional -> + do addWarning Opt_WarnAlternativeLayoutRuleTransitional + (mkSrcSpanPs thisLoc) + (transitionalAlternativeLayoutWarning + "`where' clause at the same depth as implicit layout block") + setALRContext ls + setNextToken t + -- Note that we use lastLoc, as we may need to close + -- more layouts, or give a semicolon + return (L lastLoc ITvccurly) + -- This next case is to handle a transitional issue: + (ITvbar, ALRLayout _ col : ls, _) + | newLine && thisCol == col && transitional -> + do addWarning Opt_WarnAlternativeLayoutRuleTransitional + (mkSrcSpanPs thisLoc) + (transitionalAlternativeLayoutWarning + "`|' at the same depth as implicit layout block") + setALRContext ls + setNextToken t + -- Note that we use lastLoc, as we may need to close + -- more layouts, or give a semicolon + return (L lastLoc ITvccurly) + (_, ALRLayout _ col : ls, _) + | newLine && thisCol == col -> + do setNextToken t + let loc = psSpanStart thisLoc + zeroWidthLoc = mkPsSpan loc loc + return (L zeroWidthLoc ITsemi) + | newLine && thisCol < col -> + do setALRContext ls + setNextToken t + -- Note that we use lastLoc, as we may need to close + -- more layouts, or give a semicolon + return (L lastLoc ITvccurly) + -- We need to handle close before open, as 'then' is both + -- an open and a close + (u, _, _) + | isALRclose u -> + case context of + ALRLayout _ _ : ls -> + do setALRContext ls + setNextToken t + return (L thisLoc ITvccurly) + ALRNoLayout _ isLet : ls -> + do let ls' = if isALRopen u + then ALRNoLayout (containsCommas u) False : ls + else ls + setALRContext ls' + when isLet $ setJustClosedExplicitLetBlock True + return t + [] -> + do let ls = if isALRopen u + then [ALRNoLayout (containsCommas u) False] + else [] + setALRContext ls + -- XXX This is an error in John's code, but + -- it looks reachable to me at first glance + return t + (u, _, _) + | isALRopen u -> + do setALRContext (ALRNoLayout (containsCommas u) False : context) + return t + (ITin, ALRLayout ALRLayoutLet _ : ls, _) -> + do setALRContext ls + setPendingImplicitTokens [t] + return (L thisLoc ITvccurly) + (ITin, ALRLayout _ _ : ls, _) -> + do setALRContext ls + setNextToken t + return (L thisLoc ITvccurly) + -- the other ITin case omitted; general case below covers it + (ITcomma, ALRLayout _ _ : ls, _) + | topNoLayoutContainsCommas ls -> + do setALRContext ls + setNextToken t + return (L thisLoc ITvccurly) + (ITwhere, ALRLayout ALRLayoutDo _ : ls, _) -> + do setALRContext ls + setPendingImplicitTokens [t] + return (L thisLoc ITvccurly) + -- the other ITwhere case omitted; general case below covers it + (_, _, _) -> return t + +transitionalAlternativeLayoutWarning :: String -> SDoc +transitionalAlternativeLayoutWarning msg + = text "transitional layout will not be accepted in the future:" + $$ text msg + +isALRopen :: Token -> Bool +isALRopen ITcase = True +isALRopen ITif = True +isALRopen ITthen = True +isALRopen IToparen = True +isALRopen ITobrack = True +isALRopen ITocurly = True +-- GHC Extensions: +isALRopen IToubxparen = True +isALRopen _ = False + +isALRclose :: Token -> Bool +isALRclose ITof = True +isALRclose ITthen = True +isALRclose ITelse = True +isALRclose ITcparen = True +isALRclose ITcbrack = True +isALRclose ITccurly = True +-- GHC Extensions: +isALRclose ITcubxparen = True +isALRclose _ = False + +isNonDecreasingIndentation :: ALRLayout -> Bool +isNonDecreasingIndentation ALRLayoutDo = True +isNonDecreasingIndentation _ = False + +containsCommas :: Token -> Bool +containsCommas IToparen = True +containsCommas ITobrack = True +-- John doesn't have {} as containing commas, but records contain them, +-- which caused a problem parsing Cabal's Distribution.Simple.InstallDirs +-- (defaultInstallDirs). +containsCommas ITocurly = True +-- GHC Extensions: +containsCommas IToubxparen = True +containsCommas _ = False + +topNoLayoutContainsCommas :: [ALRContext] -> Bool +topNoLayoutContainsCommas [] = False +topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls +topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b + +lexToken :: P (PsLocated Token) +lexToken = do + inp@(AI loc1 buf) <- getInput + sc <- getLexState + exts <- getExts + case alexScanUser exts inp sc of + AlexEOF -> do + let span = mkPsSpan loc1 loc1 + setEofPos (psRealSpan span) + setLastToken span 0 + return (L span ITeof) + AlexError (AI loc2 buf) -> + reportLexError (psRealLoc loc1) (psRealLoc loc2) buf "lexical error" + AlexSkip inp2 _ -> do + setInput inp2 + lexToken + AlexToken inp2@(AI end buf2) _ t -> do + setInput inp2 + let span = mkPsSpan loc1 end + let bytes = byteDiff buf buf2 + span `seq` setLastToken span bytes + lt <- t span buf bytes + let lt' = unLoc lt + unless (isComment lt') (setLastTk lt') + return lt + +reportLexError :: RealSrcLoc -> RealSrcLoc -> StringBuffer -> [Char] -> P a +reportLexError loc1 loc2 buf str + | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input") + | otherwise = + let c = fst (nextChar buf) + in if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar# + then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)") + else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c) + +lexTokenStream :: StringBuffer -> RealSrcLoc -> DynFlags -> ParseResult [Located Token] +lexTokenStream buf loc dflags = unP go initState{ options = opts' } + where dflags' = gopt_set (gopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream + initState@PState{ options = opts } = mkPState dflags' buf loc + opts' = opts{ pExtsBitmap = complement (xbit UsePosPragsBit) .&. pExtsBitmap opts } + go = do + ltok <- lexer False return + case ltok of + L _ ITeof -> return [] + _ -> liftM (ltok:) go + +linePrags = Map.singleton "line" linePrag + +fileHeaderPrags = Map.fromList([("options", lex_string_prag IToptions_prag), + ("options_ghc", lex_string_prag IToptions_prag), + ("options_haddock", lex_string_prag ITdocOptions), + ("language", token ITlanguage_prag), + ("include", lex_string_prag ITinclude_prag)]) + +ignoredPrags = Map.fromList (map ignored pragmas) + where ignored opt = (opt, nested_comment lexToken) + impls = ["hugs", "nhc98", "jhc", "yhc", "catch", "derive"] + options_pragmas = map ("options_" ++) impls + -- CFILES is a hugs-only thing. + pragmas = options_pragmas ++ ["cfiles", "contract"] + +oneWordPrags = Map.fromList [ + ("rules", rulePrag), + ("inline", + strtoken (\s -> (ITinline_prag (SourceText s) Inline FunLike))), + ("inlinable", + strtoken (\s -> (ITinline_prag (SourceText s) Inlinable FunLike))), + ("inlineable", + strtoken (\s -> (ITinline_prag (SourceText s) Inlinable FunLike))), + -- Spelling variant + ("notinline", + strtoken (\s -> (ITinline_prag (SourceText s) NoInline FunLike))), + ("specialize", strtoken (\s -> ITspec_prag (SourceText s))), + ("source", strtoken (\s -> ITsource_prag (SourceText s))), + ("warning", strtoken (\s -> ITwarning_prag (SourceText s))), + ("deprecated", strtoken (\s -> ITdeprecated_prag (SourceText s))), + ("scc", strtoken (\s -> ITscc_prag (SourceText s))), + ("generated", strtoken (\s -> ITgenerated_prag (SourceText s))), + ("core", strtoken (\s -> ITcore_prag (SourceText s))), + ("unpack", strtoken (\s -> ITunpack_prag (SourceText s))), + ("nounpack", strtoken (\s -> ITnounpack_prag (SourceText s))), + ("ann", strtoken (\s -> ITann_prag (SourceText s))), + ("minimal", strtoken (\s -> ITminimal_prag (SourceText s))), + ("overlaps", strtoken (\s -> IToverlaps_prag (SourceText s))), + ("overlappable", strtoken (\s -> IToverlappable_prag (SourceText s))), + ("overlapping", strtoken (\s -> IToverlapping_prag (SourceText s))), + ("incoherent", strtoken (\s -> ITincoherent_prag (SourceText s))), + ("ctype", strtoken (\s -> ITctype (SourceText s))), + ("complete", strtoken (\s -> ITcomplete_prag (SourceText s))), + ("column", columnPrag) + ] + +twoWordPrags = Map.fromList [ + ("inline conlike", + strtoken (\s -> (ITinline_prag (SourceText s) Inline ConLike))), + ("notinline conlike", + strtoken (\s -> (ITinline_prag (SourceText s) NoInline ConLike))), + ("specialize inline", + strtoken (\s -> (ITspec_inline_prag (SourceText s) True))), + ("specialize notinline", + strtoken (\s -> (ITspec_inline_prag (SourceText s) False))) + ] + +dispatch_pragmas :: Map String Action -> Action +dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of + Just found -> found span buf len + Nothing -> lexError "unknown pragma" + +known_pragma :: Map String Action -> AlexAccPred ExtsBitmap +known_pragma prags _ (AI _ startbuf) _ (AI _ curbuf) + = isKnown && nextCharIsNot curbuf pragmaNameChar + where l = lexemeToString startbuf (byteDiff startbuf curbuf) + isKnown = isJust $ Map.lookup (clean_pragma l) prags + pragmaNameChar c = isAlphaNum c || c == '_' + +clean_pragma :: String -> String +clean_pragma prag = canon_ws (map toLower (unprefix prag)) + where unprefix prag' = case stripPrefix "{-#" prag' of + Just rest -> rest + Nothing -> prag' + canonical prag' = case prag' of + "noinline" -> "notinline" + "specialise" -> "specialize" + "constructorlike" -> "conlike" + _ -> prag' + canon_ws s = unwords (map canonical (words s)) + + + +{- +%************************************************************************ +%* * + Helper functions for generating annotations in the parser +%* * +%************************************************************************ +-} + +-- | Encapsulated call to addAnnotation, requiring only the SrcSpan of +-- the AST construct the annotation belongs to; together with the +-- AnnKeywordId, this is the key of the annotation map. +-- +-- This type is useful for places in the parser where it is not yet +-- known what SrcSpan an annotation should be added to. The most +-- common situation is when we are parsing a list: the annotations +-- need to be associated with the AST element that *contains* the +-- list, not the list itself. 'AddAnn' lets us defer adding the +-- annotations until we finish parsing the list and are now parsing +-- the enclosing element; we then apply the 'AddAnn' to associate +-- the annotations. Another common situation is where a common fragment of +-- the AST has been factored out but there is no separate AST node for +-- this fragment (this occurs in class and data declarations). In this +-- case, the annotation belongs to the parent data declaration. +-- +-- The usual way an 'AddAnn' is created is using the 'mj' ("make jump") +-- function, and then it can be discharged using the 'ams' function. +data AddAnn = AddAnn AnnKeywordId SrcSpan + +addAnnotationOnly :: RealSrcSpan -> AnnKeywordId -> RealSrcSpan -> P () +addAnnotationOnly l a v = P $ \s -> POk s { + annotations = ((l,a), [v]) : annotations s + } () + +-- |Given a 'SrcSpan' that surrounds a 'HsPar' or 'HsParTy', generate +-- 'AddAnn' values for the opening and closing bordering on the start +-- and end of the span +mkParensApiAnn :: SrcSpan -> [AddAnn] +mkParensApiAnn (UnhelpfulSpan _) = [] +mkParensApiAnn (RealSrcSpan ss _) = [AddAnn AnnOpenP lo,AddAnn AnnCloseP lc] + where + f = srcSpanFile ss + sl = srcSpanStartLine ss + sc = srcSpanStartCol ss + el = srcSpanEndLine ss + ec = srcSpanEndCol ss + lo = RealSrcSpan (mkRealSrcSpan (realSrcSpanStart ss) (mkRealSrcLoc f sl (sc+1))) Nothing + lc = RealSrcSpan (mkRealSrcSpan (mkRealSrcLoc f el (ec - 1)) (realSrcSpanEnd ss)) Nothing + +queueComment :: RealLocated Token -> P() +queueComment c = P $ \s -> POk s { + comment_q = commentToAnnotation c : comment_q s + } () + +-- | Go through the @comment_q@ in @PState@ and remove all comments +-- that belong within the given span +allocateCommentsP :: RealSrcSpan -> P () +allocateCommentsP ss = P $ \s -> + let (comment_q', newAnns) = allocateComments ss (comment_q s) in + POk s { + comment_q = comment_q' + , annotations_comments = newAnns ++ (annotations_comments s) + } () + +allocateComments + :: RealSrcSpan + -> [RealLocated AnnotationComment] + -> ([RealLocated AnnotationComment], [(RealSrcSpan,[RealLocated AnnotationComment])]) +allocateComments ss comment_q = + let + (before,rest) = break (\(L l _) -> isRealSubspanOf l ss) comment_q + (middle,after) = break (\(L l _) -> not (isRealSubspanOf l ss)) rest + comment_q' = before ++ after + newAnns = if null middle then [] + else [(ss,middle)] + in + (comment_q', newAnns) + + +commentToAnnotation :: RealLocated Token -> RealLocated AnnotationComment +commentToAnnotation (L l (ITdocCommentNext s)) = L l (AnnDocCommentNext s) +commentToAnnotation (L l (ITdocCommentPrev s)) = L l (AnnDocCommentPrev s) +commentToAnnotation (L l (ITdocCommentNamed s)) = L l (AnnDocCommentNamed s) +commentToAnnotation (L l (ITdocSection n s)) = L l (AnnDocSection n s) +commentToAnnotation (L l (ITdocOptions s)) = L l (AnnDocOptions s) +commentToAnnotation (L l (ITlineComment s)) = L l (AnnLineComment s) +commentToAnnotation (L l (ITblockComment s)) = L l (AnnBlockComment s) +commentToAnnotation _ = panic "commentToAnnotation" + +-- --------------------------------------------------------------------- + +isComment :: Token -> Bool +isComment (ITlineComment _) = True +isComment (ITblockComment _) = True +isComment _ = False + +isDocComment :: Token -> Bool +isDocComment (ITdocCommentNext _) = True +isDocComment (ITdocCommentPrev _) = True +isDocComment (ITdocCommentNamed _) = True +isDocComment (ITdocSection _ _) = True +isDocComment (ITdocOptions _) = True +isDocComment _ = False +} |