diff options
Diffstat (limited to 'compiler/parser/Lexer.x')
-rw-r--r-- | compiler/parser/Lexer.x | 253 |
1 files changed, 187 insertions, 66 deletions
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x index 2ada289db4..fc6779a359 100644 --- a/compiler/parser/Lexer.x +++ b/compiler/parser/Lexer.x @@ -44,6 +44,7 @@ {-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE LambdaCase #-} +{-# LANGUAGE MultiWayIf #-} {-# OPTIONS_GHC -funbox-strict-fields #-} @@ -376,10 +377,6 @@ $tab { warnTab } "[t|" / { ifExtension ThQuotesBit } { token ITopenTypQuote } "|]" / { ifExtension ThQuotesBit } { token (ITcloseQuote NormalSyntax) } "||]" / { ifExtension ThQuotesBit } { token ITcloseTExpQuote } - \$ @varid / { ifExtension ThBit } { skip_one_varid ITidEscape } - "$$" @varid / { ifExtension ThBit } { skip_two_varid ITidTyEscape } - "$(" / { ifExtension ThBit } { token ITparenEscape } - "$$(" / { ifExtension ThBit } { token ITparenTyEscape } "[" @varid "|" / { ifExtension QqBit } { lex_quasiquote_tok } @@ -398,14 +395,6 @@ $tab { warnTab } { token (ITcloseQuote UnicodeSyntax) } } - -- See Note [Lexing type applications] -<0> { - [^ $idchar \) ] ^ - "@" - / { ifExtension TypeApplicationsBit `alexAndPred` notFollowedBySymbol } - { token ITtypeApp } -} - <0> { "(|" / { ifExtension ArrowsBit `alexAndPred` @@ -471,12 +460,20 @@ $tab { warnTab } @conid "#"+ / { ifExtension MagicHashBit } { idtoken conid } } +-- Operators classified into prefix, suffix, tight infix, and loose infix. +-- See Note [Whitespace-sensitive operator parsing] +<0> { + @varsym / { precededByClosingToken `alexAndPred` followedByOpeningToken } { varsym_tight_infix } + @varsym / { followedByOpeningToken } { varsym_prefix } + @varsym / { precededByClosingToken } { varsym_suffix } + @varsym { varsym_loose_infix } +} + -- ToDo: - move `var` and (sym) into lexical syntax? -- - remove backquote from $special? <0> { @qvarsym { idtoken qvarsym } @qconsym { idtoken qconsym } - @varsym { varsym } @consym { consym } } @@ -550,32 +547,114 @@ $tab { warnTab } \" { lex_string_tok } } --- Note [Lexing type applications] --- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- The desired syntax for type applications is to prefix the type application --- with '@', like this: +-- Note [Whitespace-sensitive operator parsing] +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- In accord with GHC Proposal #229 https://github.com/ghc-proposals/ghc-proposals/blob/master/proposals/0229-whitespace-bang-patterns.rst +-- we classify operator occurrences into four categories: +-- +-- a ! b -- a loose infix occurrence +-- a!b -- a tight infix occurrence +-- a !b -- a prefix occurrence +-- a! b -- a suffix occurrence +-- +-- The rules are a bit more elaborate than simply checking for whitespace, in +-- order to accommodate the following use cases: +-- +-- f (!a) = ... -- prefix occurrence +-- g (a !) -- loose infix occurrence +-- g (! a) -- loose infix occurrence +-- +-- The precise rules are as follows: +-- +-- * Identifiers, literals, and opening brackets (, (#, [, [|, [||, [p|, [e|, +-- [t|, {, are considered "opening tokens". The function followedByOpeningToken +-- tests whether the next token is an opening token. +-- +-- * Identifiers, literals, and closing brackets ), #), ], |], }, +-- are considered "closing tokens". The function precededByClosingToken tests +-- whether the previous token is a closing token. -- --- foo @Int @Bool baz bum +-- * Whitespace, comments, separators, and other tokens, are considered +-- neither opening nor closing. -- --- This, of course, conflicts with as-patterns. The conflict arises because --- expressions and patterns use the same parser, and also because we want --- to allow type patterns within expression patterns. +-- * Any unqualified operator occurrence is classified as prefix, suffix, or +-- tight/loose infix, based on preceding and following tokens: -- --- Disambiguation is accomplished by requiring *something* to appear between --- type application and the preceding token. This something must end with --- a character that cannot be the end of the variable bound in an as-pattern. --- Currently (June 2015), this means that the something cannot end with a --- $idchar or a close-paren. (The close-paren is necessary if the as-bound --- identifier is symbolic.) +-- precededByClosingToken | followedByOpeningToken | Occurrence +-- ------------------------+------------------------+------------ +-- False | True | prefix +-- True | False | suffix +-- True | True | tight infix +-- False | False | loose infix +-- ------------------------+------------------------+------------ -- --- Note that looking for whitespace before the '@' is insufficient, because --- of this pathological case: +-- A loose infix occurrence is always considered an operator. Other types of +-- occurrences may be assigned a special per-operator meaning override: -- --- foo {- hi -}@Int +-- Operator | Occurrence | Token returned +-- ----------+---------------+------------------------------------------ +-- ! | prefix | ITbang +-- | | strictness annotation or bang pattern, +-- | | e.g. f !x = rhs, data T = MkT !a +-- | not prefix | ITvarsym "!" +-- | | ordinary operator or type operator, +-- | | e.g. xs ! 3, (! x), Int ! Bool +-- ----------+---------------+------------------------------------------ +-- ~ | prefix | ITtilde +-- | | laziness annotation or lazy pattern, +-- | | e.g. f ~x = rhs, data T = MkT ~a +-- | not prefix | ITvarsym "~" +-- | | ordinary operator or type operator, +-- | | e.g. xs ~ 3, (~ x), Int ~ Bool +-- ----------+---------------+------------------------------------------ +-- $ $$ | prefix | ITdollar, ITdollardollar +-- | | untyped or typed Template Haskell splice, +-- | | e.g. $(f x), $$(f x), $$"str" +-- | not prefix | ITvarsym "$", ITvarsym "$$" +-- | | ordinary operator or type operator, +-- | | e.g. f $ g x, a $$ b +-- ----------+---------------+------------------------------------------ +-- @ | prefix | ITtypeApp +-- | | type application, e.g. fmap @Maybe +-- | tight infix | ITat +-- | | as-pattern, e.g. f p@(a,b) = rhs +-- | suffix | parse error +-- | | e.g. f p@ x = rhs +-- | loose infix | ITvarsym "@" +-- | | ordinary operator or type operator, +-- | | e.g. f @ g, (f @) +-- ----------+---------------+------------------------------------------ -- --- This design is predicated on the fact that as-patterns are generally --- whitespace-free, and also that this whole thing is opt-in, with the --- TypeApplications extension. +-- Also, some of these overrides are guarded behind language extensions. +-- According to the specification, we must determine the occurrence based on +-- surrounding *tokens* (see the proposal for the exact rules). However, in +-- the implementation we cheat a little and do the classification based on +-- characters, for reasons of both simplicity and efficiency (see +-- 'followedByOpeningToken' and 'precededByClosingToken') +-- +-- When an operator is subject to a meaning override, it is mapped to special +-- token: ITbang, ITtilde, ITat, ITdollar, ITdollardollar. Otherwise, it is +-- returned as ITvarsym. +-- +-- For example, this is how we process the (!): +-- +-- precededByClosingToken | followedByOpeningToken | Token +-- ------------------------+------------------------+------------- +-- False | True | ITbang +-- True | False | ITvarsym "!" +-- True | True | ITvarsym "!" +-- False | False | ITvarsym "!" +-- ------------------------+------------------------+------------- +-- +-- And this is how we process the (@): +-- +-- precededByClosingToken | followedByOpeningToken | Token +-- ------------------------+------------------------+------------- +-- False | True | ITtypeApp +-- True | False | parse error +-- True | True | ITat +-- False | False | ITvarsym "@" +-- ------------------------+------------------------+------------- -- ----------------------------------------------------------------------------- -- Alex "Haskell code fragment bottom" @@ -680,11 +759,12 @@ data Token | ITvbar | ITlarrow IsUnicodeSyntax | ITrarrow IsUnicodeSyntax - | ITat - | ITtilde | ITdarrow IsUnicodeSyntax | ITminus - | ITbang + | ITbang -- Prefix (!) only, e.g. f !x = rhs + | ITtilde -- Prefix (~) only, e.g. f ~x = rhs + | ITat -- Tight infix (@) only, e.g. f x@pat = rhs + | ITtypeApp -- Prefix (@) only, e.g. f @t | ITstar IsUnicodeSyntax | ITdot @@ -740,10 +820,8 @@ data Token | ITcloseQuote IsUnicodeSyntax -- |] | ITopenTExpQuote HasE -- [|| or [e|| | ITcloseTExpQuote -- ||] - | ITidEscape FastString -- $x - | ITparenEscape -- $( - | ITidTyEscape FastString -- $$x - | ITparenTyEscape -- $$( + | ITdollar -- prefix $ + | ITdollardollar -- prefix $$ | ITtyQuote -- '' | ITquasiQuote (FastString,FastString,RealSrcSpan) -- ITquasiQuote(quoter, quote, loc) @@ -764,11 +842,6 @@ data Token | ITLarrowtail IsUnicodeSyntax -- ^ @-<<@ | ITRarrowtail IsUnicodeSyntax -- ^ @>>-@ - -- | Type application '@' (lexed differently than as-pattern '@', - -- due to checking for preceding whitespace) - | ITtypeApp - - | ITunknown String -- ^ Used when the lexer can't make sense of it | ITeof -- ^ end of file token @@ -889,11 +962,8 @@ reservedSymsFM = listToUFM $ ,("|", ITvbar, NormalSyntax, 0 ) ,("<-", ITlarrow NormalSyntax, NormalSyntax, 0 ) ,("->", ITrarrow NormalSyntax, NormalSyntax, 0 ) - ,("@", ITat, NormalSyntax, 0 ) - ,("~", ITtilde, NormalSyntax, 0 ) ,("=>", ITdarrow NormalSyntax, NormalSyntax, 0 ) ,("-", ITminus, NormalSyntax, 0 ) - ,("!", ITbang, NormalSyntax, 0 ) ,("*", ITstar NormalSyntax, NormalSyntax, xbit StarIsTypeBit) @@ -988,6 +1058,32 @@ pop_and :: Action -> Action pop_and act span buf len = do _ <- popLexState act span buf len +-- See Note [Whitespace-sensitive operator parsing] +followedByOpeningToken :: AlexAccPred ExtsBitmap +followedByOpeningToken _ _ _ (AI _ buf) + | atEnd buf = False + | otherwise = + case nextChar buf of + ('{', buf') -> nextCharIsNot buf' (== '-') + ('(', _) -> True + ('[', _) -> True + ('\"', _) -> True + ('\'', _) -> True + ('_', _) -> True + (c, _) -> isAlphaNum c + +-- See Note [Whitespace-sensitive operator parsing] +precededByClosingToken :: AlexAccPred ExtsBitmap +precededByClosingToken _ (AI _ buf) _ _ = + case prevChar buf '\n' of + '}' -> decodePrevNChars 1 buf /= "-" + ')' -> True + ']' -> True + '\"' -> True + '\'' -> True + '_' -> True + c -> isAlphaNum c + {-# INLINE nextCharIs #-} nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool nextCharIs buf p = not (atEnd buf) && p (currentChar buf) @@ -1348,11 +1444,40 @@ qvarsym, qconsym :: StringBuffer -> Int -> Token qvarsym buf len = ITqvarsym $! splitQualName buf len False qconsym buf len = ITqconsym $! splitQualName buf len False -varsym, consym :: Action -varsym = sym ITvarsym -consym = sym ITconsym - -sym :: (FastString -> Token) -> Action +-- See Note [Whitespace-sensitive operator parsing] +varsym_prefix :: Action +varsym_prefix = sym $ \exts s -> + if | TypeApplicationsBit `xtest` exts, s == fsLit "@" + -> return ITtypeApp + | ThBit `xtest` exts, s == fsLit "$" + -> return ITdollar + | ThBit `xtest` exts, s == fsLit "$$" + -> return ITdollardollar + | s == fsLit "!" -> return ITbang + | s == fsLit "~" -> return ITtilde + | otherwise -> return (ITvarsym s) + +-- See Note [Whitespace-sensitive operator parsing] +varsym_suffix :: Action +varsym_suffix = sym $ \_ s -> + if | s == fsLit "@" + -> failMsgP "Suffix occurrence of @. For an as-pattern, remove the leading whitespace." + | otherwise -> return (ITvarsym s) + +-- See Note [Whitespace-sensitive operator parsing] +varsym_tight_infix :: Action +varsym_tight_infix = sym $ \_ s -> + if | s == fsLit "@" -> return ITat + | otherwise -> return (ITvarsym s) + +-- See Note [Whitespace-sensitive operator parsing] +varsym_loose_infix :: Action +varsym_loose_infix = sym (\_ s -> return $ ITvarsym s) + +consym :: Action +consym = sym (\_exts s -> return $ ITconsym s) + +sym :: (ExtsBitmap -> FastString -> P Token) -> Action sym con span buf len = case lookupUFM reservedSymsFM fs of Just (keyword, NormalSyntax, 0) -> @@ -1361,19 +1486,20 @@ sym con span buf len = exts <- getExts if exts .&. i /= 0 then return $ L span keyword - else return $ L span (con fs) + else L span <$!> con exts fs Just (keyword, UnicodeSyntax, 0) -> do exts <- getExts if xtest UnicodeSyntaxBit exts then return $ L span keyword - else return $ L span (con fs) + else L span <$!> con exts fs Just (keyword, UnicodeSyntax, i) -> do exts <- getExts if exts .&. i /= 0 && xtest UnicodeSyntaxBit exts then return $ L span keyword - else return $ L span (con fs) - Nothing -> - return $ L span $! con fs + else L span <$!> con exts fs + Nothing -> do + exts <- getExts + L span <$!> con exts fs where !fs = lexemeToFastString buf len @@ -2889,8 +3015,6 @@ isALRopen ITobrack = True isALRopen ITocurly = True -- GHC Extensions: isALRopen IToubxparen = True -isALRopen ITparenEscape = True -isALRopen ITparenTyEscape = True isALRopen _ = False isALRclose :: Token -> Bool @@ -2945,12 +3069,9 @@ lexToken = do let bytes = byteDiff buf buf2 span `seq` setLastToken span bytes lt <- t span buf bytes - case unRealSrcSpan lt of - ITlineComment _ -> return lt - ITblockComment _ -> return lt - lt' -> do - setLastTk lt' - return lt + let lt' = unRealSrcSpan lt + unless (isComment lt') (setLastTk lt') + return lt reportLexError :: RealSrcLoc -> RealSrcLoc -> StringBuffer -> [Char] -> P a reportLexError loc1 loc2 buf str |