summaryrefslogtreecommitdiff
path: root/compiler/parser/Lexer.x
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/parser/Lexer.x')
-rw-r--r--compiler/parser/Lexer.x253
1 files changed, 187 insertions, 66 deletions
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 2ada289db4..fc6779a359 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -44,6 +44,7 @@
{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE MultiWayIf #-}
{-# OPTIONS_GHC -funbox-strict-fields #-}
@@ -376,10 +377,6 @@ $tab { warnTab }
"[t|" / { ifExtension ThQuotesBit } { token ITopenTypQuote }
"|]" / { ifExtension ThQuotesBit } { token (ITcloseQuote NormalSyntax) }
"||]" / { ifExtension ThQuotesBit } { token ITcloseTExpQuote }
- \$ @varid / { ifExtension ThBit } { skip_one_varid ITidEscape }
- "$$" @varid / { ifExtension ThBit } { skip_two_varid ITidTyEscape }
- "$(" / { ifExtension ThBit } { token ITparenEscape }
- "$$(" / { ifExtension ThBit } { token ITparenTyEscape }
"[" @varid "|" / { ifExtension QqBit } { lex_quasiquote_tok }
@@ -398,14 +395,6 @@ $tab { warnTab }
{ token (ITcloseQuote UnicodeSyntax) }
}
- -- See Note [Lexing type applications]
-<0> {
- [^ $idchar \) ] ^
- "@"
- / { ifExtension TypeApplicationsBit `alexAndPred` notFollowedBySymbol }
- { token ITtypeApp }
-}
-
<0> {
"(|"
/ { ifExtension ArrowsBit `alexAndPred`
@@ -471,12 +460,20 @@ $tab { warnTab }
@conid "#"+ / { ifExtension MagicHashBit } { idtoken conid }
}
+-- Operators classified into prefix, suffix, tight infix, and loose infix.
+-- See Note [Whitespace-sensitive operator parsing]
+<0> {
+ @varsym / { precededByClosingToken `alexAndPred` followedByOpeningToken } { varsym_tight_infix }
+ @varsym / { followedByOpeningToken } { varsym_prefix }
+ @varsym / { precededByClosingToken } { varsym_suffix }
+ @varsym { varsym_loose_infix }
+}
+
-- ToDo: - move `var` and (sym) into lexical syntax?
-- - remove backquote from $special?
<0> {
@qvarsym { idtoken qvarsym }
@qconsym { idtoken qconsym }
- @varsym { varsym }
@consym { consym }
}
@@ -550,32 +547,114 @@ $tab { warnTab }
\" { lex_string_tok }
}
--- Note [Lexing type applications]
--- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- The desired syntax for type applications is to prefix the type application
--- with '@', like this:
+-- Note [Whitespace-sensitive operator parsing]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- In accord with GHC Proposal #229 https://github.com/ghc-proposals/ghc-proposals/blob/master/proposals/0229-whitespace-bang-patterns.rst
+-- we classify operator occurrences into four categories:
+--
+-- a ! b -- a loose infix occurrence
+-- a!b -- a tight infix occurrence
+-- a !b -- a prefix occurrence
+-- a! b -- a suffix occurrence
+--
+-- The rules are a bit more elaborate than simply checking for whitespace, in
+-- order to accommodate the following use cases:
+--
+-- f (!a) = ... -- prefix occurrence
+-- g (a !) -- loose infix occurrence
+-- g (! a) -- loose infix occurrence
+--
+-- The precise rules are as follows:
+--
+-- * Identifiers, literals, and opening brackets (, (#, [, [|, [||, [p|, [e|,
+-- [t|, {, are considered "opening tokens". The function followedByOpeningToken
+-- tests whether the next token is an opening token.
+--
+-- * Identifiers, literals, and closing brackets ), #), ], |], },
+-- are considered "closing tokens". The function precededByClosingToken tests
+-- whether the previous token is a closing token.
--
--- foo @Int @Bool baz bum
+-- * Whitespace, comments, separators, and other tokens, are considered
+-- neither opening nor closing.
--
--- This, of course, conflicts with as-patterns. The conflict arises because
--- expressions and patterns use the same parser, and also because we want
--- to allow type patterns within expression patterns.
+-- * Any unqualified operator occurrence is classified as prefix, suffix, or
+-- tight/loose infix, based on preceding and following tokens:
--
--- Disambiguation is accomplished by requiring *something* to appear between
--- type application and the preceding token. This something must end with
--- a character that cannot be the end of the variable bound in an as-pattern.
--- Currently (June 2015), this means that the something cannot end with a
--- $idchar or a close-paren. (The close-paren is necessary if the as-bound
--- identifier is symbolic.)
+-- precededByClosingToken | followedByOpeningToken | Occurrence
+-- ------------------------+------------------------+------------
+-- False | True | prefix
+-- True | False | suffix
+-- True | True | tight infix
+-- False | False | loose infix
+-- ------------------------+------------------------+------------
--
--- Note that looking for whitespace before the '@' is insufficient, because
--- of this pathological case:
+-- A loose infix occurrence is always considered an operator. Other types of
+-- occurrences may be assigned a special per-operator meaning override:
--
--- foo {- hi -}@Int
+-- Operator | Occurrence | Token returned
+-- ----------+---------------+------------------------------------------
+-- ! | prefix | ITbang
+-- | | strictness annotation or bang pattern,
+-- | | e.g. f !x = rhs, data T = MkT !a
+-- | not prefix | ITvarsym "!"
+-- | | ordinary operator or type operator,
+-- | | e.g. xs ! 3, (! x), Int ! Bool
+-- ----------+---------------+------------------------------------------
+-- ~ | prefix | ITtilde
+-- | | laziness annotation or lazy pattern,
+-- | | e.g. f ~x = rhs, data T = MkT ~a
+-- | not prefix | ITvarsym "~"
+-- | | ordinary operator or type operator,
+-- | | e.g. xs ~ 3, (~ x), Int ~ Bool
+-- ----------+---------------+------------------------------------------
+-- $ $$ | prefix | ITdollar, ITdollardollar
+-- | | untyped or typed Template Haskell splice,
+-- | | e.g. $(f x), $$(f x), $$"str"
+-- | not prefix | ITvarsym "$", ITvarsym "$$"
+-- | | ordinary operator or type operator,
+-- | | e.g. f $ g x, a $$ b
+-- ----------+---------------+------------------------------------------
+-- @ | prefix | ITtypeApp
+-- | | type application, e.g. fmap @Maybe
+-- | tight infix | ITat
+-- | | as-pattern, e.g. f p@(a,b) = rhs
+-- | suffix | parse error
+-- | | e.g. f p@ x = rhs
+-- | loose infix | ITvarsym "@"
+-- | | ordinary operator or type operator,
+-- | | e.g. f @ g, (f @)
+-- ----------+---------------+------------------------------------------
--
--- This design is predicated on the fact that as-patterns are generally
--- whitespace-free, and also that this whole thing is opt-in, with the
--- TypeApplications extension.
+-- Also, some of these overrides are guarded behind language extensions.
+-- According to the specification, we must determine the occurrence based on
+-- surrounding *tokens* (see the proposal for the exact rules). However, in
+-- the implementation we cheat a little and do the classification based on
+-- characters, for reasons of both simplicity and efficiency (see
+-- 'followedByOpeningToken' and 'precededByClosingToken')
+--
+-- When an operator is subject to a meaning override, it is mapped to special
+-- token: ITbang, ITtilde, ITat, ITdollar, ITdollardollar. Otherwise, it is
+-- returned as ITvarsym.
+--
+-- For example, this is how we process the (!):
+--
+-- precededByClosingToken | followedByOpeningToken | Token
+-- ------------------------+------------------------+-------------
+-- False | True | ITbang
+-- True | False | ITvarsym "!"
+-- True | True | ITvarsym "!"
+-- False | False | ITvarsym "!"
+-- ------------------------+------------------------+-------------
+--
+-- And this is how we process the (@):
+--
+-- precededByClosingToken | followedByOpeningToken | Token
+-- ------------------------+------------------------+-------------
+-- False | True | ITtypeApp
+-- True | False | parse error
+-- True | True | ITat
+-- False | False | ITvarsym "@"
+-- ------------------------+------------------------+-------------
-- -----------------------------------------------------------------------------
-- Alex "Haskell code fragment bottom"
@@ -680,11 +759,12 @@ data Token
| ITvbar
| ITlarrow IsUnicodeSyntax
| ITrarrow IsUnicodeSyntax
- | ITat
- | ITtilde
| ITdarrow IsUnicodeSyntax
| ITminus
- | ITbang
+ | ITbang -- Prefix (!) only, e.g. f !x = rhs
+ | ITtilde -- Prefix (~) only, e.g. f ~x = rhs
+ | ITat -- Tight infix (@) only, e.g. f x@pat = rhs
+ | ITtypeApp -- Prefix (@) only, e.g. f @t
| ITstar IsUnicodeSyntax
| ITdot
@@ -740,10 +820,8 @@ data Token
| ITcloseQuote IsUnicodeSyntax -- |]
| ITopenTExpQuote HasE -- [|| or [e||
| ITcloseTExpQuote -- ||]
- | ITidEscape FastString -- $x
- | ITparenEscape -- $(
- | ITidTyEscape FastString -- $$x
- | ITparenTyEscape -- $$(
+ | ITdollar -- prefix $
+ | ITdollardollar -- prefix $$
| ITtyQuote -- ''
| ITquasiQuote (FastString,FastString,RealSrcSpan)
-- ITquasiQuote(quoter, quote, loc)
@@ -764,11 +842,6 @@ data Token
| ITLarrowtail IsUnicodeSyntax -- ^ @-<<@
| ITRarrowtail IsUnicodeSyntax -- ^ @>>-@
- -- | Type application '@' (lexed differently than as-pattern '@',
- -- due to checking for preceding whitespace)
- | ITtypeApp
-
-
| ITunknown String -- ^ Used when the lexer can't make sense of it
| ITeof -- ^ end of file token
@@ -889,11 +962,8 @@ reservedSymsFM = listToUFM $
,("|", ITvbar, NormalSyntax, 0 )
,("<-", ITlarrow NormalSyntax, NormalSyntax, 0 )
,("->", ITrarrow NormalSyntax, NormalSyntax, 0 )
- ,("@", ITat, NormalSyntax, 0 )
- ,("~", ITtilde, NormalSyntax, 0 )
,("=>", ITdarrow NormalSyntax, NormalSyntax, 0 )
,("-", ITminus, NormalSyntax, 0 )
- ,("!", ITbang, NormalSyntax, 0 )
,("*", ITstar NormalSyntax, NormalSyntax, xbit StarIsTypeBit)
@@ -988,6 +1058,32 @@ pop_and :: Action -> Action
pop_and act span buf len = do _ <- popLexState
act span buf len
+-- See Note [Whitespace-sensitive operator parsing]
+followedByOpeningToken :: AlexAccPred ExtsBitmap
+followedByOpeningToken _ _ _ (AI _ buf)
+ | atEnd buf = False
+ | otherwise =
+ case nextChar buf of
+ ('{', buf') -> nextCharIsNot buf' (== '-')
+ ('(', _) -> True
+ ('[', _) -> True
+ ('\"', _) -> True
+ ('\'', _) -> True
+ ('_', _) -> True
+ (c, _) -> isAlphaNum c
+
+-- See Note [Whitespace-sensitive operator parsing]
+precededByClosingToken :: AlexAccPred ExtsBitmap
+precededByClosingToken _ (AI _ buf) _ _ =
+ case prevChar buf '\n' of
+ '}' -> decodePrevNChars 1 buf /= "-"
+ ')' -> True
+ ']' -> True
+ '\"' -> True
+ '\'' -> True
+ '_' -> True
+ c -> isAlphaNum c
+
{-# INLINE nextCharIs #-}
nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
@@ -1348,11 +1444,40 @@ qvarsym, qconsym :: StringBuffer -> Int -> Token
qvarsym buf len = ITqvarsym $! splitQualName buf len False
qconsym buf len = ITqconsym $! splitQualName buf len False
-varsym, consym :: Action
-varsym = sym ITvarsym
-consym = sym ITconsym
-
-sym :: (FastString -> Token) -> Action
+-- See Note [Whitespace-sensitive operator parsing]
+varsym_prefix :: Action
+varsym_prefix = sym $ \exts s ->
+ if | TypeApplicationsBit `xtest` exts, s == fsLit "@"
+ -> return ITtypeApp
+ | ThBit `xtest` exts, s == fsLit "$"
+ -> return ITdollar
+ | ThBit `xtest` exts, s == fsLit "$$"
+ -> return ITdollardollar
+ | s == fsLit "!" -> return ITbang
+ | s == fsLit "~" -> return ITtilde
+ | otherwise -> return (ITvarsym s)
+
+-- See Note [Whitespace-sensitive operator parsing]
+varsym_suffix :: Action
+varsym_suffix = sym $ \_ s ->
+ if | s == fsLit "@"
+ -> failMsgP "Suffix occurrence of @. For an as-pattern, remove the leading whitespace."
+ | otherwise -> return (ITvarsym s)
+
+-- See Note [Whitespace-sensitive operator parsing]
+varsym_tight_infix :: Action
+varsym_tight_infix = sym $ \_ s ->
+ if | s == fsLit "@" -> return ITat
+ | otherwise -> return (ITvarsym s)
+
+-- See Note [Whitespace-sensitive operator parsing]
+varsym_loose_infix :: Action
+varsym_loose_infix = sym (\_ s -> return $ ITvarsym s)
+
+consym :: Action
+consym = sym (\_exts s -> return $ ITconsym s)
+
+sym :: (ExtsBitmap -> FastString -> P Token) -> Action
sym con span buf len =
case lookupUFM reservedSymsFM fs of
Just (keyword, NormalSyntax, 0) ->
@@ -1361,19 +1486,20 @@ sym con span buf len =
exts <- getExts
if exts .&. i /= 0
then return $ L span keyword
- else return $ L span (con fs)
+ else L span <$!> con exts fs
Just (keyword, UnicodeSyntax, 0) -> do
exts <- getExts
if xtest UnicodeSyntaxBit exts
then return $ L span keyword
- else return $ L span (con fs)
+ else L span <$!> con exts fs
Just (keyword, UnicodeSyntax, i) -> do
exts <- getExts
if exts .&. i /= 0 && xtest UnicodeSyntaxBit exts
then return $ L span keyword
- else return $ L span (con fs)
- Nothing ->
- return $ L span $! con fs
+ else L span <$!> con exts fs
+ Nothing -> do
+ exts <- getExts
+ L span <$!> con exts fs
where
!fs = lexemeToFastString buf len
@@ -2889,8 +3015,6 @@ isALRopen ITobrack = True
isALRopen ITocurly = True
-- GHC Extensions:
isALRopen IToubxparen = True
-isALRopen ITparenEscape = True
-isALRopen ITparenTyEscape = True
isALRopen _ = False
isALRclose :: Token -> Bool
@@ -2945,12 +3069,9 @@ lexToken = do
let bytes = byteDiff buf buf2
span `seq` setLastToken span bytes
lt <- t span buf bytes
- case unRealSrcSpan lt of
- ITlineComment _ -> return lt
- ITblockComment _ -> return lt
- lt' -> do
- setLastTk lt'
- return lt
+ let lt' = unRealSrcSpan lt
+ unless (isComment lt') (setLastTk lt')
+ return lt
reportLexError :: RealSrcLoc -> RealSrcLoc -> StringBuffer -> [Char] -> P a
reportLexError loc1 loc2 buf str