diff options
author | Alec Theriault <alec.theriault@gmail.com> | 2017-10-25 15:52:38 -0400 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2017-10-25 16:44:22 -0400 |
commit | 821adee12e89dbd0a52fde872b633e4e2e9051dc (patch) | |
tree | beef26068d86bc80b5942e45e3534c039e5b4e74 | |
parent | f7f270eb6ba616feda79d370336db7e66f9ab79c (diff) | |
download | haskell-821adee12e89dbd0a52fde872b633e4e2e9051dc.tar.gz |
Fix a bug in 'alexInputPrevChar'
The lexer hacks around unicode by squishing any character into a 'Word8'
and then storing the actual character in its state. This happens at
'alexGetByte'.
That is all and well, but we ought to be careful that the characters we
retrieve via 'alexInputPrevChar' also fit this convention.
In fact, #13986 exposes nicely what can go wrong: the regex in the left
context of the type application rule uses the '$idchar' character set
which relies on the unicode hack. However, a left context corresponds
to a call to 'alexInputPrevChar', and we end up passing full blown
unicode characters to '$idchar', despite it not being equipped to deal
with these.
Test Plan: Added a regression test case
Reviewers: austin, bgamari
Reviewed By: bgamari
Subscribers: rwbarton, thomie
GHC Trac Issues: #13986
Differential Revision: https://phabricator.haskell.org/D4105
-rw-r--r-- | compiler/parser/Lexer.x | 78 | ||||
-rw-r--r-- | testsuite/tests/parser/should_compile/T13986.hs | 5 | ||||
-rw-r--r-- | testsuite/tests/parser/should_compile/all.T | 1 |
3 files changed, 59 insertions, 25 deletions
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x index 8c17315fb0..3bf249bd7e 100644 --- a/compiler/parser/Lexer.x +++ b/compiler/parser/Lexer.x @@ -129,38 +129,38 @@ import ApiAnnotation -- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs -- Any changes here should likely be reflected there. -$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetByte. +$unispace = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $nl = [\n\r\f] $whitechar = [$nl\v\ $unispace] $white_no_nl = $whitechar # \n -- TODO #8424 $tab = \t $ascdigit = 0-9 -$unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetByte. +$unidigit = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $decdigit = $ascdigit -- for now, should really be $digit (ToDo) $digit = [$ascdigit $unidigit] $special = [\(\)\,\;\[\]\`\{\}] $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:] -$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetByte. +$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $symbol = [$ascsymbol $unisymbol] # [$special \_\"\'] -$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetByte. +$unilarge = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $asclarge = [A-Z] $large = [$asclarge $unilarge] -$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetByte. +$unismall = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $ascsmall = [a-z] $small = [$ascsmall $unismall \_] -$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetByte. +$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $graphic = [$small $large $symbol $digit $special $unigraphic \"\'] $binit = 0-1 $octit = 0-7 $hexit = [$decdigit A-F a-f] -$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte. +$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $idchar = [$small $large $digit $uniidchar \'] $pragmachar = [$small $large $digit] @@ -1968,27 +1968,29 @@ getLastTk = P $ \s@(PState { last_tk = last_tk }) -> POk s last_tk data AlexInput = AI RealSrcLoc StringBuffer -alexInputPrevChar :: AlexInput -> Char -alexInputPrevChar (AI _ buf) = prevChar buf '\n' +{- +Note [Unicode in Alex] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Although newer versions of Alex support unicode, this grammar is processed with +the old style '--latin1' behaviour. This means that when implementing the +functions --- backwards compatibility for Alex 2.x -alexGetChar :: AlexInput -> Maybe (Char,AlexInput) -alexGetChar inp = case alexGetByte inp of - Nothing -> Nothing - Just (b,i) -> c `seq` Just (c,i) - where c = chr $ fromIntegral b + alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) + alexInputPrevChar :: AlexInput -> Char -alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) -alexGetByte (AI loc s) - | atEnd s = Nothing - | otherwise = byte `seq` loc' `seq` s' `seq` - --trace (show (ord c)) $ - Just (byte, (AI loc' s')) - where (c,s') = nextChar s - loc' = advanceSrcLoc loc c - byte = fromIntegral $ ord adj_c +which Alex uses to to take apart our 'AlexInput', we must + + * return a latin1 character in the 'Word8' that 'alexGetByte' expects + * return a latin1 character in 'alexInputPrevChar'. + +We handle this in 'adjustChar' by squishing entire classes of unicode +characters into single bytes. +-} - non_graphic = '\x00' +{-# INLINE adjustChar #-} +adjustChar :: Char -> Word8 +adjustChar c = fromIntegral $ ord adj_c + where non_graphic = '\x00' upper = '\x01' lower = '\x02' digit = '\x03' @@ -2034,6 +2036,32 @@ alexGetByte (AI loc s) Space -> space _other -> non_graphic +-- Getting the previous 'Char' isn't enough here - we need to convert it into +-- the same format that 'alexGetByte' would have produced. +-- +-- See Note [Unicode in Alex] and #13986. +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar (AI _ buf) = chr (fromIntegral (adjustChar pc)) + where pc = prevChar buf '\n' + +-- backwards compatibility for Alex 2.x +alexGetChar :: AlexInput -> Maybe (Char,AlexInput) +alexGetChar inp = case alexGetByte inp of + Nothing -> Nothing + Just (b,i) -> c `seq` Just (c,i) + where c = chr $ fromIntegral b + +-- See Note [Unicode in Alex] +alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) +alexGetByte (AI loc s) + | atEnd s = Nothing + | otherwise = byte `seq` loc' `seq` s' `seq` + --trace (show (ord c)) $ + Just (byte, (AI loc' s')) + where (c,s') = nextChar s + loc' = advanceSrcLoc loc c + byte = adjustChar c + -- This version does not squash unicode characters, it is used when -- lexing strings. alexGetChar' :: AlexInput -> Maybe (Char,AlexInput) diff --git a/testsuite/tests/parser/should_compile/T13986.hs b/testsuite/tests/parser/should_compile/T13986.hs new file mode 100644 index 0000000000..b1b4882a35 --- /dev/null +++ b/testsuite/tests/parser/should_compile/T13986.hs @@ -0,0 +1,5 @@ +{-# LANGUAGE TypeApplications #-} + +module T13986 where + +foo x₁@True = 10 diff --git a/testsuite/tests/parser/should_compile/all.T b/testsuite/tests/parser/should_compile/all.T index c008bd439a..e2f68f6e96 100644 --- a/testsuite/tests/parser/should_compile/all.T +++ b/testsuite/tests/parser/should_compile/all.T @@ -109,3 +109,4 @@ test('DumpRenamedAst', normal, compile, ['-dsuppress-uniques -ddump-rn-ast'] test('DumpTypecheckedAst', normal, compile, ['-dsuppress-uniques -ddump-tc-ast']) test('T13747', normal, compile, ['']) test('T14189', normal, compile, ['-dsuppress-uniques -ddump-rn-ast']) +test('T13986', normal, compile, ['']) |