Fix a bug in 'alexInputPrevChar'

The lexer hacks around unicode by squishing any character into a 'Word8' and then storing the actual character in its state. This happens at 'alexGetByte'. That is all and well, but we ought to be careful that the characters we retrieve via 'alexInputPrevChar' also fit this convention. In fact, #13986 exposes nicely what can go wrong: the regex in the left context of the type application rule uses the '$idchar' character set which relies on the unicode hack. However, a left context corresponds to a call to 'alexInputPrevChar', and we end up passing full blown unicode characters to '$idchar', despite it not being equipped to deal with these. Test Plan: Added a regression test case Reviewers: austin, bgamari Reviewed By: bgamari Subscribers: rwbarton, thomie GHC Trac Issues: #13986 Differential Revision: https://phabricator.haskell.org/D4105
author: Alec Theriault <alec.theriault@gmail.com> 2017-10-25 15:52:38 -0400
committer: Ben Gamari <ben@smart-cactus.org> 2017-10-25 16:44:22 -0400
commit: 821adee12e89dbd0a52fde872b633e4e2e9051dc (patch)
tree: beef26068d86bc80b5942e45e3534c039e5b4e74
parent: f7f270eb6ba616feda79d370336db7e66f9ab79c (diff)
download: haskell-821adee12e89dbd0a52fde872b633e4e2e9051dc.tar.gz
3 files changed, 59 insertions, 25 deletions
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 8c17315fb0..3bf249bd7e 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -129,38 +129,38 @@ import ApiAnnotation
 
 -- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs
 -- Any changes here should likely be reflected there.
-$unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetByte.
+$unispace    = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $nl          = [\n\r\f]
 $whitechar   = [$nl\v\ $unispace]
 $white_no_nl = $whitechar # \n -- TODO #8424
 $tab         = \t
 
 $ascdigit  = 0-9
-$unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetByte.
+$unidigit  = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
 $digit     = [$ascdigit $unidigit]
 
 $special   = [\(\)\,\;\[\]\`\{\}]
 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
-$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetByte.
+$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $symbol    = [$ascsymbol $unisymbol] # [$special \_\"\']
 
-$unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetByte.
+$unilarge  = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $asclarge  = [A-Z]
 $large     = [$asclarge $unilarge]
 
-$unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetByte.
+$unismall  = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $ascsmall  = [a-z]
 $small     = [$ascsmall $unismall \_]
 
-$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetByte.
+$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $graphic   = [$small $large $symbol $digit $special $unigraphic \"\']
 
 $binit     = 0-1
 $octit     = 0-7
 $hexit     = [$decdigit A-F a-f]
 
-$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $idchar    = [$small $large $digit $uniidchar \']
 
 $pragmachar = [$small $large $digit]
@@ -1968,27 +1968,29 @@ getLastTk = P $ \s@(PState { last_tk = last_tk }) -> POk s last_tk
 
 data AlexInput = AI RealSrcLoc StringBuffer
 
-alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (AI _ buf) = prevChar buf '\n'
+{-
+Note [Unicode in Alex]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Although newer versions of Alex support unicode, this grammar is processed with
+the old style '--latin1' behaviour. This means that when implementing the
+functions
 
--- backwards compatibility for Alex 2.x
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar inp = case alexGetByte inp of
-                    Nothing    -> Nothing
-                    Just (b,i) -> c `seq` Just (c,i)
-                       where c = chr $ fromIntegral b
+    alexGetByte       :: AlexInput -> Maybe (Word8,AlexInput)
+    alexInputPrevChar :: AlexInput -> Char
 
-alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
-alexGetByte (AI loc s)
-  | atEnd s   = Nothing
-  | otherwise = byte `seq` loc' `seq` s' `seq`
-                --trace (show (ord c)) $
-                Just (byte, (AI loc' s'))
-  where (c,s') = nextChar s
-        loc'   = advanceSrcLoc loc c
-        byte   = fromIntegral $ ord adj_c
+which Alex uses to to take apart our 'AlexInput', we must
+
+  * return a latin1 character in the 'Word8' that 'alexGetByte' expects
+  * return a latin1 character in 'alexInputPrevChar'.
+
+We handle this in 'adjustChar' by squishing entire classes of unicode
+characters into single bytes.
+-}
 
-        non_graphic     = '\x00'
+{-# INLINE adjustChar #-}
+adjustChar :: Char -> Word8
+adjustChar c = fromIntegral $ ord adj_c
+  where non_graphic     = '\x00'
         upper           = '\x01'
         lower           = '\x02'
         digit           = '\x03'
@@ -2034,6 +2036,32 @@ alexGetByte (AI loc s)
                   Space                 -> space
                   _other                -> non_graphic
 
+-- Getting the previous 'Char' isn't enough here - we need to convert it into
+-- the same format that 'alexGetByte' would have produced.
+--
+-- See Note [Unicode in Alex] and #13986.
+alexInputPrevChar :: AlexInput -> Char
+alexInputPrevChar (AI _ buf) = chr (fromIntegral (adjustChar pc))
+  where pc = prevChar buf '\n'
+
+-- backwards compatibility for Alex 2.x
+alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
+alexGetChar inp = case alexGetByte inp of
+                    Nothing    -> Nothing
+                    Just (b,i) -> c `seq` Just (c,i)
+                       where c = chr $ fromIntegral b
+
+-- See Note [Unicode in Alex]
+alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+alexGetByte (AI loc s)
+  | atEnd s   = Nothing
+  | otherwise = byte `seq` loc' `seq` s' `seq`
+                --trace (show (ord c)) $
+                Just (byte, (AI loc' s'))
+  where (c,s') = nextChar s
+        loc'   = advanceSrcLoc loc c
+        byte   = adjustChar c
+
 -- This version does not squash unicode characters, it is used when
 -- lexing strings.
 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
diff --git a/testsuite/tests/parser/should_compile/T13986.hs b/testsuite/tests/parser/should_compile/T13986.hs
new file mode 100644
index 0000000000..b1b4882a35
--- /dev/null
+++ b/testsuite/tests/parser/should_compile/T13986.hs
@@ -0,0 +1,5 @@
+{-# LANGUAGE TypeApplications #-}
+
+module T13986 where
+
+foo x₁@True = 10
diff --git a/testsuite/tests/parser/should_compile/all.T b/testsuite/tests/parser/should_compile/all.T
index c008bd439a..e2f68f6e96 100644
--- a/testsuite/tests/parser/should_compile/all.T
+++ b/testsuite/tests/parser/should_compile/all.T
@@ -109,3 +109,4 @@ test('DumpRenamedAst',     normal, compile, ['-dsuppress-uniques -ddump-rn-ast']
 test('DumpTypecheckedAst', normal, compile, ['-dsuppress-uniques -ddump-tc-ast'])
 test('T13747', normal, compile, [''])
 test('T14189',     normal, compile, ['-dsuppress-uniques -ddump-rn-ast'])
+test('T13986', normal, compile, [''])
author	Alec Theriault <alec.theriault@gmail.com>	2017-10-25 15:52:38 -0400
committer	Ben Gamari <ben@smart-cactus.org>	2017-10-25 16:44:22 -0400
commit	821adee12e89dbd0a52fde872b633e4e2e9051dc (patch)
tree	beef26068d86bc80b5942e45e3534c039e5b4e74
parent	f7f270eb6ba616feda79d370336db7e66f9ab79c (diff)
download	haskell-821adee12e89dbd0a52fde872b633e4e2e9051dc.tar.gz