Implement proposal 403: Lexer cleanup

This allows Other Numbers to be used in identifiers, and also documents other, already existing lexer divergence from Haskell Report
author: Oleg Grenrus <oleg.grenrus@iki.fi> 2021-02-14 13:33:21 +0200
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2021-04-05 20:38:07 -0400
commit: 9c9adbd0ffe6ecc37d3a565811d8e79f24383943 (patch)
tree: 6beab45ec5caf3e406c3aecb7dfb7f4361d3fb05
parent: 918d5021ad74a6b7e5e027df2f7d7605d833b486 (diff)
download: haskell-9c9adbd0ffe6ecc37d3a565811d8e79f24383943.tar.gz
6 files changed, 63 insertions, 7 deletions
diff --git a/compiler/GHC/Parser/Lexer.x b/compiler/GHC/Parser/Lexer.x
index eec5171eb8..9fa6d05cf0 100644
--- a/compiler/GHC/Parser/Lexer.x
+++ b/compiler/GHC/Parser/Lexer.x
@@ -131,7 +131,7 @@ $tab         = \t
 
 $ascdigit  = 0-9
 $unidigit  = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex].
-$decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
+$decdigit  = $ascdigit -- exactly $ascdigit, no more no less.
 $digit     = [$ascdigit $unidigit]
 
 $special   = [\(\)\,\;\[\]\`\{\}]
@@ -147,17 +147,17 @@ $unismall  = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $ascsmall  = [a-z]
 $small     = [$ascsmall $unismall \_]
 
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
+$idchar    = [$small $large $digit $uniidchar \']
+
 $unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex].
-$graphic   = [$small $large $symbol $digit $special $unigraphic \"\']
+$graphic   = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
 
 $binit     = 0-1
 $octit     = 0-7
 $hexit     = [$decdigit A-F a-f]
 
-$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
-$idchar    = [$small $large $digit $uniidchar \']
-
-$pragmachar = [$small $large $digit]
+$pragmachar = [$small $large $digit $uniidchar ]
 
 $docsym    = [\| \^ \* \$]
 
@@ -2521,7 +2521,7 @@ adjustChar c = fromIntegral $ ord adj_c
                   SpacingCombiningMark  -> other_graphic
                   EnclosingMark         -> other_graphic
                   DecimalNumber         -> digit
-                  LetterNumber          -> other_graphic
+                  LetterNumber          -> digit
                   OtherNumber           -> digit -- see #4373
                   ConnectorPunctuation  -> symbol
                   DashPunctuation       -> symbol
diff --git a/docs/users_guide/bugs.rst b/docs/users_guide/bugs.rst
index 2b533fa42f..df34f186e4 100644
--- a/docs/users_guide/bugs.rst
+++ b/docs/users_guide/bugs.rst
@@ -93,6 +93,27 @@ Lexical syntax
    See `GHC Proposal #229 <https://github.com/ghc-proposals/ghc-proposals/blob/master/proposals/0229-whitespace-bang-patterns.rst>`__
    for the precise rules.
 
+- Haskell Report allows any Unicode Decimal Number in decimal literals.
+  However, GHC accepts only ASCII numbers::
+
+     ascDigit    →   0 | 1 | … | 9
+     decimal     →   ascDigit {ascDigit}
+
+- GHC is more lenient in which characters are allowed in the identifiers.
+  Unicode Other Letters are considered to be small letters,
+  therefore variable identifiers can begin with them.
+  Digit class contains all Unicode numbers instead of just Decimal Numbers.
+  Modifier Letters and Non-Spacing Marks can appear in the tail
+  of the identifiers.::
+
+     uniSmall    →   any Unicode Lowercase Letter or Other Letter
+     uniDigit    →   any Unicode Decimal Number, Letter Number or Other Number
+
+     uniIdchar   →   any Unicode Modifier Letter or Non-Spacing Mark
+     idchar      →   small | large | digit | uniIdchar | '
+
+     varid       →   small {idchar} ⟨reservedid⟩
+     conid       →   large {idchar}
 
 .. _infelicities-syntax:
 
diff --git a/testsuite/tests/parser/unicode/T18158.hs b/testsuite/tests/parser/unicode/T18158.hs
new file mode 100644
index 0000000000..510ba858c5
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T18158.hs
@@ -0,0 +1,15 @@
+main = print nⅯⅯⅩⅩ
+    where nⅯⅯⅩⅩ = 11
+
+-- ⅯⅯⅩⅩ is characters are in NumberLetter unicode category.
+-- We now allow it to be used in identifiers, but they
+-- are not lower or upper, so cannot be the first one.
+--
+-- Just like 'OtherNumber' (#4373), 'ModifierLetter' (#10196) and
+-- NonSpacingMark (#7650).
+--
+-- > map generalCategory "ⅯⅯⅩⅩ"
+-- [LetterNumber,LetterNumber,LetterNumber,LetterNumber]
+--
+-- > map show "ⅯⅯⅩⅩ"
+-- ["'\\8559'","'\\8559'","'\\8553'","'\\8553'"]
diff --git a/testsuite/tests/parser/unicode/T18158b.hs b/testsuite/tests/parser/unicode/T18158b.hs
new file mode 100644
index 0000000000..0bd8780b31
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T18158b.hs
@@ -0,0 +1,15 @@
+main = print ⅯⅯⅩⅩ
+    where ⅯⅯⅩⅩ = 11
+
+-- ⅯⅯⅩⅩ is characters are in NumberLetter unicode category.
+-- We now allow it to be used in identifiers, but they
+-- are not lower or upper, so cannot be the first one.
+--
+-- Just like 'OtherNumber' (#4373), 'ModifierLetter' (#10196) and
+-- NonSpacingMark (#7650).
+--
+-- > map generalCategory "ⅯⅯⅩⅩ"
+-- [LetterNumber,LetterNumber,LetterNumber,LetterNumber]
+--
+-- > map show "ⅯⅯⅩⅩ"
+-- ["'\\8559'","'\\8559'","'\\8553'","'\\8553'"]
diff --git a/testsuite/tests/parser/unicode/T18158b.stderr b/testsuite/tests/parser/unicode/T18158b.stderr
new file mode 100644
index 0000000000..a2148b3908
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T18158b.stderr
@@ -0,0 +1,2 @@
+
+T18158b.hs:1:14: error: lexical error at character '\8559'
diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T
index 54a3b7cb1f..c854ea41c4 100644
--- a/testsuite/tests/parser/unicode/all.T
+++ b/testsuite/tests/parser/unicode/all.T
@@ -30,3 +30,6 @@ test('T7650', normal, compile, [''])
 test('brackets', normal, compile, [''])
 test('T18225A', normal, compile, [''])
 test('T18225B', normal, compile_fail, [''])
+
+test('T18158', normal, compile, [''])
+test('T18158b', normal, compile_fail, [''])
author	Oleg Grenrus <oleg.grenrus@iki.fi>	2021-02-14 13:33:21 +0200
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2021-04-05 20:38:07 -0400
commit	9c9adbd0ffe6ecc37d3a565811d8e79f24383943 (patch)
tree	6beab45ec5caf3e406c3aecb7dfb7f4361d3fb05
parent	918d5021ad74a6b7e5e027df2f7d7605d833b486 (diff)
download	haskell-9c9adbd0ffe6ecc37d3a565811d8e79f24383943.tar.gz