summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOleg Grenrus <oleg.grenrus@iki.fi>2021-02-14 13:33:21 +0200
committerMarge Bot <ben+marge-bot@smart-cactus.org>2021-04-05 20:38:07 -0400
commit9c9adbd0ffe6ecc37d3a565811d8e79f24383943 (patch)
tree6beab45ec5caf3e406c3aecb7dfb7f4361d3fb05
parent918d5021ad74a6b7e5e027df2f7d7605d833b486 (diff)
downloadhaskell-9c9adbd0ffe6ecc37d3a565811d8e79f24383943.tar.gz
Implement proposal 403: Lexer cleanup
This allows Other Numbers to be used in identifiers, and also documents other, already existing lexer divergence from Haskell Report
-rw-r--r--compiler/GHC/Parser/Lexer.x14
-rw-r--r--docs/users_guide/bugs.rst21
-rw-r--r--testsuite/tests/parser/unicode/T18158.hs15
-rw-r--r--testsuite/tests/parser/unicode/T18158b.hs15
-rw-r--r--testsuite/tests/parser/unicode/T18158b.stderr2
-rw-r--r--testsuite/tests/parser/unicode/all.T3
6 files changed, 63 insertions, 7 deletions
diff --git a/compiler/GHC/Parser/Lexer.x b/compiler/GHC/Parser/Lexer.x
index eec5171eb8..9fa6d05cf0 100644
--- a/compiler/GHC/Parser/Lexer.x
+++ b/compiler/GHC/Parser/Lexer.x
@@ -131,7 +131,7 @@ $tab = \t
$ascdigit = 0-9
$unidigit = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex].
-$decdigit = $ascdigit -- for now, should really be $digit (ToDo)
+$decdigit = $ascdigit -- exactly $ascdigit, no more no less.
$digit = [$ascdigit $unidigit]
$special = [\(\)\,\;\[\]\`\{\}]
@@ -147,17 +147,17 @@ $unismall = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex].
$ascsmall = [a-z]
$small = [$ascsmall $unismall \_]
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
+$idchar = [$small $large $digit $uniidchar \']
+
$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex].
-$graphic = [$small $large $symbol $digit $special $unigraphic \"\']
+$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
$binit = 0-1
$octit = 0-7
$hexit = [$decdigit A-F a-f]
-$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
-$idchar = [$small $large $digit $uniidchar \']
-
-$pragmachar = [$small $large $digit]
+$pragmachar = [$small $large $digit $uniidchar ]
$docsym = [\| \^ \* \$]
@@ -2521,7 +2521,7 @@ adjustChar c = fromIntegral $ ord adj_c
SpacingCombiningMark -> other_graphic
EnclosingMark -> other_graphic
DecimalNumber -> digit
- LetterNumber -> other_graphic
+ LetterNumber -> digit
OtherNumber -> digit -- see #4373
ConnectorPunctuation -> symbol
DashPunctuation -> symbol
diff --git a/docs/users_guide/bugs.rst b/docs/users_guide/bugs.rst
index 2b533fa42f..df34f186e4 100644
--- a/docs/users_guide/bugs.rst
+++ b/docs/users_guide/bugs.rst
@@ -93,6 +93,27 @@ Lexical syntax
See `GHC Proposal #229 <https://github.com/ghc-proposals/ghc-proposals/blob/master/proposals/0229-whitespace-bang-patterns.rst>`__
for the precise rules.
+- Haskell Report allows any Unicode Decimal Number in decimal literals.
+ However, GHC accepts only ASCII numbers::
+
+ ascDigit → 0 | 1 | … | 9
+ decimal → ascDigit {ascDigit}
+
+- GHC is more lenient in which characters are allowed in the identifiers.
+ Unicode Other Letters are considered to be small letters,
+ therefore variable identifiers can begin with them.
+ Digit class contains all Unicode numbers instead of just Decimal Numbers.
+ Modifier Letters and Non-Spacing Marks can appear in the tail
+ of the identifiers.::
+
+ uniSmall → any Unicode Lowercase Letter or Other Letter
+ uniDigit → any Unicode Decimal Number, Letter Number or Other Number
+
+ uniIdchar → any Unicode Modifier Letter or Non-Spacing Mark
+ idchar → small | large | digit | uniIdchar | '
+
+ varid → small {idchar} ⟨reservedid⟩
+ conid → large {idchar}
.. _infelicities-syntax:
diff --git a/testsuite/tests/parser/unicode/T18158.hs b/testsuite/tests/parser/unicode/T18158.hs
new file mode 100644
index 0000000000..510ba858c5
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T18158.hs
@@ -0,0 +1,15 @@
+main = print nⅯⅯⅩⅩ
+ where nⅯⅯⅩⅩ = 11
+
+-- ⅯⅯⅩⅩ is characters are in NumberLetter unicode category.
+-- We now allow it to be used in identifiers, but they
+-- are not lower or upper, so cannot be the first one.
+--
+-- Just like 'OtherNumber' (#4373), 'ModifierLetter' (#10196) and
+-- NonSpacingMark (#7650).
+--
+-- > map generalCategory "ⅯⅯⅩⅩ"
+-- [LetterNumber,LetterNumber,LetterNumber,LetterNumber]
+--
+-- > map show "ⅯⅯⅩⅩ"
+-- ["'\\8559'","'\\8559'","'\\8553'","'\\8553'"]
diff --git a/testsuite/tests/parser/unicode/T18158b.hs b/testsuite/tests/parser/unicode/T18158b.hs
new file mode 100644
index 0000000000..0bd8780b31
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T18158b.hs
@@ -0,0 +1,15 @@
+main = print ⅯⅯⅩⅩ
+ where ⅯⅯⅩⅩ = 11
+
+-- ⅯⅯⅩⅩ is characters are in NumberLetter unicode category.
+-- We now allow it to be used in identifiers, but they
+-- are not lower or upper, so cannot be the first one.
+--
+-- Just like 'OtherNumber' (#4373), 'ModifierLetter' (#10196) and
+-- NonSpacingMark (#7650).
+--
+-- > map generalCategory "ⅯⅯⅩⅩ"
+-- [LetterNumber,LetterNumber,LetterNumber,LetterNumber]
+--
+-- > map show "ⅯⅯⅩⅩ"
+-- ["'\\8559'","'\\8559'","'\\8553'","'\\8553'"]
diff --git a/testsuite/tests/parser/unicode/T18158b.stderr b/testsuite/tests/parser/unicode/T18158b.stderr
new file mode 100644
index 0000000000..a2148b3908
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T18158b.stderr
@@ -0,0 +1,2 @@
+
+T18158b.hs:1:14: error: lexical error at character '\8559'
diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T
index 54a3b7cb1f..c854ea41c4 100644
--- a/testsuite/tests/parser/unicode/all.T
+++ b/testsuite/tests/parser/unicode/all.T
@@ -30,3 +30,6 @@ test('T7650', normal, compile, [''])
test('brackets', normal, compile, [''])
test('T18225A', normal, compile, [''])
test('T18225B', normal, compile_fail, [''])
+
+test('T18158', normal, compile, [''])
+test('T18158b', normal, compile_fail, [''])