diff options
author | Oleg Grenrus <oleg.grenrus@iki.fi> | 2021-02-14 13:33:21 +0200 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2021-04-05 20:38:07 -0400 |
commit | 9c9adbd0ffe6ecc37d3a565811d8e79f24383943 (patch) | |
tree | 6beab45ec5caf3e406c3aecb7dfb7f4361d3fb05 | |
parent | 918d5021ad74a6b7e5e027df2f7d7605d833b486 (diff) | |
download | haskell-9c9adbd0ffe6ecc37d3a565811d8e79f24383943.tar.gz |
Implement proposal 403: Lexer cleanup
This allows Other Numbers to be used in identifiers, and also documents
other, already existing lexer divergence from Haskell Report
-rw-r--r-- | compiler/GHC/Parser/Lexer.x | 14 | ||||
-rw-r--r-- | docs/users_guide/bugs.rst | 21 | ||||
-rw-r--r-- | testsuite/tests/parser/unicode/T18158.hs | 15 | ||||
-rw-r--r-- | testsuite/tests/parser/unicode/T18158b.hs | 15 | ||||
-rw-r--r-- | testsuite/tests/parser/unicode/T18158b.stderr | 2 | ||||
-rw-r--r-- | testsuite/tests/parser/unicode/all.T | 3 |
6 files changed, 63 insertions, 7 deletions
diff --git a/compiler/GHC/Parser/Lexer.x b/compiler/GHC/Parser/Lexer.x index eec5171eb8..9fa6d05cf0 100644 --- a/compiler/GHC/Parser/Lexer.x +++ b/compiler/GHC/Parser/Lexer.x @@ -131,7 +131,7 @@ $tab = \t $ascdigit = 0-9 $unidigit = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex]. -$decdigit = $ascdigit -- for now, should really be $digit (ToDo) +$decdigit = $ascdigit -- exactly $ascdigit, no more no less. $digit = [$ascdigit $unidigit] $special = [\(\)\,\;\[\]\`\{\}] @@ -147,17 +147,17 @@ $unismall = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex]. $ascsmall = [a-z] $small = [$ascsmall $unismall \_] +$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex]. +$idchar = [$small $large $digit $uniidchar \'] + $unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex]. -$graphic = [$small $large $symbol $digit $special $unigraphic \"\'] +$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\'] $binit = 0-1 $octit = 0-7 $hexit = [$decdigit A-F a-f] -$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex]. -$idchar = [$small $large $digit $uniidchar \'] - -$pragmachar = [$small $large $digit] +$pragmachar = [$small $large $digit $uniidchar ] $docsym = [\| \^ \* \$] @@ -2521,7 +2521,7 @@ adjustChar c = fromIntegral $ ord adj_c SpacingCombiningMark -> other_graphic EnclosingMark -> other_graphic DecimalNumber -> digit - LetterNumber -> other_graphic + LetterNumber -> digit OtherNumber -> digit -- see #4373 ConnectorPunctuation -> symbol DashPunctuation -> symbol diff --git a/docs/users_guide/bugs.rst b/docs/users_guide/bugs.rst index 2b533fa42f..df34f186e4 100644 --- a/docs/users_guide/bugs.rst +++ b/docs/users_guide/bugs.rst @@ -93,6 +93,27 @@ Lexical syntax See `GHC Proposal #229 <https://github.com/ghc-proposals/ghc-proposals/blob/master/proposals/0229-whitespace-bang-patterns.rst>`__ for the precise rules. +- Haskell Report allows any Unicode Decimal Number in decimal literals. + However, GHC accepts only ASCII numbers:: + + ascDigit → 0 | 1 | … | 9 + decimal → ascDigit {ascDigit} + +- GHC is more lenient in which characters are allowed in the identifiers. + Unicode Other Letters are considered to be small letters, + therefore variable identifiers can begin with them. + Digit class contains all Unicode numbers instead of just Decimal Numbers. + Modifier Letters and Non-Spacing Marks can appear in the tail + of the identifiers.:: + + uniSmall → any Unicode Lowercase Letter or Other Letter + uniDigit → any Unicode Decimal Number, Letter Number or Other Number + + uniIdchar → any Unicode Modifier Letter or Non-Spacing Mark + idchar → small | large | digit | uniIdchar | ' + + varid → small {idchar} ⟨reservedid⟩ + conid → large {idchar} .. _infelicities-syntax: diff --git a/testsuite/tests/parser/unicode/T18158.hs b/testsuite/tests/parser/unicode/T18158.hs new file mode 100644 index 0000000000..510ba858c5 --- /dev/null +++ b/testsuite/tests/parser/unicode/T18158.hs @@ -0,0 +1,15 @@ +main = print nⅯⅯⅩⅩ + where nⅯⅯⅩⅩ = 11 + +-- ⅯⅯⅩⅩ is characters are in NumberLetter unicode category. +-- We now allow it to be used in identifiers, but they +-- are not lower or upper, so cannot be the first one. +-- +-- Just like 'OtherNumber' (#4373), 'ModifierLetter' (#10196) and +-- NonSpacingMark (#7650). +-- +-- > map generalCategory "ⅯⅯⅩⅩ" +-- [LetterNumber,LetterNumber,LetterNumber,LetterNumber] +-- +-- > map show "ⅯⅯⅩⅩ" +-- ["'\\8559'","'\\8559'","'\\8553'","'\\8553'"] diff --git a/testsuite/tests/parser/unicode/T18158b.hs b/testsuite/tests/parser/unicode/T18158b.hs new file mode 100644 index 0000000000..0bd8780b31 --- /dev/null +++ b/testsuite/tests/parser/unicode/T18158b.hs @@ -0,0 +1,15 @@ +main = print ⅯⅯⅩⅩ + where ⅯⅯⅩⅩ = 11 + +-- ⅯⅯⅩⅩ is characters are in NumberLetter unicode category. +-- We now allow it to be used in identifiers, but they +-- are not lower or upper, so cannot be the first one. +-- +-- Just like 'OtherNumber' (#4373), 'ModifierLetter' (#10196) and +-- NonSpacingMark (#7650). +-- +-- > map generalCategory "ⅯⅯⅩⅩ" +-- [LetterNumber,LetterNumber,LetterNumber,LetterNumber] +-- +-- > map show "ⅯⅯⅩⅩ" +-- ["'\\8559'","'\\8559'","'\\8553'","'\\8553'"] diff --git a/testsuite/tests/parser/unicode/T18158b.stderr b/testsuite/tests/parser/unicode/T18158b.stderr new file mode 100644 index 0000000000..a2148b3908 --- /dev/null +++ b/testsuite/tests/parser/unicode/T18158b.stderr @@ -0,0 +1,2 @@ + +T18158b.hs:1:14: error: lexical error at character '\8559' diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T index 54a3b7cb1f..c854ea41c4 100644 --- a/testsuite/tests/parser/unicode/all.T +++ b/testsuite/tests/parser/unicode/all.T @@ -30,3 +30,6 @@ test('T7650', normal, compile, ['']) test('brackets', normal, compile, ['']) test('T18225A', normal, compile, ['']) test('T18225B', normal, compile_fail, ['']) + +test('T18158', normal, compile, ['']) +test('T18158b', normal, compile_fail, ['']) |