summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Miedema <thomasmiedema@gmail.com>2016-02-20 23:50:28 +0100
committerThomas Miedema <thomasmiedema@gmail.com>2016-02-23 12:28:05 +0100
commit2aee41960aa00fe09a2cd1983e02c15e06013037 (patch)
tree1b7ef718063f4322ebdbd79eb941a0a3b6af4ecd
parent661aa07ed1b133a5ba1ae90525115f8aca0ac92b (diff)
downloadhaskell-2aee41960aa00fe09a2cd1983e02c15e06013037.tar.gz
Allow combining characters in identifiers (#7650)
Reviewed by: austin, rwbarton Differential Revision: https://phabricator.haskell.org/D1938
-rw-r--r--compiler/basicTypes/Lexeme.hs5
-rw-r--r--compiler/parser/Lexer.x12
-rw-r--r--testsuite/tests/parser/unicode/T7650.hs11
-rw-r--r--testsuite/tests/parser/unicode/T7650.stdout1
-rw-r--r--testsuite/tests/parser/unicode/all.T1
5 files changed, 22 insertions, 8 deletions
diff --git a/compiler/basicTypes/Lexeme.hs b/compiler/basicTypes/Lexeme.hs
index 9e75376dae..22515c172c 100644
--- a/compiler/basicTypes/Lexeme.hs
+++ b/compiler/basicTypes/Lexeme.hs
@@ -194,9 +194,10 @@ okIdChar c = case generalCategory c of
LowercaseLetter -> True
TitlecaseLetter -> True
ModifierLetter -> True -- See #10196
- OtherLetter -> True
+ OtherLetter -> True -- See #1103
+ NonSpacingMark -> True -- See #7650
DecimalNumber -> True
- OtherNumber -> True
+ OtherNumber -> True -- See #4373
_ -> c == '\'' || c == '_'
-- | Is this character acceptable in a symbol (after the first char)?
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 5f3bdee5fa..3f959f2a03 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -155,8 +155,8 @@ $binit = 0-1
$octit = 0-7
$hexit = [$decdigit A-F a-f]
-$modifier = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
-$idchar = [$small $large $digit $modifier \']
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
+$idchar = [$small $large $digit $uniidchar \']
$pragmachar = [$small $large $digit]
@@ -1874,10 +1874,10 @@ alexGetByte (AI loc s)
symbol = '\x04'
space = '\x05'
other_graphic = '\x06'
- modifier = '\x07'
+ uniidchar = '\x07'
adj_c
- | c <= '\x06' = non_graphic
+ | c <= '\x07' = non_graphic
| c <= '\x7f' = c
-- Alex doesn't handle Unicode, so when Unicode
-- character is encountered we output these values
@@ -1891,9 +1891,9 @@ alexGetByte (AI loc s)
UppercaseLetter -> upper
LowercaseLetter -> lower
TitlecaseLetter -> upper
- ModifierLetter -> modifier -- see #10196
+ ModifierLetter -> uniidchar -- see #10196
OtherLetter -> lower -- see #1103
- NonSpacingMark -> other_graphic
+ NonSpacingMark -> uniidchar -- see #7650
SpacingCombiningMark -> other_graphic
EnclosingMark -> other_graphic
DecimalNumber -> digit
diff --git a/testsuite/tests/parser/unicode/T7650.hs b/testsuite/tests/parser/unicode/T7650.hs
new file mode 100644
index 0000000000..c474bc0645
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T7650.hs
@@ -0,0 +1,11 @@
+main = print spın̈alTap
+ where spın̈alTap = 11
+
+-- n̈ is a combining character sequence. We now allow it to be used in
+-- identifiers (#7650).
+--
+-- > map generalCategory "n̈"
+-- [LowercaseLetter,NonSpacingMark]
+--
+-- > map show "n̈"
+-- ["'n'","'\776'"]
diff --git a/testsuite/tests/parser/unicode/T7650.stdout b/testsuite/tests/parser/unicode/T7650.stdout
new file mode 100644
index 0000000000..b4de394767
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T7650.stdout
@@ -0,0 +1 @@
+11
diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T
index 6972a0d602..36554cc143 100644
--- a/testsuite/tests/parser/unicode/all.T
+++ b/testsuite/tests/parser/unicode/all.T
@@ -25,3 +25,4 @@ test('T7671', normal, compile, [''])
# TODO: This test ought to be run in a non-UTF8 locale, but this is not yet
# supported by the test suite (see 10907)
test('T10907', normal, compile, [''])
+test('T7650', normal, compile, [''])