Overhaul Javascript numeric literals (#1534)

* Rename the "Javascript" tests to reflect that they are for CoffeeScript This change also modifies the module docstring to reflect the file's purpose. * Overhaul the Javascript numeric literal parsing Fixes #307 This patch contains the following changes: * Adds 50+ unit tests for Javascript numeric literals * Forces ASCII numbers for float literals (so, now reject `.୪`) * Adds support for Javascript's BigInt notation (`100n`) * Adds support for leading-zero-only octal notation (`0777`) * Adds support for scientific notation with no significand (`1e10`) Numeric literal parsing is based on information at: * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Grammar_and_types * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures
author: Kurt McKee <contactme@kurtmckee.org> 2020-09-06 07:53:35 -0500
committer: GitHub <noreply@github.com> 2020-09-06 14:53:35 +0200
commit: 5ec283a3592dfdef7aff34ab00ca8685c4d37470 (patch)
tree: dc2f7a3013dcda614bf63e7c9c643924d4f9c4e2
parent: 43c280b18596bf3f8905232083f1239aca6ef9fd (diff)
download: pygments-git-5ec283a3592dfdef7aff34ab00ca8685c4d37470.tar.gz
3 files changed, 162 insertions, 75 deletions
diff --git a/pygments/lexers/javascript.py b/pygments/lexers/javascript.py
index 335af320..14b51ebb 100644
--- a/pygments/lexers/javascript.py
+++ b/pygments/lexers/javascript.py
@@ -64,11 +64,14 @@ class JavascriptLexer(RegexLexer):
             (r'\A#! ?/.*?\n', Comment.Hashbang),  # recognized by node.js
             (r'^(?=\s|/|<!--)', Text, 'slashstartsregex'),
             include('commentsandwhitespace'),
-            (r'(\.\d+|[0-9]+\.[0-9]*)([eE][-+]?[0-9]+)?', Number.Float),
-            (r'0[bB][01]+', Number.Bin),
-            (r'0[oO][0-7]+', Number.Oct),
-            (r'0[xX][0-9a-fA-F]+', Number.Hex),
-            (r'[0-9]+', Number.Integer),
+
+            # Numeric literals
+            (r'0[bB][01]+n?', Number.Bin),
+            (r'0[oO]?[0-7]+n?', Number.Oct),  # Browsers support "0o7" and "07" notations
+            (r'0[xX][0-9a-fA-F]+n?', Number.Hex),
+            (r'[0-9]+n', Number.Integer),  # Javascript BigInt requires an "n" postfix
+            (r'(\.[0-9]+|[0-9]+\.[0-9]*|[0-9]+)([eE][-+]?[0-9]+)?', Number.Float),
+
             (r'\.\.\.|=>', Punctuation),
             (r'\+\+|--|~|&&|\?|:|\|\||\\(?=\n)|'
              r'(<<|>>>?|==?|!=?|[-<>+*%&|^/])=?', Operator, 'slashstartsregex'),
diff --git a/tests/test_coffeescript.py b/tests/test_coffeescript.py
new file mode 100644
index 00000000..41ca8e0d
--- /dev/null
+++ b/tests/test_coffeescript.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+"""
+    CoffeeScript tests
+    ~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import pytest
+
+from pygments.lexers import CoffeeScriptLexer
+from pygments.token import Token
+
+COFFEE_SLASH_GOLDEN = [
+    # input_str, slashes_are_regex_here
+    (r'/\\/', True),
+    (r'/\\/i', True),
+    (r'/\//', True),
+    (r'/(\s)/', True),
+    ('/a{2,8}/', True),
+    ('/b*c?d+/', True),
+    ('/(capture-match)/', True),
+    ('/(?:do-not-capture-match)/', True),
+    ('/this|or|that/', True),
+    ('/[char-set]/', True),
+    ('/[^neg-char_st]/', True),
+    ('/^.*$/', True),
+    (r'/\n(\f)\0\1\d\b\cm\u1234/', True),
+    (r'/^.?([^/\\\n\w]*)a\1+$/.something(or_other) # something more complex', True),
+    ("foo = (str) ->\n  /'|\"/.test str", True),
+    ('a = a / b / c', False),
+    ('a = a/b/c', False),
+    ('a = a/b/ c', False),
+    ('a = a /b/c', False),
+    ('a = 1 + /d/.test(a)', True),
+]
+
+
+@pytest.fixture(scope='module')
+def lexer():
+    yield CoffeeScriptLexer()
+
+
+@pytest.mark.parametrize('golden', COFFEE_SLASH_GOLDEN)
+def test_coffee_slashes(lexer, golden):
+    input_str, slashes_are_regex_here = golden
+    output = list(lexer.get_tokens(input_str))
+    print(output)
+    for t, s in output:
+        if '/' in s:
+            is_regex = t is Token.String.Regex
+            assert is_regex == slashes_are_regex_here, (t, s)
+
+
+def test_mixed_slashes(lexer):
+    fragment = u'a?/foo/:1/2;\n'
+    tokens = [
+        (Token.Name.Other, u'a'),
+        (Token.Operator, u'?'),
+        (Token.Literal.String.Regex, u'/foo/'),
+        (Token.Operator, u':'),
+        (Token.Literal.Number.Integer, u'1'),
+        (Token.Operator, u'/'),
+        (Token.Literal.Number.Integer, u'2'),
+        (Token.Punctuation, u';'),
+        (Token.Text, u'\n'),
+    ]
+    assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_beware_infinite_loop(lexer):
+    # This demonstrates the case that "This isn't really guarding" comment
+    # refers to.
+    fragment = '/a/x;\n'
+    tokens = [
+        (Token.Text, ''),
+        (Token.Operator, '/'),
+        (Token.Name.Other, 'a'),
+        (Token.Operator, '/'),
+        (Token.Name.Other, 'x'),
+        (Token.Punctuation, ';'),
+        (Token.Text, '\n'),
+    ]
+    assert list(lexer.get_tokens(fragment)) == tokens
diff --git a/tests/test_javascript.py b/tests/test_javascript.py
index 100e1f22..78350612 100644
--- a/tests/test_javascript.py
+++ b/tests/test_javascript.py
@@ -9,77 +9,76 @@
 
 import pytest
 
-from pygments.lexers import CoffeeScriptLexer
-from pygments.token import Token
-
-COFFEE_SLASH_GOLDEN = [
-    # input_str, slashes_are_regex_here
-    (r'/\\/', True),
-    (r'/\\/i', True),
-    (r'/\//', True),
-    (r'/(\s)/', True),
-    ('/a{2,8}/', True),
-    ('/b*c?d+/', True),
-    ('/(capture-match)/', True),
-    ('/(?:do-not-capture-match)/', True),
-    ('/this|or|that/', True),
-    ('/[char-set]/', True),
-    ('/[^neg-char_st]/', True),
-    ('/^.*$/', True),
-    (r'/\n(\f)\0\1\d\b\cm\u1234/', True),
-    (r'/^.?([^/\\\n\w]*)a\1+$/.something(or_other) # something more complex', True),
-    ("foo = (str) ->\n  /'|\"/.test str", True),
-    ('a = a / b / c', False),
-    ('a = a/b/c', False),
-    ('a = a/b/ c', False),
-    ('a = a /b/c', False),
-    ('a = 1 + /d/.test(a)', True),
-]
+from pygments.lexers.javascript import JavascriptLexer
+from pygments.token import Number
 
 
 @pytest.fixture(scope='module')
 def lexer():
-    yield CoffeeScriptLexer()
-
-
-@pytest.mark.parametrize('golden', COFFEE_SLASH_GOLDEN)
-def test_coffee_slashes(lexer, golden):
-    input_str, slashes_are_regex_here = golden
-    output = list(lexer.get_tokens(input_str))
-    print(output)
-    for t, s in output:
-        if '/' in s:
-            is_regex = t is Token.String.Regex
-            assert is_regex == slashes_are_regex_here, (t, s)
-
-
-def test_mixed_slashes(lexer):
-    fragment = u'a?/foo/:1/2;\n'
-    tokens = [
-        (Token.Name.Other, u'a'),
-        (Token.Operator, u'?'),
-        (Token.Literal.String.Regex, u'/foo/'),
-        (Token.Operator, u':'),
-        (Token.Literal.Number.Integer, u'1'),
-        (Token.Operator, u'/'),
-        (Token.Literal.Number.Integer, u'2'),
-        (Token.Punctuation, u';'),
-        (Token.Text, u'\n'),
-    ]
-    assert list(lexer.get_tokens(fragment)) == tokens
-
-
-def test_beware_infinite_loop(lexer):
-    # This demonstrates the case that "This isn't really guarding" comment
-    # refers to.
-    fragment = '/a/x;\n'
-    tokens = [
-        (Token.Text, ''),
-        (Token.Operator, '/'),
-        (Token.Name.Other, 'a'),
-        (Token.Operator, '/'),
-        (Token.Name.Other, 'x'),
-        (Token.Punctuation, ';'),
-        (Token.Text, '\n'),
-    ]
-    assert list(lexer.get_tokens(fragment)) == tokens
+    yield JavascriptLexer()
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        '1', '1.', '.1', '1.1', '1e1', '1E1', '1e+1', '1E-1', '1.e1', '.1e1',
+        '0888',  # octal prefix with non-octal numbers
+    )
+)
+def test_float_literal_positive_matches(lexer, text):
+    """Test literals that should be tokenized as float literals."""
+    assert list(lexer.get_tokens(text))[0] == (Number.Float, text)
+
+
+@pytest.mark.parametrize('text', ('.\u0b6a', '.', '1..', '1n', '1ee', '1e', '1e-', '1e--1', '1e++1', '1e1.0'))
+def test_float_literals_negative_matches(lexer, text):
+    """Test text that should **not** be tokenized as float literals."""
+    assert list(lexer.get_tokens(text))[0] != (Number.Float, text)
+
+
+@pytest.mark.parametrize('text', ('0n', '123n'))
+def test_integer_literal_positive_matches(lexer, text):
+    """Test literals that should be tokenized as integer literals."""
+    assert list(lexer.get_tokens(text))[0] == (Number.Integer, text)
+
+
+@pytest.mark.parametrize('text', ('1N', '1', '1.0'))
+def test_integer_literals_negative_matches(lexer, text):
+    """Test text that should **not** be tokenized as integer literals."""
+    assert list(lexer.get_tokens(text))[0] != (Number.Integer, text)
+
+
+@pytest.mark.parametrize('text', ('0b01', '0B10n'))
+def test_binary_literal_positive_matches(lexer, text):
+    """Test literals that should be tokenized as binary literals."""
+    assert list(lexer.get_tokens(text))[0] == (Number.Bin, text)
+
+
+@pytest.mark.parametrize('text', ('0b0N', '0b', '0bb', '0b2'))
+def test_binary_literals_negative_matches(lexer, text):
+    """Test text that should **not** be tokenized as binary literals."""
+    assert list(lexer.get_tokens(text))[0] != (Number.Bin, text)
+
+
+@pytest.mark.parametrize('text', ('017', '071n', '0o11', '0O77n'))
+def test_octal_literal_positive_matches(lexer, text):
+    """Test literals that should be tokenized as octal literals."""
+    assert list(lexer.get_tokens(text))[0] == (Number.Oct, text)
+
+
+@pytest.mark.parametrize('text', ('01N', '089', '098n', '0o', '0OO', '0o88', '0O88n'))
+def test_octal_literals_negative_matches(lexer, text):
+    """Test text that should **not** be tokenized as octal literals."""
+    assert list(lexer.get_tokens(text))[0] != (Number.Oct, text)
+
+
+@pytest.mark.parametrize('text', ('0x01', '0Xefn', '0x0EF'))
+def test_hexadecimal_literal_positive_matches(lexer, text):
+    """Test literals that should be tokenized as hexadecimal literals."""
+    assert list(lexer.get_tokens(text))[0] == (Number.Hex, text)
+
+
+@pytest.mark.parametrize('text', ('0x0N', '0x', '0Xx', '0xg', '0xhn'))
+def test_hexadecimal_literals_negative_matches(lexer, text):
+    """Test text that should **not** be tokenized as hexadecimal literals."""
+    assert list(lexer.get_tokens(text))[0] != (Number.Hex, text)
author	Kurt McKee <contactme@kurtmckee.org>	2020-09-06 07:53:35 -0500
committer	GitHub <noreply@github.com>	2020-09-06 14:53:35 +0200
commit	5ec283a3592dfdef7aff34ab00ca8685c4d37470 (patch)
tree	dc2f7a3013dcda614bf63e7c9c643924d4f9c4e2
parent	43c280b18596bf3f8905232083f1239aca6ef9fd (diff)
download	pygments-git-5ec283a3592dfdef7aff34ab00ca8685c4d37470.tar.gz