Overhaul the MySQL lexer (#1527)

* Overhaul the MySQL lexer Fixes #975, #1063, #1453 Changes include: Documentation ------------- * Note in the lexer docstring that Oracle MySQL is the target syntax. MariaDB syntax is not a target (though there is significant overlap). Unit tests ---------- * Add 140 unit tests for MySQL. Literals -------- * Hexadecimal/binary/date/time/timestamp literals are supported. * Integer mantissas are supported for scientific notation. * In-string escapes are now tokenized properly. * Support the "unknown" constant. Comments -------- * Optimizer hints are now supported, and keywords are recognized and tokenized as preprocessor instructions. * Remove nested multi-line comment support, which is no longer supported in MySQL. Variables --------- * Support the '@' prefix for variable names. * Lift restrictions on characters in unquoted variable names. (MySQL does not impose a restriction on lead characters.) * Support single/double/backtick-quoted variable names, including escapes. * Support the '@@' prefix for system variable names. * Support '?' as a variable so people can demonstrate prepared statements. Keywords -------- * Keyword / data type / function are now in a separate, auto-updating file. * Support 25 additional data types (including spatial and JSON types). * Support 460 additional MySQL keywords. * Support 372 MySQL functions. Explicit function support resolves a bug that causes non-function items to be treated as functions simply because they have a trailing opening parenthesis. * Support exceptions for the 'SET' keyword, which is both a datatype and a keyword depending on context. Schema object names ------------------- * Support Unicode in MySQL schema object names. * Support parsing of backtick-quoted schema object name escapes. (Escapes do not produce a distinct token type at this time.) Operators --------- * Remove non-operator characters from the list of operators. * Remove non-punctuation characters from the list of punctuation. * Cleanup items based on feedback * Remove an unnecessary optional newline lookahead for single-line comments
author: Kurt McKee <contactme@kurtmckee.org> 2020-09-06 07:40:17 -0500
committer: GitHub <noreply@github.com> 2020-09-06 14:40:17 +0200
commit: b3f1691125bf611cc073fa0ad3303518ae04c094 (patch)
tree: b91a4a346eee96937084076ab5c2ec4656779073 /tests/test_mysql.py
parent: 40baa94a6bf0c62be8c8b03a942116869ce80128 (diff)
download: pygments-git-b3f1691125bf611cc073fa0ad3303518ae04c094.tar.gz
1 files changed, 249 insertions, 0 deletions
diff --git a/tests/test_mysql.py b/tests/test_mysql.py
new file mode 100644
index 00000000..9b5e2b8c
--- /dev/null
+++ b/tests/test_mysql.py
@@ -0,0 +1,249 @@
+# -*- coding: utf-8 -*-
+"""
+    Pygments MySQL lexer tests
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import pytest
+
+from pygments.lexers.sql import MySqlLexer
+
+from pygments.token import \
+    Comment, \
+    Keyword, \
+    Literal, \
+    Name, \
+    Number, \
+    Operator, \
+    Punctuation, \
+    String, \
+    Text
+
+
+@pytest.fixture(scope='module')
+def lexer():
+    yield MySqlLexer()
+
+
+@pytest.mark.parametrize('text', ('123',))
+def test_integer_literals(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Number.Integer, text)
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        '.123', '1.23', '123.',
+        '1e10', '1.0e10', '1.e-10', '.1e+10',
+    ),
+)
+def test_float_literals(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Number.Float, text)
+
+
+@pytest.mark.parametrize('text', ("X'0af019'", "x'0AF019'", "0xaf019"))
+def test_hexadecimal_literals(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Number.Hex, text)
+
+
+@pytest.mark.parametrize('text', ("B'010'", "b'010'", "0b010"))
+def test_binary_literals(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Number.Bin, text)
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        "{d'2020-01-01'}", "{ d ' 2020^01@01 ' }",
+        "{t'8 9:10:11'}", "{ t ' 09:10:11.12 ' }", "{ t ' 091011 ' }",
+        '{ts"2020-01-01 09:10:11"}', "{ ts ' 2020@01/01  09:10:11 ' }",
+    ),
+)
+def test_temporal_literals(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Literal.Date, text)
+
+
+@pytest.mark.parametrize(
+    'text, expected_types',
+    (
+        (r"'a'", (String.Single,) * 3),
+        (r"""'""'""", (String.Single,) * 3),
+        (r"''''", (String.Single, String.Escape, String.Single)),
+        (r"'\''", (String.Single, String.Escape, String.Single)),
+        (r'"a"', (String.Double,) * 3),
+        (r'''"''"''', (String.Double,) * 3),
+        (r'""""', (String.Double, String.Escape, String.Double)),
+        (r'"\""', (String.Double, String.Escape, String.Double)),
+    ),
+)
+def test_string_literals(lexer, text, expected_types):
+    tokens = list(lexer.get_tokens(text))[:len(expected_types)]
+    assert all(t[0] == e for t, e in zip(tokens, expected_types))
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        "@a", "@1", "@._.$",
+        "@'?'", """@'abc''def"`ghi'""",
+        '@"#"', '''@"abc""def'`ghi"''',
+        '@`^`', """@`abc``def'"ghi`""",
+        "@@timestamp",
+        "@@session.auto_increment_offset",
+        "@@global.auto_increment_offset",
+        "@@persist.auto_increment_offset",
+        "@@persist_only.auto_increment_offset",
+        '?',
+    ),
+)
+def test_variables(lexer, text):
+    tokens = list(lexer.get_tokens(text))
+    assert all(t[0] == Name.Variable for t in tokens[:-1])
+    assert ''.join([t[1] for t in tokens]).strip() == text.strip()
+
+
+@pytest.mark.parametrize('text', ('true', 'false', 'null', 'unknown'))
+def test_constants(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Name.Constant, text)
+
+
+@pytest.mark.parametrize('text', ('-- abc', '--\tabc', '#abc'))
+def test_comments_single_line(lexer, text):
+    # Test the standalone comment.
+    tokens = list(lexer.get_tokens(text))
+    assert tokens[0] == (Comment.Single, text)
+
+    # Test the comment with mixed tokens.
+    tokens = list(lexer.get_tokens('select' + text + '\nselect'))
+    assert tokens[0] == (Keyword, 'select')
+    assert tokens[1] == (Comment.Single, text)
+    assert tokens[-2] == (Keyword, 'select')
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        '/**/a', '/*a*b/c*/a', '/*\nabc\n*/a',
+        '/* /* */a'
+    )
+)
+def test_comments_multi_line(lexer, text):
+    tokens = list(lexer.get_tokens(text))
+    assert all(token[0] == Comment.Multiline for token in tokens[:-2])
+    assert ''.join(token[1] for token in tokens).strip() == text.strip()
+
+    # Validate nested comments are not supported.
+    assert tokens[-2][0] != Comment.Multiline
+
+
+@pytest.mark.parametrize(
+    'text', ('BKA', 'SEMIJOIN'))
+def test_optimizer_hints(lexer, text):
+    good = '/*+ ' + text + '(), */'
+    ignore = '/* ' + text + ' */'
+    bad1 = '/*+ a' + text + '() */'
+    bad2 = '/*+ ' + text + 'a */'
+    assert (Comment.Preproc, text) in lexer.get_tokens(good)
+    assert (Comment.Preproc, text) not in lexer.get_tokens(ignore)
+    assert (Comment.Preproc, text) not in lexer.get_tokens(bad1)
+    assert (Comment.Preproc, text) not in lexer.get_tokens(bad2)
+
+
+@pytest.mark.parametrize(
+    'text, expected_types',
+    (
+        # SET exceptions
+        ('SET', (Keyword,)),
+        ('SET abc = 1;', (Keyword,)),
+        ('SET @abc = 1;', (Keyword,)),
+        ('CHARACTER SET latin1', (Keyword, Text, Keyword)),
+        ('SET("r", "g", "b")', (Keyword.Type, Punctuation)),
+        ('SET ("r", "g", "b")', (Keyword.Type, Text, Punctuation)),
+    ),
+)
+def test_exceptions(lexer, text, expected_types):
+    tokens = list(lexer.get_tokens(text))[:len(expected_types)]
+    assert all(t[0] == e for t, e in zip(tokens, expected_types))
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        'SHOW', 'CREATE', 'ALTER', 'DROP',
+        'SELECT', 'INSERT', 'UPDATE', 'DELETE',
+        'WHERE', 'GROUP', 'ORDER', 'BY', 'AS',
+        'DISTINCT', 'JOIN', 'WITH', 'RECURSIVE',
+        'PARTITION', 'NTILE', 'MASTER_PASSWORD', 'XA',
+        'REQUIRE_TABLE_PRIMARY_KEY_CHECK', 'STREAM',
+    ),
+)
+def test_keywords(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Keyword, text)
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        # Standard
+        'INT(', 'VARCHAR(', 'ENUM(', 'DATETIME', 'GEOMETRY', 'POINT', 'JSON',
+        # Aliases and compatibility
+        'FIXED', 'MEDIUMINT', 'INT3', 'REAL', 'SERIAL',
+        'LONG', 'NATIONAL', 'PRECISION', 'VARYING',
+    ),
+)
+def test_data_types(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Keyword.Type, text.strip('('))
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        # Common
+        'CAST', 'CONCAT_WS', 'DAYNAME', 'IFNULL', 'NOW', 'SUBSTR',
+        # Less common
+        'CAN_ACCESS_COLUMN', 'JSON_CONTAINS_PATH', 'ST_GEOMFROMGEOJSON',
+    ),
+)
+def test_functions(lexer, text):
+    assert list(lexer.get_tokens(text + '('))[0] == (Name.Function, text)
+    assert list(lexer.get_tokens(text + ' ('))[0] == (Name.Function, text)
+
+
+@pytest.mark.parametrize(
+    'text',
+    (
+        'abc_$123', '上市年限', 'ひらがな',
+        '`a`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
+        '````', r'`\``', r'`\\`',
+        '`-- `', '`/*`', '`#`',
+    ),
+)
+def test_schema_object_names(lexer, text):
+    tokens = list(lexer.get_tokens(text))[:-1]
+    assert all(token[0] == Name for token in tokens)
+    assert ''.join(token[1] for token in tokens) == text
+
+
+@pytest.mark.parametrize(
+    'text',
+    ('+', '*', '/', '%', '&&', ':=', '!', '<', '->>', '^', '|', '~'),
+)
+def test_operators(lexer, text):
+    assert list(lexer.get_tokens(text))[0] == (Operator, text)
+
+
+@pytest.mark.parametrize(
+    'text, expected_types',
+    (
+        ('abc.efg', (Name, Punctuation, Name)),
+        ('abc,efg', (Name, Punctuation, Name)),
+        ('MAX(abc)', (Name.Function, Punctuation, Name, Punctuation)),
+        ('efg;', (Name, Punctuation)),
+    ),
+)
+def test_punctuation(lexer, text, expected_types):
+    tokens = list(lexer.get_tokens(text))[:len(expected_types)]
+    assert all(t[0] == e for t, e in zip(tokens, expected_types))
author	Kurt McKee <contactme@kurtmckee.org>	2020-09-06 07:40:17 -0500
committer	GitHub <noreply@github.com>	2020-09-06 14:40:17 +0200
commit	b3f1691125bf611cc073fa0ad3303518ae04c094 (patch)
tree	b91a4a346eee96937084076ab5c2ec4656779073 /tests/test_mysql.py
parent	40baa94a6bf0c62be8c8b03a942116869ce80128 (diff)
download	pygments-git-b3f1691125bf611cc073fa0ad3303518ae04c094.tar.gz