tests/test_html_lexer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

"""
    HTML Lexer Tests
    ~~~~~~~~~~~~~~~~

    :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import time

import pytest

from pygments.lexers.html import HtmlLexer
from pygments.token import Token

MAX_HL_TIME = 10


@pytest.fixture(scope='module')
def lexer_html():
    yield HtmlLexer()


def test_happy_javascript_fragment(lexer_html):
    """valid, even long Javascript fragments should still get parsed ok"""

    fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*2000+"</script>"
    start_time = time.time()
    tokens = list(lexer_html.get_tokens(fragment))
    assert all(x[1] != Token.Error for x in tokens)
    assert time.time() - start_time < MAX_HL_TIME, \
        'The HTML lexer might have an expensive happy-path script case'


def test_happy_css_fragment(lexer_html):
    """valid, even long CSS fragments should still get parsed ok"""

    fragment = "<style>"+".ui-helper-hidden{display:none}"*2000+"</style>"
    start_time = time.time()
    tokens = list(lexer_html.get_tokens(fragment))
    assert all(x[1] != Token.Error for x in tokens)
    assert time.time() - start_time < MAX_HL_TIME, \
        'The HTML lexer might have an expensive happy-path style case'


def test_long_unclosed_javascript_fragment(lexer_html):
    """unclosed, long Javascript fragments should parse quickly"""

    reps = 2000
    fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*reps
    start_time = time.time()
    tokens = list(lexer_html.get_tokens(fragment))
    assert time.time() - start_time < MAX_HL_TIME, \
        'The HTML lexer might have an expensive error script case'
    tokens_intro = [
        (Token.Punctuation, '<'),
        (Token.Name.Tag, 'script'),
        (Token.Text, ' '),
        (Token.Name.Attribute, 'type'),
        (Token.Operator, '='),
        (Token.Literal.String, '"text/javascript"'),
        (Token.Punctuation, '>'),
    ]
    tokens_body = [
        (Token.Name.Other, 'alert'),
        (Token.Punctuation, '('),
        (Token.Literal.String.Double, '"hi"'),
        (Token.Punctuation, ')'),
        (Token.Punctuation, ';'),
    ]

    # make sure we get the right opening tokens
    assert tokens[:len(tokens_intro)] == tokens_intro
    # and make sure we get the right body tokens even though the script is
    # unclosed
    assert tokens[len(tokens_intro):-1] == tokens_body * reps
    # and of course, the newline we get for free from get_tokens
    assert tokens[-1] == (Token.Text.Whitespace, "\n")


def test_long_unclosed_css_fragment(lexer_html):
    """unclosed, long CSS fragments should parse quickly"""

    reps = 2000
    fragment = "<style>"+".ui-helper-hidden{display:none}"*reps
    start_time = time.time()
    tokens = list(lexer_html.get_tokens(fragment))
    assert time.time() - start_time < MAX_HL_TIME, \
        'The HTML lexer might have an expensive error style case'

    tokens_intro = [
        (Token.Punctuation, '<'),
        (Token.Name.Tag, 'style'),
        (Token.Punctuation, '>'),
    ]
    tokens_body = [
        (Token.Punctuation, '.'),
        (Token.Name.Class, 'ui-helper-hidden'),
        (Token.Punctuation, '{'),
        (Token.Keyword, 'display'),
        (Token.Punctuation, ':'),
        (Token.Keyword.Constant, 'none'),
        (Token.Punctuation, '}'),
    ]

    # make sure we get the right opening tokens
    assert tokens[:len(tokens_intro)] == tokens_intro
    # and make sure we get the right body tokens even though the style block is
    # unclosed
    assert tokens[len(tokens_intro):-1] == tokens_body * reps
    # and of course, the newline we get for free from get_tokens
    assert tokens[-1] == (Token.Text.Whitespace, "\n")


def test_unclosed_fragment_with_newline_recovery(lexer_html):
    """unclosed Javascript fragments should recover on the next line"""

    fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*20+"\n<div>hi</div>"
    tokens = list(lexer_html.get_tokens(fragment))
    recovery_tokens = [
        (Token.Punctuation, '<'),
        (Token.Name.Tag, 'div'),
        (Token.Punctuation, '>'),
        (Token.Text, 'hi'),
        (Token.Punctuation, '<'),
        (Token.Punctuation, '/'),
        (Token.Name.Tag, 'div'),
        (Token.Punctuation, '>'),
        (Token.Text, '\n'),
    ]
    assert tokens[-1*len(recovery_tokens):] == recovery_tokens