1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
#!/usr/bin/env python
"""
Checker for repeated tokens
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Helper script to find suspicious lexers which produce the same token
repeatedly, i.e. for example:
.. code::
'd' Text
'a' Text
't' Text
'a' Text
'b' Text
'a' Text
's' Text
'e' Text
This script has two test modes: Check for tokens repeating more often than
a given threshold, and exclude anything but single-character tokens.
Repeated single-character tokens are quite problematic as they result in
bloated output and are usually an indication that someone is missing a + or *
in the regex.
:copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import argparse
import os
import sys
def unpack_file(path):
"""Unpack a file into text, token pairs."""
from collections import namedtuple
pair = namedtuple('TextTokenPair', ['text', 'token'])
for line in open(path).readlines():
line = line.strip()
if line:
quotation_start = line.find('\'')
quotation_end = line.rfind('\'')
text = line[quotation_start+1:quotation_end]
token = line.split()[-1]
text = text.replace('\\n', '\n')
text = text.replace('\\t', '\t')
yield pair(text, token)
def check_file(path, threshold, single_only):
current_token = ''
current_token_repeat_count = 1
is_suspicious = False
for value, token in unpack_file(path):
if single_only and len(value) > 1:
token = ''
current_token_repeat_count = 1
continue
if token != current_token:
current_token = token
current_token_repeat_count = 1
else:
current_token_repeat_count += 1
if current_token_repeat_count > threshold:
is_suspicious = True
break
if is_suspicious:
print(path)
return not is_suspicious
def main(args):
errors = 0
for dir, _, files in os.walk(args.TEST_ROOT):
for file in files:
if not file.endswith('.output'):
continue
path = os.path.join(dir, file)
if not check_file(path, args.threshold, args.single):
errors += 1
if errors > 0:
return 1
return 0
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('TEST_ROOT',
help='Root directory containing the tests')
parser.add_argument('-t', '--threshold', type=int, default=5,
help='Warn if a token repeats itself more often then this number.')
parser.add_argument('-s', '--single', action='store_true', default=False,
help='Only look at tokens matching a single character')
args = parser.parse_args()
sys.exit(main(args))
|