Add a script to check for repeated tokens.

author: Matthäus G. Chajdas <dev@anteru.net> 2021-11-06 13:44:01 +0100
committer: Matthäus G. Chajdas <dev@anteru.net> 2021-11-06 13:44:01 +0100
commit: dd4d624ab51b610cfe1a7047368d8c305156916b (patch)
tree: 676ec91285a7f46ec6ab0c594ac50ea33c7f3b69 /scripts
parent: a9d9df203ce80de7e45618ad99901ef404757e8e (diff)
download: pygments-git-dd4d624ab51b610cfe1a7047368d8c305156916b.tar.gz
1 files changed, 103 insertions, 0 deletions
diff --git a/scripts/check_repeated_token.py b/scripts/check_repeated_token.py
new file mode 100755
index 00000000..cb98095f
--- /dev/null
+++ b/scripts/check_repeated_token.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+"""
+    Checker for repeated tokens
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Helper script to find suspicious lexers which produce the same token
+    repeatedly, i.e. for example:
+
+    .. code::
+   
+      'd'           Text
+      'a'           Text
+      't'           Text
+      'a'           Text
+      'b'           Text
+      'a'           Text
+      's'           Text
+      'e'           Text
+   
+    This script has two test modes: Check for tokens repeating more often than
+    a given threshold, and exclude anything but single-character tokens.
+    Repeated single-character tokens are quite problematic as they result in
+    bloated output and are usually an indication that someone is missing a + or *
+    in the regex. 
+
+    :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+import argparse
+import os
+import sys
+
+
+
+def unpack_file(path):
+    """Unpack a file into text, token pairs."""
+    from collections import namedtuple
+    pair = namedtuple('TextTokenPair', ['text', 'token'])
+    for line in open(path).readlines():
+        line = line.strip()
+        if line:
+            quotation_start = line.find('\'')
+            quotation_end = line.rfind('\'')
+            text = line[quotation_start+1:quotation_end]
+            token = line.split()[-1]
+            text = text.replace('\\n', '\n')
+            text = text.replace('\\t', '\t')
+            yield pair(text, token)
+
+def check_file(path, threshold, single_only):
+    current_token = ''
+    current_token_repeat_count = 1
+
+    is_suspicious = False
+
+    for value, token in unpack_file(path):
+        if single_only and len(value) > 1:
+            token = ''
+            current_token_repeat_count = 1
+            continue
+
+        if token != current_token:
+            current_token = token
+            current_token_repeat_count = 1
+        else:
+            current_token_repeat_count += 1
+        
+        if current_token_repeat_count > threshold:
+            is_suspicious = True
+            break
+
+    if is_suspicious:
+        print(path)
+
+    return not is_suspicious
+        
+
+def main(args):
+    errors = 0
+    for dir, _, files in os.walk(args.TEST_ROOT):
+        for file in files:
+            if not file.endswith('.output'):
+                continue
+            
+            path = os.path.join(dir, file)
+            if not check_file(path, args.threshold, args.single):
+                errors += 1
+
+    if errors > 0:
+        return 1
+    return 0
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('TEST_ROOT',
+        help='Root directory containing the tests')
+    parser.add_argument('-t', '--threshold', type=int, default=5,
+        help='Warn if a token repeats itself more often then this number.')
+    parser.add_argument('-s', '--single', action='store_true', default=False,
+        help='Only look at tokens matching a single character')
+    args = parser.parse_args()
+    sys.exit(main(args))
+\ No newline at end of file
author	Matthäus G. Chajdas <dev@anteru.net>	2021-11-06 13:44:01 +0100
committer	Matthäus G. Chajdas <dev@anteru.net>	2021-11-06 13:44:01 +0100
commit	dd4d624ab51b610cfe1a7047368d8c305156916b (patch)
tree	676ec91285a7f46ec6ab0c594ac50ea33c7f3b69 /scripts
parent	a9d9df203ce80de7e45618ad99901ef404757e8e (diff)
download	pygments-git-dd4d624ab51b610cfe1a7047368d8c305156916b.tar.gz