Add another check script for whitespace.

Add a script which checks for whitespace tokens, similar to the script checking for repeated tokens. Also move some functionality shared between them into a utility file, and make check_repeated_token PEP8 compliant.
author: Matthäus G. Chajdas <dev@anteru.net> 2022-12-04 15:12:42 +0100
committer: Matthäus G. Chajdas <dev@anteru.net> 2022-12-04 15:12:42 +0100
commit: d98309343786205a34b483c559be5eefcfd7eade (patch)
tree: 27dd9bd9102b00ba3b453d9dfd96d4f8963f7b97 /scripts
parent: dc492aea253a33f3335177dda23066aaf85050a3 (diff)
download: pygments-git-d98309343786205a34b483c559be5eefcfd7eade.tar.gz
3 files changed, 108 insertions, 47 deletions
diff --git a/scripts/check_repeated_token.py b/scripts/check_repeated_token.py
index 7226098d..16362813 100755
--- a/scripts/check_repeated_token.py
+++ b/scripts/check_repeated_token.py
@@ -7,7 +7,7 @@
     repeatedly, i.e. for example:
 
     .. code::
-   
+
       'd'           Text
       'a'           Text
       't'           Text
@@ -16,46 +16,27 @@
       'a'           Text
       's'           Text
       'e'           Text
-   
+
     This script has two test modes: Check for tokens repeating more often than
     a given threshold, and exclude anything but single-character tokens.
     Repeated single-character tokens are quite problematic as they result in
-    bloated output and are usually an indication that someone is missing a + or *
-    in the regex. 
+    bloated output and are usually an indication that someone is missing
+    a + or * in the regex.
 
     :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
     :license: BSD, see LICENSE for details.
 """
 import argparse
-import os
 import sys
 
-
-def unpack_file(path):
-    """Unpack a file into text, token pairs."""
-    from collections import namedtuple
-    pair = namedtuple('TextTokenPair', ['text', 'token'])
-    for line in open(path).readlines():
-        line = line.strip()
-        if line:
-            # Line can start with ' or ", so let's check which one it is
-            # and find the matching one
-            quotation_start = 0
-            quotation_end = line.rfind(line[0])
-            text = line[quotation_start+1:quotation_end]
-            token = line.split()[-1]
-            text = text.replace('\\n', '\n')
-            text = text.replace('\\t', '\t')
-            yield pair(text, token)
+from utility import unpack_output_file, process_output_files
 
 
 def check_file(path, threshold, single_only):
     current_token = ''
     current_token_repeat_count = 1
 
-    is_suspicious = False
-
-    for value, token in unpack_file(path):
+    for value, token, linenumber in unpack_output_file(path):
         if single_only and len(value) > 1:
             token = ''
             current_token_repeat_count = 1
@@ -66,29 +47,19 @@ def check_file(path, threshold, single_only):
             current_token_repeat_count = 1
         else:
             current_token_repeat_count += 1
-        
+
         if current_token_repeat_count > threshold:
-            is_suspicious = True
-            break
+            print(f'{path}:{linenumber}')
+            return False
 
-    if is_suspicious:
-        print(path)
+    return True
 
-    return not is_suspicious
-        
 
 def main(args):
-    errors = 0
-    for dir, _, files in os.walk(args.TEST_ROOT):
-        for file in files:
-            if not file.endswith('.output'):
-                continue
-            
-            path = os.path.join(dir, file)
-            if not check_file(path, args.threshold, args.single):
-                errors += 1
-
-    if errors > 0:
+    def check_file_callback(path):
+        return check_file(path, args.threshold, args.single)
+
+    if process_output_files(args.TEST_ROOT, check_file_callback) > 0:
         return 1
     return 0
 
@@ -96,10 +67,11 @@ def main(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('TEST_ROOT',
-        help='Root directory containing the tests')
+                        help='Root directory containing the tests')
     parser.add_argument('-t', '--threshold', type=int, default=5,
-        help='Warn if a token repeats itself more often then this number.')
+                        help='Warn if a token repeats itself more often then '
+                             'this number.')
     parser.add_argument('-s', '--single', action='store_true', default=False,
-        help='Only look at tokens matching a single character')
+                        help='Only look at tokens matching a single character')
     args = parser.parse_args()
-    sys.exit(main(args))
-\ No newline at end of file
+    sys.exit(main(args))
diff --git a/scripts/check_whitespace_token.py b/scripts/check_whitespace_token.py
new file mode 100644
index 00000000..9fb56ab3
--- /dev/null
+++ b/scripts/check_whitespace_token.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+"""
+    Checker for whitespace tokens
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Helper script to find whitespace which is not of token type `Whitespace`
+
+    :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+import argparse
+import sys
+import re
+
+from utility import unpack_output_file, process_output_files
+
+
+def check_file(path):
+    whitespace_re = re.compile('\s+')
+
+    for value, token, linenumber in unpack_output_file(path):
+        if whitespace_re.fullmatch(value) and 'Whitespace' not in token:
+            print(f'{path}:{linenumber}')
+            return False
+
+    return True
+
+
+def main(args):
+    if process_output_files(args.TEST_ROOT, check_file) > 0:
+        return 1
+    return 0
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('TEST_ROOT',
+                        help='Root directory containing the tests')
+    args = parser.parse_args()
+    sys.exit(main(args))
diff --git a/scripts/utility.py b/scripts/utility.py
new file mode 100644
index 00000000..6ce225b4
--- /dev/null
+++ b/scripts/utility.py
@@ -0,0 +1,49 @@
+"""
+    Utility functions for test scripts
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import os
+
+def unpack_output_file(path):
+    """
+    Unpack an output file into objects contining the line number, the text,
+    and the token name.
+    """
+    from collections import namedtuple
+    entry = namedtuple('OutputEntry', ['text', 'token', 'linenumber'])
+    for linenumber, line in enumerate(open(path).readlines()):
+        line = line.strip()
+        if line:
+            # Line can start with ' or ", so let's check which one it is
+            # and find the matching one
+            quotation_start = 0
+            quotation_end = line.rfind(line[0])
+            text = line[quotation_start+1:quotation_end]
+            token = line.split()[-1]
+            text = text.replace('\\n', '\n')
+            text = text.replace('\\t', '\t')
+            yield entry(text, token, linenumber + 1)
+
+def process_output_files(root_directory, callback):
+    """
+    Process all output files in a directory using the provided callback.
+    The callback should return `True` in case of success, `False` otherwise.
+
+    The function returns the number of files for which the callback returned
+    `False`.
+    """
+    errors = 0
+    for dir, _, files in os.walk(root_directory):
+        for file in files:
+            if not file.endswith('.output'):
+                continue
+            
+            path = os.path.join(dir, file)
+            if not callback(path):
+                errors += 1
+
+    return errors
author	Matthäus G. Chajdas <dev@anteru.net>	2022-12-04 15:12:42 +0100
committer	Matthäus G. Chajdas <dev@anteru.net>	2022-12-04 15:12:42 +0100
commit	d98309343786205a34b483c559be5eefcfd7eade (patch)
tree	27dd9bd9102b00ba3b453d9dfd96d4f8963f7b97 /scripts
parent	dc492aea253a33f3335177dda23066aaf85050a3 (diff)
download	pygments-git-d98309343786205a34b483c559be5eefcfd7eade.tar.gz