summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorMatthäus G. Chajdas <dev@anteru.net>2022-12-04 15:12:42 +0100
committerMatthäus G. Chajdas <dev@anteru.net>2022-12-04 15:12:42 +0100
commitd98309343786205a34b483c559be5eefcfd7eade (patch)
tree27dd9bd9102b00ba3b453d9dfd96d4f8963f7b97 /scripts
parentdc492aea253a33f3335177dda23066aaf85050a3 (diff)
downloadpygments-git-d98309343786205a34b483c559be5eefcfd7eade.tar.gz
Add another check script for whitespace.
Add a script which checks for whitespace tokens, similar to the script checking for repeated tokens. Also move some functionality shared between them into a utility file, and make check_repeated_token PEP8 compliant.
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/check_repeated_token.py66
-rw-r--r--scripts/check_whitespace_token.py40
-rw-r--r--scripts/utility.py49
3 files changed, 108 insertions, 47 deletions
diff --git a/scripts/check_repeated_token.py b/scripts/check_repeated_token.py
index 7226098d..16362813 100755
--- a/scripts/check_repeated_token.py
+++ b/scripts/check_repeated_token.py
@@ -7,7 +7,7 @@
repeatedly, i.e. for example:
.. code::
-
+
'd' Text
'a' Text
't' Text
@@ -16,46 +16,27 @@
'a' Text
's' Text
'e' Text
-
+
This script has two test modes: Check for tokens repeating more often than
a given threshold, and exclude anything but single-character tokens.
Repeated single-character tokens are quite problematic as they result in
- bloated output and are usually an indication that someone is missing a + or *
- in the regex.
+ bloated output and are usually an indication that someone is missing
+ a + or * in the regex.
:copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import argparse
-import os
import sys
-
-def unpack_file(path):
- """Unpack a file into text, token pairs."""
- from collections import namedtuple
- pair = namedtuple('TextTokenPair', ['text', 'token'])
- for line in open(path).readlines():
- line = line.strip()
- if line:
- # Line can start with ' or ", so let's check which one it is
- # and find the matching one
- quotation_start = 0
- quotation_end = line.rfind(line[0])
- text = line[quotation_start+1:quotation_end]
- token = line.split()[-1]
- text = text.replace('\\n', '\n')
- text = text.replace('\\t', '\t')
- yield pair(text, token)
+from utility import unpack_output_file, process_output_files
def check_file(path, threshold, single_only):
current_token = ''
current_token_repeat_count = 1
- is_suspicious = False
-
- for value, token in unpack_file(path):
+ for value, token, linenumber in unpack_output_file(path):
if single_only and len(value) > 1:
token = ''
current_token_repeat_count = 1
@@ -66,29 +47,19 @@ def check_file(path, threshold, single_only):
current_token_repeat_count = 1
else:
current_token_repeat_count += 1
-
+
if current_token_repeat_count > threshold:
- is_suspicious = True
- break
+ print(f'{path}:{linenumber}')
+ return False
- if is_suspicious:
- print(path)
+ return True
- return not is_suspicious
-
def main(args):
- errors = 0
- for dir, _, files in os.walk(args.TEST_ROOT):
- for file in files:
- if not file.endswith('.output'):
- continue
-
- path = os.path.join(dir, file)
- if not check_file(path, args.threshold, args.single):
- errors += 1
-
- if errors > 0:
+ def check_file_callback(path):
+ return check_file(path, args.threshold, args.single)
+
+ if process_output_files(args.TEST_ROOT, check_file_callback) > 0:
return 1
return 0
@@ -96,10 +67,11 @@ def main(args):
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('TEST_ROOT',
- help='Root directory containing the tests')
+ help='Root directory containing the tests')
parser.add_argument('-t', '--threshold', type=int, default=5,
- help='Warn if a token repeats itself more often then this number.')
+ help='Warn if a token repeats itself more often then '
+ 'this number.')
parser.add_argument('-s', '--single', action='store_true', default=False,
- help='Only look at tokens matching a single character')
+ help='Only look at tokens matching a single character')
args = parser.parse_args()
- sys.exit(main(args)) \ No newline at end of file
+ sys.exit(main(args))
diff --git a/scripts/check_whitespace_token.py b/scripts/check_whitespace_token.py
new file mode 100644
index 00000000..9fb56ab3
--- /dev/null
+++ b/scripts/check_whitespace_token.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+"""
+ Checker for whitespace tokens
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ Helper script to find whitespace which is not of token type `Whitespace`
+
+ :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
+ :license: BSD, see LICENSE for details.
+"""
+import argparse
+import sys
+import re
+
+from utility import unpack_output_file, process_output_files
+
+
+def check_file(path):
+ whitespace_re = re.compile('\s+')
+
+ for value, token, linenumber in unpack_output_file(path):
+ if whitespace_re.fullmatch(value) and 'Whitespace' not in token:
+ print(f'{path}:{linenumber}')
+ return False
+
+ return True
+
+
+def main(args):
+ if process_output_files(args.TEST_ROOT, check_file) > 0:
+ return 1
+ return 0
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('TEST_ROOT',
+ help='Root directory containing the tests')
+ args = parser.parse_args()
+ sys.exit(main(args))
diff --git a/scripts/utility.py b/scripts/utility.py
new file mode 100644
index 00000000..6ce225b4
--- /dev/null
+++ b/scripts/utility.py
@@ -0,0 +1,49 @@
+"""
+ Utility functions for test scripts
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
+ :license: BSD, see LICENSE for details.
+"""
+
+import os
+
+def unpack_output_file(path):
+ """
+ Unpack an output file into objects contining the line number, the text,
+ and the token name.
+ """
+ from collections import namedtuple
+ entry = namedtuple('OutputEntry', ['text', 'token', 'linenumber'])
+ for linenumber, line in enumerate(open(path).readlines()):
+ line = line.strip()
+ if line:
+ # Line can start with ' or ", so let's check which one it is
+ # and find the matching one
+ quotation_start = 0
+ quotation_end = line.rfind(line[0])
+ text = line[quotation_start+1:quotation_end]
+ token = line.split()[-1]
+ text = text.replace('\\n', '\n')
+ text = text.replace('\\t', '\t')
+ yield entry(text, token, linenumber + 1)
+
+def process_output_files(root_directory, callback):
+ """
+ Process all output files in a directory using the provided callback.
+ The callback should return `True` in case of success, `False` otherwise.
+
+ The function returns the number of files for which the callback returned
+ `False`.
+ """
+ errors = 0
+ for dir, _, files in os.walk(root_directory):
+ for file in files:
+ if not file.endswith('.output'):
+ continue
+
+ path = os.path.join(dir, file)
+ if not callback(path):
+ errors += 1
+
+ return errors