From d98309343786205a34b483c559be5eefcfd7eade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matth=C3=A4us=20G=2E=20Chajdas?= Date: Sun, 4 Dec 2022 15:12:42 +0100 Subject: Add another check script for whitespace. Add a script which checks for whitespace tokens, similar to the script checking for repeated tokens. Also move some functionality shared between them into a utility file, and make check_repeated_token PEP8 compliant. --- scripts/check_repeated_token.py | 66 +++++++++++---------------------------- scripts/check_whitespace_token.py | 40 ++++++++++++++++++++++++ scripts/utility.py | 49 +++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 47 deletions(-) create mode 100644 scripts/check_whitespace_token.py create mode 100644 scripts/utility.py (limited to 'scripts') diff --git a/scripts/check_repeated_token.py b/scripts/check_repeated_token.py index 7226098d..16362813 100755 --- a/scripts/check_repeated_token.py +++ b/scripts/check_repeated_token.py @@ -7,7 +7,7 @@ repeatedly, i.e. for example: .. code:: - + 'd' Text 'a' Text 't' Text @@ -16,46 +16,27 @@ 'a' Text 's' Text 'e' Text - + This script has two test modes: Check for tokens repeating more often than a given threshold, and exclude anything but single-character tokens. Repeated single-character tokens are quite problematic as they result in - bloated output and are usually an indication that someone is missing a + or * - in the regex. + bloated output and are usually an indication that someone is missing + a + or * in the regex. :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import argparse -import os import sys - -def unpack_file(path): - """Unpack a file into text, token pairs.""" - from collections import namedtuple - pair = namedtuple('TextTokenPair', ['text', 'token']) - for line in open(path).readlines(): - line = line.strip() - if line: - # Line can start with ' or ", so let's check which one it is - # and find the matching one - quotation_start = 0 - quotation_end = line.rfind(line[0]) - text = line[quotation_start+1:quotation_end] - token = line.split()[-1] - text = text.replace('\\n', '\n') - text = text.replace('\\t', '\t') - yield pair(text, token) +from utility import unpack_output_file, process_output_files def check_file(path, threshold, single_only): current_token = '' current_token_repeat_count = 1 - is_suspicious = False - - for value, token in unpack_file(path): + for value, token, linenumber in unpack_output_file(path): if single_only and len(value) > 1: token = '' current_token_repeat_count = 1 @@ -66,29 +47,19 @@ def check_file(path, threshold, single_only): current_token_repeat_count = 1 else: current_token_repeat_count += 1 - + if current_token_repeat_count > threshold: - is_suspicious = True - break + print(f'{path}:{linenumber}') + return False - if is_suspicious: - print(path) + return True - return not is_suspicious - def main(args): - errors = 0 - for dir, _, files in os.walk(args.TEST_ROOT): - for file in files: - if not file.endswith('.output'): - continue - - path = os.path.join(dir, file) - if not check_file(path, args.threshold, args.single): - errors += 1 - - if errors > 0: + def check_file_callback(path): + return check_file(path, args.threshold, args.single) + + if process_output_files(args.TEST_ROOT, check_file_callback) > 0: return 1 return 0 @@ -96,10 +67,11 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('TEST_ROOT', - help='Root directory containing the tests') + help='Root directory containing the tests') parser.add_argument('-t', '--threshold', type=int, default=5, - help='Warn if a token repeats itself more often then this number.') + help='Warn if a token repeats itself more often then ' + 'this number.') parser.add_argument('-s', '--single', action='store_true', default=False, - help='Only look at tokens matching a single character') + help='Only look at tokens matching a single character') args = parser.parse_args() - sys.exit(main(args)) \ No newline at end of file + sys.exit(main(args)) diff --git a/scripts/check_whitespace_token.py b/scripts/check_whitespace_token.py new file mode 100644 index 00000000..9fb56ab3 --- /dev/null +++ b/scripts/check_whitespace_token.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +""" + Checker for whitespace tokens + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Helper script to find whitespace which is not of token type `Whitespace` + + :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" +import argparse +import sys +import re + +from utility import unpack_output_file, process_output_files + + +def check_file(path): + whitespace_re = re.compile('\s+') + + for value, token, linenumber in unpack_output_file(path): + if whitespace_re.fullmatch(value) and 'Whitespace' not in token: + print(f'{path}:{linenumber}') + return False + + return True + + +def main(args): + if process_output_files(args.TEST_ROOT, check_file) > 0: + return 1 + return 0 + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('TEST_ROOT', + help='Root directory containing the tests') + args = parser.parse_args() + sys.exit(main(args)) diff --git a/scripts/utility.py b/scripts/utility.py new file mode 100644 index 00000000..6ce225b4 --- /dev/null +++ b/scripts/utility.py @@ -0,0 +1,49 @@ +""" + Utility functions for test scripts + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import os + +def unpack_output_file(path): + """ + Unpack an output file into objects contining the line number, the text, + and the token name. + """ + from collections import namedtuple + entry = namedtuple('OutputEntry', ['text', 'token', 'linenumber']) + for linenumber, line in enumerate(open(path).readlines()): + line = line.strip() + if line: + # Line can start with ' or ", so let's check which one it is + # and find the matching one + quotation_start = 0 + quotation_end = line.rfind(line[0]) + text = line[quotation_start+1:quotation_end] + token = line.split()[-1] + text = text.replace('\\n', '\n') + text = text.replace('\\t', '\t') + yield entry(text, token, linenumber + 1) + +def process_output_files(root_directory, callback): + """ + Process all output files in a directory using the provided callback. + The callback should return `True` in case of success, `False` otherwise. + + The function returns the number of files for which the callback returned + `False`. + """ + errors = 0 + for dir, _, files in os.walk(root_directory): + for file in files: + if not file.endswith('.output'): + continue + + path = os.path.join(dir, file) + if not callback(path): + errors += 1 + + return errors -- cgit v1.2.1