diff options
author | Georg Brandl <georg@python.org> | 2020-12-25 12:02:45 +0100 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2020-12-25 13:21:48 +0100 |
commit | fc12532b639214a0208cb0ae571bff928c4dcdd0 (patch) | |
tree | 28241825e320efc59903279ea88b77f62b9bfe62 /scripts | |
parent | bd378d0ef57a6bc53e9079c9363a6eab0d54f0dc (diff) | |
download | pygments-git-fc12532b639214a0208cb0ae571bff928c4dcdd0.tar.gz |
debug_lexer: add an option to utf8-decode input with "ignore" handler (useful for fuzzer testcases)
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/debug_lexer.py | 18 |
1 files changed, 14 insertions, 4 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py index 059a489f..7ef87fad 100755 --- a/scripts/debug_lexer.py +++ b/scripts/debug_lexer.py @@ -103,9 +103,14 @@ def main(fn, lexer=None, options={}): with open(fn, 'rb') as fp: text = fp.read().decode('utf-8') except UnicodeError: - print('Warning: non-UTF8 input, using latin1') - with open(fn, 'rb') as fp: - text = fp.read().decode('latin1') + if decode_strategy == 'latin1': + print('Warning: non-UTF8 input, using latin1') + with open(fn, 'rb') as fp: + text = fp.read().decode('latin1') + elif decode_strategy == 'utf8-ignore': + print('Warning: ignoring non-UTF8 bytes in input') + with open(fn, 'rb') as fp: + text = fp.read().decode('utf-8', 'ignore') text = text.strip('\n') + '\n' if lexer is not None: @@ -199,6 +204,8 @@ Selecting lexer and options: -l NAME use lexer named NAME (default is to guess from the given filenames) -g guess lexer from content + -u if input is non-utf8, use "ignore" handler instead + of using latin1 encoding -O OPTIONSTR use lexer options parsed from OPTIONSTR Debugging lexing errors: @@ -225,10 +232,11 @@ options = {} profile = False profsort = 4 guess = False +decode_strategy = 'latin1' if __name__ == '__main__': import getopt - opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hg') + opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hgu') for opt, val in opts: if opt == '-n': num = int(val) @@ -246,6 +254,8 @@ if __name__ == '__main__': options = _parse_options([val]) elif opt == '-g': guess = True + elif opt == '-u': + decode_strategy = 'utf8-ignore' elif opt == '-h': print_help() sys.exit(0) |