diff options
author | Georg Brandl <georg@python.org> | 2021-01-17 11:18:47 +0100 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2021-01-17 11:19:02 +0100 |
commit | edaa50dfabc48394c116783ccd7e0f523a00dd89 (patch) | |
tree | 0adeb7969d9b3743b9a13242990a08d02df45c49 /scripts | |
parent | b4594169025485f7d64d5ffd71f42673e7a5972d (diff) | |
download | pygments-git-edaa50dfabc48394c116783ccd7e0f523a00dd89.tar.gz |
debug_lexer: add Atheris fuzzer mode (for Google OSS-Fuzz)
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/debug_lexer.py | 61 |
1 files changed, 50 insertions, 11 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py index 9a506625..e173d344 100755 --- a/scripts/debug_lexer.py +++ b/scripts/debug_lexer.py @@ -14,6 +14,7 @@ import os import sys +import struct # always prefer Pygments from source if exists srcpath = os.path.join(os.path.dirname(__file__), '..') @@ -95,22 +96,56 @@ class DebuggingRegexLexer(ExtendedRegexLexer): break +def decode_atheris(bstr): + """Decode a byte string into a Unicode string using the algorithm + of Google's Atheris fuzzer library, which aims to produce a wide + range of possible Unicode inputs. + + Corresponds to ConsumeUnicodeImpl() with filter_surrogates=false in + https://github.com/google/atheris/blob/master/fuzzed_data_provider.cc + """ + if len(bstr) < 2: + return '' + # The first byte only selects if the rest is decoded as ascii, "utf-16" or "utf-32" + spec, bstr = bstr[0], bstr[1:] + if spec & 1: # pure ASCII + return ''.join(chr(ch & 0x7f) for ch in bstr) + elif spec & 2: # UTF-16 + bstr = bstr if len(bstr) % 2 == 0 else bstr[:-1] + return bstr.decode('utf16') + + # else UTF-32 + def valid_codepoint(ch): + ch &= 0x1fffff + if ch & 0x100000: + ch &= ~0x0f0000 + return chr(ch) + + chars = struct.unpack('%dI%dx' % divmod(len(bstr), 4), bstr) + return ''.join(map(valid_codepoint), chars) + + def main(fn, lexer=None, options={}): if fn == '-': text = sys.stdin.read() else: - try: - with open(fn, 'rb') as fp: - text = fp.read().decode('utf-8') - except UnicodeError: - if decode_strategy == 'latin1': + with open(fn, 'rb') as fp: + text = fp.read() + if decode_strategy == 'latin1': + try: + text = text.decode('utf8') + except UnicodeError: print('Warning: non-UTF8 input, using latin1') - with open(fn, 'rb') as fp: - text = fp.read().decode('latin1') - elif decode_strategy == 'utf8-ignore': + text = text.decode('latin1') + elif decode_strategy == 'utf8-ignore': + try: + text = text.decode('utf8') + except UnicodeError: print('Warning: ignoring non-UTF8 bytes in input') - with open(fn, 'rb') as fp: - text = fp.read().decode('utf-8', 'ignore') + text = text.decode('utf8', 'ignore') + elif decode_strategy == 'atheris': + text = decode_atheris(text) + text = text.strip('\n') + '\n' if lexer is not None: @@ -206,6 +241,8 @@ Selecting lexer and options: -g guess lexer from content -u if input is non-utf8, use "ignore" handler instead of using latin1 encoding + -U use Atheris fuzzer's method of converting + byte input to Unicode -O OPTIONSTR use lexer options parsed from OPTIONSTR Debugging lexing errors: @@ -236,7 +273,7 @@ decode_strategy = 'latin1' if __name__ == '__main__': import getopt - opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hgu') + opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hguU') for opt, val in opts: if opt == '-n': num = int(val) @@ -256,6 +293,8 @@ if __name__ == '__main__': guess = True elif opt == '-u': decode_strategy = 'utf8-ignore' + elif opt == '-U': + decode_strategy = 'atheris' elif opt == '-h': print_help() sys.exit(0) |