summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2021-01-17 11:18:47 +0100
committerGeorg Brandl <georg@python.org>2021-01-17 11:19:02 +0100
commitedaa50dfabc48394c116783ccd7e0f523a00dd89 (patch)
tree0adeb7969d9b3743b9a13242990a08d02df45c49 /scripts
parentb4594169025485f7d64d5ffd71f42673e7a5972d (diff)
downloadpygments-git-edaa50dfabc48394c116783ccd7e0f523a00dd89.tar.gz
debug_lexer: add Atheris fuzzer mode (for Google OSS-Fuzz)
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/debug_lexer.py61
1 files changed, 50 insertions, 11 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py
index 9a506625..e173d344 100755
--- a/scripts/debug_lexer.py
+++ b/scripts/debug_lexer.py
@@ -14,6 +14,7 @@
import os
import sys
+import struct
# always prefer Pygments from source if exists
srcpath = os.path.join(os.path.dirname(__file__), '..')
@@ -95,22 +96,56 @@ class DebuggingRegexLexer(ExtendedRegexLexer):
break
+def decode_atheris(bstr):
+ """Decode a byte string into a Unicode string using the algorithm
+ of Google's Atheris fuzzer library, which aims to produce a wide
+ range of possible Unicode inputs.
+
+ Corresponds to ConsumeUnicodeImpl() with filter_surrogates=false in
+ https://github.com/google/atheris/blob/master/fuzzed_data_provider.cc
+ """
+ if len(bstr) < 2:
+ return ''
+ # The first byte only selects if the rest is decoded as ascii, "utf-16" or "utf-32"
+ spec, bstr = bstr[0], bstr[1:]
+ if spec & 1: # pure ASCII
+ return ''.join(chr(ch & 0x7f) for ch in bstr)
+ elif spec & 2: # UTF-16
+ bstr = bstr if len(bstr) % 2 == 0 else bstr[:-1]
+ return bstr.decode('utf16')
+
+ # else UTF-32
+ def valid_codepoint(ch):
+ ch &= 0x1fffff
+ if ch & 0x100000:
+ ch &= ~0x0f0000
+ return chr(ch)
+
+ chars = struct.unpack('%dI%dx' % divmod(len(bstr), 4), bstr)
+ return ''.join(map(valid_codepoint), chars)
+
+
def main(fn, lexer=None, options={}):
if fn == '-':
text = sys.stdin.read()
else:
- try:
- with open(fn, 'rb') as fp:
- text = fp.read().decode('utf-8')
- except UnicodeError:
- if decode_strategy == 'latin1':
+ with open(fn, 'rb') as fp:
+ text = fp.read()
+ if decode_strategy == 'latin1':
+ try:
+ text = text.decode('utf8')
+ except UnicodeError:
print('Warning: non-UTF8 input, using latin1')
- with open(fn, 'rb') as fp:
- text = fp.read().decode('latin1')
- elif decode_strategy == 'utf8-ignore':
+ text = text.decode('latin1')
+ elif decode_strategy == 'utf8-ignore':
+ try:
+ text = text.decode('utf8')
+ except UnicodeError:
print('Warning: ignoring non-UTF8 bytes in input')
- with open(fn, 'rb') as fp:
- text = fp.read().decode('utf-8', 'ignore')
+ text = text.decode('utf8', 'ignore')
+ elif decode_strategy == 'atheris':
+ text = decode_atheris(text)
+
text = text.strip('\n') + '\n'
if lexer is not None:
@@ -206,6 +241,8 @@ Selecting lexer and options:
-g guess lexer from content
-u if input is non-utf8, use "ignore" handler instead
of using latin1 encoding
+ -U use Atheris fuzzer's method of converting
+ byte input to Unicode
-O OPTIONSTR use lexer options parsed from OPTIONSTR
Debugging lexing errors:
@@ -236,7 +273,7 @@ decode_strategy = 'latin1'
if __name__ == '__main__':
import getopt
- opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hgu')
+ opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hguU')
for opt, val in opts:
if opt == '-n':
num = int(val)
@@ -256,6 +293,8 @@ if __name__ == '__main__':
guess = True
elif opt == '-u':
decode_strategy = 'utf8-ignore'
+ elif opt == '-U':
+ decode_strategy = 'atheris'
elif opt == '-h':
print_help()
sys.exit(0)