#!/usr/bin/python """ Lexing error finder ~~~~~~~~~~~~~~~~~~~ For the source files given on the command line, display the text where Error tokens are being generated, along with some context. :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import os import sys import struct # always prefer Pygments from source if exists srcpath = os.path.join(os.path.dirname(__file__), '..') if os.path.isdir(os.path.join(srcpath, 'pygments')): sys.path.insert(0, srcpath) from pygments.lexer import ExtendedRegexLexer, LexerContext from pygments.lexers import get_lexer_by_name, find_lexer_class, \ find_lexer_class_for_filename, guess_lexer from pygments.token import Error, Text, _TokenType from pygments.cmdline import _parse_options class DebuggingRegexLexer(ExtendedRegexLexer): """Make the state stack, position and current match instance attributes.""" def get_tokens_unprocessed(self, text, stack=('root',)): """ Split ``text`` into (tokentype, text) pairs. ``stack`` is the initial stack (default: ``['root']``) """ tokendefs = self._tokens self.ctx = ctx = LexerContext(text, 0) ctx.stack = list(stack) statetokens = tokendefs[ctx.stack[-1]] while 1: for rexmatch, action, new_state in statetokens: self.m = m = rexmatch(text, ctx.pos, ctx.end) if m: if action is not None: if type(action) is _TokenType: yield ctx.pos, action, m.group() ctx.pos = m.end() else: if not isinstance(self, ExtendedRegexLexer): yield from action(self, m) ctx.pos = m.end() else: yield from action(self, m, ctx) if not new_state: # altered the state stack? statetokens = tokendefs[ctx.stack[-1]] if new_state is not None: # state transition if isinstance(new_state, tuple): for state in new_state: if state == '#pop': ctx.stack.pop() elif state == '#push': ctx.stack.append(ctx.stack[-1]) else: ctx.stack.append(state) elif isinstance(new_state, int): # pop del ctx.stack[new_state:] elif new_state == '#push': ctx.stack.append(ctx.stack[-1]) else: assert False, 'wrong state def: %r' % new_state statetokens = tokendefs[ctx.stack[-1]] break else: try: if ctx.pos >= ctx.end: break if text[ctx.pos] == '\n': # at EOL, reset state to 'root' ctx.stack = ['root'] statetokens = tokendefs['root'] yield ctx.pos, Text, '\n' ctx.pos += 1 continue yield ctx.pos, Error, text[ctx.pos] ctx.pos += 1 except IndexError: break def decode_atheris(bstr): """Decode a byte string into a Unicode string using the algorithm of Google's Atheris fuzzer library, which aims to produce a wide range of possible Unicode inputs. Corresponds to ConsumeUnicodeImpl() with filter_surrogates=false in https://github.com/google/atheris/blob/master/fuzzed_data_provider.cc """ if len(bstr) < 2: return '' # The first byte only selects if the rest is decoded as ascii, "utf-16" or "utf-32" spec, bstr = bstr[0], bstr[1:] if spec & 1: # pure ASCII return ''.join(chr(ch & 0x7f) for ch in bstr) elif spec & 2: # UTF-16 bstr = bstr if len(bstr) % 2 == 0 else bstr[:-1] return bstr.decode('utf16') # else UTF-32 def valid_codepoint(ch): ch &= 0x1fffff if ch & 0x100000: ch &= ~0x0f0000 return chr(ch) chars = struct.unpack('%dI%dx' % divmod(len(bstr), 4), bstr) return ''.join(map(valid_codepoint), chars) def main(fn, lexer=None, options={}): if fn == '-': text = sys.stdin.read() else: with open(fn, 'rb') as fp: text = fp.read() if decode_strategy == 'latin1': try: text = text.decode('utf8') except UnicodeError: print('Warning: non-UTF8 input, using latin1') text = text.decode('latin1') elif decode_strategy == 'utf8-ignore': try: text = text.decode('utf8') except UnicodeError: print('Warning: ignoring non-UTF8 bytes in input') text = text.decode('utf8', 'ignore') elif decode_strategy == 'atheris': text = decode_atheris(text) text = text.strip('\n') + '\n' if lexer is not None: lxcls = get_lexer_by_name(lexer).__class__ elif guess: lxcls = guess_lexer(text).__class__ print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__, lxcls.__name__)) else: lxcls = find_lexer_class_for_filename(os.path.basename(fn)) if lxcls is None: name, rest = fn.split('_', 1) lxcls = find_lexer_class(name) if lxcls is None: raise AssertionError('no lexer found for file %r' % fn) print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__, lxcls.__name__)) debug_lexer = False # if profile: # # does not work for e.g. ExtendedRegexLexers # if lxcls.__bases__ == (RegexLexer,): # # yes we can! (change the metaclass) # lxcls.__class__ = ProfilingRegexLexerMeta # lxcls.__bases__ = (ProfilingRegexLexer,) # lxcls._prof_sort_index = profsort # else: # if lxcls.__bases__ == (RegexLexer,): # lxcls.__bases__ = (DebuggingRegexLexer,) # debug_lexer = True # elif lxcls.__bases__ == (DebuggingRegexLexer,): # # already debugged before # debug_lexer = True # else: # # HACK: ExtendedRegexLexer subclasses will only partially work here. # lxcls.__bases__ = (DebuggingRegexLexer,) # debug_lexer = True lx = lxcls(**options) lno = 1 tokens = [] states = [] def show_token(tok, state): reprs = list(map(repr, tok)) print(' ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ') if debug_lexer: print(' ' + ' ' * (29-len(reprs[0])) + ' : '.join(state) if state else '', end=' ') print() for type, val in lx.get_tokens(text): lno += val.count('\n') if type == Error and not ignerror: print('Error parsing', fn, 'on line', lno) if not showall: print('Previous tokens' + (debug_lexer and ' and states' or '') + ':') for i in range(max(len(tokens) - num, 0), len(tokens)): if debug_lexer: show_token(tokens[i], states[i]) else: show_token(tokens[i], None) print('Error token:') vlen = len(repr(val)) print(' ' + repr(val), end=' ') if debug_lexer and hasattr(lx, 'ctx'): print(' ' * (60-vlen) + ' : '.join(lx.ctx.stack), end=' ') print() print() return 1 tokens.append((type, val)) if debug_lexer: if hasattr(lx, 'ctx'): states.append(lx.ctx.stack[:]) else: states.append(None) if showall: show_token((type, val), states[-1] if debug_lexer else None) return 0 def print_help(): print('''\ Pygments development helper to quickly debug lexers. scripts/debug_lexer.py [options] file ... Give one or more filenames to lex them and display possible error tokens and/or profiling info. Files are assumed to be encoded in UTF-8. Selecting lexer and options: -l NAME use lexer named NAME (default is to guess from the given filenames) -g guess lexer from content -u if input is non-utf8, use "ignore" handler instead of using latin1 encoding -U use Atheris fuzzer's method of converting byte input to Unicode -O OPTIONSTR use lexer options parsed from OPTIONSTR Debugging lexing errors: -n N show the last N tokens on error -a always show all lexed tokens (default is only to show them when an error occurs) -e do not stop on error tokens Profiling: -p use the ProfilingRegexLexer to profile regexes instead of the debugging lexer -s N sort profiling output by column N (default is column 4, the time per call) ''') num = 10 showall = False ignerror = False lexer = None options = {} profile = False profsort = 4 guess = False decode_strategy = 'latin1' if __name__ == '__main__': import getopt opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hguU') for opt, val in opts: if opt == '-n': num = int(val) elif opt == '-a': showall = True elif opt == '-e': ignerror = True elif opt == '-l': lexer = val elif opt == '-p': profile = True elif opt == '-s': profsort = int(val) elif opt == '-O': options = _parse_options([val]) elif opt == '-g': guess = True elif opt == '-u': decode_strategy = 'utf8-ignore' elif opt == '-U': decode_strategy = 'atheris' elif opt == '-h': print_help() sys.exit(0) ret = 0 if not args: print_help() for f in args: ret += main(f, lexer, options) sys.exit(bool(ret))