summaryrefslogtreecommitdiff
path: root/scripts/debug_lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/debug_lexer.py')
-rwxr-xr-xscripts/debug_lexer.py246
1 files changed, 0 insertions, 246 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py
deleted file mode 100755
index ef01a23f..00000000
--- a/scripts/debug_lexer.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
- Lexing error finder
- ~~~~~~~~~~~~~~~~~~~
-
- For the source files given on the command line, display
- the text where Error tokens are being generated, along
- with some context.
-
- :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
- :license: BSD, see LICENSE for details.
-"""
-
-from __future__ import print_function
-
-import os
-import sys
-
-# always prefer Pygments from source if exists
-srcpath = os.path.join(os.path.dirname(__file__), '..')
-if os.path.isdir(os.path.join(srcpath, 'pygments')):
- sys.path.insert(0, srcpath)
-
-
-from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \
- ProfilingRegexLexer, ProfilingRegexLexerMeta
-from pygments.lexers import get_lexer_by_name, find_lexer_class, \
- find_lexer_class_for_filename
-from pygments.token import Error, Text, _TokenType
-from pygments.cmdline import _parse_options
-
-
-class DebuggingRegexLexer(ExtendedRegexLexer):
- """Make the state stack, position and current match instance attributes."""
-
- def get_tokens_unprocessed(self, text, stack=('root',)):
- """
- Split ``text`` into (tokentype, text) pairs.
-
- ``stack`` is the inital stack (default: ``['root']``)
- """
- tokendefs = self._tokens
- self.ctx = ctx = LexerContext(text, 0)
- ctx.stack = list(stack)
- statetokens = tokendefs[ctx.stack[-1]]
- while 1:
- for rexmatch, action, new_state in statetokens:
- self.m = m = rexmatch(text, ctx.pos, ctx.end)
- if m:
- if action is not None:
- if type(action) is _TokenType:
- yield ctx.pos, action, m.group()
- ctx.pos = m.end()
- else:
- if not isinstance(self, ExtendedRegexLexer):
- for item in action(self, m):
- yield item
- ctx.pos = m.end()
- else:
- for item in action(self, m, ctx):
- yield item
- if not new_state:
- # altered the state stack?
- statetokens = tokendefs[ctx.stack[-1]]
- if new_state is not None:
- # state transition
- if isinstance(new_state, tuple):
- for state in new_state:
- if state == '#pop':
- ctx.stack.pop()
- elif state == '#push':
- ctx.stack.append(ctx.stack[-1])
- else:
- ctx.stack.append(state)
- elif isinstance(new_state, int):
- # pop
- del ctx.stack[new_state:]
- elif new_state == '#push':
- ctx.stack.append(ctx.stack[-1])
- else:
- assert False, 'wrong state def: %r' % new_state
- statetokens = tokendefs[ctx.stack[-1]]
- break
- else:
- try:
- if ctx.pos >= ctx.end:
- break
- if text[ctx.pos] == '\n':
- # at EOL, reset state to 'root'
- ctx.stack = ['root']
- statetokens = tokendefs['root']
- yield ctx.pos, Text, u'\n'
- ctx.pos += 1
- continue
- yield ctx.pos, Error, text[ctx.pos]
- ctx.pos += 1
- except IndexError:
- break
-
-
-def main(fn, lexer=None, options={}):
- if lexer is not None:
- lxcls = get_lexer_by_name(lexer).__class__
- else:
- lxcls = find_lexer_class_for_filename(os.path.basename(fn))
- if lxcls is None:
- name, rest = fn.split('_', 1)
- lxcls = find_lexer_class(name)
- if lxcls is None:
- raise AssertionError('no lexer found for file %r' % fn)
- print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__,
- lxcls.__name__))
- debug_lexer = False
- # if profile:
- # # does not work for e.g. ExtendedRegexLexers
- # if lxcls.__bases__ == (RegexLexer,):
- # # yes we can! (change the metaclass)
- # lxcls.__class__ = ProfilingRegexLexerMeta
- # lxcls.__bases__ = (ProfilingRegexLexer,)
- # lxcls._prof_sort_index = profsort
- # else:
- # if lxcls.__bases__ == (RegexLexer,):
- # lxcls.__bases__ = (DebuggingRegexLexer,)
- # debug_lexer = True
- # elif lxcls.__bases__ == (DebuggingRegexLexer,):
- # # already debugged before
- # debug_lexer = True
- # else:
- # # HACK: ExtendedRegexLexer subclasses will only partially work here.
- # lxcls.__bases__ = (DebuggingRegexLexer,)
- # debug_lexer = True
-
- lx = lxcls(**options)
- lno = 1
- if fn == '-':
- text = sys.stdin.read()
- else:
- with open(fn, 'rb') as fp:
- text = fp.read().decode('utf-8')
- text = text.strip('\n') + '\n'
- tokens = []
- states = []
-
- def show_token(tok, state):
- reprs = list(map(repr, tok))
- print(' ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ')
- if debug_lexer:
- print(' ' + ' ' * (29-len(reprs[0])) + ' : '.join(state) if state else '', end=' ')
- print()
-
- for type, val in lx.get_tokens(text):
- lno += val.count('\n')
- if type == Error and not ignerror:
- print('Error parsing', fn, 'on line', lno)
- if not showall:
- print('Previous tokens' + (debug_lexer and ' and states' or '') + ':')
- for i in range(max(len(tokens) - num, 0), len(tokens)):
- if debug_lexer:
- show_token(tokens[i], states[i])
- else:
- show_token(tokens[i], None)
- print('Error token:')
- l = len(repr(val))
- print(' ' + repr(val), end=' ')
- if debug_lexer and hasattr(lx, 'ctx'):
- print(' ' * (60-l) + ' : '.join(lx.ctx.stack), end=' ')
- print()
- print()
- return 1
- tokens.append((type, val))
- if debug_lexer:
- if hasattr(lx, 'ctx'):
- states.append(lx.ctx.stack[:])
- else:
- states.append(None)
- if showall:
- show_token((type, val), states[-1] if debug_lexer else None)
- return 0
-
-
-def print_help():
- print('''\
-Pygments development helper to quickly debug lexers.
-
- scripts/debug_lexer.py [options] file ...
-
-Give one or more filenames to lex them and display possible error tokens
-and/or profiling info. Files are assumed to be encoded in UTF-8.
-
-Selecting lexer and options:
-
- -l NAME use lexer named NAME (default is to guess from
- the given filenames)
- -O OPTIONSTR use lexer options parsed from OPTIONSTR
-
-Debugging lexing errors:
-
- -n N show the last N tokens on error
- -a always show all lexed tokens (default is only
- to show them when an error occurs)
- -e do not stop on error tokens
-
-Profiling:
-
- -p use the ProfilingRegexLexer to profile regexes
- instead of the debugging lexer
- -s N sort profiling output by column N (default is
- column 4, the time per call)
-''')
-
-num = 10
-showall = False
-ignerror = False
-lexer = None
-options = {}
-profile = False
-profsort = 4
-
-if __name__ == '__main__':
- import getopt
- opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:h')
- for opt, val in opts:
- if opt == '-n':
- num = int(val)
- elif opt == '-a':
- showall = True
- elif opt == '-e':
- ignerror = True
- elif opt == '-l':
- lexer = val
- elif opt == '-p':
- profile = True
- elif opt == '-s':
- profsort = int(val)
- elif opt == '-O':
- options = _parse_options([val])
- elif opt == '-h':
- print_help()
- sys.exit(0)
- ret = 0
- if not args:
- print_help()
- for f in args:
- ret += main(f, lexer, options)
- sys.exit(bool(ret))