1 files changed, 233 insertions, 0 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py
new file mode 100755
index 00000000..dfc28ce2
--- /dev/null
+++ b/scripts/debug_lexer.py
@@ -0,0 +1,233 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+    Lexing error finder
+    ~~~~~~~~~~~~~~~~~~~
+
+    For the source files given on the command line, display
+    the text where Error tokens are being generated, along
+    with some context.
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+from __future__ import print_function
+
+import os
+import sys
+
+# always prefer Pygments from source if exists
+srcpath = os.path.join(os.path.dirname(__file__), '..')
+if os.path.isdir(os.path.join(srcpath, 'pygments')):
+    sys.path.insert(0, srcpath)
+
+
+from pygments.lexer import RegexLexer, ProfilingRegexLexer, ProfilingRegexLexerMeta
+from pygments.lexers import get_lexer_by_name, find_lexer_class, \
+    find_lexer_class_for_filename
+from pygments.token import Error, Text, _TokenType
+from pygments.cmdline import _parse_options
+
+
+class DebuggingRegexLexer(RegexLexer):
+    """Make the state stack, position and current match instance attributes."""
+
+    def get_tokens_unprocessed(self, text, stack=('root',)):
+        """
+        Split ``text`` into (tokentype, text) pairs.
+
+        ``stack`` is the inital stack (default: ``['root']``)
+        """
+        self.pos = 0
+        tokendefs = self._tokens
+        self.statestack = list(stack)
+        statetokens = tokendefs[self.statestack[-1]]
+        while 1:
+            for rexmatch, action, new_state in statetokens:
+                self.m = m = rexmatch(text, self.pos)
+                if m:
+                    if action is not None:
+                        if type(action) is _TokenType:
+                            yield self.pos, action, m.group()
+                        else:
+                            for item in action(self, m):
+                                yield item
+                    self.pos = m.end()
+                    if new_state is not None:
+                        # state transition
+                        if isinstance(new_state, tuple):
+                            for state in new_state:
+                                if state == '#pop':
+                                    self.statestack.pop()
+                                elif state == '#push':
+                                    self.statestack.append(self.statestack[-1])
+                                else:
+                                    self.statestack.append(state)
+                        elif isinstance(new_state, int):
+                            # pop
+                            del self.statestack[new_state:]
+                        elif new_state == '#push':
+                            self.statestack.append(self.statestack[-1])
+                        else:
+                            assert False, 'wrong state def: %r' % new_state
+                        statetokens = tokendefs[self.statestack[-1]]
+                    break
+            else:
+                try:
+                    if text[self.pos] == '\n':
+                        # at EOL, reset state to 'root'
+                        self.pos += 1
+                        self.statestack = ['root']
+                        statetokens = tokendefs['root']
+                        yield self.pos, Text, u'\n'
+                        continue
+                    yield self.pos, Error, text[self.pos]
+                    self.pos += 1
+                except IndexError:
+                    break
+
+
+def main(fn, lexer=None, options={}):
+    if lexer is not None:
+        lxcls = get_lexer_by_name(lexer).__class__
+    else:
+        lxcls = find_lexer_class_for_filename(os.path.basename(fn))
+        if lxcls is None:
+            name, rest = fn.split('_', 1)
+            lxcls = find_lexer_class(name)
+            if lxcls is None:
+                raise AssertionError('no lexer found for file %r' % fn)
+    debug_lexer = False
+    if profile:
+        # does not work for e.g. ExtendedRegexLexers
+        if lxcls.__bases__ == (RegexLexer,):
+            # yes we can!  (change the metaclass)
+            lxcls.__class__ = ProfilingRegexLexerMeta
+            lxcls.__bases__ = (ProfilingRegexLexer,)
+            lxcls._prof_sort_index = profsort
+    else:
+        if lxcls.__bases__ == (RegexLexer,):
+            lxcls.__bases__ = (DebuggingRegexLexer,)
+            debug_lexer = True
+        elif lxcls.__bases__ == (DebuggingRegexLexer,):
+            # already debugged before
+            debug_lexer = True
+        else:
+            # HACK: ExtendedRegexLexer subclasses will only partially work here.
+            lxcls.__bases__ = (DebuggingRegexLexer,)
+            debug_lexer = True
+
+    lx = lxcls(**options)
+    lno = 1
+    if fn == '-':
+        text = sys.stdin.read()
+    else:
+        with open(fn, 'rb') as fp:
+            text = fp.read().decode('utf-8')
+    text = text.strip('\n') + '\n'
+    tokens = []
+    states = []
+
+    def show_token(tok, state):
+        reprs = list(map(repr, tok))
+        print('   ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ')
+        if debug_lexer:
+            print(' ' + ' ' * (29-len(reprs[0])) + repr(state), end=' ')
+        print()
+
+    for type, val in lx.get_tokens(text):
+        lno += val.count('\n')
+        if type == Error:
+            print('Error parsing', fn, 'on line', lno)
+            print('Previous tokens' + (debug_lexer and ' and states' or '') + ':')
+            if showall:
+                for tok, state in map(None, tokens, states):
+                    show_token(tok, state)
+            else:
+                for i in range(max(len(tokens) - num, 0), len(tokens)):
+                    if debug_lexer:
+                        show_token(tokens[i], states[i])
+                    else:
+                        show_token(tokens[i], None)
+            print('Error token:')
+            l = len(repr(val))
+            print('   ' + repr(val), end=' ')
+            if debug_lexer and hasattr(lx, 'statestack'):
+                print(' ' * (60-l) + repr(lx.statestack), end=' ')
+            print()
+            print()
+            return 1
+        tokens.append((type, val))
+        if debug_lexer:
+            if hasattr(lx, 'statestack'):
+                states.append(lx.statestack[:])
+            else:
+                states.append(None)
+    if showall:
+        for tok, state in zip(tokens, states):
+            show_token(tok, state)
+    return 0
+
+
+def print_help():
+    print('''\
+Pygments development helper to quickly debug lexers.
+
+    scripts/debug_lexer.py [options] file ...
+
+Give one or more filenames to lex them and display possible error tokens
+and/or profiling info.  Files are assumed to be encoded in UTF-8.
+
+Selecting lexer and options:
+
+    -l NAME         use lexer named NAME (default is to guess from
+                    the given filenames)
+    -O OPTIONSTR    use lexer options parsed from OPTIONSTR
+
+Debugging lexing errors:
+
+    -n N            show the last N tokens on error
+    -a              always show all lexed tokens (default is only
+                    to show them when an error occurs)
+
+Profiling:
+
+    -p              use the ProfilingRegexLexer to profile regexes
+                    instead of the debugging lexer
+    -s N            sort profiling output by column N (default is
+                    column 4, the time per call)
+''')
+
+num = 10
+showall = False
+lexer = None
+options = {}
+profile = False
+profsort = 4
+
+if __name__ == '__main__':
+    import getopt
+    opts, args = getopt.getopt(sys.argv[1:], 'n:l:apO:s:h')
+    for opt, val in opts:
+        if opt == '-n':
+            num = int(val)
+        elif opt == '-a':
+            showall = True
+        elif opt == '-l':
+            lexer = val
+        elif opt == '-p':
+            profile = True
+        elif opt == '-s':
+            profsort = int(val)
+        elif opt == '-O':
+            options = _parse_options([val])
+        elif opt == '-h':
+            print_help()
+            sys.exit(0)
+    ret = 0
+    if not args:
+        print_help()
+    for f in args:
+        ret += main(f, lexer, options)
+    sys.exit(bool(ret))