diff options
author | Georg Brandl <georg@python.org> | 2010-02-18 11:19:11 +0100 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2010-02-18 11:19:11 +0100 |
commit | efba3200cf5d5bd3a50239531eef40eb2ef55fb5 (patch) | |
tree | 36f3a1403439fad1ce48fc3bad31437ee47a5476 /scripts/find_error.py | |
parent | 629031ea2df80c98801636a447b46c6ea8ef98a3 (diff) | |
download | pygments-efba3200cf5d5bd3a50239531eef40eb2ef55fb5.tar.gz |
Make the find_error script more usable: make it possible to inspect the state stack for simple regex lexers, and make the output prettier.
Diffstat (limited to 'scripts/find_error.py')
-rwxr-xr-x[-rw-r--r--] | scripts/find_error.py | 98 |
1 files changed, 90 insertions, 8 deletions
diff --git a/scripts/find_error.py b/scripts/find_error.py index a3cdad4f..82914b50 100644..100755 --- a/scripts/find_error.py +++ b/scripts/find_error.py @@ -18,38 +18,120 @@ try: import pygments except ImportError: # try parent path - sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + sys.path.append(os.path.join(os.path.dirname(__file__), '..')) + from pygments import highlight +from pygments.lexer import RegexLexer from pygments.lexers import get_lexer_for_filename, get_lexer_by_name -from pygments.token import Error +from pygments.token import Error, _TokenType + + +class DebuggingRegexLexer(RegexLexer): + """Make the state stack, position and current match instance attributes.""" + + def get_tokens_unprocessed(self, text, stack=('root',)): + """ + Split ``text`` into (tokentype, text) pairs. + + ``stack`` is the inital stack (default: ``['root']``) + """ + self.pos = 0 + tokendefs = self._tokens + self.statestack = list(stack) + statetokens = tokendefs[self.statestack[-1]] + while 1: + for rexmatch, action, new_state in statetokens: + self.m = rexmatch(text, self.pos) + if self.m: + if type(action) is _TokenType: + yield self.pos, action, self.m.group() + else: + for item in action(self, self.m): + yield item + self.pos = self.m.end() + if new_state is not None: + # state transition + if isinstance(new_state, tuple): + for state in new_state: + if state == '#pop': + self.statestack.pop() + elif state == '#push': + self.statestack.append(self.statestack[-1]) + else: + self.statestack.append(state) + elif isinstance(new_state, int): + # pop + del self.statestack[new_state:] + elif new_state == '#push': + self.statestack.append(self.statestack[-1]) + else: + assert False, 'wrong state def: %r' % new_state + statetokens = tokendefs[self.statestack[-1]] + break + else: + try: + if text[self.pos] == '\n': + # at EOL, reset state to 'root' + self.pos += 1 + self.statestack = ['root'] + statetokens = tokendefs['root'] + yield self.pos, Text, u'\n' + continue + yield self.pos, Error, text[self.pos] + self.pos += 1 + except IndexError: + break + def main(fn): try: - lx = get_lexer_for_filename(fn) + lx = get_lexer_for_filename(os.path.basename(fn)) except ValueError: try: - name, rest = fn.split("_", 1) + name, rest = fn.split('_', 1) lx = get_lexer_by_name(name) except ValueError: raise AssertionError('no lexer found for file %r' % fn) + debug_lexer = False + # does not work for e.g. ExtendedRegexLexers + if lx.__class__.__bases__ == (RegexLexer,): + lx.__class__.__bases__ = (DebuggingRegexLexer,) + debug_lexer = True + lno = 1 text = file(fn, 'U').read() text = text.strip('\n') + '\n' text = text.decode('latin1') ntext = [] + states = [] for type, val in lx.get_tokens(text): + lno += val.count('\n') if type == Error: - print "Error parsing", fn - print "\n".join([' ' + repr(x) for x in ntext[-num:]]) - print `val` + "<<<" + print 'Error parsing', fn, 'on line', lno + print 'Previous tokens' + (debug_lexer and ' and states' or '') + ':' + for i in range(len(ntext) - num, len(ntext)): + reprs = map(repr, ntext[i]) + print ' ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], + if debug_lexer: + print ' ' + ' ' * (29-len(reprs[0])) + repr(states[i]), + print + print 'Error token:' + l = len(repr(val)) + print ' ' + repr(val), + if debug_lexer: + print ' ' * (60-l) + repr(lx.statestack), + print + print return 1 ntext.append((type,val)) + if debug_lexer: + states.append(lx.statestack[:]) return 0 num = 10 -if __name__ == "__main__": +if __name__ == '__main__': if sys.argv[1][:2] == '-n': num = int(sys.argv[1][2:]) del sys.argv[1] |