scripts/find_error.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
    Lexing error finder
    ~~~~~~~~~~~~~~~~~~~

    For the source files given on the command line, display
    the text where Error tokens are being generated, along
    with some context.

    :copyright: Copyright 2006-2010 by the Pygments team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import sys, os

try:
    import pygments
except ImportError:
    # try parent path
    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))


from pygments.lexer import RegexLexer
from pygments.lexers import get_lexer_for_filename, get_lexer_by_name
from pygments.token import Error, Text, _TokenType


class DebuggingRegexLexer(RegexLexer):
    """Make the state stack, position and current match instance attributes."""

    def get_tokens_unprocessed(self, text, stack=('root',)):
        """
        Split ``text`` into (tokentype, text) pairs.

        ``stack`` is the inital stack (default: ``['root']``)
        """
        self.pos = 0
        tokendefs = self._tokens
        self.statestack = list(stack)
        statetokens = tokendefs[self.statestack[-1]]
        while 1:
            for rexmatch, action, new_state in statetokens:
                self.m = m = rexmatch(text, self.pos)
                if m:
                    if type(action) is _TokenType:
                        yield self.pos, action, m.group()
                    else:
                        for item in action(self, m):
                            yield item
                    self.pos = m.end()
                    if new_state is not None:
                        # state transition
                        if isinstance(new_state, tuple):
                            for state in new_state:
                                if state == '#pop':
                                    self.statestack.pop()
                                elif state == '#push':
                                    self.statestack.append(self.statestack[-1])
                                else:
                                    self.statestack.append(state)
                        elif isinstance(new_state, int):
                            # pop
                            del self.statestack[new_state:]
                        elif new_state == '#push':
                            self.statestack.append(self.statestack[-1])
                        else:
                            assert False, 'wrong state def: %r' % new_state
                        statetokens = tokendefs[self.statestack[-1]]
                    break
            else:
                try:
                    if text[self.pos] == '\n':
                        # at EOL, reset state to 'root'
                        self.pos += 1
                        self.statestack = ['root']
                        statetokens = tokendefs['root']
                        yield self.pos, Text, u'\n'
                        continue
                    yield self.pos, Error, text[self.pos]
                    self.pos += 1
                except IndexError:
                    break


def main(fn):
    try:
        lx = get_lexer_for_filename(os.path.basename(fn))
    except ValueError:
        try:
            name, rest = fn.split('_', 1)
            lx = get_lexer_by_name(name)
        except ValueError:
            raise AssertionError('no lexer found for file %r' % fn)
    debug_lexer = False
    # does not work for e.g. ExtendedRegexLexers
    if lx.__class__.__bases__ == (RegexLexer,):
        lx.__class__.__bases__ = (DebuggingRegexLexer,)
        debug_lexer = True
    lno = 1
    text = file(fn, 'U').read()
    text = text.strip('\n') + '\n'
    text = text.decode('latin1')
    ntext = []
    states = []

    def show_token(tok):
        reprs = map(repr, tok)
        print '   ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0],
        if debug_lexer:
            print ' ' + ' ' * (29-len(reprs[0])) + repr(states[i]),
        print

    for type, val in lx.get_tokens(text):
        lno += val.count('\n')
        if type == Error:
            print 'Error parsing', fn, 'on line', lno
            print 'Previous tokens' + (debug_lexer and ' and states' or '') + ':'
            for i in range(len(ntext) - num, len(ntext)):
                show_token(ntext[i])
            print 'Error token:'
            l = len(repr(val))
            print '   ' + repr(val),
            if debug_lexer:
                print ' ' * (60-l) + repr(lx.statestack),
            print
            print
            return 1
        ntext.append((type,val))
        if debug_lexer:
            states.append(lx.statestack[:])
    if showall:
        for tok in ntext:
            show_token(tok)
    return 0


num = 10
showall = False

if __name__ == '__main__':
    import getopt
    opts, args = getopt.getopt(sys.argv[1:], 'n:a')
    for opt, val in opts:
        if opt == '-n':
            num = int(val)
        elif opt == '-a':
            showall = True
    ret = 0
    for f in args:
        ret += main(f)
    sys.exit(bool(ret))