summaryrefslogtreecommitdiff
path: root/Cython/Plex/Traditional.py
blob: ec7252daed9963acc16369418152755e9e8eca30 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#=======================================================================
#
#   Python Lexical Analyser
#
#   Traditional Regular Expression Syntax
#
#=======================================================================

from __future__ import absolute_import

from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
from .Errors import PlexError


class RegexpSyntaxError(PlexError):
    pass


def re(s):
    """
    Convert traditional string representation of regular expression |s|
    into Plex representation.
    """
    return REParser(s).parse_re()


class REParser(object):
    def __init__(self, s):
        self.s = s
        self.i = -1
        self.end = 0
        self.next()

    def parse_re(self):
        re = self.parse_alt()
        if not self.end:
            self.error("Unexpected %s" % repr(self.c))
        return re

    def parse_alt(self):
        """Parse a set of alternative regexps."""
        re = self.parse_seq()
        if self.c == '|':
            re_list = [re]
            while self.c == '|':
                self.next()
                re_list.append(self.parse_seq())
            re = Alt(*re_list)
        return re

    def parse_seq(self):
        """Parse a sequence of regexps."""
        re_list = []
        while not self.end and not self.c in "|)":
            re_list.append(self.parse_mod())
        return Seq(*re_list)

    def parse_mod(self):
        """Parse a primitive regexp followed by *, +, ? modifiers."""
        re = self.parse_prim()
        while not self.end and self.c in "*+?":
            if self.c == '*':
                re = Rep(re)
            elif self.c == '+':
                re = Rep1(re)
            else:  # self.c == '?'
                re = Opt(re)
            self.next()
        return re

    def parse_prim(self):
        """Parse a primitive regexp."""
        c = self.get()
        if c == '.':
            re = AnyBut("\n")
        elif c == '^':
            re = Bol
        elif c == '$':
            re = Eol
        elif c == '(':
            re = self.parse_alt()
            self.expect(')')
        elif c == '[':
            re = self.parse_charset()
            self.expect(']')
        else:
            if c == '\\':
                c = self.get()
            re = Char(c)
        return re

    def parse_charset(self):
        """Parse a charset. Does not include the surrounding []."""
        char_list = []
        invert = 0
        if self.c == '^':
            invert = 1
            self.next()
        if self.c == ']':
            char_list.append(']')
            self.next()
        while not self.end and self.c != ']':
            c1 = self.get()
            if self.c == '-' and self.lookahead(1) != ']':
                self.next()
                c2 = self.get()
                for a in range(ord(c1), ord(c2) + 1):
                    char_list.append(chr(a))
            else:
                char_list.append(c1)
        chars = ''.join(char_list)
        if invert:
            return AnyBut(chars)
        else:
            return Any(chars)

    def next(self):
        """Advance to the next char."""
        s = self.s
        i = self.i = self.i + 1
        if i < len(s):
            self.c = s[i]
        else:
            self.c = ''
            self.end = 1

    def get(self):
        if self.end:
            self.error("Premature end of string")
        c = self.c
        self.next()
        return c

    def lookahead(self, n):
        """Look ahead n chars."""
        j = self.i + n
        if j < len(self.s):
            return self.s[j]
        else:
            return ''

    def expect(self, c):
        """
        Expect to find character |c| at current position.
        Raises an exception otherwise.
        """
        if self.c == c:
            self.next()
        else:
            self.error("Missing %s" % repr(c))

    def error(self, mess):
        """Raise exception to signal syntax error in regexp."""
        raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
            repr(self.s), self.i, mess))