sandbox/py-rest-doc/converter/tokenizer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

# -*- coding: utf-8 -*-
"""
    Python documentation LaTeX file tokenizer
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    For more documentation, look into the ``restwriter.py`` file.

    :copyright: 2007 by Georg Brandl.
    :license: Python license.
"""

import re

from .scanner import Scanner

class Tokenizer(Scanner):
    """ Lex a Python doc LaTeX document. """

    specials = {
        '{': 'bgroup',
        '}': 'egroup',
        '[': 'boptional',
        ']': 'eoptional',
        '~': 'tilde',
        '$': 'mathmode',
    }

    @property
    def mtext(self):
        return self.match.group()

    def tokenize(self):
        return TokenStream(self._tokenize())

    def _tokenize(self):
        lineno = 1
        while not self.eos:
            if self.scan(r'\\verb([^a-zA-Z])(.*?)(\1)'):
                # specialcase \verb here
                yield lineno, 'command', 'verb', '\\verb'
                yield lineno, 'text', self.match.group(1), self.match.group(1)
                yield lineno, 'text', self.match.group(2), self.match.group(2)
                yield lineno, 'text', self.match.group(3), self.match.group(3)
            elif self.scan(r'\\([a-zA-Z]+\*?)[ \t]*'):
                yield lineno, 'command', self.match.group(1), self.mtext
            elif self.scan(r'\\.'):
                yield lineno, 'command', self.mtext[1], self.mtext
            elif self.scan(r'\\\n'):
                yield lineno, 'text', self.mtext, self.mtext
                lineno += 1
            elif self.scan(r'%(.*)\n[ \t]*'):
                yield lineno, 'comment', self.match.group(1), self.mtext
                lineno += 1
            elif self.scan(r'[{}\[\]~$]'):
                yield lineno, self.specials[self.mtext], self.mtext, self.mtext
            elif self.scan(r'(\n[ \t]*){2,}'):
                lines = self.mtext.count('\n')
                yield lineno, 'parasep', '\n' * lines, self.mtext
                lineno += lines
            elif self.scan(r'\n[ \t]*'):
                yield lineno, 'text', ' ', self.mtext
                lineno += 1
            elif self.scan(r'[^\\%}{\[\]~\n]+'):
                yield lineno, 'text', self.mtext, self.mtext
            else:
                raise RuntimeError('unexpected text on line %d: %r' %
                                   (lineno, self.data[self.pos:self.pos+100]))


class TokenStream(object):
    """
    A token stream works like a normal generator just that
    it supports peeking and pushing tokens back to the stream.
    """

    def __init__(self, generator):
        self._generator = generator
        self._pushed = []
        self.last = (1, 'initial', '')

    def __iter__(self):
        return self

    def __nonzero__(self):
        """ Are we at the end of the tokenstream? """
        if self._pushed:
            return True
        try:
            self.push(self.next())
        except StopIteration:
            return False
        return True

    def pop(self):
        """ Return the next token from the stream. """
        if self._pushed:
            rv = self._pushed.pop()
        else:
            rv = self._generator.next()
        self.last = rv
        return rv

    next = pop

    def popmany(self, num=1):
        """ Pop a list of tokens. """
        return [self.next() for i in range(num)]

    def peek(self):
        """ Pop and push a token, return it. """
        token = self.next()
        self.push(token)
        return token

    def peekmany(self, num=1):
        """ Pop and push a list of tokens. """
        tokens = self.popmany(num)
        for tok in tokens:
            self.push(tok)
        return tokens

    def push(self, item):
        """ Push a token back to the stream. """
        self._pushed.append(item)