pygments/lexers/special.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

# -*- coding: utf-8 -*-
"""
    pygments.lexers.special
    ~~~~~~~~~~~~~~~~~~~~~~~

    Special lexers.

    :copyright: 2006 by Georg Brandl.
    :license: BSD, see LICENSE for more details.
"""

import re
import cStringIO

from pygments.lexer import Lexer
from pygments.token import Token, Error, Text


__all__ = ['TextLexer', 'RawTokenLexer']


class TextLexer(Lexer):
    name = 'Text only'
    aliases = ['text']
    filenames = ['*.txt']
    mimetypes = ['text/plain']

    def get_tokens_unprocessed(self, text):
        yield 0, Text, text


_ttype_cache = {}

line_re = re.compile('.*?\n')

class RawTokenLexer(Lexer):
    """
    Recreate a token stream formatted with the RawTokenFormatter.

    Additional options accepted:

    ``compress``
        If set to "gz" or "bz2", decompress the token stream with
        the given compression algorithm (default: '').
    """
    name = 'Raw token data'
    aliases = ['raw']
    filenames = ['*.raw']
    mimetypes = ['application/x-pygments-tokens']

    def __init__(self, **options):
        self.compress = options.get('compress', '')
        Lexer.__init__(self, **options)

    def get_tokens(self, text):
        if self.compress == 'gz':
            import gzip
            gzipfile = gzip.GzipFile('', 'rb', 9, cStringIO.StringIO(text))
            text = gzipfile.read()
        elif self.compress == 'bz2':
            import bz2
            text = bz2.decompress(text)
        return Lexer.get_tokens(self, text)

    def get_tokens_unprocessed(self, text):
        length = 0
        for match in line_re.finditer(text):
            try:
                ttypestr, val = match.group().split('\t', 1)
            except ValueError:
                val = match.group()
                ttype = Error
            else:
                ttype = _ttype_cache.get(ttypestr)
                if not ttype:
                    ttype = Token
                    ttypes = ttypestr.split('.')[1:]
                    for ttype_ in ttypes:
                        ttype = getattr(ttype, ttype_)
                    _ttype_cache[ttypestr] = ttype
                val = val[1:-2].decode('string-escape')
            yield length, ttype, val
            length += len(val)