pygments/lexers/special.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

# -*- coding: utf-8 -*-
"""
    pygments.lexers.special
    ~~~~~~~~~~~~~~~~~~~~~~~

    Special lexers.

    :copyright: 2006 by Georg Brandl.
    :license: GNU LGPL, see LICENSE for more details.
"""

import re
import cStringIO

from pygments.lexer import Lexer, RegexLexer
from pygments.token import Token, \
     Text, Comment, Operator, Keyword, Name, String, Number


__all__ = ['TextLexer', 'RawTokenLexer']


class TextLexer(Lexer):
    name = 'Text only'
    aliases = ['text']
    filenames = ['*.txt']

    def get_tokens_unprocessed(self, text):
        yield 0, Text, text


_ttype_cache = {}

line_re = re.compile('.*?\n')

class RawTokenLexer(Lexer):
    """
    Recreate a token stream formatted with the RawTokenFormatter.

    Additional options accepted:

    ``compress``
        If set to "gz" or "bz2", decompress the token stream with
        the given compression algorithm (default: '').
    """
    name = 'Raw token data'
    aliases = ['raw']
    filenames = ['*.raw']

    def __init__(self, **options):
        self.compress = options.get('compress', '')
        Lexer.__init__(self, **options)

    def get_tokens(self, text):
        if self.compress == 'gz':
            import gzip
            gzipfile = gzip.GzipFile('', 'rb', 9, cStringIO.StringIO(text))
            text = gzipfile.read()
        elif self.compress == 'bz2':
            import bz2
            text = bz2.decompress(text)
        return Lexer.get_tokens(self, text)

    def get_tokens_unprocessed(self, text):
        length = 0
        for match in line_re.finditer(text):
            ttypestr, val = match.group().split('\t', 1)
            ttype = _ttype_cache.get(ttypestr)
            if not ttype:
                ttype = Token
                ttypes = ttypestr.split('.')[1:]
                for ttype_ in ttypes:
                    ttype = getattr(ttype, ttype_)
                _ttype_cache[ttypestr] = ttype
            val = val[1:-2].decode('string-escape')
            yield length, ttype, val
            length += len(val)