# -*- coding: utf-8 -*- """ pygments.lexers.special ~~~~~~~~~~~~~~~~~~~~~~ Special lexers. :copyright: 2006 by Georg Brandl. :license: GNU LGPL, see LICENSE for more details. """ import re import cStringIO from pygments.lexer import Lexer, RegexLexer from pygments.token import Token, \ Text, Comment, Operator, Keyword, Name, String, Number __all__ = ['TextLexer', 'RawTokenLexer'] class TextLexer(Lexer): name = 'Text only' aliases = ['text'] filenames = ['*.txt'] def get_tokens_unprocessed(self, text): yield 0, Text, text _ttype_cache = {} line_re = re.compile('.*?\n') class RawTokenLexer(Lexer): """ Recreate a token stream formatted with the RawTokenFormatter. Additional options accepted: ``compress`` If set to "gz" or "bz2", decompress the token stream with the given compression algorithm (default: ''). """ name = 'Raw token data' aliases = ['raw'] filenames = ['*.raw'] def __init__(self, **options): self.compress = options.get('compress', '') Lexer.__init__(self, **options) def get_tokens(self, text): if self.compress == 'gz': import gzip gzipfile = gzip.GzipFile('', 'rb', 9, cStringIO.StringIO(text)) text = gzipfile.read() elif self.compress == 'bz2': import bz2 text = bz2.decompress(text) return Lexer.get_tokens(self, text) def get_tokens_unprocessed(self, text): for match in line_re.finditer(text): ttypestr, val = match.group().split('\t', 1) ttype = _ttype_cache.get(ttypestr) if not ttype: ttype = Token ttypes = ttypestr.split('.')[1:] for ttype_ in ttypes: ttype = getattr(ttype, ttype_) _ttype_cache[ttypestr] = ttype val = val[1:-2].decode('string-escape') yield 0, ttype, val