diff options
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r-- | pygments/lexer.py | 41 |
1 files changed, 31 insertions, 10 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py index 0ede7927..f3543d41 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -8,17 +8,20 @@ :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ -import re, itertools + +import re +import itertools from pygments.filter import apply_filters, Filter from pygments.filters import get_filter_by_name from pygments.token import Error, Text, Other, _TokenType from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ - make_analysator, text_type, add_metaclass, iteritems - + make_analysator, text_type, add_metaclass, iteritems +from pygments.regexopt import regex_opt __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', - 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', 'default'] + 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', + 'default', 'words'] _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'), @@ -160,7 +163,7 @@ class Lexer(object): break # no BOM found, so use chardet if decoded is None: - enc = chardet.detect(text[:1024]) # Guess using first 1KB + enc = chardet.detect(text[:1024]) # Guess using first 1KB decoded = text.decode(enc.get('encoding') or 'utf-8', 'replace') text = decoded @@ -237,7 +240,7 @@ class DelegatingLexer(Lexer): self.root_lexer.get_tokens_unprocessed(buffered)) -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # RegexLexer and ExtendedRegexLexer # @@ -387,12 +390,27 @@ class default: """ Indicates a state or state action (e.g. #pop) to apply. For example default('#pop') is equivalent to ('', Token, '#pop') - Note that state tuples may be used as well + Note that state tuples may be used as well. + + .. versionadded:: 2.0 """ def __init__(self, state): self.state = state +class words: + """ + Indicates a list of literal words that is transformed into an optimized + regex that matches any of the words. + + .. versionadded:: 2.0 + """ + def __init__(self, words, prefix='', suffix=''): + self.words = words + self.prefix = prefix + self.suffix = suffix + + class RegexLexerMeta(LexerMeta): """ Metaclass for RegexLexer, creates the self._tokens attribute from @@ -401,12 +419,15 @@ class RegexLexerMeta(LexerMeta): def _process_regex(cls, regex, rflags): """Preprocess the regular expression component of a token definition.""" + if isinstance(regex, words): + return regex_opt(regex.words, rflags, prefix=regex.prefix, + suffix=regex.suffix).match return re.compile(regex, rflags).match def _process_token(cls, token): """Preprocess the token component of a token definition.""" assert type(token) is _TokenType or callable(token), \ - 'token type must be simple type or callable, not %r' % (token,) + 'token type must be simple type or callable, not %r' % (token,) return token def _process_new_state(cls, new_state, unprocessed, processed): @@ -439,7 +460,7 @@ class RegexLexerMeta(LexerMeta): for istate in new_state: assert (istate in unprocessed or istate in ('#pop', '#push')), \ - 'unknown new state ' + istate + 'unknown new state ' + istate return new_state else: assert False, 'unknown new state def %r' % new_state @@ -645,7 +666,7 @@ class LexerContext(object): def __init__(self, text, pos, stack=None, end=None): self.text = text self.pos = pos - self.end = end or len(text) # end=0 not supported ;-) + self.end = end or len(text) # end=0 not supported ;-) self.stack = stack or ['root'] def __repr__(self): |