diff options
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r-- | pygments/lexer.py | 39 |
1 files changed, 29 insertions, 10 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py index 53ea5ac1..82f09318 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -5,7 +5,7 @@ Base lexer classes. - :copyright: Copyright 2006-2011 by the Pygments team, see AUTHORS. + :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import re @@ -21,6 +21,12 @@ __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 'LexerContext', 'include', 'bygroups', 'using', 'this'] +_encoding_map = [('\xef\xbb\xbf', 'utf-8'), + ('\xff\xfe\0\0', 'utf-32'), + ('\0\0\xfe\xff', 'utf-32be'), + ('\xff\xfe', 'utf-16'), + ('\xfe\xff', 'utf-16be')] + _default_analyse = staticmethod(lambda x: 0.0) @@ -142,8 +148,19 @@ class Lexer(object): raise ImportError('To enable chardet encoding guessing, ' 'please install the chardet library ' 'from http://chardet.feedparser.org/') - enc = chardet.detect(text) - text = text.decode(enc['encoding']) + # check for BOM first + decoded = None + for bom, encoding in _encoding_map: + if text.startswith(bom): + decoded = unicode(text[len(bom):], encoding, + errors='replace') + break + # no BOM found, so use chardet + if decoded is None: + enc = chardet.detect(text[:1024]) # Guess using first 1KB + decoded = unicode(text, enc.get('encoding') or 'utf-8', + errors='replace') + text = decoded else: text = text.decode(self.encoding) # text now *is* a unicode string @@ -274,12 +291,14 @@ def bygroups(*args): if data: yield match.start(i + 1), action, data else: - if ctx: - ctx.pos = match.start(i + 1) - for item in action(lexer, _PseudoMatch(match.start(i + 1), - match.group(i + 1)), ctx): - if item: - yield item + data = match.group(i + 1) + if data is not None: + if ctx: + ctx.pos = match.start(i + 1) + for item in action(lexer, _PseudoMatch(match.start(i + 1), + data), ctx): + if item: + yield item if ctx: ctx.pos = match.end() return callback @@ -439,7 +458,7 @@ class RegexLexerMeta(LexerMeta): def __call__(cls, *args, **kwds): """Instantiate cls after preprocessing its token definitions.""" - if not hasattr(cls, '_tokens'): + if '_tokens' not in cls.__dict__: cls._all_tokens = {} cls._tmpname = 0 if hasattr(cls, 'token_variants') and cls.token_variants: |