summaryrefslogtreecommitdiff
path: root/pygments/lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r--pygments/lexer.py39
1 files changed, 29 insertions, 10 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 53ea5ac1..82f09318 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -5,7 +5,7 @@
Base lexer classes.
- :copyright: Copyright 2006-2011 by the Pygments team, see AUTHORS.
+ :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import re
@@ -21,6 +21,12 @@ __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
'LexerContext', 'include', 'bygroups', 'using', 'this']
+_encoding_map = [('\xef\xbb\xbf', 'utf-8'),
+ ('\xff\xfe\0\0', 'utf-32'),
+ ('\0\0\xfe\xff', 'utf-32be'),
+ ('\xff\xfe', 'utf-16'),
+ ('\xfe\xff', 'utf-16be')]
+
_default_analyse = staticmethod(lambda x: 0.0)
@@ -142,8 +148,19 @@ class Lexer(object):
raise ImportError('To enable chardet encoding guessing, '
'please install the chardet library '
'from http://chardet.feedparser.org/')
- enc = chardet.detect(text)
- text = text.decode(enc['encoding'])
+ # check for BOM first
+ decoded = None
+ for bom, encoding in _encoding_map:
+ if text.startswith(bom):
+ decoded = unicode(text[len(bom):], encoding,
+ errors='replace')
+ break
+ # no BOM found, so use chardet
+ if decoded is None:
+ enc = chardet.detect(text[:1024]) # Guess using first 1KB
+ decoded = unicode(text, enc.get('encoding') or 'utf-8',
+ errors='replace')
+ text = decoded
else:
text = text.decode(self.encoding)
# text now *is* a unicode string
@@ -274,12 +291,14 @@ def bygroups(*args):
if data:
yield match.start(i + 1), action, data
else:
- if ctx:
- ctx.pos = match.start(i + 1)
- for item in action(lexer, _PseudoMatch(match.start(i + 1),
- match.group(i + 1)), ctx):
- if item:
- yield item
+ data = match.group(i + 1)
+ if data is not None:
+ if ctx:
+ ctx.pos = match.start(i + 1)
+ for item in action(lexer, _PseudoMatch(match.start(i + 1),
+ data), ctx):
+ if item:
+ yield item
if ctx:
ctx.pos = match.end()
return callback
@@ -439,7 +458,7 @@ class RegexLexerMeta(LexerMeta):
def __call__(cls, *args, **kwds):
"""Instantiate cls after preprocessing its token definitions."""
- if not hasattr(cls, '_tokens'):
+ if '_tokens' not in cls.__dict__:
cls._all_tokens = {}
cls._tmpname = 0
if hasattr(cls, 'token_variants') and cls.token_variants: