diff options
author | Georg Brandl <georg@python.org> | 2012-02-06 07:53:37 +0100 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2012-02-06 07:53:37 +0100 |
commit | 0b21c9695cdd86af64cba4fdb3a85b51ec5f510b (patch) | |
tree | c69ee5bc1e5487a37d5911d267b77e69a2dc81ce /pygments/lexer.py | |
parent | 8ba7c3fd309800a353f517ce6c374b40c47ca028 (diff) | |
download | pygments-0b21c9695cdd86af64cba4fdb3a85b51ec5f510b.tar.gz |
Closes #692: when using chardet mode, look for BOM and default to replace errorhandling.
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r-- | pygments/lexer.py | 21 |
1 files changed, 19 insertions, 2 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py index 46f86076..82f09318 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -21,6 +21,12 @@ __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 'LexerContext', 'include', 'bygroups', 'using', 'this'] +_encoding_map = [('\xef\xbb\xbf', 'utf-8'), + ('\xff\xfe\0\0', 'utf-32'), + ('\0\0\xfe\xff', 'utf-32be'), + ('\xff\xfe', 'utf-16'), + ('\xfe\xff', 'utf-16be')] + _default_analyse = staticmethod(lambda x: 0.0) @@ -142,8 +148,19 @@ class Lexer(object): raise ImportError('To enable chardet encoding guessing, ' 'please install the chardet library ' 'from http://chardet.feedparser.org/') - enc = chardet.detect(text) - text = text.decode(enc['encoding']) + # check for BOM first + decoded = None + for bom, encoding in _encoding_map: + if text.startswith(bom): + decoded = unicode(text[len(bom):], encoding, + errors='replace') + break + # no BOM found, so use chardet + if decoded is None: + enc = chardet.detect(text[:1024]) # Guess using first 1KB + decoded = unicode(text, enc.get('encoding') or 'utf-8', + errors='replace') + text = decoded else: text = text.decode(self.encoding) # text now *is* a unicode string |