diff options
author | Georg Brandl <georg@python.org> | 2014-10-07 14:45:58 +0200 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2014-10-07 14:45:58 +0200 |
commit | e3b37e0dc370ec4671ffb09030cf7e723438b83b (patch) | |
tree | 9987f1dfd5637c17186d6d3b9cd251cf5afbc97e /pygments/lexer.py | |
parent | 9a51e6a6df8a56aebede133687e91e519a186122 (diff) | |
download | pygments-e3b37e0dc370ec4671ffb09030cf7e723438b83b.tar.gz |
Closes #979: improve encoding behavior of cmdline invocations
Now the preferred locale encoding is used for stdin/stdout if the encoding is not set
on the file objects.
Also, code from input files is tried to be decoded as UTF-8, the locale encoding and
finally latin-1 as a last resort if no encoding option is given.
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r-- | pygments/lexer.py | 14 |
1 files changed, 5 insertions, 9 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py index 4a041523..8d781e85 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -20,7 +20,7 @@ from pygments.filter import apply_filters, Filter from pygments.filters import get_filter_by_name from pygments.token import Error, Text, Other, _TokenType from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ - make_analysator, text_type, add_metaclass, iteritems, Future + make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode from pygments.regexopt import regex_opt __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', @@ -72,8 +72,9 @@ class Lexer(object): If given, must be an encoding name. This encoding will be used to convert the input string to Unicode, if it is not already a Unicode string (default: ``'latin1'``). - Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or - ``'chardet'`` to use the chardet library, if it is installed. + Can also be ``'guess'`` to use a simple UTF-8 / Locale / Latin1 + detection, or ``'chardet'`` to use the chardet library, if it is + installed. """ #: Name of the lexer @@ -146,12 +147,7 @@ class Lexer(object): """ if not isinstance(text, text_type): if self.encoding == 'guess': - try: - text = text.decode('utf-8') - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] - except UnicodeDecodeError: - text = text.decode('latin1') + text = guess_decode(text) elif self.encoding == 'chardet': try: import chardet |