Closes #979: improve encoding behavior of cmdline invocations

Now the preferred locale encoding is used for stdin/stdout if the encoding is not set on the file objects. Also, code from input files is tried to be decoded as UTF-8, the locale encoding and finally latin-1 as a last resort if no encoding option is given.
author: Georg Brandl <georg@python.org> 2014-10-07 14:45:58 +0200
committer: Georg Brandl <georg@python.org> 2014-10-07 14:45:58 +0200
commit: e3b37e0dc370ec4671ffb09030cf7e723438b83b (patch)
tree: 9987f1dfd5637c17186d6d3b9cd251cf5afbc97e /pygments/lexer.py
parent: 9a51e6a6df8a56aebede133687e91e519a186122 (diff)
download: pygments-e3b37e0dc370ec4671ffb09030cf7e723438b83b.tar.gz
1 files changed, 5 insertions, 9 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 4a041523..8d781e85 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -20,7 +20,7 @@ from pygments.filter import apply_filters, Filter
 from pygments.filters import get_filter_by_name
 from pygments.token import Error, Text, Other, _TokenType
 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
-    make_analysator, text_type, add_metaclass, iteritems, Future
+    make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode
 from pygments.regexopt import regex_opt
 
 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
@@ -72,8 +72,9 @@ class Lexer(object):
         If given, must be an encoding name. This encoding will be used to
         convert the input string to Unicode, if it is not already a Unicode
         string (default: ``'latin1'``).
-        Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
-        ``'chardet'`` to use the chardet library, if it is installed.
+        Can also be ``'guess'`` to use a simple UTF-8 / Locale / Latin1
+        detection, or ``'chardet'`` to use the chardet library, if it is
+        installed.
     """
 
     #: Name of the lexer
@@ -146,12 +147,7 @@ class Lexer(object):
         """
         if not isinstance(text, text_type):
             if self.encoding == 'guess':
-                try:
-                    text = text.decode('utf-8')
-                    if text.startswith(u'\ufeff'):
-                        text = text[len(u'\ufeff'):]
-                except UnicodeDecodeError:
-                    text = text.decode('latin1')
+                text = guess_decode(text)
             elif self.encoding == 'chardet':
                 try:
                     import chardet
author	Georg Brandl <georg@python.org>	2014-10-07 14:45:58 +0200
committer	Georg Brandl <georg@python.org>	2014-10-07 14:45:58 +0200
commit	e3b37e0dc370ec4671ffb09030cf7e723438b83b (patch)
tree	9987f1dfd5637c17186d6d3b9cd251cf5afbc97e /pygments/lexer.py
parent	9a51e6a6df8a56aebede133687e91e519a186122 (diff)
download	pygments-e3b37e0dc370ec4671ffb09030cf7e723438b83b.tar.gz