Closes #979: improve encoding behavior of cmdline invocations

Now the preferred locale encoding is used for stdin/stdout if the encoding is not set on the file objects. Also, code from input files is tried to be decoded as UTF-8, the locale encoding and finally latin-1 as a last resort if no encoding option is given.
author: Georg Brandl <georg@python.org> 2014-10-07 14:45:58 +0200
committer: Georg Brandl <georg@python.org> 2014-10-07 14:45:58 +0200
commit: e3b37e0dc370ec4671ffb09030cf7e723438b83b (patch)
tree: 9987f1dfd5637c17186d6d3b9cd251cf5afbc97e
parent: 9a51e6a6df8a56aebede133687e91e519a186122 (diff)
download: pygments-e3b37e0dc370ec4671ffb09030cf7e723438b83b.tar.gz
3 files changed, 52 insertions, 20 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py
index 035f8c87..b6e83c24 100644
--- a/pygments/cmdline.py
+++ b/pygments/cmdline.py
@@ -16,13 +16,14 @@ import getopt
 from textwrap import dedent
 
 from pygments import __version__, highlight
-from pygments.util import ClassNotFound, OptionError, docstring_headline
-from pygments.lexers import get_all_lexers, get_lexer_by_name, get_lexer_for_filename, \
-     find_lexer_class, guess_lexer, TextLexer
+from pygments.util import ClassNotFound, OptionError, docstring_headline, \
+    text_type, guess_decode
+from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \
+    get_lexer_for_filename, find_lexer_class, TextLexer
 from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter
 from pygments.formatters import get_all_formatters, get_formatter_by_name, \
-     get_formatter_for_filename, find_formatter_class, \
-     TerminalFormatter  # pylint:disable-msg=E0611
+    get_formatter_for_filename, find_formatter_class, \
+    TerminalFormatter  # pylint:disable-msg=E0611
 from pygments.filters import get_all_filters, find_filter_class
 from pygments.styles import get_all_styles, get_style_by_name
 
@@ -188,6 +189,18 @@ def _print_list(what):
             print("    %s" % docstring_headline(cls))
 
 
+def _get_termencoding():
+    """Return terminal encoding for stdin/stdout.
+
+    Defaults to preferred locale encoding.
+    """
+    import locale
+    defencoding = locale.getpreferredencoding()
+    inencoding = getattr(sys.stdin, 'encoding', None) or defencoding
+    outencoding = getattr(sys.stdout, 'encoding', None) or defencoding
+    return inencoding, outencoding
+
+
 def main(args=sys.argv):
     """
     Main command line entry point.
@@ -376,6 +389,8 @@ def main(args=sys.argv):
         except Exception as err:
             print('Error: cannot read infile:', err, file=sys.stderr)
             return 1
+        if 'encoding' not in parsed_opts:
+            code = guess_decode(code)
 
         if not lexer:
             try:
@@ -401,11 +416,14 @@ def main(args=sys.argv):
             except ClassNotFound:
                 lexer = TextLexer(**parsed_opts)
         elif not lexer:
-            print('Error: no lexer name given and reading ' + \
-                                'from stdin (try using -g or -l <lexer>)', file=sys.stderr)
+            print('Error: no lexer name given and reading '
+                  'from stdin (try using -g or -l <lexer>)', file=sys.stderr)
             return 2
         else:
             code = sys.stdin.read()
+        if not isinstance(code, text_type):
+            # Python 2; Python 3's terminal is already fine
+            code = code.decode(_get_termencoding()[0])
 
     # When using the LaTeX formatter and the option `escapeinside` is
     # specified, we need a special lexer which collects escaped text
@@ -426,10 +444,7 @@ def main(args=sys.argv):
         else:
             if sys.version_info < (3,):
                 # use terminal encoding; Python 3's terminals already do that
-                lexer.encoding = getattr(sys.stdin, 'encoding',
-                                         None) or 'ascii'
-                fmter.encoding = getattr(sys.stdout, 'encoding',
-                                         None) or 'ascii'
+                lexer.encoding, fmter.encoding = _get_termencoding()
     elif not outfn and sys.version_info > (3,):
         # output to terminal with encoding -> use .buffer
         outfile = sys.stdout.buffer
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 4a041523..8d781e85 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -20,7 +20,7 @@ from pygments.filter import apply_filters, Filter
 from pygments.filters import get_filter_by_name
 from pygments.token import Error, Text, Other, _TokenType
 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
-    make_analysator, text_type, add_metaclass, iteritems, Future
+    make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode
 from pygments.regexopt import regex_opt
 
 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
@@ -72,8 +72,9 @@ class Lexer(object):
         If given, must be an encoding name. This encoding will be used to
         convert the input string to Unicode, if it is not already a Unicode
         string (default: ``'latin1'``).
-        Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
-        ``'chardet'`` to use the chardet library, if it is installed.
+        Can also be ``'guess'`` to use a simple UTF-8 / Locale / Latin1
+        detection, or ``'chardet'`` to use the chardet library, if it is
+        installed.
     """
 
     #: Name of the lexer
@@ -146,12 +147,7 @@ class Lexer(object):
         """
         if not isinstance(text, text_type):
             if self.encoding == 'guess':
-                try:
-                    text = text.decode('utf-8')
-                    if text.startswith(u'\ufeff'):
-                        text = text[len(u'\ufeff'):]
-                except UnicodeDecodeError:
-                    text = text.decode('latin1')
+                text = guess_decode(text)
             elif self.encoding == 'chardet':
                 try:
                     import chardet
diff --git a/pygments/util.py b/pygments/util.py
index 6f0c4148..486be579 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -288,6 +288,27 @@ class Future(object):
         raise NotImplementedError
 
 
+def guess_decode(text):
+    """Decode *text* with guessed encoding.
+
+    First try UTF-8; this should fail for non-UTF-8 encodings.
+    Then try the preferred locale encoding.
+    Fall back to latin-1, which always works.
+    """
+    try:
+        text = text.decode('utf-8')
+    except UnicodeDecodeError:
+        try:
+            import locale
+            text = text.decode(locale.getpreferredencoding())
+        except (UnicodeDecodeError, LookupError):
+            text = text.decode('latin1')
+    else:
+        if text.startswith(u'\ufeff'):
+            text = text[len(u'\ufeff'):]
+    return text
+
+
 # Python 2/3 compatibility
 
 if sys.version_info < (3, 0):
author	Georg Brandl <georg@python.org>	2014-10-07 14:45:58 +0200
committer	Georg Brandl <georg@python.org>	2014-10-07 14:45:58 +0200
commit	e3b37e0dc370ec4671ffb09030cf7e723438b83b (patch)
tree	9987f1dfd5637c17186d6d3b9cd251cf5afbc97e
parent	9a51e6a6df8a56aebede133687e91e519a186122 (diff)
download	pygments-e3b37e0dc370ec4671ffb09030cf7e723438b83b.tar.gz