Overhaul encoding handling in cmdline even more.

Now the encoding guessed for the input file will be used for an output file. We now always read and write to the terminal .buffer on Python 3, which allows us to override the terminal encoding and use our guessing algorithm.
author: Georg Brandl <georg@python.org> 2014-10-08 01:20:11 +0200
committer: Georg Brandl <georg@python.org> 2014-10-08 01:20:11 +0200
commit: 484583e428efde3dbea4980ffeafc53d4fe37935 (patch)
tree: 4102a9b4462a6069eb55bff5009a52e8e35f2314
parent: c0ffb8a5babc8e6d1c58b92810f1cc11ae96ff85 (diff)
download: pygments-484583e428efde3dbea4980ffeafc53d4fe37935.tar.gz
4 files changed, 75 insertions, 50 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py
index 20a3ae04..c687e12b 100644
--- a/pygments/cmdline.py
+++ b/pygments/cmdline.py
@@ -17,7 +17,7 @@ from textwrap import dedent
 
 from pygments import __version__, highlight
 from pygments.util import ClassNotFound, OptionError, docstring_headline, \
-    text_type, guess_decode
+    guess_decode, guess_decode_from_terminal, terminal_encoding
 from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \
     get_lexer_for_filename, find_lexer_class, TextLexer
 from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter
@@ -189,18 +189,6 @@ def _print_list(what):
             print("    %s" % docstring_headline(cls))
 
 
-def _get_termencoding():
-    """Return terminal encoding for stdin/stdout.
-
-    Defaults to preferred locale encoding.
-    """
-    import locale
-    defencoding = locale.getpreferredencoding()
-    inencoding = getattr(sys.stdin, 'encoding', None) or defencoding
-    outencoding = getattr(sys.stdout, 'encoding', None) or defencoding
-    return inencoding, outencoding
-
-
 def main(args=sys.argv):
     """
     Main command line entry point.
@@ -287,6 +275,10 @@ def main(args=sys.argv):
             parsed_opts[name] = value
     opts.pop('-P', None)
 
+    # encodings
+    inencoding  = parsed_opts.get('inencoding', parsed_opts.get('encoding'))
+    outencoding = parsed_opts.get('outencoding', parsed_opts.get('encoding'))
+
     # handle ``pygmentize -N``
     infn = opts.pop('-N', None)
     if infn is not None:
@@ -362,7 +354,11 @@ def main(args=sys.argv):
     else:
         if not fmter:
             fmter = TerminalFormatter(**parsed_opts)
-        outfile = sys.stdout
+        if sys.version_info > (3,):
+            # Python 3: we have to use .buffer to get a binary stream
+            outfile = sys.stdout.buffer
+        else:
+            outfile = sys.stdout
 
     # select lexer
     lexer = opts.pop('-l', None)
@@ -373,6 +369,7 @@ def main(args=sys.argv):
             print('Error:', err, file=sys.stderr)
             return 1
 
+    # read input code
     if args:
         if len(args) > 1:
             print(usage, file=sys.stderr)
@@ -385,9 +382,10 @@ def main(args=sys.argv):
         except Exception as err:
             print('Error: cannot read infile:', err, file=sys.stderr)
             return 1
-        if 'encoding' not in parsed_opts:
-            code = guess_decode(code)
+        if not inencoding:
+            code, inencoding = guess_decode(code)
 
+        # do we have to guess the lexer?
         if not lexer:
             try:
                 lexer = get_lexer_for_filename(infn, code, **parsed_opts)
@@ -405,18 +403,16 @@ def main(args=sys.argv):
                 return 1
 
     else:
-        if 'encoding' in parsed_opts:
-            if sys.version_info > (3,):
-                # Python 3: we have to use .buffer
-                code = sys.stdin.buffer.read()
-            else:
-                code = sys.stdin.read()
-            # the lexer will do the decoding
+        # read code from terminal, always in binary mode since we want to
+        # decode ourselves and be tolerant with it
+        if sys.version_info > (3,):
+            # Python 3: we have to use .buffer to get a binary stream
+            code = sys.stdin.buffer.read()
         else:
             code = sys.stdin.read()
-            if not isinstance(code, text_type):
-                # Python 2; Python 3's terminal is already fine
-                code = code.decode(_get_termencoding()[0])
+        if not inencoding:
+            code, inencoding = guess_decode_from_terminal(code, sys.stdin)
+            # else the lexer will do the decoding
         if not lexer:
             try:
                 lexer = guess_lexer(code, **parsed_opts)
@@ -432,20 +428,14 @@ def main(args=sys.argv):
         right = escapeinside[1]
         lexer = LatexEmbeddedLexer(left, right, lexer)
 
-    # No encoding given? Use latin1 if output file given,
-    # stdin/stdout encoding otherwise.
-    # (This is a compromise, I'm not too happy with it...)
-    if 'encoding' not in parsed_opts and 'outencoding' not in parsed_opts:
+    # determine output encoding if not explicitly selected
+    if not outencoding:
         if outfn:
-            # encoding pass-through
-            fmter.encoding = 'latin1'
+            # output file? -> encoding pass-through
+            fmter.encoding = inencoding
         else:
-            if sys.version_info < (3,):
-                # use terminal encoding; Python 3's terminals already do that
-                lexer.encoding, fmter.encoding = _get_termencoding()
-    elif not outfn and sys.version_info > (3,):
-        # output to terminal with encoding -> use .buffer
-        outfile = sys.stdout.buffer
+            # else use terminal encoding
+            fmter.encoding = terminal_encoding(sys.stdout)
 
     # ... and do it!
     try:
diff --git a/pygments/lexer.py b/pygments/lexer.py
index d93cb284..5b3ad358 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -149,7 +149,7 @@ class Lexer(object):
         """
         if not isinstance(text, text_type):
             if self.encoding == 'guess':
-                text = guess_decode(text)
+                text, _ = guess_decode(text)
             elif self.encoding == 'chardet':
                 try:
                     import chardet
diff --git a/pygments/util.py b/pygments/util.py
index abf1cab8..8376a67f 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -282,16 +282,41 @@ def guess_decode(text):
     """
     try:
         text = text.decode('utf-8')
+        return text, 'utf-8'
     except UnicodeDecodeError:
         try:
             import locale
-            text = text.decode(locale.getpreferredencoding())
+            prefencoding = locale.getpreferredencoding()
+            text = text.decode()
+            return text, prefencoding
         except (UnicodeDecodeError, LookupError):
             text = text.decode('latin1')
-    else:
-        if text.startswith(u'\ufeff'):
-            text = text[len(u'\ufeff'):]
-    return text
+            return text, 'latin1'
+
+
+def guess_decode_from_terminal(text, term):
+    """Decode *text* coming from terminal *term*.
+
+    First try the terminal encoding, if given.
+    Then try UTF-8.  Then try the preferred locale encoding.
+    Fall back to latin-1, which always works.
+    """
+    if getattr(term, 'encoding', None):
+        try:
+            text = text.decode(term.encoding)
+        except UnicodeDecodeError:
+            pass
+        else:
+            return text, term.encoding
+    return guess_decode(text)
+
+
+def terminal_encoding(term):
+    """Return our best guess of encoding for the given *term*."""
+    if getattr(term, 'encoding', None):
+        return term.encoding
+    import locale
+    return locale.getpreferredencoding()
 
 
 # Python 2/3 compatibility
diff --git a/tests/test_cmdline.py b/tests/test_cmdline.py
index e4953f7e..9e26ce17 100644
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -7,14 +7,14 @@
     :license: BSD, see LICENSE for details.
 """
 
-# Test the command line interface
+from __future__ import print_function
 
 import io
 import sys
 import unittest
 
 from pygments import highlight
-from pygments.util import StringIO
+from pygments.util import StringIO, BytesIO
 from pygments.cmdline import main as cmdline_main
 
 import support
@@ -25,14 +25,24 @@ TESTFILE, TESTDIR = support.location(__file__)
 def run_cmdline(*args):
     saved_stdout = sys.stdout
     saved_stderr = sys.stderr
-    new_stdout = sys.stdout = StringIO()
-    new_stderr = sys.stderr = StringIO()
+    if sys.version_info > (3,):
+        stdout_buffer = BytesIO()
+        stderr_buffer = BytesIO()
+        new_stdout = sys.stdout = io.TextIOWrapper(stdout_buffer)
+        new_stderr = sys.stderr = io.TextIOWrapper(stderr_buffer)
+    else:
+        stdout_buffer = new_stdout = sys.stdout = StringIO()
+        stderr_buffer = new_stderr = sys.stderr = StringIO()
     try:
         ret = cmdline_main(["pygmentize"] + list(args))
     finally:
         sys.stdout = saved_stdout
         sys.stderr = saved_stderr
-    return (ret, new_stdout.getvalue(), new_stderr.getvalue())
+    new_stdout.flush()
+    new_stderr.flush()
+    out, err = stdout_buffer.getvalue().decode('utf-8'), \
+        stderr_buffer.getvalue().decode('utf-8')
+    return (ret, out, err)
 
 
 class CmdLineTest(unittest.TestCase):
@@ -83,7 +93,7 @@ class CmdLineTest(unittest.TestCase):
     def test_invalid_opts(self):
         for opts in [("-L", "-lpy"), ("-L", "-fhtml"), ("-L", "-Ox"),
                      ("-a",), ("-Sst", "-lpy"), ("-H",),
-                     ("-H", "formatter"),]:
+                     ("-H", "formatter")]:
             self.assertTrue(run_cmdline(*opts)[0] == 2)
 
     def test_normal(self):
author	Georg Brandl <georg@python.org>	2014-10-08 01:20:11 +0200
committer	Georg Brandl <georg@python.org>	2014-10-08 01:20:11 +0200
commit	484583e428efde3dbea4980ffeafc53d4fe37935 (patch)
tree	4102a9b4462a6069eb55bff5009a52e8e35f2314
parent	c0ffb8a5babc8e6d1c58b92810f1cc11ae96ff85 (diff)
download	pygments-484583e428efde3dbea4980ffeafc53d4fe37935.tar.gz