4 files changed, 75 insertions, 50 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py
index 20a3ae04..c687e12b 100644
--- a/pygments/cmdline.py
+++ b/pygments/cmdline.py
@@ -17,7 +17,7 @@ from textwrap import dedent
 
 from pygments import __version__, highlight
 from pygments.util import ClassNotFound, OptionError, docstring_headline, \
-    text_type, guess_decode
+    guess_decode, guess_decode_from_terminal, terminal_encoding
 from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \
     get_lexer_for_filename, find_lexer_class, TextLexer
 from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter
@@ -189,18 +189,6 @@ def _print_list(what):
             print("    %s" % docstring_headline(cls))
 
 
-def _get_termencoding():
-    """Return terminal encoding for stdin/stdout.
-
-    Defaults to preferred locale encoding.
-    """
-    import locale
-    defencoding = locale.getpreferredencoding()
-    inencoding = getattr(sys.stdin, 'encoding', None) or defencoding
-    outencoding = getattr(sys.stdout, 'encoding', None) or defencoding
-    return inencoding, outencoding
-
-
 def main(args=sys.argv):
     """
     Main command line entry point.
@@ -287,6 +275,10 @@ def main(args=sys.argv):
             parsed_opts[name] = value
     opts.pop('-P', None)
 
+    # encodings
+    inencoding  = parsed_opts.get('inencoding', parsed_opts.get('encoding'))
+    outencoding = parsed_opts.get('outencoding', parsed_opts.get('encoding'))
+
     # handle ``pygmentize -N``
     infn = opts.pop('-N', None)
     if infn is not None:
@@ -362,7 +354,11 @@ def main(args=sys.argv):
     else:
         if not fmter:
             fmter = TerminalFormatter(**parsed_opts)
-        outfile = sys.stdout
+        if sys.version_info > (3,):
+            # Python 3: we have to use .buffer to get a binary stream
+            outfile = sys.stdout.buffer
+        else:
+            outfile = sys.stdout
 
     # select lexer
     lexer = opts.pop('-l', None)
@@ -373,6 +369,7 @@ def main(args=sys.argv):
             print('Error:', err, file=sys.stderr)
             return 1
 
+    # read input code
     if args:
         if len(args) > 1:
             print(usage, file=sys.stderr)
@@ -385,9 +382,10 @@ def main(args=sys.argv):
         except Exception as err:
             print('Error: cannot read infile:', err, file=sys.stderr)
             return 1
-        if 'encoding' not in parsed_opts:
-            code = guess_decode(code)
+        if not inencoding:
+            code, inencoding = guess_decode(code)
 
+        # do we have to guess the lexer?
         if not lexer:
             try:
                 lexer = get_lexer_for_filename(infn, code, **parsed_opts)
@@ -405,18 +403,16 @@ def main(args=sys.argv):
                 return 1
 
     else:
-        if 'encoding' in parsed_opts:
-            if sys.version_info > (3,):
-                # Python 3: we have to use .buffer
-                code = sys.stdin.buffer.read()
-            else:
-                code = sys.stdin.read()
-            # the lexer will do the decoding
+        # read code from terminal, always in binary mode since we want to
+        # decode ourselves and be tolerant with it
+        if sys.version_info > (3,):
+            # Python 3: we have to use .buffer to get a binary stream
+            code = sys.stdin.buffer.read()
         else:
             code = sys.stdin.read()
-            if not isinstance(code, text_type):
-                # Python 2; Python 3's terminal is already fine
-                code = code.decode(_get_termencoding()[0])
+        if not inencoding:
+            code, inencoding = guess_decode_from_terminal(code, sys.stdin)
+            # else the lexer will do the decoding
         if not lexer:
             try:
                 lexer = guess_lexer(code, **parsed_opts)
@@ -432,20 +428,14 @@ def main(args=sys.argv):
         right = escapeinside[1]
         lexer = LatexEmbeddedLexer(left, right, lexer)
 
-    # No encoding given? Use latin1 if output file given,
-    # stdin/stdout encoding otherwise.
-    # (This is a compromise, I'm not too happy with it...)
-    if 'encoding' not in parsed_opts and 'outencoding' not in parsed_opts:
+    # determine output encoding if not explicitly selected
+    if not outencoding:
         if outfn:
-            # encoding pass-through
-            fmter.encoding = 'latin1'
+            # output file? -> encoding pass-through
+            fmter.encoding = inencoding
         else:
-            if sys.version_info < (3,):
-                # use terminal encoding; Python 3's terminals already do that
-                lexer.encoding, fmter.encoding = _get_termencoding()
-    elif not outfn and sys.version_info > (3,):
-        # output to terminal with encoding -> use .buffer
-        outfile = sys.stdout.buffer
+            # else use terminal encoding
+            fmter.encoding = terminal_encoding(sys.stdout)
 
     # ... and do it!
     try:
diff --git a/pygments/lexer.py b/pygments/lexer.py
index d93cb284..5b3ad358 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -149,7 +149,7 @@ class Lexer(object):
         """
         if not isinstance(text, text_type):
             if self.encoding == 'guess':
-                text = guess_decode(text)
+                text, _ = guess_decode(text)
             elif self.encoding == 'chardet':
                 try:
                     import chardet
diff --git a/pygments/util.py b/pygments/util.py
index abf1cab8..8376a67f 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -282,16 +282,41 @@ def guess_decode(text):
     """
     try:
         text = text.decode('utf-8')
+        return text, 'utf-8'
     except UnicodeDecodeError:
         try:
             import locale
-            text = text.decode(locale.getpreferredencoding())
+            prefencoding = locale.getpreferredencoding()
+            text = text.decode()
+            return text, prefencoding
         except (UnicodeDecodeError, LookupError):
             text = text.decode('latin1')
-    else:
-        if text.startswith(u'\ufeff'):
-            text = text[len(u'\ufeff'):]
-    return text
+            return text, 'latin1'
+
+
+def guess_decode_from_terminal(text, term):
+    """Decode *text* coming from terminal *term*.
+
+    First try the terminal encoding, if given.
+    Then try UTF-8.  Then try the preferred locale encoding.
+    Fall back to latin-1, which always works.
+    """
+    if getattr(term, 'encoding', None):
+        try:
+            text = text.decode(term.encoding)
+        except UnicodeDecodeError:
+            pass
+        else:
+            return text, term.encoding
+    return guess_decode(text)
+
+
+def terminal_encoding(term):
+    """Return our best guess of encoding for the given *term*."""
+    if getattr(term, 'encoding', None):
+        return term.encoding
+    import locale
+    return locale.getpreferredencoding()
 
 
 # Python 2/3 compatibility
diff --git a/tests/test_cmdline.py b/tests/test_cmdline.py
index e4953f7e..9e26ce17 100644
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -7,14 +7,14 @@
     :license: BSD, see LICENSE for details.
 """
 
-# Test the command line interface
+from __future__ import print_function
 
 import io
 import sys
 import unittest
 
 from pygments import highlight
-from pygments.util import StringIO
+from pygments.util import StringIO, BytesIO
 from pygments.cmdline import main as cmdline_main
 
 import support
@@ -25,14 +25,24 @@ TESTFILE, TESTDIR = support.location(__file__)
 def run_cmdline(*args):
     saved_stdout = sys.stdout
     saved_stderr = sys.stderr
-    new_stdout = sys.stdout = StringIO()
-    new_stderr = sys.stderr = StringIO()
+    if sys.version_info > (3,):
+        stdout_buffer = BytesIO()
+        stderr_buffer = BytesIO()
+        new_stdout = sys.stdout = io.TextIOWrapper(stdout_buffer)
+        new_stderr = sys.stderr = io.TextIOWrapper(stderr_buffer)
+    else:
+        stdout_buffer = new_stdout = sys.stdout = StringIO()
+        stderr_buffer = new_stderr = sys.stderr = StringIO()
     try:
         ret = cmdline_main(["pygmentize"] + list(args))
     finally:
         sys.stdout = saved_stdout
         sys.stderr = saved_stderr
-    return (ret, new_stdout.getvalue(), new_stderr.getvalue())
+    new_stdout.flush()
+    new_stderr.flush()
+    out, err = stdout_buffer.getvalue().decode('utf-8'), \
+        stderr_buffer.getvalue().decode('utf-8')
+    return (ret, out, err)
 
 
 class CmdLineTest(unittest.TestCase):
@@ -83,7 +93,7 @@ class CmdLineTest(unittest.TestCase):
     def test_invalid_opts(self):
         for opts in [("-L", "-lpy"), ("-L", "-fhtml"), ("-L", "-Ox"),
                      ("-a",), ("-Sst", "-lpy"), ("-H",),
-                     ("-H", "formatter"),]:
+                     ("-H", "formatter")]:
             self.assertTrue(run_cmdline(*opts)[0] == 2)
 
     def test_normal(self):