summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pygments/cmdline.py66
-rw-r--r--pygments/lexer.py2
-rw-r--r--pygments/util.py35
-rw-r--r--tests/test_cmdline.py22
4 files changed, 75 insertions, 50 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py
index 20a3ae04..c687e12b 100644
--- a/pygments/cmdline.py
+++ b/pygments/cmdline.py
@@ -17,7 +17,7 @@ from textwrap import dedent
from pygments import __version__, highlight
from pygments.util import ClassNotFound, OptionError, docstring_headline, \
- text_type, guess_decode
+ guess_decode, guess_decode_from_terminal, terminal_encoding
from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \
get_lexer_for_filename, find_lexer_class, TextLexer
from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter
@@ -189,18 +189,6 @@ def _print_list(what):
print(" %s" % docstring_headline(cls))
-def _get_termencoding():
- """Return terminal encoding for stdin/stdout.
-
- Defaults to preferred locale encoding.
- """
- import locale
- defencoding = locale.getpreferredencoding()
- inencoding = getattr(sys.stdin, 'encoding', None) or defencoding
- outencoding = getattr(sys.stdout, 'encoding', None) or defencoding
- return inencoding, outencoding
-
-
def main(args=sys.argv):
"""
Main command line entry point.
@@ -287,6 +275,10 @@ def main(args=sys.argv):
parsed_opts[name] = value
opts.pop('-P', None)
+ # encodings
+ inencoding = parsed_opts.get('inencoding', parsed_opts.get('encoding'))
+ outencoding = parsed_opts.get('outencoding', parsed_opts.get('encoding'))
+
# handle ``pygmentize -N``
infn = opts.pop('-N', None)
if infn is not None:
@@ -362,7 +354,11 @@ def main(args=sys.argv):
else:
if not fmter:
fmter = TerminalFormatter(**parsed_opts)
- outfile = sys.stdout
+ if sys.version_info > (3,):
+ # Python 3: we have to use .buffer to get a binary stream
+ outfile = sys.stdout.buffer
+ else:
+ outfile = sys.stdout
# select lexer
lexer = opts.pop('-l', None)
@@ -373,6 +369,7 @@ def main(args=sys.argv):
print('Error:', err, file=sys.stderr)
return 1
+ # read input code
if args:
if len(args) > 1:
print(usage, file=sys.stderr)
@@ -385,9 +382,10 @@ def main(args=sys.argv):
except Exception as err:
print('Error: cannot read infile:', err, file=sys.stderr)
return 1
- if 'encoding' not in parsed_opts:
- code = guess_decode(code)
+ if not inencoding:
+ code, inencoding = guess_decode(code)
+ # do we have to guess the lexer?
if not lexer:
try:
lexer = get_lexer_for_filename(infn, code, **parsed_opts)
@@ -405,18 +403,16 @@ def main(args=sys.argv):
return 1
else:
- if 'encoding' in parsed_opts:
- if sys.version_info > (3,):
- # Python 3: we have to use .buffer
- code = sys.stdin.buffer.read()
- else:
- code = sys.stdin.read()
- # the lexer will do the decoding
+ # read code from terminal, always in binary mode since we want to
+ # decode ourselves and be tolerant with it
+ if sys.version_info > (3,):
+ # Python 3: we have to use .buffer to get a binary stream
+ code = sys.stdin.buffer.read()
else:
code = sys.stdin.read()
- if not isinstance(code, text_type):
- # Python 2; Python 3's terminal is already fine
- code = code.decode(_get_termencoding()[0])
+ if not inencoding:
+ code, inencoding = guess_decode_from_terminal(code, sys.stdin)
+ # else the lexer will do the decoding
if not lexer:
try:
lexer = guess_lexer(code, **parsed_opts)
@@ -432,20 +428,14 @@ def main(args=sys.argv):
right = escapeinside[1]
lexer = LatexEmbeddedLexer(left, right, lexer)
- # No encoding given? Use latin1 if output file given,
- # stdin/stdout encoding otherwise.
- # (This is a compromise, I'm not too happy with it...)
- if 'encoding' not in parsed_opts and 'outencoding' not in parsed_opts:
+ # determine output encoding if not explicitly selected
+ if not outencoding:
if outfn:
- # encoding pass-through
- fmter.encoding = 'latin1'
+ # output file? -> encoding pass-through
+ fmter.encoding = inencoding
else:
- if sys.version_info < (3,):
- # use terminal encoding; Python 3's terminals already do that
- lexer.encoding, fmter.encoding = _get_termencoding()
- elif not outfn and sys.version_info > (3,):
- # output to terminal with encoding -> use .buffer
- outfile = sys.stdout.buffer
+ # else use terminal encoding
+ fmter.encoding = terminal_encoding(sys.stdout)
# ... and do it!
try:
diff --git a/pygments/lexer.py b/pygments/lexer.py
index d93cb284..5b3ad358 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -149,7 +149,7 @@ class Lexer(object):
"""
if not isinstance(text, text_type):
if self.encoding == 'guess':
- text = guess_decode(text)
+ text, _ = guess_decode(text)
elif self.encoding == 'chardet':
try:
import chardet
diff --git a/pygments/util.py b/pygments/util.py
index abf1cab8..8376a67f 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -282,16 +282,41 @@ def guess_decode(text):
"""
try:
text = text.decode('utf-8')
+ return text, 'utf-8'
except UnicodeDecodeError:
try:
import locale
- text = text.decode(locale.getpreferredencoding())
+ prefencoding = locale.getpreferredencoding()
+ text = text.decode()
+ return text, prefencoding
except (UnicodeDecodeError, LookupError):
text = text.decode('latin1')
- else:
- if text.startswith(u'\ufeff'):
- text = text[len(u'\ufeff'):]
- return text
+ return text, 'latin1'
+
+
+def guess_decode_from_terminal(text, term):
+ """Decode *text* coming from terminal *term*.
+
+ First try the terminal encoding, if given.
+ Then try UTF-8. Then try the preferred locale encoding.
+ Fall back to latin-1, which always works.
+ """
+ if getattr(term, 'encoding', None):
+ try:
+ text = text.decode(term.encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ return text, term.encoding
+ return guess_decode(text)
+
+
+def terminal_encoding(term):
+ """Return our best guess of encoding for the given *term*."""
+ if getattr(term, 'encoding', None):
+ return term.encoding
+ import locale
+ return locale.getpreferredencoding()
# Python 2/3 compatibility
diff --git a/tests/test_cmdline.py b/tests/test_cmdline.py
index e4953f7e..9e26ce17 100644
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -7,14 +7,14 @@
:license: BSD, see LICENSE for details.
"""
-# Test the command line interface
+from __future__ import print_function
import io
import sys
import unittest
from pygments import highlight
-from pygments.util import StringIO
+from pygments.util import StringIO, BytesIO
from pygments.cmdline import main as cmdline_main
import support
@@ -25,14 +25,24 @@ TESTFILE, TESTDIR = support.location(__file__)
def run_cmdline(*args):
saved_stdout = sys.stdout
saved_stderr = sys.stderr
- new_stdout = sys.stdout = StringIO()
- new_stderr = sys.stderr = StringIO()
+ if sys.version_info > (3,):
+ stdout_buffer = BytesIO()
+ stderr_buffer = BytesIO()
+ new_stdout = sys.stdout = io.TextIOWrapper(stdout_buffer)
+ new_stderr = sys.stderr = io.TextIOWrapper(stderr_buffer)
+ else:
+ stdout_buffer = new_stdout = sys.stdout = StringIO()
+ stderr_buffer = new_stderr = sys.stderr = StringIO()
try:
ret = cmdline_main(["pygmentize"] + list(args))
finally:
sys.stdout = saved_stdout
sys.stderr = saved_stderr
- return (ret, new_stdout.getvalue(), new_stderr.getvalue())
+ new_stdout.flush()
+ new_stderr.flush()
+ out, err = stdout_buffer.getvalue().decode('utf-8'), \
+ stderr_buffer.getvalue().decode('utf-8')
+ return (ret, out, err)
class CmdLineTest(unittest.TestCase):
@@ -83,7 +93,7 @@ class CmdLineTest(unittest.TestCase):
def test_invalid_opts(self):
for opts in [("-L", "-lpy"), ("-L", "-fhtml"), ("-L", "-Ox"),
("-a",), ("-Sst", "-lpy"), ("-H",),
- ("-H", "formatter"),]:
+ ("-H", "formatter")]:
self.assertTrue(run_cmdline(*opts)[0] == 2)
def test_normal(self):