summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2014-10-08 01:20:11 +0200
committerGeorg Brandl <georg@python.org>2014-10-08 01:20:11 +0200
commit484583e428efde3dbea4980ffeafc53d4fe37935 (patch)
tree4102a9b4462a6069eb55bff5009a52e8e35f2314
parentc0ffb8a5babc8e6d1c58b92810f1cc11ae96ff85 (diff)
downloadpygments-484583e428efde3dbea4980ffeafc53d4fe37935.tar.gz
Overhaul encoding handling in cmdline even more.
Now the encoding guessed for the input file will be used for an output file. We now always read and write to the terminal .buffer on Python 3, which allows us to override the terminal encoding and use our guessing algorithm.
-rw-r--r--pygments/cmdline.py66
-rw-r--r--pygments/lexer.py2
-rw-r--r--pygments/util.py35
-rw-r--r--tests/test_cmdline.py22
4 files changed, 75 insertions, 50 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py
index 20a3ae04..c687e12b 100644
--- a/pygments/cmdline.py
+++ b/pygments/cmdline.py
@@ -17,7 +17,7 @@ from textwrap import dedent
from pygments import __version__, highlight
from pygments.util import ClassNotFound, OptionError, docstring_headline, \
- text_type, guess_decode
+ guess_decode, guess_decode_from_terminal, terminal_encoding
from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \
get_lexer_for_filename, find_lexer_class, TextLexer
from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter
@@ -189,18 +189,6 @@ def _print_list(what):
print(" %s" % docstring_headline(cls))
-def _get_termencoding():
- """Return terminal encoding for stdin/stdout.
-
- Defaults to preferred locale encoding.
- """
- import locale
- defencoding = locale.getpreferredencoding()
- inencoding = getattr(sys.stdin, 'encoding', None) or defencoding
- outencoding = getattr(sys.stdout, 'encoding', None) or defencoding
- return inencoding, outencoding
-
-
def main(args=sys.argv):
"""
Main command line entry point.
@@ -287,6 +275,10 @@ def main(args=sys.argv):
parsed_opts[name] = value
opts.pop('-P', None)
+ # encodings
+ inencoding = parsed_opts.get('inencoding', parsed_opts.get('encoding'))
+ outencoding = parsed_opts.get('outencoding', parsed_opts.get('encoding'))
+
# handle ``pygmentize -N``
infn = opts.pop('-N', None)
if infn is not None:
@@ -362,7 +354,11 @@ def main(args=sys.argv):
else:
if not fmter:
fmter = TerminalFormatter(**parsed_opts)
- outfile = sys.stdout
+ if sys.version_info > (3,):
+ # Python 3: we have to use .buffer to get a binary stream
+ outfile = sys.stdout.buffer
+ else:
+ outfile = sys.stdout
# select lexer
lexer = opts.pop('-l', None)
@@ -373,6 +369,7 @@ def main(args=sys.argv):
print('Error:', err, file=sys.stderr)
return 1
+ # read input code
if args:
if len(args) > 1:
print(usage, file=sys.stderr)
@@ -385,9 +382,10 @@ def main(args=sys.argv):
except Exception as err:
print('Error: cannot read infile:', err, file=sys.stderr)
return 1
- if 'encoding' not in parsed_opts:
- code = guess_decode(code)
+ if not inencoding:
+ code, inencoding = guess_decode(code)
+ # do we have to guess the lexer?
if not lexer:
try:
lexer = get_lexer_for_filename(infn, code, **parsed_opts)
@@ -405,18 +403,16 @@ def main(args=sys.argv):
return 1
else:
- if 'encoding' in parsed_opts:
- if sys.version_info > (3,):
- # Python 3: we have to use .buffer
- code = sys.stdin.buffer.read()
- else:
- code = sys.stdin.read()
- # the lexer will do the decoding
+ # read code from terminal, always in binary mode since we want to
+ # decode ourselves and be tolerant with it
+ if sys.version_info > (3,):
+ # Python 3: we have to use .buffer to get a binary stream
+ code = sys.stdin.buffer.read()
else:
code = sys.stdin.read()
- if not isinstance(code, text_type):
- # Python 2; Python 3's terminal is already fine
- code = code.decode(_get_termencoding()[0])
+ if not inencoding:
+ code, inencoding = guess_decode_from_terminal(code, sys.stdin)
+ # else the lexer will do the decoding
if not lexer:
try:
lexer = guess_lexer(code, **parsed_opts)
@@ -432,20 +428,14 @@ def main(args=sys.argv):
right = escapeinside[1]
lexer = LatexEmbeddedLexer(left, right, lexer)
- # No encoding given? Use latin1 if output file given,
- # stdin/stdout encoding otherwise.
- # (This is a compromise, I'm not too happy with it...)
- if 'encoding' not in parsed_opts and 'outencoding' not in parsed_opts:
+ # determine output encoding if not explicitly selected
+ if not outencoding:
if outfn:
- # encoding pass-through
- fmter.encoding = 'latin1'
+ # output file? -> encoding pass-through
+ fmter.encoding = inencoding
else:
- if sys.version_info < (3,):
- # use terminal encoding; Python 3's terminals already do that
- lexer.encoding, fmter.encoding = _get_termencoding()
- elif not outfn and sys.version_info > (3,):
- # output to terminal with encoding -> use .buffer
- outfile = sys.stdout.buffer
+ # else use terminal encoding
+ fmter.encoding = terminal_encoding(sys.stdout)
# ... and do it!
try:
diff --git a/pygments/lexer.py b/pygments/lexer.py
index d93cb284..5b3ad358 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -149,7 +149,7 @@ class Lexer(object):
"""
if not isinstance(text, text_type):
if self.encoding == 'guess':
- text = guess_decode(text)
+ text, _ = guess_decode(text)
elif self.encoding == 'chardet':
try:
import chardet
diff --git a/pygments/util.py b/pygments/util.py
index abf1cab8..8376a67f 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -282,16 +282,41 @@ def guess_decode(text):
"""
try:
text = text.decode('utf-8')
+ return text, 'utf-8'
except UnicodeDecodeError:
try:
import locale
- text = text.decode(locale.getpreferredencoding())
+ prefencoding = locale.getpreferredencoding()
+ text = text.decode()
+ return text, prefencoding
except (UnicodeDecodeError, LookupError):
text = text.decode('latin1')
- else:
- if text.startswith(u'\ufeff'):
- text = text[len(u'\ufeff'):]
- return text
+ return text, 'latin1'
+
+
+def guess_decode_from_terminal(text, term):
+ """Decode *text* coming from terminal *term*.
+
+ First try the terminal encoding, if given.
+ Then try UTF-8. Then try the preferred locale encoding.
+ Fall back to latin-1, which always works.
+ """
+ if getattr(term, 'encoding', None):
+ try:
+ text = text.decode(term.encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ return text, term.encoding
+ return guess_decode(text)
+
+
+def terminal_encoding(term):
+ """Return our best guess of encoding for the given *term*."""
+ if getattr(term, 'encoding', None):
+ return term.encoding
+ import locale
+ return locale.getpreferredencoding()
# Python 2/3 compatibility
diff --git a/tests/test_cmdline.py b/tests/test_cmdline.py
index e4953f7e..9e26ce17 100644
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -7,14 +7,14 @@
:license: BSD, see LICENSE for details.
"""
-# Test the command line interface
+from __future__ import print_function
import io
import sys
import unittest
from pygments import highlight
-from pygments.util import StringIO
+from pygments.util import StringIO, BytesIO
from pygments.cmdline import main as cmdline_main
import support
@@ -25,14 +25,24 @@ TESTFILE, TESTDIR = support.location(__file__)
def run_cmdline(*args):
saved_stdout = sys.stdout
saved_stderr = sys.stderr
- new_stdout = sys.stdout = StringIO()
- new_stderr = sys.stderr = StringIO()
+ if sys.version_info > (3,):
+ stdout_buffer = BytesIO()
+ stderr_buffer = BytesIO()
+ new_stdout = sys.stdout = io.TextIOWrapper(stdout_buffer)
+ new_stderr = sys.stderr = io.TextIOWrapper(stderr_buffer)
+ else:
+ stdout_buffer = new_stdout = sys.stdout = StringIO()
+ stderr_buffer = new_stderr = sys.stderr = StringIO()
try:
ret = cmdline_main(["pygmentize"] + list(args))
finally:
sys.stdout = saved_stdout
sys.stderr = saved_stderr
- return (ret, new_stdout.getvalue(), new_stderr.getvalue())
+ new_stdout.flush()
+ new_stderr.flush()
+ out, err = stdout_buffer.getvalue().decode('utf-8'), \
+ stderr_buffer.getvalue().decode('utf-8')
+ return (ret, out, err)
class CmdLineTest(unittest.TestCase):
@@ -83,7 +93,7 @@ class CmdLineTest(unittest.TestCase):
def test_invalid_opts(self):
for opts in [("-L", "-lpy"), ("-L", "-fhtml"), ("-L", "-Ox"),
("-a",), ("-Sst", "-lpy"), ("-H",),
- ("-H", "formatter"),]:
+ ("-H", "formatter")]:
self.assertTrue(run_cmdline(*opts)[0] == 2)
def test_normal(self):