diff options
-rw-r--r-- | pygments/cmdline.py | 66 | ||||
-rw-r--r-- | pygments/lexer.py | 2 | ||||
-rw-r--r-- | pygments/util.py | 35 | ||||
-rw-r--r-- | tests/test_cmdline.py | 22 |
4 files changed, 75 insertions, 50 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py index 20a3ae04..c687e12b 100644 --- a/pygments/cmdline.py +++ b/pygments/cmdline.py @@ -17,7 +17,7 @@ from textwrap import dedent from pygments import __version__, highlight from pygments.util import ClassNotFound, OptionError, docstring_headline, \ - text_type, guess_decode + guess_decode, guess_decode_from_terminal, terminal_encoding from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \ get_lexer_for_filename, find_lexer_class, TextLexer from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter @@ -189,18 +189,6 @@ def _print_list(what): print(" %s" % docstring_headline(cls)) -def _get_termencoding(): - """Return terminal encoding for stdin/stdout. - - Defaults to preferred locale encoding. - """ - import locale - defencoding = locale.getpreferredencoding() - inencoding = getattr(sys.stdin, 'encoding', None) or defencoding - outencoding = getattr(sys.stdout, 'encoding', None) or defencoding - return inencoding, outencoding - - def main(args=sys.argv): """ Main command line entry point. @@ -287,6 +275,10 @@ def main(args=sys.argv): parsed_opts[name] = value opts.pop('-P', None) + # encodings + inencoding = parsed_opts.get('inencoding', parsed_opts.get('encoding')) + outencoding = parsed_opts.get('outencoding', parsed_opts.get('encoding')) + # handle ``pygmentize -N`` infn = opts.pop('-N', None) if infn is not None: @@ -362,7 +354,11 @@ def main(args=sys.argv): else: if not fmter: fmter = TerminalFormatter(**parsed_opts) - outfile = sys.stdout + if sys.version_info > (3,): + # Python 3: we have to use .buffer to get a binary stream + outfile = sys.stdout.buffer + else: + outfile = sys.stdout # select lexer lexer = opts.pop('-l', None) @@ -373,6 +369,7 @@ def main(args=sys.argv): print('Error:', err, file=sys.stderr) return 1 + # read input code if args: if len(args) > 1: print(usage, file=sys.stderr) @@ -385,9 +382,10 @@ def main(args=sys.argv): except Exception as err: print('Error: cannot read infile:', err, file=sys.stderr) return 1 - if 'encoding' not in parsed_opts: - code = guess_decode(code) + if not inencoding: + code, inencoding = guess_decode(code) + # do we have to guess the lexer? if not lexer: try: lexer = get_lexer_for_filename(infn, code, **parsed_opts) @@ -405,18 +403,16 @@ def main(args=sys.argv): return 1 else: - if 'encoding' in parsed_opts: - if sys.version_info > (3,): - # Python 3: we have to use .buffer - code = sys.stdin.buffer.read() - else: - code = sys.stdin.read() - # the lexer will do the decoding + # read code from terminal, always in binary mode since we want to + # decode ourselves and be tolerant with it + if sys.version_info > (3,): + # Python 3: we have to use .buffer to get a binary stream + code = sys.stdin.buffer.read() else: code = sys.stdin.read() - if not isinstance(code, text_type): - # Python 2; Python 3's terminal is already fine - code = code.decode(_get_termencoding()[0]) + if not inencoding: + code, inencoding = guess_decode_from_terminal(code, sys.stdin) + # else the lexer will do the decoding if not lexer: try: lexer = guess_lexer(code, **parsed_opts) @@ -432,20 +428,14 @@ def main(args=sys.argv): right = escapeinside[1] lexer = LatexEmbeddedLexer(left, right, lexer) - # No encoding given? Use latin1 if output file given, - # stdin/stdout encoding otherwise. - # (This is a compromise, I'm not too happy with it...) - if 'encoding' not in parsed_opts and 'outencoding' not in parsed_opts: + # determine output encoding if not explicitly selected + if not outencoding: if outfn: - # encoding pass-through - fmter.encoding = 'latin1' + # output file? -> encoding pass-through + fmter.encoding = inencoding else: - if sys.version_info < (3,): - # use terminal encoding; Python 3's terminals already do that - lexer.encoding, fmter.encoding = _get_termencoding() - elif not outfn and sys.version_info > (3,): - # output to terminal with encoding -> use .buffer - outfile = sys.stdout.buffer + # else use terminal encoding + fmter.encoding = terminal_encoding(sys.stdout) # ... and do it! try: diff --git a/pygments/lexer.py b/pygments/lexer.py index d93cb284..5b3ad358 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -149,7 +149,7 @@ class Lexer(object): """ if not isinstance(text, text_type): if self.encoding == 'guess': - text = guess_decode(text) + text, _ = guess_decode(text) elif self.encoding == 'chardet': try: import chardet diff --git a/pygments/util.py b/pygments/util.py index abf1cab8..8376a67f 100644 --- a/pygments/util.py +++ b/pygments/util.py @@ -282,16 +282,41 @@ def guess_decode(text): """ try: text = text.decode('utf-8') + return text, 'utf-8' except UnicodeDecodeError: try: import locale - text = text.decode(locale.getpreferredencoding()) + prefencoding = locale.getpreferredencoding() + text = text.decode() + return text, prefencoding except (UnicodeDecodeError, LookupError): text = text.decode('latin1') - else: - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] - return text + return text, 'latin1' + + +def guess_decode_from_terminal(text, term): + """Decode *text* coming from terminal *term*. + + First try the terminal encoding, if given. + Then try UTF-8. Then try the preferred locale encoding. + Fall back to latin-1, which always works. + """ + if getattr(term, 'encoding', None): + try: + text = text.decode(term.encoding) + except UnicodeDecodeError: + pass + else: + return text, term.encoding + return guess_decode(text) + + +def terminal_encoding(term): + """Return our best guess of encoding for the given *term*.""" + if getattr(term, 'encoding', None): + return term.encoding + import locale + return locale.getpreferredencoding() # Python 2/3 compatibility diff --git a/tests/test_cmdline.py b/tests/test_cmdline.py index e4953f7e..9e26ce17 100644 --- a/tests/test_cmdline.py +++ b/tests/test_cmdline.py @@ -7,14 +7,14 @@ :license: BSD, see LICENSE for details. """ -# Test the command line interface +from __future__ import print_function import io import sys import unittest from pygments import highlight -from pygments.util import StringIO +from pygments.util import StringIO, BytesIO from pygments.cmdline import main as cmdline_main import support @@ -25,14 +25,24 @@ TESTFILE, TESTDIR = support.location(__file__) def run_cmdline(*args): saved_stdout = sys.stdout saved_stderr = sys.stderr - new_stdout = sys.stdout = StringIO() - new_stderr = sys.stderr = StringIO() + if sys.version_info > (3,): + stdout_buffer = BytesIO() + stderr_buffer = BytesIO() + new_stdout = sys.stdout = io.TextIOWrapper(stdout_buffer) + new_stderr = sys.stderr = io.TextIOWrapper(stderr_buffer) + else: + stdout_buffer = new_stdout = sys.stdout = StringIO() + stderr_buffer = new_stderr = sys.stderr = StringIO() try: ret = cmdline_main(["pygmentize"] + list(args)) finally: sys.stdout = saved_stdout sys.stderr = saved_stderr - return (ret, new_stdout.getvalue(), new_stderr.getvalue()) + new_stdout.flush() + new_stderr.flush() + out, err = stdout_buffer.getvalue().decode('utf-8'), \ + stderr_buffer.getvalue().decode('utf-8') + return (ret, out, err) class CmdLineTest(unittest.TestCase): @@ -83,7 +93,7 @@ class CmdLineTest(unittest.TestCase): def test_invalid_opts(self): for opts in [("-L", "-lpy"), ("-L", "-fhtml"), ("-L", "-Ox"), ("-a",), ("-Sst", "-lpy"), ("-H",), - ("-H", "formatter"),]: + ("-H", "formatter")]: self.assertTrue(run_cmdline(*opts)[0] == 2) def test_normal(self): |