diff options
author | Georg Brandl <georg@python.org> | 2014-10-08 01:20:11 +0200 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2014-10-08 01:20:11 +0200 |
commit | 484583e428efde3dbea4980ffeafc53d4fe37935 (patch) | |
tree | 4102a9b4462a6069eb55bff5009a52e8e35f2314 | |
parent | c0ffb8a5babc8e6d1c58b92810f1cc11ae96ff85 (diff) | |
download | pygments-484583e428efde3dbea4980ffeafc53d4fe37935.tar.gz |
Overhaul encoding handling in cmdline even more.
Now the encoding guessed for the input file will be used for an output file.
We now always read and write to the terminal .buffer on Python 3, which allows
us to override the terminal encoding and use our guessing algorithm.
-rw-r--r-- | pygments/cmdline.py | 66 | ||||
-rw-r--r-- | pygments/lexer.py | 2 | ||||
-rw-r--r-- | pygments/util.py | 35 | ||||
-rw-r--r-- | tests/test_cmdline.py | 22 |
4 files changed, 75 insertions, 50 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py index 20a3ae04..c687e12b 100644 --- a/pygments/cmdline.py +++ b/pygments/cmdline.py @@ -17,7 +17,7 @@ from textwrap import dedent from pygments import __version__, highlight from pygments.util import ClassNotFound, OptionError, docstring_headline, \ - text_type, guess_decode + guess_decode, guess_decode_from_terminal, terminal_encoding from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \ get_lexer_for_filename, find_lexer_class, TextLexer from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter @@ -189,18 +189,6 @@ def _print_list(what): print(" %s" % docstring_headline(cls)) -def _get_termencoding(): - """Return terminal encoding for stdin/stdout. - - Defaults to preferred locale encoding. - """ - import locale - defencoding = locale.getpreferredencoding() - inencoding = getattr(sys.stdin, 'encoding', None) or defencoding - outencoding = getattr(sys.stdout, 'encoding', None) or defencoding - return inencoding, outencoding - - def main(args=sys.argv): """ Main command line entry point. @@ -287,6 +275,10 @@ def main(args=sys.argv): parsed_opts[name] = value opts.pop('-P', None) + # encodings + inencoding = parsed_opts.get('inencoding', parsed_opts.get('encoding')) + outencoding = parsed_opts.get('outencoding', parsed_opts.get('encoding')) + # handle ``pygmentize -N`` infn = opts.pop('-N', None) if infn is not None: @@ -362,7 +354,11 @@ def main(args=sys.argv): else: if not fmter: fmter = TerminalFormatter(**parsed_opts) - outfile = sys.stdout + if sys.version_info > (3,): + # Python 3: we have to use .buffer to get a binary stream + outfile = sys.stdout.buffer + else: + outfile = sys.stdout # select lexer lexer = opts.pop('-l', None) @@ -373,6 +369,7 @@ def main(args=sys.argv): print('Error:', err, file=sys.stderr) return 1 + # read input code if args: if len(args) > 1: print(usage, file=sys.stderr) @@ -385,9 +382,10 @@ def main(args=sys.argv): except Exception as err: print('Error: cannot read infile:', err, file=sys.stderr) return 1 - if 'encoding' not in parsed_opts: - code = guess_decode(code) + if not inencoding: + code, inencoding = guess_decode(code) + # do we have to guess the lexer? if not lexer: try: lexer = get_lexer_for_filename(infn, code, **parsed_opts) @@ -405,18 +403,16 @@ def main(args=sys.argv): return 1 else: - if 'encoding' in parsed_opts: - if sys.version_info > (3,): - # Python 3: we have to use .buffer - code = sys.stdin.buffer.read() - else: - code = sys.stdin.read() - # the lexer will do the decoding + # read code from terminal, always in binary mode since we want to + # decode ourselves and be tolerant with it + if sys.version_info > (3,): + # Python 3: we have to use .buffer to get a binary stream + code = sys.stdin.buffer.read() else: code = sys.stdin.read() - if not isinstance(code, text_type): - # Python 2; Python 3's terminal is already fine - code = code.decode(_get_termencoding()[0]) + if not inencoding: + code, inencoding = guess_decode_from_terminal(code, sys.stdin) + # else the lexer will do the decoding if not lexer: try: lexer = guess_lexer(code, **parsed_opts) @@ -432,20 +428,14 @@ def main(args=sys.argv): right = escapeinside[1] lexer = LatexEmbeddedLexer(left, right, lexer) - # No encoding given? Use latin1 if output file given, - # stdin/stdout encoding otherwise. - # (This is a compromise, I'm not too happy with it...) - if 'encoding' not in parsed_opts and 'outencoding' not in parsed_opts: + # determine output encoding if not explicitly selected + if not outencoding: if outfn: - # encoding pass-through - fmter.encoding = 'latin1' + # output file? -> encoding pass-through + fmter.encoding = inencoding else: - if sys.version_info < (3,): - # use terminal encoding; Python 3's terminals already do that - lexer.encoding, fmter.encoding = _get_termencoding() - elif not outfn and sys.version_info > (3,): - # output to terminal with encoding -> use .buffer - outfile = sys.stdout.buffer + # else use terminal encoding + fmter.encoding = terminal_encoding(sys.stdout) # ... and do it! try: diff --git a/pygments/lexer.py b/pygments/lexer.py index d93cb284..5b3ad358 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -149,7 +149,7 @@ class Lexer(object): """ if not isinstance(text, text_type): if self.encoding == 'guess': - text = guess_decode(text) + text, _ = guess_decode(text) elif self.encoding == 'chardet': try: import chardet diff --git a/pygments/util.py b/pygments/util.py index abf1cab8..8376a67f 100644 --- a/pygments/util.py +++ b/pygments/util.py @@ -282,16 +282,41 @@ def guess_decode(text): """ try: text = text.decode('utf-8') + return text, 'utf-8' except UnicodeDecodeError: try: import locale - text = text.decode(locale.getpreferredencoding()) + prefencoding = locale.getpreferredencoding() + text = text.decode() + return text, prefencoding except (UnicodeDecodeError, LookupError): text = text.decode('latin1') - else: - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] - return text + return text, 'latin1' + + +def guess_decode_from_terminal(text, term): + """Decode *text* coming from terminal *term*. + + First try the terminal encoding, if given. + Then try UTF-8. Then try the preferred locale encoding. + Fall back to latin-1, which always works. + """ + if getattr(term, 'encoding', None): + try: + text = text.decode(term.encoding) + except UnicodeDecodeError: + pass + else: + return text, term.encoding + return guess_decode(text) + + +def terminal_encoding(term): + """Return our best guess of encoding for the given *term*.""" + if getattr(term, 'encoding', None): + return term.encoding + import locale + return locale.getpreferredencoding() # Python 2/3 compatibility diff --git a/tests/test_cmdline.py b/tests/test_cmdline.py index e4953f7e..9e26ce17 100644 --- a/tests/test_cmdline.py +++ b/tests/test_cmdline.py @@ -7,14 +7,14 @@ :license: BSD, see LICENSE for details. """ -# Test the command line interface +from __future__ import print_function import io import sys import unittest from pygments import highlight -from pygments.util import StringIO +from pygments.util import StringIO, BytesIO from pygments.cmdline import main as cmdline_main import support @@ -25,14 +25,24 @@ TESTFILE, TESTDIR = support.location(__file__) def run_cmdline(*args): saved_stdout = sys.stdout saved_stderr = sys.stderr - new_stdout = sys.stdout = StringIO() - new_stderr = sys.stderr = StringIO() + if sys.version_info > (3,): + stdout_buffer = BytesIO() + stderr_buffer = BytesIO() + new_stdout = sys.stdout = io.TextIOWrapper(stdout_buffer) + new_stderr = sys.stderr = io.TextIOWrapper(stderr_buffer) + else: + stdout_buffer = new_stdout = sys.stdout = StringIO() + stderr_buffer = new_stderr = sys.stderr = StringIO() try: ret = cmdline_main(["pygmentize"] + list(args)) finally: sys.stdout = saved_stdout sys.stderr = saved_stderr - return (ret, new_stdout.getvalue(), new_stderr.getvalue()) + new_stdout.flush() + new_stderr.flush() + out, err = stdout_buffer.getvalue().decode('utf-8'), \ + stderr_buffer.getvalue().decode('utf-8') + return (ret, out, err) class CmdLineTest(unittest.TestCase): @@ -83,7 +93,7 @@ class CmdLineTest(unittest.TestCase): def test_invalid_opts(self): for opts in [("-L", "-lpy"), ("-L", "-fhtml"), ("-L", "-Ox"), ("-a",), ("-Sst", "-lpy"), ("-H",), - ("-H", "formatter"),]: + ("-H", "formatter")]: self.assertTrue(run_cmdline(*opts)[0] == 2) def test_normal(self): |