diff options
author | Georg Brandl <georg@python.org> | 2014-10-07 14:45:58 +0200 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2014-10-07 14:45:58 +0200 |
commit | e3b37e0dc370ec4671ffb09030cf7e723438b83b (patch) | |
tree | 9987f1dfd5637c17186d6d3b9cd251cf5afbc97e /pygments/cmdline.py | |
parent | 9a51e6a6df8a56aebede133687e91e519a186122 (diff) | |
download | pygments-e3b37e0dc370ec4671ffb09030cf7e723438b83b.tar.gz |
Closes #979: improve encoding behavior of cmdline invocations
Now the preferred locale encoding is used for stdin/stdout if the encoding is not set
on the file objects.
Also, code from input files is tried to be decoded as UTF-8, the locale encoding and
finally latin-1 as a last resort if no encoding option is given.
Diffstat (limited to 'pygments/cmdline.py')
-rw-r--r-- | pygments/cmdline.py | 37 |
1 files changed, 26 insertions, 11 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py index 035f8c87..b6e83c24 100644 --- a/pygments/cmdline.py +++ b/pygments/cmdline.py @@ -16,13 +16,14 @@ import getopt from textwrap import dedent from pygments import __version__, highlight -from pygments.util import ClassNotFound, OptionError, docstring_headline -from pygments.lexers import get_all_lexers, get_lexer_by_name, get_lexer_for_filename, \ - find_lexer_class, guess_lexer, TextLexer +from pygments.util import ClassNotFound, OptionError, docstring_headline, \ + text_type, guess_decode +from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \ + get_lexer_for_filename, find_lexer_class, TextLexer from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter from pygments.formatters import get_all_formatters, get_formatter_by_name, \ - get_formatter_for_filename, find_formatter_class, \ - TerminalFormatter # pylint:disable-msg=E0611 + get_formatter_for_filename, find_formatter_class, \ + TerminalFormatter # pylint:disable-msg=E0611 from pygments.filters import get_all_filters, find_filter_class from pygments.styles import get_all_styles, get_style_by_name @@ -188,6 +189,18 @@ def _print_list(what): print(" %s" % docstring_headline(cls)) +def _get_termencoding(): + """Return terminal encoding for stdin/stdout. + + Defaults to preferred locale encoding. + """ + import locale + defencoding = locale.getpreferredencoding() + inencoding = getattr(sys.stdin, 'encoding', None) or defencoding + outencoding = getattr(sys.stdout, 'encoding', None) or defencoding + return inencoding, outencoding + + def main(args=sys.argv): """ Main command line entry point. @@ -376,6 +389,8 @@ def main(args=sys.argv): except Exception as err: print('Error: cannot read infile:', err, file=sys.stderr) return 1 + if 'encoding' not in parsed_opts: + code = guess_decode(code) if not lexer: try: @@ -401,11 +416,14 @@ def main(args=sys.argv): except ClassNotFound: lexer = TextLexer(**parsed_opts) elif not lexer: - print('Error: no lexer name given and reading ' + \ - 'from stdin (try using -g or -l <lexer>)', file=sys.stderr) + print('Error: no lexer name given and reading ' + 'from stdin (try using -g or -l <lexer>)', file=sys.stderr) return 2 else: code = sys.stdin.read() + if not isinstance(code, text_type): + # Python 2; Python 3's terminal is already fine + code = code.decode(_get_termencoding()[0]) # When using the LaTeX formatter and the option `escapeinside` is # specified, we need a special lexer which collects escaped text @@ -426,10 +444,7 @@ def main(args=sys.argv): else: if sys.version_info < (3,): # use terminal encoding; Python 3's terminals already do that - lexer.encoding = getattr(sys.stdin, 'encoding', - None) or 'ascii' - fmter.encoding = getattr(sys.stdout, 'encoding', - None) or 'ascii' + lexer.encoding, fmter.encoding = _get_termencoding() elif not outfn and sys.version_info > (3,): # output to terminal with encoding -> use .buffer outfile = sys.stdout.buffer |