diff options
author | Georg Brandl <georg@python.org> | 2014-10-07 14:45:58 +0200 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2014-10-07 14:45:58 +0200 |
commit | e3b37e0dc370ec4671ffb09030cf7e723438b83b (patch) | |
tree | 9987f1dfd5637c17186d6d3b9cd251cf5afbc97e | |
parent | 9a51e6a6df8a56aebede133687e91e519a186122 (diff) | |
download | pygments-e3b37e0dc370ec4671ffb09030cf7e723438b83b.tar.gz |
Closes #979: improve encoding behavior of cmdline invocations
Now the preferred locale encoding is used for stdin/stdout if the encoding is not set
on the file objects.
Also, code from input files is tried to be decoded as UTF-8, the locale encoding and
finally latin-1 as a last resort if no encoding option is given.
-rw-r--r-- | pygments/cmdline.py | 37 | ||||
-rw-r--r-- | pygments/lexer.py | 14 | ||||
-rw-r--r-- | pygments/util.py | 21 |
3 files changed, 52 insertions, 20 deletions
diff --git a/pygments/cmdline.py b/pygments/cmdline.py index 035f8c87..b6e83c24 100644 --- a/pygments/cmdline.py +++ b/pygments/cmdline.py @@ -16,13 +16,14 @@ import getopt from textwrap import dedent from pygments import __version__, highlight -from pygments.util import ClassNotFound, OptionError, docstring_headline -from pygments.lexers import get_all_lexers, get_lexer_by_name, get_lexer_for_filename, \ - find_lexer_class, guess_lexer, TextLexer +from pygments.util import ClassNotFound, OptionError, docstring_headline, \ + text_type, guess_decode +from pygments.lexers import get_all_lexers, get_lexer_by_name, guess_lexer, \ + get_lexer_for_filename, find_lexer_class, TextLexer from pygments.formatters.latex import LatexEmbeddedLexer, LatexFormatter from pygments.formatters import get_all_formatters, get_formatter_by_name, \ - get_formatter_for_filename, find_formatter_class, \ - TerminalFormatter # pylint:disable-msg=E0611 + get_formatter_for_filename, find_formatter_class, \ + TerminalFormatter # pylint:disable-msg=E0611 from pygments.filters import get_all_filters, find_filter_class from pygments.styles import get_all_styles, get_style_by_name @@ -188,6 +189,18 @@ def _print_list(what): print(" %s" % docstring_headline(cls)) +def _get_termencoding(): + """Return terminal encoding for stdin/stdout. + + Defaults to preferred locale encoding. + """ + import locale + defencoding = locale.getpreferredencoding() + inencoding = getattr(sys.stdin, 'encoding', None) or defencoding + outencoding = getattr(sys.stdout, 'encoding', None) or defencoding + return inencoding, outencoding + + def main(args=sys.argv): """ Main command line entry point. @@ -376,6 +389,8 @@ def main(args=sys.argv): except Exception as err: print('Error: cannot read infile:', err, file=sys.stderr) return 1 + if 'encoding' not in parsed_opts: + code = guess_decode(code) if not lexer: try: @@ -401,11 +416,14 @@ def main(args=sys.argv): except ClassNotFound: lexer = TextLexer(**parsed_opts) elif not lexer: - print('Error: no lexer name given and reading ' + \ - 'from stdin (try using -g or -l <lexer>)', file=sys.stderr) + print('Error: no lexer name given and reading ' + 'from stdin (try using -g or -l <lexer>)', file=sys.stderr) return 2 else: code = sys.stdin.read() + if not isinstance(code, text_type): + # Python 2; Python 3's terminal is already fine + code = code.decode(_get_termencoding()[0]) # When using the LaTeX formatter and the option `escapeinside` is # specified, we need a special lexer which collects escaped text @@ -426,10 +444,7 @@ def main(args=sys.argv): else: if sys.version_info < (3,): # use terminal encoding; Python 3's terminals already do that - lexer.encoding = getattr(sys.stdin, 'encoding', - None) or 'ascii' - fmter.encoding = getattr(sys.stdout, 'encoding', - None) or 'ascii' + lexer.encoding, fmter.encoding = _get_termencoding() elif not outfn and sys.version_info > (3,): # output to terminal with encoding -> use .buffer outfile = sys.stdout.buffer diff --git a/pygments/lexer.py b/pygments/lexer.py index 4a041523..8d781e85 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -20,7 +20,7 @@ from pygments.filter import apply_filters, Filter from pygments.filters import get_filter_by_name from pygments.token import Error, Text, Other, _TokenType from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ - make_analysator, text_type, add_metaclass, iteritems, Future + make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode from pygments.regexopt import regex_opt __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', @@ -72,8 +72,9 @@ class Lexer(object): If given, must be an encoding name. This encoding will be used to convert the input string to Unicode, if it is not already a Unicode string (default: ``'latin1'``). - Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or - ``'chardet'`` to use the chardet library, if it is installed. + Can also be ``'guess'`` to use a simple UTF-8 / Locale / Latin1 + detection, or ``'chardet'`` to use the chardet library, if it is + installed. """ #: Name of the lexer @@ -146,12 +147,7 @@ class Lexer(object): """ if not isinstance(text, text_type): if self.encoding == 'guess': - try: - text = text.decode('utf-8') - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] - except UnicodeDecodeError: - text = text.decode('latin1') + text = guess_decode(text) elif self.encoding == 'chardet': try: import chardet diff --git a/pygments/util.py b/pygments/util.py index 6f0c4148..486be579 100644 --- a/pygments/util.py +++ b/pygments/util.py @@ -288,6 +288,27 @@ class Future(object): raise NotImplementedError +def guess_decode(text): + """Decode *text* with guessed encoding. + + First try UTF-8; this should fail for non-UTF-8 encodings. + Then try the preferred locale encoding. + Fall back to latin-1, which always works. + """ + try: + text = text.decode('utf-8') + except UnicodeDecodeError: + try: + import locale + text = text.decode(locale.getpreferredencoding()) + except (UnicodeDecodeError, LookupError): + text = text.decode('latin1') + else: + if text.startswith(u'\ufeff'): + text = text[len(u'\ufeff'):] + return text + + # Python 2/3 compatibility if sys.version_info < (3, 0): |