diff options
author | Georg Brandl <georg@python.org> | 2014-11-06 13:18:19 +0100 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2014-11-06 13:18:19 +0100 |
commit | 69e83eb0856666d2594c96b1e8fae42dbeb92318 (patch) | |
tree | 818573e86d101542b1f7536c313443d5baf77dd5 | |
parent | 9053d1b7a1c3ac2c90944fe9d9564e0351dac74f (diff) | |
download | pygments-69e83eb0856666d2594c96b1e8fae42dbeb92318.tar.gz |
Default lexer encoding is now "guess", i.e. UTF-8 / Locale / Latin1 is
tried in that order.
-rw-r--r-- | CHANGES | 3 | ||||
-rw-r--r-- | pygments/formatters/latex.py | 2 | ||||
-rw-r--r-- | pygments/lexer.py | 9 | ||||
-rw-r--r-- | pygments/lexers/__init__.py | 4 |
4 files changed, 10 insertions, 8 deletions
@@ -74,6 +74,9 @@ Version 2.0rc1 * Todo.txt todo lists * Twig (PR#404) +- Default lexer encoding is now "guess", i.e. UTF-8 / Locale / Latin1 is + tried in that order. + - Added a helper to "optimize" regular expressions that match one of many literal words; this can save 20% and more lexing time with lexers that highlight many keywords or builtins. diff --git a/pygments/formatters/latex.py b/pygments/formatters/latex.py index 4b22215f..7a4eeca8 100644 --- a/pygments/formatters/latex.py +++ b/pygments/formatters/latex.py @@ -417,7 +417,7 @@ class LatexFormatter(Formatter): dict(docclass = self.docclass, preamble = self.preamble, title = self.title, - encoding = self.encoding or 'latin1', + encoding = self.encoding or 'utf8', styledefs = self.get_style_defs(), code = outfile.getvalue())) diff --git a/pygments/lexer.py b/pygments/lexer.py index 687d19e3..e6c60015 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -71,10 +71,9 @@ class Lexer(object): ``encoding`` If given, must be an encoding name. This encoding will be used to convert the input string to Unicode, if it is not already a Unicode - string (default: ``'latin1'``). - Can also be ``'guess'`` to use a simple UTF-8 / Locale / Latin1 - detection, or ``'chardet'`` to use the chardet library, if it is - installed. + string (default: ``'guess'``, which uses a simple UTF-8 / Locale / + Latin1 detection. Can also be ``'chardet'`` to use the chardet + library, if it is installed. ``inencoding`` Overrides the ``encoding`` if given. """ @@ -103,7 +102,7 @@ class Lexer(object): self.stripall = get_bool_opt(options, 'stripall', False) self.ensurenl = get_bool_opt(options, 'ensurenl', True) self.tabsize = get_int_opt(options, 'tabsize', 0) - self.encoding = options.get('encoding', 'latin1') + self.encoding = options.get('encoding', 'guess') self.encoding = options.get('inencoding') or self.encoding self.filters = [] for filter_ in get_list_opt(options, 'filters', ()): diff --git a/pygments/lexers/__init__.py b/pygments/lexers/__init__.py index 578a9101..039090c5 100644 --- a/pygments/lexers/__init__.py +++ b/pygments/lexers/__init__.py @@ -18,7 +18,7 @@ from os.path import basename from pygments.lexers._mapping import LEXERS from pygments.modeline import get_filetype_from_buffer from pygments.plugin import find_plugin_lexers -from pygments.util import ClassNotFound, itervalues +from pygments.util import ClassNotFound, itervalues, guess_decode __all__ = ['get_lexer_by_name', 'get_lexer_for_filename', 'find_lexer_class', @@ -116,7 +116,7 @@ def find_lexer_class_for_filename(_fn, code=None): if sys.version_info > (3,) and isinstance(code, bytes): # decode it, since all analyse_text functions expect unicode - code = code.decode('latin1') + code = guess_decode(code) def get_rating(info): cls, filename = info |