summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2014-11-06 13:18:19 +0100
committerGeorg Brandl <georg@python.org>2014-11-06 13:18:19 +0100
commit69e83eb0856666d2594c96b1e8fae42dbeb92318 (patch)
tree818573e86d101542b1f7536c313443d5baf77dd5
parent9053d1b7a1c3ac2c90944fe9d9564e0351dac74f (diff)
downloadpygments-69e83eb0856666d2594c96b1e8fae42dbeb92318.tar.gz
Default lexer encoding is now "guess", i.e. UTF-8 / Locale / Latin1 is
tried in that order.
-rw-r--r--CHANGES3
-rw-r--r--pygments/formatters/latex.py2
-rw-r--r--pygments/lexer.py9
-rw-r--r--pygments/lexers/__init__.py4
4 files changed, 10 insertions, 8 deletions
diff --git a/CHANGES b/CHANGES
index 95f975ac..e8c12970 100644
--- a/CHANGES
+++ b/CHANGES
@@ -74,6 +74,9 @@ Version 2.0rc1
* Todo.txt todo lists
* Twig (PR#404)
+- Default lexer encoding is now "guess", i.e. UTF-8 / Locale / Latin1 is
+ tried in that order.
+
- Added a helper to "optimize" regular expressions that match one of many
literal words; this can save 20% and more lexing time with lexers that
highlight many keywords or builtins.
diff --git a/pygments/formatters/latex.py b/pygments/formatters/latex.py
index 4b22215f..7a4eeca8 100644
--- a/pygments/formatters/latex.py
+++ b/pygments/formatters/latex.py
@@ -417,7 +417,7 @@ class LatexFormatter(Formatter):
dict(docclass = self.docclass,
preamble = self.preamble,
title = self.title,
- encoding = self.encoding or 'latin1',
+ encoding = self.encoding or 'utf8',
styledefs = self.get_style_defs(),
code = outfile.getvalue()))
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 687d19e3..e6c60015 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -71,10 +71,9 @@ class Lexer(object):
``encoding``
If given, must be an encoding name. This encoding will be used to
convert the input string to Unicode, if it is not already a Unicode
- string (default: ``'latin1'``).
- Can also be ``'guess'`` to use a simple UTF-8 / Locale / Latin1
- detection, or ``'chardet'`` to use the chardet library, if it is
- installed.
+ string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
+ Latin1 detection. Can also be ``'chardet'`` to use the chardet
+ library, if it is installed.
``inencoding``
Overrides the ``encoding`` if given.
"""
@@ -103,7 +102,7 @@ class Lexer(object):
self.stripall = get_bool_opt(options, 'stripall', False)
self.ensurenl = get_bool_opt(options, 'ensurenl', True)
self.tabsize = get_int_opt(options, 'tabsize', 0)
- self.encoding = options.get('encoding', 'latin1')
+ self.encoding = options.get('encoding', 'guess')
self.encoding = options.get('inencoding') or self.encoding
self.filters = []
for filter_ in get_list_opt(options, 'filters', ()):
diff --git a/pygments/lexers/__init__.py b/pygments/lexers/__init__.py
index 578a9101..039090c5 100644
--- a/pygments/lexers/__init__.py
+++ b/pygments/lexers/__init__.py
@@ -18,7 +18,7 @@ from os.path import basename
from pygments.lexers._mapping import LEXERS
from pygments.modeline import get_filetype_from_buffer
from pygments.plugin import find_plugin_lexers
-from pygments.util import ClassNotFound, itervalues
+from pygments.util import ClassNotFound, itervalues, guess_decode
__all__ = ['get_lexer_by_name', 'get_lexer_for_filename', 'find_lexer_class',
@@ -116,7 +116,7 @@ def find_lexer_class_for_filename(_fn, code=None):
if sys.version_info > (3,) and isinstance(code, bytes):
# decode it, since all analyse_text functions expect unicode
- code = code.decode('latin1')
+ code = guess_decode(code)
def get_rating(info):
cls, filename = info