summaryrefslogtreecommitdiff
path: root/pygments/lexer.py
diff options
context:
space:
mode:
authorgbrandl <devnull@localhost>2006-11-01 17:15:13 +0100
committergbrandl <devnull@localhost>2006-11-01 17:15:13 +0100
commitaa1e3486962c93755e98ade258a4165d87a3e997 (patch)
tree4f9a7e424c6f7d3b041dfb19f4c745f647331b13 /pygments/lexer.py
parent265d6f0dcd3cc74cdc0376c10d371824b486c0da (diff)
downloadpygments-aa1e3486962c93755e98ade258a4165d87a3e997.tar.gz
[svn] Support for encoding guessing.
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r--pygments/lexer.py20
1 files changed, 19 insertions, 1 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index d3a05f0b..ffaeeb55 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -54,6 +54,8 @@ class Lexer(object):
If given, must be an encoding name. This encoding will be used to
convert the input string to Unicode, if it is not already a Unicode
string. The default is to use latin1 (default: 'latin1').
+ Can also be 'guess' to use a simple UTF-8 / Latin1 detection, or
+ 'chardet' to use the chardet library, if it is installed.
"""
#: Name of the lexer
@@ -110,7 +112,23 @@ class Lexer(object):
if isinstance(text, unicode):
text = u'\n'.join(text.splitlines())
else:
- text = '\n'.join(text.splitlines()).decode(self.encoding)
+ text = '\n'.join(text.splitlines())
+ if self.encoding == 'guess':
+ try:
+ text = text.decode('utf-8-sig')
+ except UnicodeDecodeError:
+ text = text.decode('latin1')
+ elif self.encoding == 'chardet':
+ try:
+ import chardet
+ except ImportError:
+ raise ImportError('To enable chardet encoding guessing, please '
+ 'install the chardet library from '
+ 'http://chardet.feedparser.org/')
+ enc = chardet.detect(text)
+ text = text.decode(enc['encoding'])
+ else:
+ text = text.decode(self.encoding)
if self.stripall:
text = text.strip()
elif self.stripnl: