[svn] Support for encoding guessing.

author: gbrandl <devnull@localhost> 2006-11-01 17:15:13 +0100
committer: gbrandl <devnull@localhost> 2006-11-01 17:15:13 +0100
commit: aa1e3486962c93755e98ade258a4165d87a3e997 (patch)
tree: 4f9a7e424c6f7d3b041dfb19f4c745f647331b13 /pygments/lexer.py
parent: 265d6f0dcd3cc74cdc0376c10d371824b486c0da (diff)
download: pygments-aa1e3486962c93755e98ade258a4165d87a3e997.tar.gz
1 files changed, 19 insertions, 1 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index d3a05f0b..ffaeeb55 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -54,6 +54,8 @@ class Lexer(object):
         If given, must be an encoding name. This encoding will be used to
         convert the input string to Unicode, if it is not already a Unicode
         string. The default is to use latin1 (default: 'latin1').
+        Can also be 'guess' to use a simple UTF-8 / Latin1 detection, or
+        'chardet' to use the chardet library, if it is installed.
     """
 
     #: Name of the lexer
@@ -110,7 +112,23 @@ class Lexer(object):
         if isinstance(text, unicode):
             text = u'\n'.join(text.splitlines())
         else:
-            text = '\n'.join(text.splitlines()).decode(self.encoding)
+            text = '\n'.join(text.splitlines())
+            if self.encoding == 'guess':
+                try:
+                    text = text.decode('utf-8-sig')
+                except UnicodeDecodeError:
+                    text = text.decode('latin1')
+            elif self.encoding == 'chardet':
+                try:
+                    import chardet
+                except ImportError:
+                    raise ImportError('To enable chardet encoding guessing, please '
+                                      'install the chardet library from '
+                                      'http://chardet.feedparser.org/')
+                enc = chardet.detect(text)
+                text = text.decode(enc['encoding'])
+            else:
+                text = text.decode(self.encoding)
         if self.stripall:
             text = text.strip()
         elif self.stripnl:
author	gbrandl <devnull@localhost>	2006-11-01 17:15:13 +0100
committer	gbrandl <devnull@localhost>	2006-11-01 17:15:13 +0100
commit	aa1e3486962c93755e98ade258a4165d87a3e997 (patch)
tree	4f9a7e424c6f7d3b041dfb19f4c745f647331b13 /pygments/lexer.py
parent	265d6f0dcd3cc74cdc0376c10d371824b486c0da (diff)
download	pygments-aa1e3486962c93755e98ade258a4165d87a3e997.tar.gz