12 files changed, 48 insertions, 23 deletions
diff --git a/TODO b/TODO
index d6052264..663938b5 100644
--- a/TODO
+++ b/TODO
@@ -4,11 +4,15 @@ Todo
 for 0.6
 -------
 
-- encoding, unicode support
+- document encodings
+
+- guess encoding support?
+
+- html formatter: full document, external css file?
 
 - improve guess_lexer heuristics (esp. for template langs)
 
-- more unit tests (pygmentize...)
+- more unit tests (pygmentize, all formatters)
 
 - help for -O and -a cmdline options
 
diff --git a/pygments/__init__.py b/pygments/__init__.py
index 280fe0bb..6a7f9f9b 100644
--- a/pygments/__init__.py
+++ b/pygments/__init__.py
@@ -33,8 +33,7 @@ __all__ = ['lex', 'format', 'highlight']
 
 
 import sys, os
-# using StringIO because it can handle Unicode strings
-from StringIO import StringIO
+from cStringIO import StringIO
 
 from pygments.util import OptionError
 from pygments.lexers import LEXERS, get_lexer_by_name, get_lexer_for_filename
diff --git a/pygments/formatter.py b/pygments/formatter.py
index d6b48475..4082150b 100644
--- a/pygments/formatter.py
+++ b/pygments/formatter.py
@@ -38,12 +38,17 @@ class Formatter(object):
     ``title``
         If ``full`` is true, the title that should be used to
         caption the document (default: '').
+    ``encoding``
+        If given, must be an encoding name. This will be used to
+        convert the Unicode token strings to byte strings in the
+        output (default: 'latin1').
     """
 
     def __init__(self, **options):
         self.style = _lookup_style(options.get('style', 'default'))
         self.full  = get_bool_opt(options, 'full', False)
         self.title = options.get('title', '')
+        self.encoding = options.get('encoding', 'latin1') 
         self.options = options
 
     def get_style_defs(self, arg=''):
diff --git a/pygments/formatters/bbcode.py b/pygments/formatters/bbcode.py
index 228c0a4d..a8663ea9 100644
--- a/pygments/formatters/bbcode.py
+++ b/pygments/formatters/bbcode.py
@@ -76,6 +76,7 @@ class BBCodeFormatter(Formatter):
         lasttype = None
 
         for ttype, value in tokensource:
+            value = value.encode(self.encoding)
             while ttype not in self.styles:
                 ttype = ttype.parent
             if ttype == lasttype:
diff --git a/pygments/formatters/html.py b/pygments/formatters/html.py
index 00723ede..33aada8d 100644
--- a/pygments/formatters/html.py
+++ b/pygments/formatters/html.py
@@ -8,7 +8,7 @@
     :copyright: 2006 by Georg Brandl, Armin Ronacher.
     :license: GNU LGPL, see LICENSE for more details.
 """
-import StringIO
+import cStringIO
 
 from pygments.formatter import Formatter
 from pygments.token import Token, Text, STANDARD_TYPES
@@ -57,6 +57,7 @@ DOC_TEMPLATE = '''\
 <html>
 <head>
   <title>%(title)s</title>
+  <meta http-equiv="content-type" content="text/html; charset=%(encoding)s">
   <style type="text/css">
 td.linenos { background-color: #f0f0f0; padding-right: 10px; }
 %(styledefs)s
@@ -191,7 +192,7 @@ class HtmlFormatter(Formatter):
         write = outfile.write
         lspan = ''
         for ttype, value in tokensource:
-            htmlvalue = escape_html(value)
+            htmlvalue = escape_html(value.encode(self.encoding))
             if lnos:
                 lncount += value.count("\n")
 
@@ -235,7 +236,7 @@ class HtmlFormatter(Formatter):
         div = ('<div' + (self.cssclass and ' class="%s" ' % self.cssclass)
                + (self.cssstyles and ' style="%s"' % self.cssstyles) + '>')
         if full or lnos:
-            outfile = StringIO.StringIO()
+            outfile = cStringIO.StringIO()
         else:
             outfile.write(div)
 
@@ -271,6 +272,7 @@ class HtmlFormatter(Formatter):
             realoutfile.write(DOC_TEMPLATE %
                 dict(title     = self.title,
                      styledefs = self.get_style_defs('body'),
+                     encoding  = self.encoding,
                      code      = ret))
         elif lnos:
             realoutfile.write(ret + '</div>\n')
diff --git a/pygments/formatters/latex.py b/pygments/formatters/latex.py
index 3cc9d219..27b0c5fe 100644
--- a/pygments/formatters/latex.py
+++ b/pygments/formatters/latex.py
@@ -8,7 +8,7 @@
     :copyright: 2006 by Georg Brandl.
     :license: GNU LGPL, see LICENSE for more details.
 """
-import StringIO
+import cStringIO
 
 from pygments.formatter import Formatter
 from pygments.token import Token
@@ -31,6 +31,7 @@ DOC_TEMPLATE = r'''
 \documentclass{%(docclass)s}
 \usepackage{fancyvrb}
 \usepackage{color}
+\usepackage[%(encoding)s]{inputenc}
 %(preamble)s
 
 %(styledefs)s
@@ -151,7 +152,7 @@ class LatexFormatter(Formatter):
 
         if self.full:
             realoutfile = outfile
-            outfile = StringIO.StringIO()
+            outfile = cStringIO.StringIO()
 
         outfile.write(r'\begin{Verbatim}[commandchars=@\[\]')
         if self.linenos:
@@ -164,7 +165,7 @@ class LatexFormatter(Formatter):
         outfile.write(']\n')
 
         for ttype, value in tokensource:
-            value = escape_tex(value)
+            value = escape_tex(value.encode(self.encoding))
             cmd = self.ttype2cmd.get(ttype)
             while cmd is None:
                 ttype = ttype.parent
@@ -187,5 +188,6 @@ class LatexFormatter(Formatter):
                 dict(docclass  = self.docclass,
                      preamble  = self.preamble,
                      title     = self.title,
+                     encoding  = self.encoding,
                      styledefs = self.get_style_defs(),
                      code      = outfile.getvalue()))
diff --git a/pygments/formatters/other.py b/pygments/formatters/other.py
index a3657bbb..affbbfc2 100644
--- a/pygments/formatters/other.py
+++ b/pygments/formatters/other.py
@@ -21,7 +21,7 @@ class NullFormatter(Formatter):
     """
     def format(self, tokensource, outfile):
         for ttype, value in tokensource:
-            outfile.write(value)
+            outfile.write(value.encode(self.encoding))
 
 
 class RawTokenFormatter(Formatter):
@@ -60,8 +60,9 @@ class RawTokenFormatter(Formatter):
             flush = outfile.flush
 
         lasttype = None
-        lastval = ''
+        lastval = u''
         for ttype, value in tokensource:
+            value = value.encode(self.encoding)
             if ttype is lasttype:
                 lastval += value
             else:
diff --git a/pygments/formatters/terminal.py b/pygments/formatters/terminal.py
index b1756c54..11f0b26a 100644
--- a/pygments/formatters/terminal.py
+++ b/pygments/formatters/terminal.py
@@ -79,6 +79,7 @@ class TerminalFormatter(Formatter):
     def format(self, tokensource, outfile):
         dbg = self.debug
         for ttype, value in tokensource:
+            value = value.encode(self.encoding)
             color = self.colorscheme.get(ttype)
             while color is None:
                 ttype = ttype[:-1]
diff --git a/pygments/lexer.py b/pygments/lexer.py
index cd9671fd..e19526b8 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -50,6 +50,10 @@ class Lexer(object):
         (default: False).
     ``tabsize``
         If given and greater than 0, expand tabs in the input (default: 0).
+    ``encoding``
+        If given, must be an encoding name. This encoding will be used to
+        convert the input string to Unicode, if it is not already a Unicode
+        string. The default is to use latin1 (default: 'latin1'). 
     """
 
     #: Name of the lexer
@@ -74,7 +78,7 @@ class Lexer(object):
         self.stripnl = get_bool_opt(options, 'stripnl', True)
         self.stripall = get_bool_opt(options, 'stripall', False)
         self.tabsize = get_int_opt(options, 'tabsize', 0)
-        self.encoding = options.get('encoding', '')
+        self.encoding = options.get('encoding', 'latin1')
 
     def __repr__(self):
         if self.options:
@@ -103,7 +107,10 @@ class Lexer(object):
 
         Also preprocess the text, i.e. expand tabs and strip it if wanted.
         """
-        text = type(text)('\n').join(text.splitlines())
+        if isinstance(text, unicode):
+            text = u'\n'.join(text.splitlines())
+        else:
+            text = '\n'.join(text.splitlines()).decode(self.encoding)
         if self.stripall:
             text = text.strip()
         elif self.stripnl:
@@ -411,7 +418,7 @@ class RegexLexer(Lexer):
                         pos += 1
                         statestack = ['root']
                         statetokens = self._tokens['root']
-                        yield pos, Text, '\n'
+                        yield pos, Text, u'\n'
                         continue
                     yield pos, Error, text[pos]
                     pos += 1
@@ -488,7 +495,7 @@ class ExtendedRegexLexer(RegexLexer):
                         ctx.pos += 1
                         ctx.stack = ['root']
                         statetokens = self._tokens['root']
-                        yield ctx.pos, Text, '\n'
+                        yield ctx.pos, Text, u'\n'
                         continue
                     yield ctx.pos, Error, text[ctx.pos]
                     ctx.pos += 1
diff --git a/pygments/lexers/agile.py b/pygments/lexers/agile.py
index ee0cd368..bad230d6 100644
--- a/pygments/lexers/agile.py
+++ b/pygments/lexers/agile.py
@@ -780,7 +780,7 @@ class LuaLexer(RegexLexer):
                 elif '.' in value:
                     a, b = value.split('.')
                     yield index, Name, a
-                    yield index + len(a), Text, '.'
+                    yield index + len(a), Text, u'.'
                     yield index + len(a) + 1, Name, b
                     continue
             yield index, token, value
diff --git a/tests/test_basic_api.py b/tests/test_basic_api.py
index e5ba623d..41aa01dc 100644
--- a/tests/test_basic_api.py
+++ b/tests/test_basic_api.py
@@ -22,8 +22,8 @@ class LexersTest(unittest.TestCase):
 
     def test_import_all(self):
         # instantiate every lexer, to see if the token type defs are correct
-        for x in pygments.lexers.LEXERS.keys():
-            c = getattr(pygments.lexers, x)()
+        for x in lexers.LEXERS.keys():
+            c = getattr(lexers, x)()
 
     def test_lexer_classes(self):
         a = self.assert_
@@ -41,7 +41,9 @@ class LexersTest(unittest.TestCase):
             for token in tokens:
                 a(isinstance(token, tuple))
                 a(isinstance(token[0], _TokenType))
-                a(isinstance(token[1], str))
+                if isinstance(token[1], str):
+                    print repr(token[1])
+                a(isinstance(token[1], unicode))
                 txt += token[1]
             ae(txt, test_content, "%s lexer roundtrip failed: %r != %r" %
                     (lexer.name, test_content, txt))
diff --git a/tests/test_examplefiles.py b/tests/test_examplefiles.py
index 6347ab88..247f986d 100644
--- a/tests/test_examplefiles.py
+++ b/tests/test_examplefiles.py
@@ -38,11 +38,12 @@ for fn in os.listdir(os.path.join(testdir, 'examplefiles')):
     def test(self, lx=lx, absfn=absfn):
         text = file(absfn, 'U').read()
         text = text.strip('\n') + '\n'
-        ntext = ''
+        text = text.decode('latin1')
+        ntext = []
         for type, val in lx.get_tokens(text):
-            ntext += val
+            ntext.append(val)
             self.failIf(type == Error, 'lexer generated error token for '+absfn)
-        if ntext != text:
+        if u''.join(ntext) != text:
             self.fail('round trip failed for '+absfn)
 
     setattr(ExampleFileTest, 'test_%i' % lfd, test)