summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgbrandl <devnull@localhost>2006-10-31 23:46:24 +0100
committergbrandl <devnull@localhost>2006-10-31 23:46:24 +0100
commit729df9d55ee975be89f18fa90964f19bead60feb (patch)
tree641a0511cbcfa260d1d4e2287f0ede38e50504d8
parent026d59d92a7b574323484fae8a21c9bcc2401517 (diff)
downloadpygments-729df9d55ee975be89f18fa90964f19bead60feb.tar.gz
[svn] Add encoding support. All processing is now done with unicode strings.
-rw-r--r--TODO8
-rw-r--r--pygments/__init__.py3
-rw-r--r--pygments/formatter.py5
-rw-r--r--pygments/formatters/bbcode.py1
-rw-r--r--pygments/formatters/html.py8
-rw-r--r--pygments/formatters/latex.py8
-rw-r--r--pygments/formatters/other.py5
-rw-r--r--pygments/formatters/terminal.py1
-rw-r--r--pygments/lexer.py15
-rw-r--r--pygments/lexers/agile.py2
-rw-r--r--tests/test_basic_api.py8
-rw-r--r--tests/test_examplefiles.py7
12 files changed, 48 insertions, 23 deletions
diff --git a/TODO b/TODO
index d6052264..663938b5 100644
--- a/TODO
+++ b/TODO
@@ -4,11 +4,15 @@ Todo
for 0.6
-------
-- encoding, unicode support
+- document encodings
+
+- guess encoding support?
+
+- html formatter: full document, external css file?
- improve guess_lexer heuristics (esp. for template langs)
-- more unit tests (pygmentize...)
+- more unit tests (pygmentize, all formatters)
- help for -O and -a cmdline options
diff --git a/pygments/__init__.py b/pygments/__init__.py
index 280fe0bb..6a7f9f9b 100644
--- a/pygments/__init__.py
+++ b/pygments/__init__.py
@@ -33,8 +33,7 @@ __all__ = ['lex', 'format', 'highlight']
import sys, os
-# using StringIO because it can handle Unicode strings
-from StringIO import StringIO
+from cStringIO import StringIO
from pygments.util import OptionError
from pygments.lexers import LEXERS, get_lexer_by_name, get_lexer_for_filename
diff --git a/pygments/formatter.py b/pygments/formatter.py
index d6b48475..4082150b 100644
--- a/pygments/formatter.py
+++ b/pygments/formatter.py
@@ -38,12 +38,17 @@ class Formatter(object):
``title``
If ``full`` is true, the title that should be used to
caption the document (default: '').
+ ``encoding``
+ If given, must be an encoding name. This will be used to
+ convert the Unicode token strings to byte strings in the
+ output (default: 'latin1').
"""
def __init__(self, **options):
self.style = _lookup_style(options.get('style', 'default'))
self.full = get_bool_opt(options, 'full', False)
self.title = options.get('title', '')
+ self.encoding = options.get('encoding', 'latin1')
self.options = options
def get_style_defs(self, arg=''):
diff --git a/pygments/formatters/bbcode.py b/pygments/formatters/bbcode.py
index 228c0a4d..a8663ea9 100644
--- a/pygments/formatters/bbcode.py
+++ b/pygments/formatters/bbcode.py
@@ -76,6 +76,7 @@ class BBCodeFormatter(Formatter):
lasttype = None
for ttype, value in tokensource:
+ value = value.encode(self.encoding)
while ttype not in self.styles:
ttype = ttype.parent
if ttype == lasttype:
diff --git a/pygments/formatters/html.py b/pygments/formatters/html.py
index 00723ede..33aada8d 100644
--- a/pygments/formatters/html.py
+++ b/pygments/formatters/html.py
@@ -8,7 +8,7 @@
:copyright: 2006 by Georg Brandl, Armin Ronacher.
:license: GNU LGPL, see LICENSE for more details.
"""
-import StringIO
+import cStringIO
from pygments.formatter import Formatter
from pygments.token import Token, Text, STANDARD_TYPES
@@ -57,6 +57,7 @@ DOC_TEMPLATE = '''\
<html>
<head>
<title>%(title)s</title>
+ <meta http-equiv="content-type" content="text/html; charset=%(encoding)s">
<style type="text/css">
td.linenos { background-color: #f0f0f0; padding-right: 10px; }
%(styledefs)s
@@ -191,7 +192,7 @@ class HtmlFormatter(Formatter):
write = outfile.write
lspan = ''
for ttype, value in tokensource:
- htmlvalue = escape_html(value)
+ htmlvalue = escape_html(value.encode(self.encoding))
if lnos:
lncount += value.count("\n")
@@ -235,7 +236,7 @@ class HtmlFormatter(Formatter):
div = ('<div' + (self.cssclass and ' class="%s" ' % self.cssclass)
+ (self.cssstyles and ' style="%s"' % self.cssstyles) + '>')
if full or lnos:
- outfile = StringIO.StringIO()
+ outfile = cStringIO.StringIO()
else:
outfile.write(div)
@@ -271,6 +272,7 @@ class HtmlFormatter(Formatter):
realoutfile.write(DOC_TEMPLATE %
dict(title = self.title,
styledefs = self.get_style_defs('body'),
+ encoding = self.encoding,
code = ret))
elif lnos:
realoutfile.write(ret + '</div>\n')
diff --git a/pygments/formatters/latex.py b/pygments/formatters/latex.py
index 3cc9d219..27b0c5fe 100644
--- a/pygments/formatters/latex.py
+++ b/pygments/formatters/latex.py
@@ -8,7 +8,7 @@
:copyright: 2006 by Georg Brandl.
:license: GNU LGPL, see LICENSE for more details.
"""
-import StringIO
+import cStringIO
from pygments.formatter import Formatter
from pygments.token import Token
@@ -31,6 +31,7 @@ DOC_TEMPLATE = r'''
\documentclass{%(docclass)s}
\usepackage{fancyvrb}
\usepackage{color}
+\usepackage[%(encoding)s]{inputenc}
%(preamble)s
%(styledefs)s
@@ -151,7 +152,7 @@ class LatexFormatter(Formatter):
if self.full:
realoutfile = outfile
- outfile = StringIO.StringIO()
+ outfile = cStringIO.StringIO()
outfile.write(r'\begin{Verbatim}[commandchars=@\[\]')
if self.linenos:
@@ -164,7 +165,7 @@ class LatexFormatter(Formatter):
outfile.write(']\n')
for ttype, value in tokensource:
- value = escape_tex(value)
+ value = escape_tex(value.encode(self.encoding))
cmd = self.ttype2cmd.get(ttype)
while cmd is None:
ttype = ttype.parent
@@ -187,5 +188,6 @@ class LatexFormatter(Formatter):
dict(docclass = self.docclass,
preamble = self.preamble,
title = self.title,
+ encoding = self.encoding,
styledefs = self.get_style_defs(),
code = outfile.getvalue()))
diff --git a/pygments/formatters/other.py b/pygments/formatters/other.py
index a3657bbb..affbbfc2 100644
--- a/pygments/formatters/other.py
+++ b/pygments/formatters/other.py
@@ -21,7 +21,7 @@ class NullFormatter(Formatter):
"""
def format(self, tokensource, outfile):
for ttype, value in tokensource:
- outfile.write(value)
+ outfile.write(value.encode(self.encoding))
class RawTokenFormatter(Formatter):
@@ -60,8 +60,9 @@ class RawTokenFormatter(Formatter):
flush = outfile.flush
lasttype = None
- lastval = ''
+ lastval = u''
for ttype, value in tokensource:
+ value = value.encode(self.encoding)
if ttype is lasttype:
lastval += value
else:
diff --git a/pygments/formatters/terminal.py b/pygments/formatters/terminal.py
index b1756c54..11f0b26a 100644
--- a/pygments/formatters/terminal.py
+++ b/pygments/formatters/terminal.py
@@ -79,6 +79,7 @@ class TerminalFormatter(Formatter):
def format(self, tokensource, outfile):
dbg = self.debug
for ttype, value in tokensource:
+ value = value.encode(self.encoding)
color = self.colorscheme.get(ttype)
while color is None:
ttype = ttype[:-1]
diff --git a/pygments/lexer.py b/pygments/lexer.py
index cd9671fd..e19526b8 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -50,6 +50,10 @@ class Lexer(object):
(default: False).
``tabsize``
If given and greater than 0, expand tabs in the input (default: 0).
+ ``encoding``
+ If given, must be an encoding name. This encoding will be used to
+ convert the input string to Unicode, if it is not already a Unicode
+ string. The default is to use latin1 (default: 'latin1').
"""
#: Name of the lexer
@@ -74,7 +78,7 @@ class Lexer(object):
self.stripnl = get_bool_opt(options, 'stripnl', True)
self.stripall = get_bool_opt(options, 'stripall', False)
self.tabsize = get_int_opt(options, 'tabsize', 0)
- self.encoding = options.get('encoding', '')
+ self.encoding = options.get('encoding', 'latin1')
def __repr__(self):
if self.options:
@@ -103,7 +107,10 @@ class Lexer(object):
Also preprocess the text, i.e. expand tabs and strip it if wanted.
"""
- text = type(text)('\n').join(text.splitlines())
+ if isinstance(text, unicode):
+ text = u'\n'.join(text.splitlines())
+ else:
+ text = '\n'.join(text.splitlines()).decode(self.encoding)
if self.stripall:
text = text.strip()
elif self.stripnl:
@@ -411,7 +418,7 @@ class RegexLexer(Lexer):
pos += 1
statestack = ['root']
statetokens = self._tokens['root']
- yield pos, Text, '\n'
+ yield pos, Text, u'\n'
continue
yield pos, Error, text[pos]
pos += 1
@@ -488,7 +495,7 @@ class ExtendedRegexLexer(RegexLexer):
ctx.pos += 1
ctx.stack = ['root']
statetokens = self._tokens['root']
- yield ctx.pos, Text, '\n'
+ yield ctx.pos, Text, u'\n'
continue
yield ctx.pos, Error, text[ctx.pos]
ctx.pos += 1
diff --git a/pygments/lexers/agile.py b/pygments/lexers/agile.py
index ee0cd368..bad230d6 100644
--- a/pygments/lexers/agile.py
+++ b/pygments/lexers/agile.py
@@ -780,7 +780,7 @@ class LuaLexer(RegexLexer):
elif '.' in value:
a, b = value.split('.')
yield index, Name, a
- yield index + len(a), Text, '.'
+ yield index + len(a), Text, u'.'
yield index + len(a) + 1, Name, b
continue
yield index, token, value
diff --git a/tests/test_basic_api.py b/tests/test_basic_api.py
index e5ba623d..41aa01dc 100644
--- a/tests/test_basic_api.py
+++ b/tests/test_basic_api.py
@@ -22,8 +22,8 @@ class LexersTest(unittest.TestCase):
def test_import_all(self):
# instantiate every lexer, to see if the token type defs are correct
- for x in pygments.lexers.LEXERS.keys():
- c = getattr(pygments.lexers, x)()
+ for x in lexers.LEXERS.keys():
+ c = getattr(lexers, x)()
def test_lexer_classes(self):
a = self.assert_
@@ -41,7 +41,9 @@ class LexersTest(unittest.TestCase):
for token in tokens:
a(isinstance(token, tuple))
a(isinstance(token[0], _TokenType))
- a(isinstance(token[1], str))
+ if isinstance(token[1], str):
+ print repr(token[1])
+ a(isinstance(token[1], unicode))
txt += token[1]
ae(txt, test_content, "%s lexer roundtrip failed: %r != %r" %
(lexer.name, test_content, txt))
diff --git a/tests/test_examplefiles.py b/tests/test_examplefiles.py
index 6347ab88..247f986d 100644
--- a/tests/test_examplefiles.py
+++ b/tests/test_examplefiles.py
@@ -38,11 +38,12 @@ for fn in os.listdir(os.path.join(testdir, 'examplefiles')):
def test(self, lx=lx, absfn=absfn):
text = file(absfn, 'U').read()
text = text.strip('\n') + '\n'
- ntext = ''
+ text = text.decode('latin1')
+ ntext = []
for type, val in lx.get_tokens(text):
- ntext += val
+ ntext.append(val)
self.failIf(type == Error, 'lexer generated error token for '+absfn)
- if ntext != text:
+ if u''.join(ntext) != text:
self.fail('round trip failed for '+absfn)
setattr(ExampleFileTest, 'test_%i' % lfd, test)