diff options
-rw-r--r-- | TODO | 8 | ||||
-rw-r--r-- | pygments/__init__.py | 3 | ||||
-rw-r--r-- | pygments/formatter.py | 5 | ||||
-rw-r--r-- | pygments/formatters/bbcode.py | 1 | ||||
-rw-r--r-- | pygments/formatters/html.py | 8 | ||||
-rw-r--r-- | pygments/formatters/latex.py | 8 | ||||
-rw-r--r-- | pygments/formatters/other.py | 5 | ||||
-rw-r--r-- | pygments/formatters/terminal.py | 1 | ||||
-rw-r--r-- | pygments/lexer.py | 15 | ||||
-rw-r--r-- | pygments/lexers/agile.py | 2 | ||||
-rw-r--r-- | tests/test_basic_api.py | 8 | ||||
-rw-r--r-- | tests/test_examplefiles.py | 7 |
12 files changed, 48 insertions, 23 deletions
@@ -4,11 +4,15 @@ Todo for 0.6 ------- -- encoding, unicode support +- document encodings + +- guess encoding support? + +- html formatter: full document, external css file? - improve guess_lexer heuristics (esp. for template langs) -- more unit tests (pygmentize...) +- more unit tests (pygmentize, all formatters) - help for -O and -a cmdline options diff --git a/pygments/__init__.py b/pygments/__init__.py index 280fe0bb..6a7f9f9b 100644 --- a/pygments/__init__.py +++ b/pygments/__init__.py @@ -33,8 +33,7 @@ __all__ = ['lex', 'format', 'highlight'] import sys, os -# using StringIO because it can handle Unicode strings -from StringIO import StringIO +from cStringIO import StringIO from pygments.util import OptionError from pygments.lexers import LEXERS, get_lexer_by_name, get_lexer_for_filename diff --git a/pygments/formatter.py b/pygments/formatter.py index d6b48475..4082150b 100644 --- a/pygments/formatter.py +++ b/pygments/formatter.py @@ -38,12 +38,17 @@ class Formatter(object): ``title`` If ``full`` is true, the title that should be used to caption the document (default: ''). + ``encoding`` + If given, must be an encoding name. This will be used to + convert the Unicode token strings to byte strings in the + output (default: 'latin1'). """ def __init__(self, **options): self.style = _lookup_style(options.get('style', 'default')) self.full = get_bool_opt(options, 'full', False) self.title = options.get('title', '') + self.encoding = options.get('encoding', 'latin1') self.options = options def get_style_defs(self, arg=''): diff --git a/pygments/formatters/bbcode.py b/pygments/formatters/bbcode.py index 228c0a4d..a8663ea9 100644 --- a/pygments/formatters/bbcode.py +++ b/pygments/formatters/bbcode.py @@ -76,6 +76,7 @@ class BBCodeFormatter(Formatter): lasttype = None for ttype, value in tokensource: + value = value.encode(self.encoding) while ttype not in self.styles: ttype = ttype.parent if ttype == lasttype: diff --git a/pygments/formatters/html.py b/pygments/formatters/html.py index 00723ede..33aada8d 100644 --- a/pygments/formatters/html.py +++ b/pygments/formatters/html.py @@ -8,7 +8,7 @@ :copyright: 2006 by Georg Brandl, Armin Ronacher. :license: GNU LGPL, see LICENSE for more details. """ -import StringIO +import cStringIO from pygments.formatter import Formatter from pygments.token import Token, Text, STANDARD_TYPES @@ -57,6 +57,7 @@ DOC_TEMPLATE = '''\ <html> <head> <title>%(title)s</title> + <meta http-equiv="content-type" content="text/html; charset=%(encoding)s"> <style type="text/css"> td.linenos { background-color: #f0f0f0; padding-right: 10px; } %(styledefs)s @@ -191,7 +192,7 @@ class HtmlFormatter(Formatter): write = outfile.write lspan = '' for ttype, value in tokensource: - htmlvalue = escape_html(value) + htmlvalue = escape_html(value.encode(self.encoding)) if lnos: lncount += value.count("\n") @@ -235,7 +236,7 @@ class HtmlFormatter(Formatter): div = ('<div' + (self.cssclass and ' class="%s" ' % self.cssclass) + (self.cssstyles and ' style="%s"' % self.cssstyles) + '>') if full or lnos: - outfile = StringIO.StringIO() + outfile = cStringIO.StringIO() else: outfile.write(div) @@ -271,6 +272,7 @@ class HtmlFormatter(Formatter): realoutfile.write(DOC_TEMPLATE % dict(title = self.title, styledefs = self.get_style_defs('body'), + encoding = self.encoding, code = ret)) elif lnos: realoutfile.write(ret + '</div>\n') diff --git a/pygments/formatters/latex.py b/pygments/formatters/latex.py index 3cc9d219..27b0c5fe 100644 --- a/pygments/formatters/latex.py +++ b/pygments/formatters/latex.py @@ -8,7 +8,7 @@ :copyright: 2006 by Georg Brandl. :license: GNU LGPL, see LICENSE for more details. """ -import StringIO +import cStringIO from pygments.formatter import Formatter from pygments.token import Token @@ -31,6 +31,7 @@ DOC_TEMPLATE = r''' \documentclass{%(docclass)s} \usepackage{fancyvrb} \usepackage{color} +\usepackage[%(encoding)s]{inputenc} %(preamble)s %(styledefs)s @@ -151,7 +152,7 @@ class LatexFormatter(Formatter): if self.full: realoutfile = outfile - outfile = StringIO.StringIO() + outfile = cStringIO.StringIO() outfile.write(r'\begin{Verbatim}[commandchars=@\[\]') if self.linenos: @@ -164,7 +165,7 @@ class LatexFormatter(Formatter): outfile.write(']\n') for ttype, value in tokensource: - value = escape_tex(value) + value = escape_tex(value.encode(self.encoding)) cmd = self.ttype2cmd.get(ttype) while cmd is None: ttype = ttype.parent @@ -187,5 +188,6 @@ class LatexFormatter(Formatter): dict(docclass = self.docclass, preamble = self.preamble, title = self.title, + encoding = self.encoding, styledefs = self.get_style_defs(), code = outfile.getvalue())) diff --git a/pygments/formatters/other.py b/pygments/formatters/other.py index a3657bbb..affbbfc2 100644 --- a/pygments/formatters/other.py +++ b/pygments/formatters/other.py @@ -21,7 +21,7 @@ class NullFormatter(Formatter): """ def format(self, tokensource, outfile): for ttype, value in tokensource: - outfile.write(value) + outfile.write(value.encode(self.encoding)) class RawTokenFormatter(Formatter): @@ -60,8 +60,9 @@ class RawTokenFormatter(Formatter): flush = outfile.flush lasttype = None - lastval = '' + lastval = u'' for ttype, value in tokensource: + value = value.encode(self.encoding) if ttype is lasttype: lastval += value else: diff --git a/pygments/formatters/terminal.py b/pygments/formatters/terminal.py index b1756c54..11f0b26a 100644 --- a/pygments/formatters/terminal.py +++ b/pygments/formatters/terminal.py @@ -79,6 +79,7 @@ class TerminalFormatter(Formatter): def format(self, tokensource, outfile): dbg = self.debug for ttype, value in tokensource: + value = value.encode(self.encoding) color = self.colorscheme.get(ttype) while color is None: ttype = ttype[:-1] diff --git a/pygments/lexer.py b/pygments/lexer.py index cd9671fd..e19526b8 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -50,6 +50,10 @@ class Lexer(object): (default: False). ``tabsize`` If given and greater than 0, expand tabs in the input (default: 0). + ``encoding`` + If given, must be an encoding name. This encoding will be used to + convert the input string to Unicode, if it is not already a Unicode + string. The default is to use latin1 (default: 'latin1'). """ #: Name of the lexer @@ -74,7 +78,7 @@ class Lexer(object): self.stripnl = get_bool_opt(options, 'stripnl', True) self.stripall = get_bool_opt(options, 'stripall', False) self.tabsize = get_int_opt(options, 'tabsize', 0) - self.encoding = options.get('encoding', '') + self.encoding = options.get('encoding', 'latin1') def __repr__(self): if self.options: @@ -103,7 +107,10 @@ class Lexer(object): Also preprocess the text, i.e. expand tabs and strip it if wanted. """ - text = type(text)('\n').join(text.splitlines()) + if isinstance(text, unicode): + text = u'\n'.join(text.splitlines()) + else: + text = '\n'.join(text.splitlines()).decode(self.encoding) if self.stripall: text = text.strip() elif self.stripnl: @@ -411,7 +418,7 @@ class RegexLexer(Lexer): pos += 1 statestack = ['root'] statetokens = self._tokens['root'] - yield pos, Text, '\n' + yield pos, Text, u'\n' continue yield pos, Error, text[pos] pos += 1 @@ -488,7 +495,7 @@ class ExtendedRegexLexer(RegexLexer): ctx.pos += 1 ctx.stack = ['root'] statetokens = self._tokens['root'] - yield ctx.pos, Text, '\n' + yield ctx.pos, Text, u'\n' continue yield ctx.pos, Error, text[ctx.pos] ctx.pos += 1 diff --git a/pygments/lexers/agile.py b/pygments/lexers/agile.py index ee0cd368..bad230d6 100644 --- a/pygments/lexers/agile.py +++ b/pygments/lexers/agile.py @@ -780,7 +780,7 @@ class LuaLexer(RegexLexer): elif '.' in value: a, b = value.split('.') yield index, Name, a - yield index + len(a), Text, '.' + yield index + len(a), Text, u'.' yield index + len(a) + 1, Name, b continue yield index, token, value diff --git a/tests/test_basic_api.py b/tests/test_basic_api.py index e5ba623d..41aa01dc 100644 --- a/tests/test_basic_api.py +++ b/tests/test_basic_api.py @@ -22,8 +22,8 @@ class LexersTest(unittest.TestCase): def test_import_all(self): # instantiate every lexer, to see if the token type defs are correct - for x in pygments.lexers.LEXERS.keys(): - c = getattr(pygments.lexers, x)() + for x in lexers.LEXERS.keys(): + c = getattr(lexers, x)() def test_lexer_classes(self): a = self.assert_ @@ -41,7 +41,9 @@ class LexersTest(unittest.TestCase): for token in tokens: a(isinstance(token, tuple)) a(isinstance(token[0], _TokenType)) - a(isinstance(token[1], str)) + if isinstance(token[1], str): + print repr(token[1]) + a(isinstance(token[1], unicode)) txt += token[1] ae(txt, test_content, "%s lexer roundtrip failed: %r != %r" % (lexer.name, test_content, txt)) diff --git a/tests/test_examplefiles.py b/tests/test_examplefiles.py index 6347ab88..247f986d 100644 --- a/tests/test_examplefiles.py +++ b/tests/test_examplefiles.py @@ -38,11 +38,12 @@ for fn in os.listdir(os.path.join(testdir, 'examplefiles')): def test(self, lx=lx, absfn=absfn): text = file(absfn, 'U').read() text = text.strip('\n') + '\n' - ntext = '' + text = text.decode('latin1') + ntext = [] for type, val in lx.get_tokens(text): - ntext += val + ntext.append(val) self.failIf(type == Error, 'lexer generated error token for '+absfn) - if ntext != text: + if u''.join(ntext) != text: self.fail('round trip failed for '+absfn) setattr(ExampleFileTest, 'test_%i' % lfd, test) |