summaryrefslogtreecommitdiff
path: root/pygments/lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r--pygments/lexer.py15
1 files changed, 11 insertions, 4 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index cd9671fd..e19526b8 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -50,6 +50,10 @@ class Lexer(object):
(default: False).
``tabsize``
If given and greater than 0, expand tabs in the input (default: 0).
+ ``encoding``
+ If given, must be an encoding name. This encoding will be used to
+ convert the input string to Unicode, if it is not already a Unicode
+ string. The default is to use latin1 (default: 'latin1').
"""
#: Name of the lexer
@@ -74,7 +78,7 @@ class Lexer(object):
self.stripnl = get_bool_opt(options, 'stripnl', True)
self.stripall = get_bool_opt(options, 'stripall', False)
self.tabsize = get_int_opt(options, 'tabsize', 0)
- self.encoding = options.get('encoding', '')
+ self.encoding = options.get('encoding', 'latin1')
def __repr__(self):
if self.options:
@@ -103,7 +107,10 @@ class Lexer(object):
Also preprocess the text, i.e. expand tabs and strip it if wanted.
"""
- text = type(text)('\n').join(text.splitlines())
+ if isinstance(text, unicode):
+ text = u'\n'.join(text.splitlines())
+ else:
+ text = '\n'.join(text.splitlines()).decode(self.encoding)
if self.stripall:
text = text.strip()
elif self.stripnl:
@@ -411,7 +418,7 @@ class RegexLexer(Lexer):
pos += 1
statestack = ['root']
statetokens = self._tokens['root']
- yield pos, Text, '\n'
+ yield pos, Text, u'\n'
continue
yield pos, Error, text[pos]
pos += 1
@@ -488,7 +495,7 @@ class ExtendedRegexLexer(RegexLexer):
ctx.pos += 1
ctx.stack = ['root']
statetokens = self._tokens['root']
- yield ctx.pos, Text, '\n'
+ yield ctx.pos, Text, u'\n'
continue
yield ctx.pos, Error, text[ctx.pos]
ctx.pos += 1