diff options
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r-- | pygments/lexer.py | 15 |
1 files changed, 11 insertions, 4 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py index cd9671fd..e19526b8 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -50,6 +50,10 @@ class Lexer(object): (default: False). ``tabsize`` If given and greater than 0, expand tabs in the input (default: 0). + ``encoding`` + If given, must be an encoding name. This encoding will be used to + convert the input string to Unicode, if it is not already a Unicode + string. The default is to use latin1 (default: 'latin1'). """ #: Name of the lexer @@ -74,7 +78,7 @@ class Lexer(object): self.stripnl = get_bool_opt(options, 'stripnl', True) self.stripall = get_bool_opt(options, 'stripall', False) self.tabsize = get_int_opt(options, 'tabsize', 0) - self.encoding = options.get('encoding', '') + self.encoding = options.get('encoding', 'latin1') def __repr__(self): if self.options: @@ -103,7 +107,10 @@ class Lexer(object): Also preprocess the text, i.e. expand tabs and strip it if wanted. """ - text = type(text)('\n').join(text.splitlines()) + if isinstance(text, unicode): + text = u'\n'.join(text.splitlines()) + else: + text = '\n'.join(text.splitlines()).decode(self.encoding) if self.stripall: text = text.strip() elif self.stripnl: @@ -411,7 +418,7 @@ class RegexLexer(Lexer): pos += 1 statestack = ['root'] statetokens = self._tokens['root'] - yield pos, Text, '\n' + yield pos, Text, u'\n' continue yield pos, Error, text[pos] pos += 1 @@ -488,7 +495,7 @@ class ExtendedRegexLexer(RegexLexer): ctx.pos += 1 ctx.stack = ['root'] statetokens = self._tokens['root'] - yield ctx.pos, Text, '\n' + yield ctx.pos, Text, u'\n' continue yield ctx.pos, Error, text[ctx.pos] ctx.pos += 1 |