summaryrefslogtreecommitdiff
path: root/pygments/lexer.py
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2014-10-08 09:09:18 +0200
committerGeorg Brandl <georg@python.org>2014-10-08 09:09:18 +0200
commit6d063d8d28bda60f37b03c7fe130074f92932398 (patch)
tree946af08ab0d46558c58b246dd5d13fee9bcdfae3 /pygments/lexer.py
parent342f9b5f2720ab257cea0ca934f1c82d9cbfab72 (diff)
parentac3a01c3b86d36b4dc88a11dfe2dfe042ea1c208 (diff)
downloadpygments-6d063d8d28bda60f37b03c7fe130074f92932398.tar.gz
Merged in protz/pygments-main/add-envname (pull request #235)
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r--pygments/lexer.py212
1 files changed, 155 insertions, 57 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 8f88dfda..5b3ad358 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -5,27 +5,34 @@
Base lexer classes.
- :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS.
+ :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
-import re, itertools
+
+from __future__ import print_function
+
+import re
+import sys
+import time
+import itertools
from pygments.filter import apply_filters, Filter
from pygments.filters import get_filter_by_name
from pygments.token import Error, Text, Other, _TokenType
from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
- make_analysator
-
+ make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode
+from pygments.regexopt import regex_opt
__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
- 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this']
+ 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
+ 'default', 'words']
-_encoding_map = [('\xef\xbb\xbf', 'utf-8'),
- ('\xff\xfe\0\0', 'utf-32'),
- ('\0\0\xfe\xff', 'utf-32be'),
- ('\xff\xfe', 'utf-16'),
- ('\xfe\xff', 'utf-16be')]
+_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
+ (b'\xff\xfe\0\0', 'utf-32'),
+ (b'\0\0\xfe\xff', 'utf-32be'),
+ (b'\xff\xfe', 'utf-16'),
+ (b'\xfe\xff', 'utf-16be')]
_default_analyse = staticmethod(lambda x: 0.0)
@@ -42,6 +49,7 @@ class LexerMeta(type):
return type.__new__(cls, name, bases, d)
+@add_metaclass(LexerMeta)
class Lexer(object):
"""
Lexer for a specific language.
@@ -55,15 +63,20 @@ class Lexer(object):
``ensurenl``
Make sure that the input ends with a newline (default: True). This
is required for some lexers that consume input linewise.
- *New in Pygments 1.3.*
+
+ .. versionadded:: 1.3
+
``tabsize``
If given and greater than 0, expand tabs in the input (default: 0).
``encoding``
If given, must be an encoding name. This encoding will be used to
convert the input string to Unicode, if it is not already a Unicode
string (default: ``'latin1'``).
- Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
- ``'chardet'`` to use the chardet library, if it is installed.
+ Can also be ``'guess'`` to use a simple UTF-8 / Locale / Latin1
+ detection, or ``'chardet'`` to use the chardet library, if it is
+ installed.
+ ``inencoding``
+ Overrides the ``encoding`` if given.
"""
#: Name of the lexer
@@ -84,8 +97,6 @@ class Lexer(object):
#: Priority, should multiple lexers match and no content is provided
priority = 0
- __metaclass__ = LexerMeta
-
def __init__(self, **options):
self.options = options
self.stripnl = get_bool_opt(options, 'stripnl', True)
@@ -93,7 +104,7 @@ class Lexer(object):
self.ensurenl = get_bool_opt(options, 'ensurenl', True)
self.tabsize = get_int_opt(options, 'tabsize', 0)
self.encoding = options.get('encoding', 'latin1')
- # self.encoding = options.get('inencoding', None) or self.encoding
+ self.encoding = options.get('inencoding') or self.encoding
self.filters = []
for filter_ in get_list_opt(options, 'filters', ()):
self.add_filter(filter_)
@@ -136,14 +147,9 @@ class Lexer(object):
Also preprocess the text, i.e. expand tabs and strip it if
wanted and applies registered filters.
"""
- if not isinstance(text, unicode):
+ if not isinstance(text, text_type):
if self.encoding == 'guess':
- try:
- text = text.decode('utf-8')
- if text.startswith(u'\ufeff'):
- text = text[len(u'\ufeff'):]
- except UnicodeDecodeError:
- text = text.decode('latin1')
+ text, _ = guess_decode(text)
elif self.encoding == 'chardet':
try:
import chardet
@@ -155,17 +161,18 @@ class Lexer(object):
decoded = None
for bom, encoding in _encoding_map:
if text.startswith(bom):
- decoded = unicode(text[len(bom):], encoding,
- errors='replace')
+ decoded = text[len(bom):].decode(encoding, 'replace')
break
# no BOM found, so use chardet
if decoded is None:
- enc = chardet.detect(text[:1024]) # Guess using first 1KB
- decoded = unicode(text, enc.get('encoding') or 'utf-8',
- errors='replace')
+ enc = chardet.detect(text[:1024]) # Guess using first 1KB
+ decoded = text.decode(enc.get('encoding') or 'utf-8',
+ 'replace')
text = decoded
else:
text = text.decode(self.encoding)
+ if text.startswith(u'\ufeff'):
+ text = text[len(u'\ufeff'):]
else:
if text.startswith(u'\ufeff'):
text = text[len(u'\ufeff'):]
@@ -192,7 +199,9 @@ class Lexer(object):
def get_tokens_unprocessed(self, text):
"""
- Return an iterable of (tokentype, value) pairs.
+ Return an iterable of (index, tokentype, value) pairs where "index"
+ is the starting position of the token within the input text.
+
In subclasses, implement this method as a generator to
maximize effectiveness.
"""
@@ -233,7 +242,7 @@ class DelegatingLexer(Lexer):
self.root_lexer.get_tokens_unprocessed(buffered))
-#-------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
# RegexLexer and ExtendedRegexLexer
#
@@ -379,20 +388,50 @@ def using(_other, **kwargs):
return callback
+class default:
+ """
+ Indicates a state or state action (e.g. #pop) to apply.
+ For example default('#pop') is equivalent to ('', Token, '#pop')
+ Note that state tuples may be used as well.
+
+ .. versionadded:: 2.0
+ """
+ def __init__(self, state):
+ self.state = state
+
+
+class words(Future):
+ """
+ Indicates a list of literal words that is transformed into an optimized
+ regex that matches any of the words.
+
+ .. versionadded:: 2.0
+ """
+ def __init__(self, words, prefix='', suffix=''):
+ self.words = words
+ self.prefix = prefix
+ self.suffix = suffix
+
+ def get(self):
+ return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
+
+
class RegexLexerMeta(LexerMeta):
"""
Metaclass for RegexLexer, creates the self._tokens attribute from
self.tokens on the first instantiation.
"""
- def _process_regex(cls, regex, rflags):
+ def _process_regex(cls, regex, rflags, state):
"""Preprocess the regular expression component of a token definition."""
+ if isinstance(regex, Future):
+ regex = regex.get()
return re.compile(regex, rflags).match
def _process_token(cls, token):
"""Preprocess the token component of a token definition."""
assert type(token) is _TokenType or callable(token), \
- 'token type must be simple type or callable, not %r' % (token,)
+ 'token type must be simple type or callable, not %r' % (token,)
return token
def _process_new_state(cls, new_state, unprocessed, processed):
@@ -425,7 +464,7 @@ class RegexLexerMeta(LexerMeta):
for istate in new_state:
assert (istate in unprocessed or
istate in ('#pop', '#push')), \
- 'unknown new state ' + istate
+ 'unknown new state ' + istate
return new_state
else:
assert False, 'unknown new state def %r' % new_state
@@ -448,12 +487,16 @@ class RegexLexerMeta(LexerMeta):
if isinstance(tdef, _inherit):
# processed already
continue
+ if isinstance(tdef, default):
+ new_state = cls._process_new_state(tdef.state, unprocessed, processed)
+ tokens.append((re.compile('').match, None, new_state))
+ continue
assert type(tdef) is tuple, "wrong rule def %r" % tdef
try:
- rex = cls._process_regex(tdef[0], rflags)
- except Exception, err:
+ rex = cls._process_regex(tdef[0], rflags, state)
+ except Exception as err:
raise ValueError("uncompilable regex %r in state %r of %r: %s" %
(tdef[0], state, cls, err))
@@ -472,7 +515,7 @@ class RegexLexerMeta(LexerMeta):
"""Preprocess a dictionary of token definitions."""
processed = cls._all_tokens[name] = {}
tokendefs = tokendefs or cls.tokens[name]
- for state in tokendefs.keys():
+ for state in list(tokendefs):
cls._process_state(tokendefs, processed, state)
return processed
@@ -493,7 +536,7 @@ class RegexLexerMeta(LexerMeta):
for c in itertools.chain((cls,), cls.__mro__):
toks = c.__dict__.get('tokens', {})
- for state, items in toks.iteritems():
+ for state, items in iteritems(toks):
curitems = tokens.get(state)
if curitems is None:
tokens[state] = items
@@ -533,13 +576,13 @@ class RegexLexerMeta(LexerMeta):
return type.__call__(cls, *args, **kwds)
+@add_metaclass(RegexLexerMeta)
class RegexLexer(Lexer):
"""
Base for simple stateful regular expression-based lexers.
Simplifies the lexing process so that you need only
provide a list of states and regular expressions.
"""
- __metaclass__ = RegexLexerMeta
#: Flags for compiling the regular expressions.
#: Defaults to MULTILINE.
@@ -578,11 +621,12 @@ class RegexLexer(Lexer):
for rexmatch, action, new_state in statetokens:
m = rexmatch(text, pos)
if m:
- if type(action) is _TokenType:
- yield pos, action, m.group()
- else:
- for item in action(self, m):
- yield item
+ if action is not None:
+ if type(action) is _TokenType:
+ yield pos, action, m.group()
+ else:
+ for item in action(self, m):
+ yield item
pos = m.end()
if new_state is not None:
# state transition
@@ -626,7 +670,7 @@ class LexerContext(object):
def __init__(self, text, pos, stack=None, end=None):
self.text = text
self.pos = pos
- self.end = end or len(text) # end=0 not supported ;-)
+ self.end = end or len(text) # end=0 not supported ;-)
self.stack = stack or ['root']
def __repr__(self):
@@ -656,15 +700,16 @@ class ExtendedRegexLexer(RegexLexer):
for rexmatch, action, new_state in statetokens:
m = rexmatch(text, ctx.pos, ctx.end)
if m:
- if type(action) is _TokenType:
- yield ctx.pos, action, m.group()
- ctx.pos = m.end()
- else:
- for item in action(self, m, ctx):
- yield item
- if not new_state:
- # altered the state stack?
- statetokens = tokendefs[ctx.stack[-1]]
+ if action is not None:
+ if type(action) is _TokenType:
+ yield ctx.pos, action, m.group()
+ ctx.pos = m.end()
+ else:
+ for item in action(self, m, ctx):
+ yield item
+ if not new_state:
+ # altered the state stack?
+ statetokens = tokendefs[ctx.stack[-1]]
# CAUTION: callback must set ctx.pos!
if new_state is not None:
# state transition
@@ -673,7 +718,7 @@ class ExtendedRegexLexer(RegexLexer):
if state == '#pop':
ctx.stack.pop()
elif state == '#push':
- ctx.stack.append(statestack[-1])
+ ctx.stack.append(ctx.stack[-1])
else:
ctx.stack.append(state)
elif isinstance(new_state, int):
@@ -718,7 +763,7 @@ def do_insertions(insertions, tokens):
"""
insertions = iter(insertions)
try:
- index, itokens = insertions.next()
+ index, itokens = next(insertions)
except StopIteration:
# no insertions
for item in tokens:
@@ -744,7 +789,7 @@ def do_insertions(insertions, tokens):
realpos += len(it_value)
oldi = index - i
try:
- index, itokens = insertions.next()
+ index, itokens = next(insertions)
except StopIteration:
insleft = False
break # not strictly necessary
@@ -759,7 +804,60 @@ def do_insertions(insertions, tokens):
yield realpos, t, v
realpos += len(v)
try:
- index, itokens = insertions.next()
+ index, itokens = next(insertions)
except StopIteration:
insleft = False
break # not strictly necessary
+
+
+class ProfilingRegexLexerMeta(RegexLexerMeta):
+ """Metaclass for ProfilingRegexLexer, collects regex timing info."""
+
+ def _process_regex(cls, regex, rflags, state):
+ if isinstance(regex, words):
+ rex = regex_opt(regex.words, prefix=regex.prefix,
+ suffix=regex.suffix)
+ else:
+ rex = regex
+ compiled = re.compile(rex, rflags)
+
+ def match_func(text, pos, endpos=sys.maxsize):
+ info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
+ t0 = time.time()
+ res = compiled.match(text, pos, endpos)
+ t1 = time.time()
+ info[0] += 1
+ info[1] += t1 - t0
+ return res
+ return match_func
+
+
+@add_metaclass(ProfilingRegexLexerMeta)
+class ProfilingRegexLexer(RegexLexer):
+ """Drop-in replacement for RegexLexer that does profiling of its regexes."""
+
+ _prof_data = []
+ _prof_sort_index = 4 # defaults to time per call
+
+ def get_tokens_unprocessed(self, text, stack=('root',)):
+ # this needs to be a stack, since using(this) will produce nested calls
+ self.__class__._prof_data.append({})
+ for tok in RegexLexer.get_tokens_unprocessed(self, text, stack):
+ yield tok
+ rawdata = self.__class__._prof_data.pop()
+ data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
+ n, 1000 * t, 1000 * t / n)
+ for ((s, r), (n, t)) in rawdata.items()),
+ key=lambda x: x[self._prof_sort_index],
+ reverse=True)
+ sum_total = sum(x[3] for x in data)
+
+ print()
+ print('Profiling result for %s lexing %d chars in %.3f ms' %
+ (self.__class__.__name__, len(text), sum_total))
+ print('=' * 110)
+ print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
+ print('-' * 110)
+ for d in data:
+ print('%-20s %-65s %5d %8.4f %8.4f' % d)
+ print('=' * 110)