summaryrefslogtreecommitdiff
path: root/pygments/lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r--pygments/lexer.py153
1 files changed, 122 insertions, 31 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index a22768a3..567e85f8 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -5,22 +5,28 @@
Base lexer classes.
- :copyright: Copyright 2006-2010 by the Pygments team, see AUTHORS.
+ :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
-import re
+import re, itertools
from pygments.filter import apply_filters, Filter
from pygments.filters import get_filter_by_name
from pygments.token import Error, Text, Other, _TokenType
from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
- make_analysator
+ make_analysator, text_type, add_metaclass, iteritems
__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
- 'LexerContext', 'include', 'bygroups', 'using', 'this']
+ 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this']
+_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
+ (b'\xff\xfe\0\0', 'utf-32'),
+ (b'\0\0\xfe\xff', 'utf-32be'),
+ (b'\xff\xfe', 'utf-16'),
+ (b'\xfe\xff', 'utf-16be')]
+
_default_analyse = staticmethod(lambda x: 0.0)
@@ -36,6 +42,7 @@ class LexerMeta(type):
return type.__new__(cls, name, bases, d)
+@add_metaclass(LexerMeta)
class Lexer(object):
"""
Lexer for a specific language.
@@ -49,7 +56,9 @@ class Lexer(object):
``ensurenl``
Make sure that the input ends with a newline (default: True). This
is required for some lexers that consume input linewise.
- *New in Pygments 1.3.*
+
+ .. versionadded:: 1.3
+
``tabsize``
If given and greater than 0, expand tabs in the input (default: 0).
``encoding``
@@ -66,16 +75,17 @@ class Lexer(object):
#: Shortcuts for the lexer
aliases = []
- #: fn match rules
+ #: File name globs
filenames = []
- #: fn alias filenames
+ #: Secondary file name globs
alias_filenames = []
- #: mime types
+ #: MIME types
mimetypes = []
- __metaclass__ = LexerMeta
+ #: Priority, should multiple lexers match and no content is provided
+ priority = 0
def __init__(self, **options):
self.options = options
@@ -127,7 +137,7 @@ class Lexer(object):
Also preprocess the text, i.e. expand tabs and strip it if
wanted and applies registered filters.
"""
- if not isinstance(text, unicode):
+ if not isinstance(text, text_type):
if self.encoding == 'guess':
try:
text = text.decode('utf-8')
@@ -142,10 +152,26 @@ class Lexer(object):
raise ImportError('To enable chardet encoding guessing, '
'please install the chardet library '
'from http://chardet.feedparser.org/')
- enc = chardet.detect(text)
- text = text.decode(enc['encoding'])
+ # check for BOM first
+ decoded = None
+ for bom, encoding in _encoding_map:
+ if text.startswith(bom):
+ decoded = text[len(bom):].decode(encoding, 'replace')
+ break
+ # no BOM found, so use chardet
+ if decoded is None:
+ enc = chardet.detect(text[:1024]) # Guess using first 1KB
+ decoded = text.decode(enc.get('encoding') or 'utf-8',
+ 'replace')
+ text = decoded
else:
text = text.decode(self.encoding)
+ if text.startswith(u'\ufeff'):
+ text = text[len(u'\ufeff'):]
+ else:
+ if text.startswith(u'\ufeff'):
+ text = text[len(u'\ufeff'):]
+
# text now *is* a unicode string
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
@@ -168,7 +194,9 @@ class Lexer(object):
def get_tokens_unprocessed(self, text):
"""
- Return an iterable of (tokentype, value) pairs.
+ Return an iterable of (index, tokentype, value) pairs where "index"
+ is the starting position of the token within the input text.
+
In subclasses, implement this method as a generator to
maximize effectiveness.
"""
@@ -221,6 +249,16 @@ class include(str):
pass
+class _inherit(object):
+ """
+ Indicates the a state should inherit from its superclass.
+ """
+ def __repr__(self):
+ return 'inherit'
+
+inherit = _inherit()
+
+
class combined(tuple):
"""
Indicates a state combined from multiple states.
@@ -274,12 +312,14 @@ def bygroups(*args):
if data:
yield match.start(i + 1), action, data
else:
- if ctx:
- ctx.pos = match.start(i + 1)
- for item in action(lexer, _PseudoMatch(match.start(i + 1),
- match.group(i + 1)), ctx):
- if item:
- yield item
+ data = match.group(i + 1)
+ if data is not None:
+ if ctx:
+ ctx.pos = match.start(i + 1)
+ for item in action(lexer, _PseudoMatch(match.start(i + 1),
+ data), ctx):
+ if item:
+ yield item
if ctx:
ctx.pos = match.end()
return callback
@@ -409,12 +449,15 @@ class RegexLexerMeta(LexerMeta):
tokens.extend(cls._process_state(unprocessed, processed,
str(tdef)))
continue
+ if isinstance(tdef, _inherit):
+ # processed already
+ continue
assert type(tdef) is tuple, "wrong rule def %r" % tdef
try:
rex = cls._process_regex(tdef[0], rflags)
- except Exception, err:
+ except Exception as err:
raise ValueError("uncompilable regex %r in state %r of %r: %s" %
(tdef[0], state, cls, err))
@@ -433,31 +476,74 @@ class RegexLexerMeta(LexerMeta):
"""Preprocess a dictionary of token definitions."""
processed = cls._all_tokens[name] = {}
tokendefs = tokendefs or cls.tokens[name]
- for state in tokendefs.keys():
+ for state in list(tokendefs):
cls._process_state(tokendefs, processed, state)
return processed
+ def get_tokendefs(cls):
+ """
+ Merge tokens from superclasses in MRO order, returning a single tokendef
+ dictionary.
+
+ Any state that is not defined by a subclass will be inherited
+ automatically. States that *are* defined by subclasses will, by
+ default, override that state in the superclass. If a subclass wishes to
+ inherit definitions from a superclass, it can use the special value
+ "inherit", which will cause the superclass' state definition to be
+ included at that point in the state.
+ """
+ tokens = {}
+ inheritable = {}
+ for c in itertools.chain((cls,), cls.__mro__):
+ toks = c.__dict__.get('tokens', {})
+
+ for state, items in iteritems(toks):
+ curitems = tokens.get(state)
+ if curitems is None:
+ tokens[state] = items
+ try:
+ inherit_ndx = items.index(inherit)
+ except ValueError:
+ continue
+ inheritable[state] = inherit_ndx
+ continue
+
+ inherit_ndx = inheritable.pop(state, None)
+ if inherit_ndx is None:
+ continue
+
+ # Replace the "inherit" value with the items
+ curitems[inherit_ndx:inherit_ndx+1] = items
+ try:
+ new_inh_ndx = items.index(inherit)
+ except ValueError:
+ pass
+ else:
+ inheritable[state] = inherit_ndx + new_inh_ndx
+
+ return tokens
+
def __call__(cls, *args, **kwds):
"""Instantiate cls after preprocessing its token definitions."""
- if not hasattr(cls, '_tokens'):
+ if '_tokens' not in cls.__dict__:
cls._all_tokens = {}
cls._tmpname = 0
if hasattr(cls, 'token_variants') and cls.token_variants:
# don't process yet
pass
else:
- cls._tokens = cls.process_tokendef('', cls.tokens)
+ cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
return type.__call__(cls, *args, **kwds)
+@add_metaclass(RegexLexerMeta)
class RegexLexer(Lexer):
"""
Base for simple stateful regular expression-based lexers.
Simplifies the lexing process so that you need only
provide a list of states and regular expressions.
"""
- __metaclass__ = RegexLexerMeta
#: Flags for compiling the regular expressions.
#: Defaults to MULTILINE.
@@ -525,10 +611,10 @@ class RegexLexer(Lexer):
try:
if text[pos] == '\n':
# at EOL, reset state to "root"
- pos += 1
statestack = ['root']
statetokens = tokendefs['root']
yield pos, Text, u'\n'
+ pos += 1
continue
yield pos, Error, text[pos]
pos += 1
@@ -587,7 +673,13 @@ class ExtendedRegexLexer(RegexLexer):
if new_state is not None:
# state transition
if isinstance(new_state, tuple):
- ctx.stack.extend(new_state)
+ for state in new_state:
+ if state == '#pop':
+ ctx.stack.pop()
+ elif state == '#push':
+ ctx.stack.append(ctx.stack[-1])
+ else:
+ ctx.stack.append(state)
elif isinstance(new_state, int):
# pop
del ctx.stack[new_state:]
@@ -603,10 +695,10 @@ class ExtendedRegexLexer(RegexLexer):
break
if text[ctx.pos] == '\n':
# at EOL, reset state to "root"
- ctx.pos += 1
ctx.stack = ['root']
statetokens = tokendefs['root']
yield ctx.pos, Text, u'\n'
+ ctx.pos += 1
continue
yield ctx.pos, Error, text[ctx.pos]
ctx.pos += 1
@@ -630,7 +722,7 @@ def do_insertions(insertions, tokens):
"""
insertions = iter(insertions)
try:
- index, itokens = insertions.next()
+ index, itokens = next(insertions)
except StopIteration:
# no insertions
for item in tokens:
@@ -656,7 +748,7 @@ def do_insertions(insertions, tokens):
realpos += len(it_value)
oldi = index - i
try:
- index, itokens = insertions.next()
+ index, itokens = next(insertions)
except StopIteration:
insleft = False
break # not strictly necessary
@@ -671,8 +763,7 @@ def do_insertions(insertions, tokens):
yield realpos, t, v
realpos += len(v)
try:
- index, itokens = insertions.next()
+ index, itokens = next(insertions)
except StopIteration:
insleft = False
break # not strictly necessary
-