summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHiroaki Itoh <xwhhsprings@gmail.com>2015-11-12 15:40:10 +0900
committerHiroaki Itoh <xwhhsprings@gmail.com>2015-11-12 15:40:10 +0900
commitd5d2277a9c912dcff7df08a996a58cccdd18d39d (patch)
tree6a4520eaac6ab5cdd0e76dd06f162a562bd2c9d8
parentbcae575b929fd3bb87dcbcfbabfd7fb634f3d6c9 (diff)
parentac2fae04034e82f39eb401837893973a5a90af48 (diff)
downloadpygments-d5d2277a9c912dcff7df08a996a58cccdd18d39d.tar.gz
Merged default into add-wdiff-lexer
-rw-r--r--pygments/lexers/_mapping.py1
-rw-r--r--pygments/lexers/diff.py61
-rw-r--r--tests/examplefiles/wdiff_example1.wdiff731
-rw-r--r--tests/examplefiles/wdiff_example2.wdiff758
-rw-r--r--tests/examplefiles/wdiff_example3.wdiff10
5 files changed, 1560 insertions, 1 deletions
diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py
index c2251e51..1f3c6c27 100644
--- a/pygments/lexers/_mapping.py
+++ b/pygments/lexers/_mapping.py
@@ -403,6 +403,7 @@ LEXERS = {
'VerilogLexer': ('pygments.lexers.hdl', 'verilog', ('verilog', 'v'), ('*.v',), ('text/x-verilog',)),
'VhdlLexer': ('pygments.lexers.hdl', 'vhdl', ('vhdl',), ('*.vhdl', '*.vhd'), ('text/x-vhdl',)),
'VimLexer': ('pygments.lexers.textedit', 'VimL', ('vim',), ('*.vim', '.vimrc', '.exrc', '.gvimrc', '_vimrc', '_exrc', '_gvimrc', 'vimrc', 'gvimrc'), ('text/x-vim',)),
+ 'WDiffLexer': ('pygments.lexers.diff', 'WDiff', ('wdiff',), ('*.wdiff',), ()),
'X10Lexer': ('pygments.lexers.x10', 'X10', ('x10', 'xten'), ('*.x10',), ('text/x-x10',)),
'XQueryLexer': ('pygments.lexers.webmisc', 'XQuery', ('xquery', 'xqy', 'xq', 'xql', 'xqm'), ('*.xqy', '*.xquery', '*.xq', '*.xql', '*.xqm'), ('text/xquery', 'application/xquery')),
'XmlDjangoLexer': ('pygments.lexers.templates', 'XML+Django/Jinja', ('xml+django', 'xml+jinja'), (), ('application/xml+django', 'application/xml+jinja')),
diff --git a/pygments/lexers/diff.py b/pygments/lexers/diff.py
index d3b1589d..2056fbff 100644
--- a/pygments/lexers/diff.py
+++ b/pygments/lexers/diff.py
@@ -9,11 +9,13 @@
:license: BSD, see LICENSE for details.
"""
+import re
+
from pygments.lexer import RegexLexer, include, bygroups
from pygments.token import Text, Comment, Operator, Keyword, Name, Generic, \
Literal
-__all__ = ['DiffLexer', 'DarcsPatchLexer']
+__all__ = ['DiffLexer', 'DarcsPatchLexer', 'WDiffLexer']
class DiffLexer(RegexLexer):
@@ -104,3 +106,60 @@ class DarcsPatchLexer(RegexLexer):
(r'[^\n\[]+', Generic.Deleted),
],
}
+
+
+class WDiffLexer(RegexLexer):
+ """
+ A `wdiff <https://www.gnu.org/software/wdiff/>`_ lexer.
+
+ Note that:
+
+ * only to normal output (without option like -l).
+ * if target files of wdiff contain "[-", "-]", "{+", "+}",
+ especially they are unbalanced, this lexer will get confusing.
+
+ .. versionadded:: 2.1
+ """
+
+ name = 'WDiff'
+ aliases = ['wdiff',]
+ filenames = ['*.wdiff',]
+ mimetypes = []
+
+ flags = re.MULTILINE | re.DOTALL
+
+ # We can only assume "[-" after "[-" before "-]" is `nested`,
+ # for instance wdiff to wdiff outputs. We have no way to
+ # distinct these marker is of wdiff output from original text.
+
+ ins_op = r"\{\+"
+ ins_cl = r"\+\}"
+ del_op = r"\[\-"
+ del_cl = r"\-\]"
+ normal = r'[^{}[\]+-]+' # for performance
+ tokens = {
+ 'root': [
+ (ins_op, Generic.Inserted, 'inserted'),
+ (del_op, Generic.Deleted, 'deleted'),
+ (normal, Text),
+ (r'.', Text),
+ ],
+ 'inserted': [
+ (ins_op, Generic.Inserted, '#push'),
+ (del_op, Generic.Inserted, '#push'),
+ (del_cl, Generic.Inserted, '#pop'),
+
+ (ins_cl, Generic.Inserted, '#pop'),
+ (normal, Generic.Inserted),
+ (r'.', Generic.Inserted),
+ ],
+ 'deleted': [
+ (del_op, Generic.Deleted, '#push'),
+ (ins_op, Generic.Deleted, '#push'),
+ (ins_cl, Generic.Deleted, '#pop'),
+
+ (del_cl, Generic.Deleted, '#pop'),
+ (normal, Generic.Deleted),
+ (r'.', Generic.Deleted),
+ ],
+ }
diff --git a/tests/examplefiles/wdiff_example1.wdiff b/tests/examplefiles/wdiff_example1.wdiff
new file mode 100644
index 00000000..ca760812
--- /dev/null
+++ b/tests/examplefiles/wdiff_example1.wdiff
@@ -0,0 +1,731 @@
+.. -*- mode: rst -*-
+
+{+.. highlight:: python+}
+
+====================
+Write your own lexer
+====================
+
+If a lexer for your favorite language is missing in the Pygments package, you
+can easily write your own and extend Pygments.
+
+All you need can be found inside the :mod:`pygments.lexer` module. As you can
+read in the :doc:`API documentation <api>`, a lexer is a class that is
+initialized with some keyword arguments (the lexer options) and that provides a
+:meth:`.get_tokens_unprocessed()` method which is given a string or unicode
+object with the data to [-parse.-] {+lex.+}
+
+The :meth:`.get_tokens_unprocessed()` method must return an iterator or iterable
+containing tuples in the form ``(index, token, value)``. Normally you don't
+need to do this since there are [-numerous-] base lexers {+that do most of the work and that+}
+you can subclass.
+
+
+RegexLexer
+==========
+
+[-A very powerful (but quite easy to use)-]
+
+{+The+} lexer {+base class used by almost all of Pygments' lexers+} is the
+:class:`RegexLexer`. This
+[-lexer base-] class allows you to define lexing rules in terms of
+*regular expressions* for different *states*.
+
+States are groups of regular expressions that are matched against the input
+string at the *current position*. If one of these expressions matches, a
+corresponding action is performed [-(normally-] {+(such as+} yielding a token with a specific
+[-type),-]
+{+type, or changing state),+} the current position is set to where the last match
+ended and the matching process continues with the first regex of the current
+state.
+
+Lexer states are kept [-in-] {+on+} a [-state-] stack: each time a new state is entered, the new
+state is pushed onto the stack. The most basic lexers (like the `DiffLexer`)
+just need one state.
+
+Each state is defined as a list of tuples in the form (`regex`, `action`,
+`new_state`) where the last item is optional. In the most basic form, `action`
+is a token type (like `Name.Builtin`). That means: When `regex` matches, emit a
+token with the match text and type `tokentype` and push `new_state` on the state
+stack. If the new state is ``'#pop'``, the topmost state is popped from the
+stack instead. [-(To-] {+To+} pop more than one state, use ``'#pop:2'`` and so [-on.)-] {+on.+}
+``'#push'`` is a synonym for pushing the current state on the stack.
+
+The following example shows the `DiffLexer` from the builtin lexers. Note that
+it contains some additional attributes `name`, `aliases` and `filenames` which
+aren't required for a lexer. They are used by the builtin lexer lookup
+functions.
+
+[-.. sourcecode:: python-] {+::+}
+
+ from pygments.lexer import RegexLexer
+ from pygments.token import *
+
+ class DiffLexer(RegexLexer):
+ name = 'Diff'
+ aliases = ['diff']
+ filenames = ['*.diff']
+
+ tokens = {
+ 'root': [
+ (r' .*\n', Text),
+ (r'\+.*\n', Generic.Inserted),
+ (r'-.*\n', Generic.Deleted),
+ (r'@.*\n', Generic.Subheading),
+ (r'Index.*\n', Generic.Heading),
+ (r'=.*\n', Generic.Heading),
+ (r'.*\n', Text),
+ ]
+ }
+
+As you can see this lexer only uses one state. When the lexer starts scanning
+the text, it first checks if the current character is a space. If this is true
+it scans everything until newline and returns the [-parsed-] data as {+a+} `Text` [-token.-] {+token (which
+is the "no special highlighting" token).+}
+
+If this rule doesn't match, it checks if the current char is a plus sign. And
+so on.
+
+If no rule matches at the current position, the current char is emitted as an
+`Error` token that indicates a [-parsing-] {+lexing+} error, and the position is increased by
+[-1.-]
+{+one.+}
+
+
+Adding and testing a new lexer
+==============================
+
+To make [-pygments-] {+Pygments+} aware of your new lexer, you have to perform the following
+steps:
+
+First, change to the current directory containing the [-pygments-] {+Pygments+} source code:
+
+.. [-sourcecode::-] {+code-block::+} console
+
+ $ cd .../pygments-main
+
+{+Select a matching module under ``pygments/lexers``, or create a new module for
+your lexer class.+}
+
+Next, make sure the lexer is known from outside of the module. All modules in
+the ``pygments.lexers`` specify ``__all__``. For example, [-``other.py`` sets:
+
+.. sourcecode:: python-] {+``esoteric.py`` sets::+}
+
+ __all__ = ['BrainfuckLexer', 'BefungeLexer', ...]
+
+Simply add the name of your lexer class to this list.
+
+Finally the lexer can be made [-publically-] {+publicly+} known by rebuilding the lexer mapping:
+
+.. [-sourcecode::-] {+code-block::+} console
+
+ $ make mapfiles
+
+To test the new lexer, store an example file with the proper extension in
+``tests/examplefiles``. For example, to test your ``DiffLexer``, add a
+``tests/examplefiles/example.diff`` containing a sample diff output.
+
+Now you can use pygmentize to render your example to HTML:
+
+.. [-sourcecode::-] {+code-block::+} console
+
+ $ ./pygmentize -O full -f html -o /tmp/example.html tests/examplefiles/example.diff
+
+Note that this [-explicitely-] {+explicitly+} calls the ``pygmentize`` in the current directory
+by preceding it with ``./``. This ensures your modifications are used.
+Otherwise a possibly already installed, unmodified version without your new
+lexer would have been called from the system search path (``$PATH``).
+
+To view the result, open ``/tmp/example.html`` in your browser.
+
+Once the example renders as expected, you should run the complete test suite:
+
+.. [-sourcecode::-] {+code-block::+} console
+
+ $ make test
+
+{+It also tests that your lexer fulfills the lexer API and certain invariants,
+such as that the concatenation of all token text is the same as the input text.+}
+
+
+Regex Flags
+===========
+
+You can either define regex flags {+locally+} in the regex (``r'(?x)foo bar'``) or
+{+globally+} by adding a `flags` attribute to your lexer class. If no attribute is
+defined, it defaults to `re.MULTILINE`. For more [-informations-] {+information+} about regular
+expression flags see the {+page about+} `regular expressions`_ [-help page-] in the [-python-] {+Python+}
+documentation.
+
+.. _regular expressions: [-http://docs.python.org/lib/re-syntax.html-] {+http://docs.python.org/library/re.html#regular-expression-syntax+}
+
+
+Scanning multiple tokens at once
+================================
+
+{+So far, the `action` element in the rule tuple of regex, action and state has
+been a single token type. Now we look at the first of several other possible
+values.+}
+
+Here is a more complex lexer that highlights INI files. INI files consist of
+sections, comments and [-key-] {+``key+} = [-value pairs:
+
+.. sourcecode:: python-] {+value`` pairs::+}
+
+ from pygments.lexer import RegexLexer, bygroups
+ from pygments.token import *
+
+ class IniLexer(RegexLexer):
+ name = 'INI'
+ aliases = ['ini', 'cfg']
+ filenames = ['*.ini', '*.cfg']
+
+ tokens = {
+ 'root': [
+ (r'\s+', Text),
+ (r';.*?$', Comment),
+ (r'\[.*?\]$', Keyword),
+ (r'(.*?)(\s*)(=)(\s*)(.*?)$',
+ bygroups(Name.Attribute, Text, Operator, Text, String))
+ ]
+ }
+
+The lexer first looks for whitespace, comments and section names. [-And later-] {+Later+} it
+looks for a line that looks like a key, value pair, separated by an ``'='``
+sign, and optional whitespace.
+
+The `bygroups` helper [-makes sure that-] {+yields+} each {+capturing+} group [-is yielded-] {+in the regex+} with a different
+token type. First the `Name.Attribute` token, then a `Text` token for the
+optional whitespace, after that a `Operator` token for the equals sign. Then a
+`Text` token for the whitespace again. The rest of the line is returned as
+`String`.
+
+Note that for this to work, every part of the match must be inside a capturing
+group (a ``(...)``), and there must not be any nested capturing groups. If you
+nevertheless need a group, use a non-capturing group defined using this syntax:
+[-``r'(?:some|words|here)'``-]
+{+``(?:some|words|here)``+} (note the ``?:`` after the beginning parenthesis).
+
+If you find yourself needing a capturing group inside the regex which shouldn't
+be part of the output but is used in the regular expressions for backreferencing
+(eg: ``r'(<(foo|bar)>)(.*?)(</\2>)'``), you can pass `None` to the bygroups
+function and [-it will skip-] that group will be skipped in the output.
+
+
+Changing states
+===============
+
+Many lexers need multiple states to work as expected. For example, some
+languages allow multiline comments to be nested. Since this is a recursive
+pattern it's impossible to lex just using regular expressions.
+
+Here is [-the solution:
+
+.. sourcecode:: python-] {+a lexer that recognizes C++ style comments (multi-line with ``/* */``
+and single-line with ``//`` until end of line)::+}
+
+ from pygments.lexer import RegexLexer
+ from pygments.token import *
+
+ class [-ExampleLexer(RegexLexer):-] {+CppCommentLexer(RegexLexer):+}
+ name = 'Example Lexer with states'
+
+ tokens = {
+ 'root': [
+ (r'[^/]+', Text),
+ (r'/\*', Comment.Multiline, 'comment'),
+ (r'//.*?$', Comment.Singleline),
+ (r'/', Text)
+ ],
+ 'comment': [
+ (r'[^*/]', Comment.Multiline),
+ (r'/\*', Comment.Multiline, '#push'),
+ (r'\*/', Comment.Multiline, '#pop'),
+ (r'[*/]', Comment.Multiline)
+ ]
+ }
+
+This lexer starts lexing in the ``'root'`` state. It tries to match as much as
+possible until it finds a slash (``'/'``). If the next character after the slash
+is [-a star-] {+an asterisk+} (``'*'``) the `RegexLexer` sends those two characters to the
+output stream marked as `Comment.Multiline` and continues [-parsing-] {+lexing+} with the rules
+defined in the ``'comment'`` state.
+
+If there wasn't [-a star-] {+an asterisk+} after the slash, the `RegexLexer` checks if it's a
+[-singleline-]
+{+Singleline+} comment [-(eg:-] {+(i.e.+} followed by a second slash). If this also wasn't the
+case it must be a single [-slash-] {+slash, which is not a comment starter+} (the separate
+regex for a single slash must also be given, else the slash would be marked as
+an error token).
+
+Inside the ``'comment'`` state, we do the same thing again. Scan until the
+lexer finds a star or slash. If it's the opening of a multiline comment, push
+the ``'comment'`` state on the stack and continue scanning, again in the
+``'comment'`` state. Else, check if it's the end of the multiline comment. If
+yes, pop one state from the stack.
+
+Note: If you pop from an empty stack you'll get an `IndexError`. (There is an
+easy way to prevent this from happening: don't ``'#pop'`` in the root state).
+
+If the `RegexLexer` encounters a newline that is flagged as an error token, the
+stack is emptied and the lexer continues scanning in the ``'root'`` state. This
+[-helps-]
+{+can help+} producing error-tolerant highlighting for erroneous input, e.g. when a
+single-line string is not closed.
+
+
+Advanced state tricks
+=====================
+
+There are a few more things you can do with states:
+
+- You can push multiple states onto the stack if you give a tuple instead of a
+ simple string as the third item in a rule tuple. For example, if you want to
+ match a comment containing a directive, something [-like::-] {+like:
+
+ .. code-block:: text+}
+
+ /* <processing directive> rest of comment */
+
+ you can use this [-rule:
+
+ .. sourcecode:: python-] {+rule::+}
+
+ tokens = {
+ 'root': [
+ (r'/\* <', Comment, ('comment', 'directive')),
+ ...
+ ],
+ 'directive': [
+ (r'[^>]*', Comment.Directive),
+ (r'>', Comment, '#pop'),
+ ],
+ 'comment': [
+ (r'[^*]+', Comment),
+ (r'\*/', Comment, '#pop'),
+ (r'\*', Comment),
+ ]
+ }
+
+ When this encounters the above sample, first ``'comment'`` and ``'directive'``
+ are pushed onto the stack, then the lexer continues in the directive state
+ until it finds the closing ``>``, then it continues in the comment state until
+ the closing ``*/``. Then, both states are popped from the stack again and
+ lexing continues in the root state.
+
+ .. versionadded:: 0.9
+ The tuple can contain the special ``'#push'`` and ``'#pop'`` (but not
+ ``'#pop:n'``) directives.
+
+
+- You can include the rules of a state in the definition of another. This is
+ done by using `include` from [-`pygments.lexer`:
+
+ .. sourcecode:: python-] {+`pygments.lexer`::+}
+
+ from pygments.lexer import RegexLexer, bygroups, include
+ from pygments.token import *
+
+ class ExampleLexer(RegexLexer):
+ tokens = {
+ 'comments': [
+ (r'/\*.*?\*/', Comment),
+ (r'//.*?\n', Comment),
+ ],
+ 'root': [
+ include('comments'),
+ (r'(function )(\w+)( {)',
+ bygroups(Keyword, Name, Keyword), 'function'),
+ (r'.', Text),
+ ],
+ 'function': [
+ (r'[^}/]+', Text),
+ include('comments'),
+ (r'/', Text),
+ [-(r'}',-]
+ {+(r'\}',+} Keyword, '#pop'),
+ ]
+ }
+
+ This is a hypothetical lexer for a language that consist of functions and
+ comments. Because comments can occur at toplevel and in functions, we need
+ rules for comments in both states. As you can see, the `include` helper saves
+ repeating rules that occur more than once (in this example, the state
+ ``'comment'`` will never be entered by the lexer, as it's only there to be
+ included in ``'root'`` and ``'function'``).
+
+- Sometimes, you may want to "combine" a state from existing ones. This is
+ possible with the [-`combine`-] {+`combined`+} helper from `pygments.lexer`.
+
+ If you, instead of a new state, write ``combined('state1', 'state2')`` as the
+ third item of a rule tuple, a new anonymous state will be formed from state1
+ and state2 and if the rule matches, the lexer will enter this state.
+
+ This is not used very often, but can be helpful in some cases, such as the
+ `PythonLexer`'s string literal processing.
+
+- If you want your lexer to start lexing in a different state you can modify the
+ stack by [-overloading-] {+overriding+} the `get_tokens_unprocessed()` [-method:
+
+ .. sourcecode:: python-] {+method::+}
+
+ from pygments.lexer import RegexLexer
+
+ class [-MyLexer(RegexLexer):-] {+ExampleLexer(RegexLexer):+}
+ tokens = {...}
+
+ def get_tokens_unprocessed(self, [-text):
+ stack = ['root', 'otherstate']-] {+text, stack=('root', 'otherstate')):+}
+ for item in RegexLexer.get_tokens_unprocessed(text, stack):
+ yield item
+
+ Some lexers like the `PhpLexer` use this to make the leading ``<?php``
+ preprocessor comments optional. Note that you can crash the lexer easily by
+ putting values into the stack that don't exist in the token map. Also
+ removing ``'root'`` from the stack can result in strange errors!
+
+- [-An-] {+In some lexers, a state should be popped if anything is encountered that isn't
+ matched by a rule in the state. You could use an+} empty regex at the end of [-a-]
+ {+the+} state list, [-combined with ``'#pop'``, can
+ act as-] {+but Pygments provides+} a [-return point-] {+more obvious way of spelling that:
+ ``default('#pop')`` is equivalent to ``('', Text, '#pop')``.
+
+ .. versionadded:: 2.0
+
+
+Subclassing lexers derived+} from {+RegexLexer
+==========================================
+
+.. versionadded:: 1.6
+
+Sometimes multiple languages are very similar, but should still be lexed by
+different lexer classes.
+
+When subclassing+} a {+lexer derived from RegexLexer, the ``tokens`` dictionaries
+defined in the parent and child class are merged. For example::
+
+ from pygments.lexer import RegexLexer, inherit
+ from pygments.token import *
+
+ class BaseLexer(RegexLexer):
+ tokens = {
+ 'root': [
+ ('[a-z]+', Name),
+ (r'/\*', Comment, 'comment'),
+ ('"', String, 'string'),
+ ('\s+', Text),
+ ],
+ 'string': [
+ ('[^"]+', String),
+ ('"', String, '#pop'),
+ ],
+ 'comment': [
+ ...
+ ],
+ }
+
+ class DerivedLexer(BaseLexer):
+ tokens = {
+ 'root': [
+ ('[0-9]+', Number),
+ inherit,
+ ],
+ 'string': [
+ (r'[^"\\]+', String),
+ (r'\\.', String.Escape),
+ ('"', String, '#pop'),
+ ],
+ }
+
+The `BaseLexer` defines two states, lexing names and strings. The
+`DerivedLexer` defines its own tokens dictionary, which extends the definitions
+of the base lexer:
+
+* The "root"+} state {+has an additional rule and then the special object `inherit`,
+ which tells Pygments to insert the token definitions of the parent class at+}
+ that [-doesn't have a clear end marker.-] {+point.
+
+* The "string" state is replaced entirely, since there is not `inherit` rule.
+
+* The "comment" state is inherited entirely.+}
+
+
+Using multiple lexers
+=====================
+
+Using multiple lexers for the same input can be tricky. One of the easiest
+combination techniques is shown here: You can replace the [-token type-] {+action+} entry in a rule
+tuple [-(the second item)-] with a lexer class. The matched text will then be lexed with that lexer,
+and the resulting tokens will be yielded.
+
+For example, look at this stripped-down HTML [-lexer:
+
+.. sourcecode:: python-] {+lexer::+}
+
+ from pygments.lexer import RegexLexer, bygroups, using
+ from pygments.token import *
+ from [-pygments.lexers.web-] {+pygments.lexers.javascript+} import JavascriptLexer
+
+ class HtmlLexer(RegexLexer):
+ name = 'HTML'
+ aliases = ['html']
+ filenames = ['*.html', '*.htm']
+
+ flags = re.IGNORECASE | re.DOTALL
+ tokens = {
+ 'root': [
+ ('[^<&]+', Text),
+ ('&.*?;', Name.Entity),
+ (r'<\s*script\s*', Name.Tag, ('script-content', 'tag')),
+ (r'<\s*[a-zA-Z0-9:]+', Name.Tag, 'tag'),
+ (r'<\s*/\s*[a-zA-Z0-9:]+\s*>', Name.Tag),
+ ],
+ 'script-content': [
+ (r'(.+?)(<\s*/\s*script\s*>)',
+ bygroups(using(JavascriptLexer), Name.Tag),
+ '#pop'),
+ ]
+ }
+
+Here the content of a ``<script>`` tag is passed to a newly created instance of
+a `JavascriptLexer` and not processed by the `HtmlLexer`. This is done using
+the `using` helper that takes the other lexer class as its parameter.
+
+Note the combination of `bygroups` and `using`. This makes sure that the
+content up to the ``</script>`` end tag is processed by the `JavascriptLexer`,
+while the end tag is yielded as a normal token with the `Name.Tag` type.
+
+[-As an additional goodie, if the lexer class is replaced by `this` (imported from
+`pygments.lexer`), the "other" lexer will be the current one (because you cannot
+refer to the current class within the code that runs at class definition time).-]
+
+Also note the ``(r'<\s*script\s*', Name.Tag, ('script-content', 'tag'))`` rule.
+Here, two states are pushed onto the state stack, ``'script-content'`` and
+``'tag'``. That means that first ``'tag'`` is processed, which will [-parse-] {+lex+}
+attributes and the closing ``>``, then the ``'tag'`` state is popped and the
+next state on top of the stack will be ``'script-content'``.
+
+{+Since you cannot refer to the class currently being defined, use `this`
+(imported from `pygments.lexer`) to refer to the current lexer class, i.e.
+``using(this)``. This construct may seem unnecessary, but this is often the
+most obvious way of lexing arbitrary syntax between fixed delimiters without
+introducing deeply nested states.+}
+
+The `using()` helper has a special keyword argument, `state`, which works as
+follows: if given, the lexer to use initially is not in the ``"root"`` state,
+but in the state given by this argument. This [-*only* works-] {+does not work+} with [-a `RegexLexer`.-] {+advanced
+`RegexLexer` subclasses such as `ExtendedRegexLexer` (see below).+}
+
+Any other keywords arguments passed to `using()` are added to the keyword
+arguments used to create the lexer.
+
+
+Delegating Lexer
+================
+
+Another approach for nested lexers is the `DelegatingLexer` which is for example
+used for the template engine lexers. It takes two lexers as arguments on
+initialisation: a `root_lexer` and a `language_lexer`.
+
+The input is processed as follows: First, the whole text is lexed with the
+`language_lexer`. All tokens yielded with [-a-] {+the special+} type of ``Other`` are
+then concatenated and given to the `root_lexer`. The language tokens of the
+`language_lexer` are then inserted into the `root_lexer`'s token stream at the
+appropriate positions.
+
+[-.. sourcecode:: python-] {+::+}
+
+ from pygments.lexer import DelegatingLexer
+ from pygments.lexers.web import HtmlLexer, PhpLexer
+
+ class HtmlPhpLexer(DelegatingLexer):
+ def __init__(self, **options):
+ super(HtmlPhpLexer, self).__init__(HtmlLexer, PhpLexer, **options)
+
+This procedure ensures that e.g. HTML with template tags in it is highlighted
+correctly even if the template tags are put into HTML tags or attributes.
+
+If you want to change the needle token ``Other`` to something else, you can give
+the lexer another token type as the third [-parameter:
+
+.. sourcecode:: python-] {+parameter::+}
+
+ DelegatingLexer.__init__(MyLexer, OtherLexer, Text, **options)
+
+
+Callbacks
+=========
+
+Sometimes the grammar of a language is so complex that a lexer would be unable
+to [-parse-] {+process+} it just by using regular expressions and stacks.
+
+For this, the `RegexLexer` allows callbacks to be given in rule tuples, instead
+of token types (`bygroups` and `using` are nothing else but preimplemented
+callbacks). The callback must be a function taking two arguments:
+
+* the lexer itself
+* the match object for the last matched rule
+
+The callback must then return an iterable of (or simply yield) ``(index,
+tokentype, value)`` tuples, which are then just passed through by
+`get_tokens_unprocessed()`. The ``index`` here is the position of the token in
+the input string, ``tokentype`` is the normal token type (like `Name.Builtin`),
+and ``value`` the associated part of the input string.
+
+You can see an example [-here:
+
+.. sourcecode:: python-] {+here::+}
+
+ from pygments.lexer import RegexLexer
+ from pygments.token import Generic
+
+ class HypotheticLexer(RegexLexer):
+
+ def headline_callback(lexer, match):
+ equal_signs = match.group(1)
+ text = match.group(2)
+ yield match.start(), Generic.Headline, equal_signs + text + equal_signs
+
+ tokens = {
+ 'root': [
+ (r'(=+)(.*?)(\1)', headline_callback)
+ ]
+ }
+
+If the regex for the `headline_callback` matches, the function is called with
+the match object. Note that after the callback is done, processing continues
+normally, that is, after the end of the previous match. The callback has no
+possibility to influence the position.
+
+There are not really any simple examples for lexer callbacks, but you can see
+them in action e.g. in the [-`compiled.py`_ source code-] {+`SMLLexer` class+} in [-the `CLexer` and
+`JavaLexer` classes.-] {+`ml.py`_.+}
+
+.. [-_compiled.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/compiled.py-] {+_ml.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ml.py+}
+
+
+The ExtendedRegexLexer class
+============================
+
+The `RegexLexer`, even with callbacks, unfortunately isn't powerful enough for
+the funky syntax rules of [-some-] languages [-that will go unnamed,-] such as Ruby.
+
+But fear not; even then you don't have to abandon the regular expression
+[-approach. For-]
+{+approach:+} Pygments has a subclass of `RegexLexer`, the `ExtendedRegexLexer`.
+All features known from RegexLexers are available here too, and the tokens are
+specified in exactly the same way, *except* for one detail:
+
+The `get_tokens_unprocessed()` method holds its internal state data not as local
+variables, but in an instance of the `pygments.lexer.LexerContext` class, and
+that instance is passed to callbacks as a third argument. This means that you
+can modify the lexer state in callbacks.
+
+The `LexerContext` class has the following members:
+
+* `text` -- the input text
+* `pos` -- the current starting position that is used for matching regexes
+* `stack` -- a list containing the state stack
+* `end` -- the maximum position to which regexes are matched, this defaults to
+ the length of `text`
+
+Additionally, the `get_tokens_unprocessed()` method can be given a
+`LexerContext` instead of a string and will then process this context instead of
+creating a new one for the string argument.
+
+Note that because you can set the current position to anything in the callback,
+it won't be automatically be set by the caller after the callback is finished.
+For example, this is how the hypothetical lexer above would be written with the
+[-`ExtendedRegexLexer`:
+
+.. sourcecode:: python-]
+{+`ExtendedRegexLexer`::+}
+
+ from pygments.lexer import ExtendedRegexLexer
+ from pygments.token import Generic
+
+ class ExHypotheticLexer(ExtendedRegexLexer):
+
+ def headline_callback(lexer, match, ctx):
+ equal_signs = match.group(1)
+ text = match.group(2)
+ yield match.start(), Generic.Headline, equal_signs + text + equal_signs
+ ctx.pos = match.end()
+
+ tokens = {
+ 'root': [
+ (r'(=+)(.*?)(\1)', headline_callback)
+ ]
+ }
+
+This might sound confusing (and it can really be). But it is needed, and for an
+example look at the Ruby lexer in [-`agile.py`_.-] {+`ruby.py`_.+}
+
+.. [-_agile.py: https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/agile.py
+
+
+Filtering-] {+_ruby.py: https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ruby.py
+
+
+Handling Lists of Keywords
+==========================
+
+For a relatively short list (hundreds) you can construct an optimized regular
+expression directly using ``words()`` (longer lists, see next section). This
+function handles a few things for you automatically, including escaping
+metacharacters and Python's first-match rather than longest-match in
+alternations. Feel free to put the lists themselves in
+``pygments/lexers/_$lang_builtins.py`` (see examples there), and generated by
+code if possible.
+
+An example of using ``words()`` is something like::
+
+ from pygments.lexer import RegexLexer, words, Name
+
+ class MyLexer(RegexLexer):
+
+ tokens = {
+ 'root': [
+ (words(('else', 'elseif'), suffix=r'\b'), Name.Builtin),
+ (r'\w+', Name),
+ ],
+ }
+
+As you can see, you can add ``prefix`` and ``suffix`` parts to the constructed
+regex.
+
+
+Modifying+} Token Streams
+=======================
+
+Some languages ship a lot of builtin functions (for example PHP). The total
+amount of those functions differs from system to system because not everybody
+has every extension installed. In the case of PHP there are over 3000 builtin
+functions. That's an [-incredible-] {+incredibly+} huge amount of functions, much more than you
+[-can-]
+{+want to+} put into a regular expression.
+
+But because only `Name` tokens can be function names [-it's-] {+this is+} solvable by
+overriding the ``get_tokens_unprocessed()`` method. The following lexer
+subclasses the `PythonLexer` so that it highlights some additional names as
+pseudo [-keywords:
+
+.. sourcecode:: python-] {+keywords::+}
+
+ from [-pygments.lexers.agile-] {+pygments.lexers.python+} import PythonLexer
+ from pygments.token import Name, Keyword
+
+ class MyPythonLexer(PythonLexer):
+ EXTRA_KEYWORDS = [-['foo',-] {+set(('foo',+} 'bar', 'foobar', 'barfoo', 'spam', [-'eggs']-] {+'eggs'))+}
+
+ def get_tokens_unprocessed(self, text):
+ for index, token, value in PythonLexer.get_tokens_unprocessed(self, text):
+ if token is Name and value in self.EXTRA_KEYWORDS:
+ yield index, Keyword.Pseudo, value
+ else:
+ yield index, token, value
+
+The `PhpLexer` and `LuaLexer` use this method to resolve builtin functions.
+
+[-.. note:: Do not confuse this with the :doc:`filter <filters>` system.-]
diff --git a/tests/examplefiles/wdiff_example2.wdiff b/tests/examplefiles/wdiff_example2.wdiff
new file mode 100644
index 00000000..1a746fe5
--- /dev/null
+++ b/tests/examplefiles/wdiff_example2.wdiff
@@ -0,0 +1,758 @@
+.. -*- mode: rst -*-
+
+[-..-]
+
+{+{+..+} highlight:: [-python-] {+python+}+}
+
+====================
+Write your own lexer
+====================
+
+If a lexer for your favorite language is missing in the Pygments package, you
+can easily write your own and extend Pygments.
+
+All you need can be found inside the :mod:`pygments.lexer` module. As you can
+read in the :doc:`API documentation <api>`, a lexer is a class that is
+initialized with some keyword arguments (the lexer options) and that provides a
+:meth:`.get_tokens_unprocessed()` method which is given a string or unicode
+object with the data to [-lex.-] {+[-parse.-] {+lex.+}+}
+
+The :meth:`.get_tokens_unprocessed()` method must return an iterator or iterable
+containing tuples in the form ``(index, token, value)``. Normally you don't
+need to do this since there are {+[-numerous-]+} base lexers [-that-] {+{+that+} do most of the work and [-that-] {+that+}+}
+you can subclass.
+
+
+RegexLexer
+==========
+
+[-The-]
+
+{+[-A very powerful (but quite easy to use)-]
+
+{+The+}+} lexer [-base-] {+{+base+} class used by almost all of Pygments' [-lexers-] {+lexers+}+} is the
+:class:`RegexLexer`. This
+{+[-lexer base-]+} class allows you to define lexing rules in terms of
+*regular expressions* for different *states*.
+
+States are groups of regular expressions that are matched against the input
+string at the *current position*. If one of these expressions matches, a
+corresponding action is performed [-(such as-] {+[-(normally-] {+(such as+}+} yielding a token with a specific
+[-type,-]
+{+[-type),-]
+{+type,+} or changing [-state),-] {+state),+}+} the current position is set to where the last match
+ended and the matching process continues with the first regex of the current
+state.
+
+Lexer states are kept [-on-] {+[-in-] {+on+}+} a {+[-state-]+} stack: each time a new state is entered, the new
+state is pushed onto the stack. The most basic lexers (like the `DiffLexer`)
+just need one state.
+
+Each state is defined as a list of tuples in the form (`regex`, `action`,
+`new_state`) where the last item is optional. In the most basic form, `action`
+is a token type (like `Name.Builtin`). That means: When `regex` matches, emit a
+token with the match text and type `tokentype` and push `new_state` on the state
+stack. If the new state is ``'#pop'``, the topmost state is popped from the
+stack instead. [-To-] {+[-(To-] {+To+}+} pop more than one state, use ``'#pop:2'`` and so [-on.-] {+[-on.)-] {+on.+}+}
+``'#push'`` is a synonym for pushing the current state on the stack.
+
+The following example shows the `DiffLexer` from the builtin lexers. Note that
+it contains some additional attributes `name`, `aliases` and `filenames` which
+aren't required for a lexer. They are used by the builtin lexer lookup
+functions. [-::-]
+
+{+[-.. sourcecode:: python-] {+::+}+}
+
+ from pygments.lexer import RegexLexer
+ from pygments.token import *
+
+ class DiffLexer(RegexLexer):
+ name = 'Diff'
+ aliases = ['diff']
+ filenames = ['*.diff']
+
+ tokens = {
+ 'root': [
+ (r' .*\n', Text),
+ (r'\+.*\n', Generic.Inserted),
+ (r'-.*\n', Generic.Deleted),
+ (r'@.*\n', Generic.Subheading),
+ (r'Index.*\n', Generic.Heading),
+ (r'=.*\n', Generic.Heading),
+ (r'.*\n', Text),
+ ]
+ }
+
+As you can see this lexer only uses one state. When the lexer starts scanning
+the text, it first checks if the current character is a space. If this is true
+it scans everything until newline and returns the {+[-parsed-]+} data as [-a-] {+{+a+}+} `Text` [-token-] {+[-token.-] {+token+} (which
+is the "no special highlighting" [-token).-] {+token).+}+}
+
+If this rule doesn't match, it checks if the current char is a plus sign. And
+so on.
+
+If no rule matches at the current position, the current char is emitted as an
+`Error` token that indicates a [-lexing-] {+[-parsing-] {+lexing+}+} error, and the position is increased by
+[-one.-]
+{+[-1.-]
+{+one.+}+}
+
+
+Adding and testing a new lexer
+==============================
+
+To make [-Pygments-] {+[-pygments-] {+Pygments+}+} aware of your new lexer, you have to perform the following
+steps:
+
+First, change to the current directory containing the [-Pygments-] {+[-pygments-] {+Pygments+}+} source code:
+
+.. [-code-block::-] {+[-sourcecode::-] {+code-block::+}+} console
+
+ $ cd .../pygments-main
+
+[-Select-]
+
+{+{+Select+} a matching module under ``pygments/lexers``, or create a new module for
+your lexer [-class.-] {+class.+}+}
+
+Next, make sure the lexer is known from outside of the module. All modules in
+the ``pygments.lexers`` specify ``__all__``. For example, [-``esoteric.py`` sets::-] {+[-``other.py`` sets:
+
+.. sourcecode:: python-] {+``esoteric.py`` sets::+}+}
+
+ __all__ = ['BrainfuckLexer', 'BefungeLexer', ...]
+
+Simply add the name of your lexer class to this list.
+
+Finally the lexer can be made [-publicly-] {+[-publically-] {+publicly+}+} known by rebuilding the lexer mapping:
+
+.. [-code-block::-] {+[-sourcecode::-] {+code-block::+}+} console
+
+ $ make mapfiles
+
+To test the new lexer, store an example file with the proper extension in
+``tests/examplefiles``. For example, to test your ``DiffLexer``, add a
+``tests/examplefiles/example.diff`` containing a sample diff output.
+
+Now you can use pygmentize to render your example to HTML:
+
+.. [-code-block::-] {+[-sourcecode::-] {+code-block::+}+} console
+
+ $ ./pygmentize -O full -f html -o /tmp/example.html tests/examplefiles/example.diff
+
+Note that this [-explicitly-] {+[-explicitely-] {+explicitly+}+} calls the ``pygmentize`` in the current directory
+by preceding it with ``./``. This ensures your modifications are used.
+Otherwise a possibly already installed, unmodified version without your new
+lexer would have been called from the system search path (``$PATH``).
+
+To view the result, open ``/tmp/example.html`` in your browser.
+
+Once the example renders as expected, you should run the complete test suite:
+
+.. [-code-block::-] {+[-sourcecode::-] {+code-block::+}+} console
+
+ $ make test
+
+[-It-]
+
+{+{+It+} also tests that your lexer fulfills the lexer API and certain invariants,
+such as that the concatenation of all token text is the same as the input [-text.-] {+text.+}+}
+
+
+Regex Flags
+===========
+
+You can either define regex flags [-locally-] {+{+locally+}+} in the regex (``r'(?x)foo bar'``) or
+[-globally-]
+{+{+globally+}+} by adding a `flags` attribute to your lexer class. If no attribute is
+defined, it defaults to `re.MULTILINE`. For more [-information-] {+[-informations-] {+information+}+} about regular
+expression flags see the [-page about-] {+{+page about+}+} `regular expressions`_ {+[-help page-]+} in the [-Python-] {+[-python-] {+Python+}+}
+documentation.
+
+.. _regular expressions: [-http://docs.python.org/library/re.html#regular-expression-syntax-] {+[-http://docs.python.org/lib/re-syntax.html-] {+http://docs.python.org/library/re.html#regular-expression-syntax+}+}
+
+
+Scanning multiple tokens at once
+================================
+
+[-So-]
+
+{+{+So+} far, the `action` element in the rule tuple of regex, action and state has
+been a single token type. Now we look at the first of several other possible
+[-values.-]
+{+values.+}+}
+
+Here is a more complex lexer that highlights INI files. INI files consist of
+sections, comments and [-``key-] {+[-key-] {+``key+}+} = [-value`` pairs::-] {+[-value pairs:
+
+.. sourcecode:: python-] {+value`` pairs::+}+}
+
+ from pygments.lexer import RegexLexer, bygroups
+ from pygments.token import *
+
+ class IniLexer(RegexLexer):
+ name = 'INI'
+ aliases = ['ini', 'cfg']
+ filenames = ['*.ini', '*.cfg']
+
+ tokens = {
+ 'root': [
+ (r'\s+', Text),
+ (r';.*?$', Comment),
+ (r'\[.*?\]$', Keyword),
+ (r'(.*?)(\s*)(=)(\s*)(.*?)$',
+ bygroups(Name.Attribute, Text, Operator, Text, String))
+ ]
+ }
+
+The lexer first looks for whitespace, comments and section names. [-Later-] {+[-And later-] {+Later+}+} it
+looks for a line that looks like a key, value pair, separated by an ``'='``
+sign, and optional whitespace.
+
+The `bygroups` helper [-yields-] {+[-makes sure that-] {+yields+}+} each [-capturing-] {+{+capturing+}+} group [-in-] {+[-is yielded-] {+in+} the [-regex-] {+regex+}+} with a different
+token type. First the `Name.Attribute` token, then a `Text` token for the
+optional whitespace, after that a `Operator` token for the equals sign. Then a
+`Text` token for the whitespace again. The rest of the line is returned as
+`String`.
+
+Note that for this to work, every part of the match must be inside a capturing
+group (a ``(...)``), and there must not be any nested capturing groups. If you
+nevertheless need a group, use a non-capturing group defined using this syntax:
+[-``(?:some|words|here)``-]
+{+[-``r'(?:some|words|here)'``-]
+{+``(?:some|words|here)``+}+} (note the ``?:`` after the beginning parenthesis).
+
+If you find yourself needing a capturing group inside the regex which shouldn't
+be part of the output but is used in the regular expressions for backreferencing
+(eg: ``r'(<(foo|bar)>)(.*?)(</\2>)'``), you can pass `None` to the bygroups
+function and {+[-it will skip-]+} that group will be skipped in the output.
+
+
+Changing states
+===============
+
+Many lexers need multiple states to work as expected. For example, some
+languages allow multiline comments to be nested. Since this is a recursive
+pattern it's impossible to lex just using regular expressions.
+
+Here is [-a-] {+[-the solution:
+
+.. sourcecode:: python-] {+a+} lexer that recognizes C++ style comments (multi-line with ``/* */``
+and single-line with ``//`` until end of [-line)::-] {+line)::+}+}
+
+ from pygments.lexer import RegexLexer
+ from pygments.token import *
+
+ class [-CppCommentLexer(RegexLexer):-] {+[-ExampleLexer(RegexLexer):-] {+CppCommentLexer(RegexLexer):+}+}
+ name = 'Example Lexer with states'
+
+ tokens = {
+ 'root': [
+ (r'[^/]+', Text),
+ (r'/\*', Comment.Multiline, 'comment'),
+ (r'//.*?$', Comment.Singleline),
+ (r'/', Text)
+ ],
+ 'comment': [
+ (r'[^*/]', Comment.Multiline),
+ (r'/\*', Comment.Multiline, '#push'),
+ (r'\*/', Comment.Multiline, '#pop'),
+ (r'[*/]', Comment.Multiline)
+ ]
+ }
+
+This lexer starts lexing in the ``'root'`` state. It tries to match as much as
+possible until it finds a slash (``'/'``). If the next character after the slash
+is [-an asterisk-] {+[-a star-] {+an asterisk+}+} (``'*'``) the `RegexLexer` sends those two characters to the
+output stream marked as `Comment.Multiline` and continues [-lexing-] {+[-parsing-] {+lexing+}+} with the rules
+defined in the ``'comment'`` state.
+
+If there wasn't [-an asterisk-] {+[-a star-] {+an asterisk+}+} after the slash, the `RegexLexer` checks if it's a
+[-Singleline-]
+{+[-singleline-]
+{+Singleline+}+} comment [-(i.e.-] {+[-(eg:-] {+(i.e.+}+} followed by a second slash). If this also wasn't the
+case it must be a single [-slash,-] {+[-slash-] {+slash,+} which is not a comment [-starter-] {+starter+}+} (the separate
+regex for a single slash must also be given, else the slash would be marked as
+an error token).
+
+Inside the ``'comment'`` state, we do the same thing again. Scan until the
+lexer finds a star or slash. If it's the opening of a multiline comment, push
+the ``'comment'`` state on the stack and continue scanning, again in the
+``'comment'`` state. Else, check if it's the end of the multiline comment. If
+yes, pop one state from the stack.
+
+Note: If you pop from an empty stack you'll get an `IndexError`. (There is an
+easy way to prevent this from happening: don't ``'#pop'`` in the root state).
+
+If the `RegexLexer` encounters a newline that is flagged as an error token, the
+stack is emptied and the lexer continues scanning in the ``'root'`` state. This
+[-can help-]
+{+[-helps-]
+{+can help+}+} producing error-tolerant highlighting for erroneous input, e.g. when a
+single-line string is not closed.
+
+
+Advanced state tricks
+=====================
+
+There are a few more things you can do with states:
+
+- You can push multiple states onto the stack if you give a tuple instead of a
+ simple string as the third item in a rule tuple. For example, if you want to
+ match a comment containing a directive, something [-like:-] {+[-like::-] {+like:+}
+
+ .. code-block:: [-text-] {+text+}+}
+
+ /* <processing directive> rest of comment */
+
+ you can use this [-rule::-] {+[-rule:
+
+ .. sourcecode:: python-] {+rule::+}+}
+
+ tokens = {
+ 'root': [
+ (r'/\* <', Comment, ('comment', 'directive')),
+ ...
+ ],
+ 'directive': [
+ (r'[^>]*', Comment.Directive),
+ (r'>', Comment, '#pop'),
+ ],
+ 'comment': [
+ (r'[^*]+', Comment),
+ (r'\*/', Comment, '#pop'),
+ (r'\*', Comment),
+ ]
+ }
+
+ When this encounters the above sample, first ``'comment'`` and ``'directive'``
+ are pushed onto the stack, then the lexer continues in the directive state
+ until it finds the closing ``>``, then it continues in the comment state until
+ the closing ``*/``. Then, both states are popped from the stack again and
+ lexing continues in the root state.
+
+ .. versionadded:: 0.9
+ The tuple can contain the special ``'#push'`` and ``'#pop'`` (but not
+ ``'#pop:n'``) directives.
+
+
+- You can include the rules of a state in the definition of another. This is
+ done by using `include` from [-`pygments.lexer`::-] {+[-`pygments.lexer`:
+
+ .. sourcecode:: python-] {+`pygments.lexer`::+}+}
+
+ from pygments.lexer import RegexLexer, bygroups, include
+ from pygments.token import *
+
+ class ExampleLexer(RegexLexer):
+ tokens = {
+ 'comments': [
+ (r'/\*.*?\*/', Comment),
+ (r'//.*?\n', Comment),
+ ],
+ 'root': [
+ include('comments'),
+ (r'(function )(\w+)( {)',
+ bygroups(Keyword, Name, Keyword), 'function'),
+ (r'.', Text),
+ ],
+ 'function': [
+ (r'[^}/]+', Text),
+ include('comments'),
+ (r'/', Text),
+ [-(r'\}',-]
+ {+[-(r'}',-]
+ {+(r'\}',+}+} Keyword, '#pop'),
+ ]
+ }
+
+ This is a hypothetical lexer for a language that consist of functions and
+ comments. Because comments can occur at toplevel and in functions, we need
+ rules for comments in both states. As you can see, the `include` helper saves
+ repeating rules that occur more than once (in this example, the state
+ ``'comment'`` will never be entered by the lexer, as it's only there to be
+ included in ``'root'`` and ``'function'``).
+
+- Sometimes, you may want to "combine" a state from existing ones. This is
+ possible with the [-`combined`-] {+[-`combine`-] {+`combined`+}+} helper from `pygments.lexer`.
+
+ If you, instead of a new state, write ``combined('state1', 'state2')`` as the
+ third item of a rule tuple, a new anonymous state will be formed from state1
+ and state2 and if the rule matches, the lexer will enter this state.
+
+ This is not used very often, but can be helpful in some cases, such as the
+ `PythonLexer`'s string literal processing.
+
+- If you want your lexer to start lexing in a different state you can modify the
+ stack by [-overriding-] {+[-overloading-] {+overriding+}+} the `get_tokens_unprocessed()` [-method::-] {+[-method:
+
+ .. sourcecode:: python-] {+method::+}+}
+
+ from pygments.lexer import RegexLexer
+
+ class [-ExampleLexer(RegexLexer):-] {+[-MyLexer(RegexLexer):-] {+ExampleLexer(RegexLexer):+}+}
+ tokens = {...}
+
+ def get_tokens_unprocessed(self, [-text,-] {+[-text):
+ stack = ['root', 'otherstate']-] {+text,+} stack=('root', [-'otherstate')):-] {+'otherstate')):+}+}
+ for item in RegexLexer.get_tokens_unprocessed(text, stack):
+ yield item
+
+ Some lexers like the `PhpLexer` use this to make the leading ``<?php``
+ preprocessor comments optional. Note that you can crash the lexer easily by
+ putting values into the stack that don't exist in the token map. Also
+ removing ``'root'`` from the stack can result in strange errors!
+
+- [-In-] {+[-An-] {+In+} some lexers, a state should be popped if anything is encountered that isn't
+ matched by a rule in the state. You could use [-an-] {+an+}+} empty regex at the end of
+ [-the-] {+[-a-]
+ {+the+}+} state list, [-but-] {+[-combined with ``'#pop'``, can
+ act as-] {+but+} Pygments [-provides-] {+provides+}+} a [-more-] {+[-return point-] {+more+} obvious way of spelling that:
+ ``default('#pop')`` is equivalent to ``('', Text, '#pop')``.
+
+ .. versionadded:: 2.0
+
+
+Subclassing lexers [-derived-] {+derived+}+} from [-RegexLexer-] {+{+RegexLexer+}
+==========================================
+
+.. versionadded:: 1.6
+
+Sometimes multiple languages are very similar, but should still be lexed by
+different lexer classes.
+
+When [-subclassing-] {+subclassing+}+} a [-lexer-] {+{+lexer+} derived from RegexLexer, the ``tokens`` dictionaries
+defined in the parent and child class are merged. For example::
+
+ from pygments.lexer import RegexLexer, inherit
+ from pygments.token import *
+
+ class BaseLexer(RegexLexer):
+ tokens = {
+ 'root': [
+ ('[a-z]+', Name),
+ (r'/\*', Comment, 'comment'),
+ ('"', String, 'string'),
+ ('\s+', Text),
+ ],
+ 'string': [
+ ('[^"]+', String),
+ ('"', String, '#pop'),
+ ],
+ 'comment': [
+ ...
+ ],
+ }
+
+ class DerivedLexer(BaseLexer):
+ tokens = {
+ 'root': [
+ ('[0-9]+', Number),
+ inherit,
+ ],
+ 'string': [
+ (r'[^"\\]+', String),
+ (r'\\.', String.Escape),
+ ('"', String, '#pop'),
+ ],
+ }
+
+The `BaseLexer` defines two states, lexing names and strings. The
+`DerivedLexer` defines its own tokens dictionary, which extends the definitions
+of the base lexer:
+
+* The [-"root"-] {+"root"+}+} state [-has-] {+{+has+} an additional rule and then the special object `inherit`,
+ which tells Pygments to insert the token definitions of the parent class [-at-] {+at+}+}
+ that [-point.-] {+[-doesn't have a clear end marker.-] {+point.+}
+
+* The "string" state is replaced entirely, since there is not `inherit` rule.
+
+* The "comment" state is inherited [-entirely.-] {+entirely.+}+}
+
+
+Using multiple lexers
+=====================
+
+Using multiple lexers for the same input can be tricky. One of the easiest
+combination techniques is shown here: You can replace the [-action-] {+[-token type-] {+action+}+} entry in a rule
+tuple {+[-(the second item)-]+} with a lexer class. The matched text will then be lexed with that lexer,
+and the resulting tokens will be yielded.
+
+For example, look at this stripped-down HTML [-lexer::-] {+[-lexer:
+
+.. sourcecode:: python-] {+lexer::+}+}
+
+ from pygments.lexer import RegexLexer, bygroups, using
+ from pygments.token import *
+ from [-pygments.lexers.javascript-] {+[-pygments.lexers.web-] {+pygments.lexers.javascript+}+} import JavascriptLexer
+
+ class HtmlLexer(RegexLexer):
+ name = 'HTML'
+ aliases = ['html']
+ filenames = ['*.html', '*.htm']
+
+ flags = re.IGNORECASE | re.DOTALL
+ tokens = {
+ 'root': [
+ ('[^<&]+', Text),
+ ('&.*?;', Name.Entity),
+ (r'<\s*script\s*', Name.Tag, ('script-content', 'tag')),
+ (r'<\s*[a-zA-Z0-9:]+', Name.Tag, 'tag'),
+ (r'<\s*/\s*[a-zA-Z0-9:]+\s*>', Name.Tag),
+ ],
+ 'script-content': [
+ (r'(.+?)(<\s*/\s*script\s*>)',
+ bygroups(using(JavascriptLexer), Name.Tag),
+ '#pop'),
+ ]
+ }
+
+Here the content of a ``<script>`` tag is passed to a newly created instance of
+a `JavascriptLexer` and not processed by the `HtmlLexer`. This is done using
+the `using` helper that takes the other lexer class as its parameter.
+
+Note the combination of `bygroups` and `using`. This makes sure that the
+content up to the ``</script>`` end tag is processed by the `JavascriptLexer`,
+while the end tag is yielded as a normal token with the `Name.Tag` type.
+
+{+[-As an additional goodie, if the lexer class is replaced by `this` (imported from
+`pygments.lexer`), the "other" lexer will be the current one (because you cannot
+refer to the current class within the code that runs at class definition time).-]+}
+
+Also note the ``(r'<\s*script\s*', Name.Tag, ('script-content', 'tag'))`` rule.
+Here, two states are pushed onto the state stack, ``'script-content'`` and
+``'tag'``. That means that first ``'tag'`` is processed, which will [-lex-] {+[-parse-] {+lex+}+}
+attributes and the closing ``>``, then the ``'tag'`` state is popped and the
+next state on top of the stack will be ``'script-content'``.
+
+[-Since-]
+
+{+{+Since+} you cannot refer to the class currently being defined, use `this`
+(imported from `pygments.lexer`) to refer to the current lexer class, i.e.
+``using(this)``. This construct may seem unnecessary, but this is often the
+most obvious way of lexing arbitrary syntax between fixed delimiters without
+introducing deeply nested [-states.-] {+states.+}+}
+
+The `using()` helper has a special keyword argument, `state`, which works as
+follows: if given, the lexer to use initially is not in the ``"root"`` state,
+but in the state given by this argument. This [-does-] {+[-*only* works-] {+does+} not [-work-] {+work+}+} with [-advanced-] {+[-a `RegexLexer`.-] {+advanced+}
+`RegexLexer` subclasses such as `ExtendedRegexLexer` (see [-below).-] {+below).+}+}
+
+Any other keywords arguments passed to `using()` are added to the keyword
+arguments used to create the lexer.
+
+
+Delegating Lexer
+================
+
+Another approach for nested lexers is the `DelegatingLexer` which is for example
+used for the template engine lexers. It takes two lexers as arguments on
+initialisation: a `root_lexer` and a `language_lexer`.
+
+The input is processed as follows: First, the whole text is lexed with the
+`language_lexer`. All tokens yielded with [-the special-] {+[-a-] {+the special+}+} type of ``Other`` are
+then concatenated and given to the `root_lexer`. The language tokens of the
+`language_lexer` are then inserted into the `root_lexer`'s token stream at the
+appropriate positions. [-::-]
+
+{+[-.. sourcecode:: python-] {+::+}+}
+
+ from pygments.lexer import DelegatingLexer
+ from pygments.lexers.web import HtmlLexer, PhpLexer
+
+ class HtmlPhpLexer(DelegatingLexer):
+ def __init__(self, **options):
+ super(HtmlPhpLexer, self).__init__(HtmlLexer, PhpLexer, **options)
+
+This procedure ensures that e.g. HTML with template tags in it is highlighted
+correctly even if the template tags are put into HTML tags or attributes.
+
+If you want to change the needle token ``Other`` to something else, you can give
+the lexer another token type as the third [-parameter::-] {+[-parameter:
+
+.. sourcecode:: python-] {+parameter::+}+}
+
+ DelegatingLexer.__init__(MyLexer, OtherLexer, Text, **options)
+
+
+Callbacks
+=========
+
+Sometimes the grammar of a language is so complex that a lexer would be unable
+to [-process-] {+[-parse-] {+process+}+} it just by using regular expressions and stacks.
+
+For this, the `RegexLexer` allows callbacks to be given in rule tuples, instead
+of token types (`bygroups` and `using` are nothing else but preimplemented
+callbacks). The callback must be a function taking two arguments:
+
+* the lexer itself
+* the match object for the last matched rule
+
+The callback must then return an iterable of (or simply yield) ``(index,
+tokentype, value)`` tuples, which are then just passed through by
+`get_tokens_unprocessed()`. The ``index`` here is the position of the token in
+the input string, ``tokentype`` is the normal token type (like `Name.Builtin`),
+and ``value`` the associated part of the input string.
+
+You can see an example [-here::-] {+[-here:
+
+.. sourcecode:: python-] {+here::+}+}
+
+ from pygments.lexer import RegexLexer
+ from pygments.token import Generic
+
+ class HypotheticLexer(RegexLexer):
+
+ def headline_callback(lexer, match):
+ equal_signs = match.group(1)
+ text = match.group(2)
+ yield match.start(), Generic.Headline, equal_signs + text + equal_signs
+
+ tokens = {
+ 'root': [
+ (r'(=+)(.*?)(\1)', headline_callback)
+ ]
+ }
+
+If the regex for the `headline_callback` matches, the function is called with
+the match object. Note that after the callback is done, processing continues
+normally, that is, after the end of the previous match. The callback has no
+possibility to influence the position.
+
+There are not really any simple examples for lexer callbacks, but you can see
+them in action e.g. in the [-`SMLLexer` class in `ml.py`_.-] {+[-`compiled.py`_ source code-] {+`SMLLexer` class+} in [-the `CLexer` and
+`JavaLexer` classes.-] {+`ml.py`_.+}+}
+
+.. [-_ml.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ml.py-] {+[-_compiled.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/compiled.py-] {+_ml.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ml.py+}+}
+
+
+The ExtendedRegexLexer class
+============================
+
+The `RegexLexer`, even with callbacks, unfortunately isn't powerful enough for
+the funky syntax rules of {+[-some-]+} languages {+[-that will go unnamed,-]+} such as Ruby.
+
+But fear not; even then you don't have to abandon the regular expression
+[-approach:-]
+{+[-approach. For-]
+{+approach:+}+} Pygments has a subclass of `RegexLexer`, the `ExtendedRegexLexer`.
+All features known from RegexLexers are available here too, and the tokens are
+specified in exactly the same way, *except* for one detail:
+
+The `get_tokens_unprocessed()` method holds its internal state data not as local
+variables, but in an instance of the `pygments.lexer.LexerContext` class, and
+that instance is passed to callbacks as a third argument. This means that you
+can modify the lexer state in callbacks.
+
+The `LexerContext` class has the following members:
+
+* `text` -- the input text
+* `pos` -- the current starting position that is used for matching regexes
+* `stack` -- a list containing the state stack
+* `end` -- the maximum position to which regexes are matched, this defaults to
+ the length of `text`
+
+Additionally, the `get_tokens_unprocessed()` method can be given a
+`LexerContext` instead of a string and will then process this context instead of
+creating a new one for the string argument.
+
+Note that because you can set the current position to anything in the callback,
+it won't be automatically be set by the caller after the callback is finished.
+For example, this is how the hypothetical lexer above would be written with the
+[-`ExtendedRegexLexer`::-]
+{+[-`ExtendedRegexLexer`:
+
+.. sourcecode:: python-]
+{+`ExtendedRegexLexer`::+}+}
+
+ from pygments.lexer import ExtendedRegexLexer
+ from pygments.token import Generic
+
+ class ExHypotheticLexer(ExtendedRegexLexer):
+
+ def headline_callback(lexer, match, ctx):
+ equal_signs = match.group(1)
+ text = match.group(2)
+ yield match.start(), Generic.Headline, equal_signs + text + equal_signs
+ ctx.pos = match.end()
+
+ tokens = {
+ 'root': [
+ (r'(=+)(.*?)(\1)', headline_callback)
+ ]
+ }
+
+This might sound confusing (and it can really be). But it is needed, and for an
+example look at the Ruby lexer in [-`ruby.py`_.-] {+[-`agile.py`_.-] {+`ruby.py`_.+}+}
+
+.. [-_ruby.py:-] {+[-_agile.py: https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/agile.py
+
+
+Filtering-] {+_ruby.py:+} https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ruby.py
+
+
+Handling Lists of Keywords
+==========================
+
+For a relatively short list (hundreds) you can construct an optimized regular
+expression directly using ``words()`` (longer lists, see next section). This
+function handles a few things for you automatically, including escaping
+metacharacters and Python's first-match rather than longest-match in
+alternations. Feel free to put the lists themselves in
+``pygments/lexers/_$lang_builtins.py`` (see examples there), and generated by
+code if possible.
+
+An example of using ``words()`` is something like::
+
+ from pygments.lexer import RegexLexer, words, Name
+
+ class MyLexer(RegexLexer):
+
+ tokens = {
+ 'root': [
+ (words(('else', 'elseif'), suffix=r'\b'), Name.Builtin),
+ (r'\w+', Name),
+ ],
+ }
+
+As you can see, you can add ``prefix`` and ``suffix`` parts to the constructed
+regex.
+
+
+[-Modifying-]
+
+
+{+Modifying+}+} Token Streams
+=======================
+
+Some languages ship a lot of builtin functions (for example PHP). The total
+amount of those functions differs from system to system because not everybody
+has every extension installed. In the case of PHP there are over 3000 builtin
+functions. That's an [-incredibly-] {+[-incredible-] {+incredibly+}+} huge amount of functions, much more than you
+[-want to-]
+{+[-can-]
+{+want to+}+} put into a regular expression.
+
+But because only `Name` tokens can be function names [-this is-] {+[-it's-] {+this is+}+} solvable by
+overriding the ``get_tokens_unprocessed()`` method. The following lexer
+subclasses the `PythonLexer` so that it highlights some additional names as
+pseudo [-keywords::-] {+[-keywords:
+
+.. sourcecode:: python-] {+keywords::+}+}
+
+ from [-pygments.lexers.python-] {+[-pygments.lexers.agile-] {+pygments.lexers.python+}+} import PythonLexer
+ from pygments.token import Name, Keyword
+
+ class MyPythonLexer(PythonLexer):
+ EXTRA_KEYWORDS = [-set(('foo',-] {+[-['foo',-] {+set(('foo',+}+} 'bar', 'foobar', 'barfoo', 'spam', [-'eggs'))-] {+[-'eggs']-] {+'eggs'))+}+}
+
+ def get_tokens_unprocessed(self, text):
+ for index, token, value in PythonLexer.get_tokens_unprocessed(self, text):
+ if token is Name and value in self.EXTRA_KEYWORDS:
+ yield index, Keyword.Pseudo, value
+ else:
+ yield index, token, value
+
+The `PhpLexer` and `LuaLexer` use this method to resolve builtin functions.
+
+{+[-.. note:: Do not confuse this with the :doc:`filter <filters>` system.-]+}
diff --git a/tests/examplefiles/wdiff_example3.wdiff b/tests/examplefiles/wdiff_example3.wdiff
new file mode 100644
index 00000000..0bbd6d65
--- /dev/null
+++ b/tests/examplefiles/wdiff_example3.wdiff
@@ -0,0 +1,10 @@
+This example is unbalanced open-close.
+We can't treat these easily.
+
+{+ added? -]
+[- deleted? +}
+
+suddenly closed -]
+suddenly closed +}
+
+{+ added? [- deleted?