From b9b77fbceb32da2b4973725a4bda356535429424 Mon Sep 17 00:00:00 2001 From: hhsprings Date: Wed, 4 Nov 2015 17:43:59 +0900 Subject: Add the lexer for `wdiff `_ output. (issue `#960 `_) --- pygments/lexers/_mapping.py | 1 + pygments/lexers/diff.py | 57 ++- tests/examplefiles/wdiff_example1.wdiff | 731 ++++++++++++++++++++++++++++++ tests/examplefiles/wdiff_example2.wdiff | 758 ++++++++++++++++++++++++++++++++ tests/examplefiles/wdiff_example3.wdiff | 10 + 5 files changed, 1556 insertions(+), 1 deletion(-) create mode 100644 tests/examplefiles/wdiff_example1.wdiff create mode 100644 tests/examplefiles/wdiff_example2.wdiff create mode 100644 tests/examplefiles/wdiff_example3.wdiff diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py index af7eec36..28c02cff 100644 --- a/pygments/lexers/_mapping.py +++ b/pygments/lexers/_mapping.py @@ -396,6 +396,7 @@ LEXERS = { 'VerilogLexer': ('pygments.lexers.hdl', 'verilog', ('verilog', 'v'), ('*.v',), ('text/x-verilog',)), 'VhdlLexer': ('pygments.lexers.hdl', 'vhdl', ('vhdl',), ('*.vhdl', '*.vhd'), ('text/x-vhdl',)), 'VimLexer': ('pygments.lexers.textedit', 'VimL', ('vim',), ('*.vim', '.vimrc', '.exrc', '.gvimrc', '_vimrc', '_exrc', '_gvimrc', 'vimrc', 'gvimrc'), ('text/x-vim',)), + 'WDiffLexer': ('pygments.lexers.diff', 'WDiff', ('wdiff',), ('*.wdiff',), ()), 'X10Lexer': ('pygments.lexers.x10', 'X10', ('x10', 'xten'), ('*.x10',), ('text/x-x10',)), 'XQueryLexer': ('pygments.lexers.webmisc', 'XQuery', ('xquery', 'xqy', 'xq', 'xql', 'xqm'), ('*.xqy', '*.xquery', '*.xq', '*.xql', '*.xqm'), ('text/xquery', 'application/xquery')), 'XmlDjangoLexer': ('pygments.lexers.templates', 'XML+Django/Jinja', ('xml+django', 'xml+jinja'), (), ('application/xml+django', 'application/xml+jinja')), diff --git a/pygments/lexers/diff.py b/pygments/lexers/diff.py index d3b1589d..9efb100b 100644 --- a/pygments/lexers/diff.py +++ b/pygments/lexers/diff.py @@ -9,11 +9,13 @@ :license: BSD, see LICENSE for details. """ +import re + from pygments.lexer import RegexLexer, include, bygroups from pygments.token import Text, Comment, Operator, Keyword, Name, Generic, \ Literal -__all__ = ['DiffLexer', 'DarcsPatchLexer'] +__all__ = ['DiffLexer', 'DarcsPatchLexer', 'WDiffLexer'] class DiffLexer(RegexLexer): @@ -104,3 +106,56 @@ class DarcsPatchLexer(RegexLexer): (r'[^\n\[]+', Generic.Deleted), ], } + + +class WDiffLexer(RegexLexer): + """ + A `wdiff `_ lexer. + + Note that: + + * only to normal output (without option like -l). + * if target files of wdiff contain "[-", "-]", "{+", "+}", + especially they are unbalanced, this lexer will get confusing. + + .. versionadded:: 2.1 + """ + + name = 'WDiff' + aliases = ['wdiff',] + filenames = ['*.wdiff',] + mimetypes = [] + + flags = re.MULTILINE | re.DOTALL + + # We can only assume "[-" after "[-" before "-]" is `nested`, + # for instance wdiff to wdiff outputs. We have no way to + # distinct these marker is of wdiff output from original text. + + ins_op = r"\{\+" + ins_cl = r"\+\}" + del_op = r"\[\-" + del_cl = r"\-\]" + tokens = { + 'root': [ + (ins_op, Generic.Inserted, 'inserted'), + (del_op, Generic.Deleted, 'deleted'), + (r'.', Text), + ], + 'inserted': [ + (ins_op, Generic.Inserted, '#push'), + (del_op, Generic.Inserted, '#push'), + (del_cl, Generic.Inserted, '#pop'), + + (ins_cl, Generic.Inserted, '#pop'), + (r'.', Generic.Inserted), + ], + 'deleted': [ + (del_op, Generic.Deleted, '#push'), + (ins_op, Generic.Deleted, '#push'), + (ins_cl, Generic.Deleted, '#pop'), + + (del_cl, Generic.Deleted, '#pop'), + (r'.', Generic.Deleted), + ], + } diff --git a/tests/examplefiles/wdiff_example1.wdiff b/tests/examplefiles/wdiff_example1.wdiff new file mode 100644 index 00000000..ca760812 --- /dev/null +++ b/tests/examplefiles/wdiff_example1.wdiff @@ -0,0 +1,731 @@ +.. -*- mode: rst -*- + +{+.. highlight:: python+} + +==================== +Write your own lexer +==================== + +If a lexer for your favorite language is missing in the Pygments package, you +can easily write your own and extend Pygments. + +All you need can be found inside the :mod:`pygments.lexer` module. As you can +read in the :doc:`API documentation `, a lexer is a class that is +initialized with some keyword arguments (the lexer options) and that provides a +:meth:`.get_tokens_unprocessed()` method which is given a string or unicode +object with the data to [-parse.-] {+lex.+} + +The :meth:`.get_tokens_unprocessed()` method must return an iterator or iterable +containing tuples in the form ``(index, token, value)``. Normally you don't +need to do this since there are [-numerous-] base lexers {+that do most of the work and that+} +you can subclass. + + +RegexLexer +========== + +[-A very powerful (but quite easy to use)-] + +{+The+} lexer {+base class used by almost all of Pygments' lexers+} is the +:class:`RegexLexer`. This +[-lexer base-] class allows you to define lexing rules in terms of +*regular expressions* for different *states*. + +States are groups of regular expressions that are matched against the input +string at the *current position*. If one of these expressions matches, a +corresponding action is performed [-(normally-] {+(such as+} yielding a token with a specific +[-type),-] +{+type, or changing state),+} the current position is set to where the last match +ended and the matching process continues with the first regex of the current +state. + +Lexer states are kept [-in-] {+on+} a [-state-] stack: each time a new state is entered, the new +state is pushed onto the stack. The most basic lexers (like the `DiffLexer`) +just need one state. + +Each state is defined as a list of tuples in the form (`regex`, `action`, +`new_state`) where the last item is optional. In the most basic form, `action` +is a token type (like `Name.Builtin`). That means: When `regex` matches, emit a +token with the match text and type `tokentype` and push `new_state` on the state +stack. If the new state is ``'#pop'``, the topmost state is popped from the +stack instead. [-(To-] {+To+} pop more than one state, use ``'#pop:2'`` and so [-on.)-] {+on.+} +``'#push'`` is a synonym for pushing the current state on the stack. + +The following example shows the `DiffLexer` from the builtin lexers. Note that +it contains some additional attributes `name`, `aliases` and `filenames` which +aren't required for a lexer. They are used by the builtin lexer lookup +functions. + +[-.. sourcecode:: python-] {+::+} + + from pygments.lexer import RegexLexer + from pygments.token import * + + class DiffLexer(RegexLexer): + name = 'Diff' + aliases = ['diff'] + filenames = ['*.diff'] + + tokens = { + 'root': [ + (r' .*\n', Text), + (r'\+.*\n', Generic.Inserted), + (r'-.*\n', Generic.Deleted), + (r'@.*\n', Generic.Subheading), + (r'Index.*\n', Generic.Heading), + (r'=.*\n', Generic.Heading), + (r'.*\n', Text), + ] + } + +As you can see this lexer only uses one state. When the lexer starts scanning +the text, it first checks if the current character is a space. If this is true +it scans everything until newline and returns the [-parsed-] data as {+a+} `Text` [-token.-] {+token (which +is the "no special highlighting" token).+} + +If this rule doesn't match, it checks if the current char is a plus sign. And +so on. + +If no rule matches at the current position, the current char is emitted as an +`Error` token that indicates a [-parsing-] {+lexing+} error, and the position is increased by +[-1.-] +{+one.+} + + +Adding and testing a new lexer +============================== + +To make [-pygments-] {+Pygments+} aware of your new lexer, you have to perform the following +steps: + +First, change to the current directory containing the [-pygments-] {+Pygments+} source code: + +.. [-sourcecode::-] {+code-block::+} console + + $ cd .../pygments-main + +{+Select a matching module under ``pygments/lexers``, or create a new module for +your lexer class.+} + +Next, make sure the lexer is known from outside of the module. All modules in +the ``pygments.lexers`` specify ``__all__``. For example, [-``other.py`` sets: + +.. sourcecode:: python-] {+``esoteric.py`` sets::+} + + __all__ = ['BrainfuckLexer', 'BefungeLexer', ...] + +Simply add the name of your lexer class to this list. + +Finally the lexer can be made [-publically-] {+publicly+} known by rebuilding the lexer mapping: + +.. [-sourcecode::-] {+code-block::+} console + + $ make mapfiles + +To test the new lexer, store an example file with the proper extension in +``tests/examplefiles``. For example, to test your ``DiffLexer``, add a +``tests/examplefiles/example.diff`` containing a sample diff output. + +Now you can use pygmentize to render your example to HTML: + +.. [-sourcecode::-] {+code-block::+} console + + $ ./pygmentize -O full -f html -o /tmp/example.html tests/examplefiles/example.diff + +Note that this [-explicitely-] {+explicitly+} calls the ``pygmentize`` in the current directory +by preceding it with ``./``. This ensures your modifications are used. +Otherwise a possibly already installed, unmodified version without your new +lexer would have been called from the system search path (``$PATH``). + +To view the result, open ``/tmp/example.html`` in your browser. + +Once the example renders as expected, you should run the complete test suite: + +.. [-sourcecode::-] {+code-block::+} console + + $ make test + +{+It also tests that your lexer fulfills the lexer API and certain invariants, +such as that the concatenation of all token text is the same as the input text.+} + + +Regex Flags +=========== + +You can either define regex flags {+locally+} in the regex (``r'(?x)foo bar'``) or +{+globally+} by adding a `flags` attribute to your lexer class. If no attribute is +defined, it defaults to `re.MULTILINE`. For more [-informations-] {+information+} about regular +expression flags see the {+page about+} `regular expressions`_ [-help page-] in the [-python-] {+Python+} +documentation. + +.. _regular expressions: [-http://docs.python.org/lib/re-syntax.html-] {+http://docs.python.org/library/re.html#regular-expression-syntax+} + + +Scanning multiple tokens at once +================================ + +{+So far, the `action` element in the rule tuple of regex, action and state has +been a single token type. Now we look at the first of several other possible +values.+} + +Here is a more complex lexer that highlights INI files. INI files consist of +sections, comments and [-key-] {+``key+} = [-value pairs: + +.. sourcecode:: python-] {+value`` pairs::+} + + from pygments.lexer import RegexLexer, bygroups + from pygments.token import * + + class IniLexer(RegexLexer): + name = 'INI' + aliases = ['ini', 'cfg'] + filenames = ['*.ini', '*.cfg'] + + tokens = { + 'root': [ + (r'\s+', Text), + (r';.*?$', Comment), + (r'\[.*?\]$', Keyword), + (r'(.*?)(\s*)(=)(\s*)(.*?)$', + bygroups(Name.Attribute, Text, Operator, Text, String)) + ] + } + +The lexer first looks for whitespace, comments and section names. [-And later-] {+Later+} it +looks for a line that looks like a key, value pair, separated by an ``'='`` +sign, and optional whitespace. + +The `bygroups` helper [-makes sure that-] {+yields+} each {+capturing+} group [-is yielded-] {+in the regex+} with a different +token type. First the `Name.Attribute` token, then a `Text` token for the +optional whitespace, after that a `Operator` token for the equals sign. Then a +`Text` token for the whitespace again. The rest of the line is returned as +`String`. + +Note that for this to work, every part of the match must be inside a capturing +group (a ``(...)``), and there must not be any nested capturing groups. If you +nevertheless need a group, use a non-capturing group defined using this syntax: +[-``r'(?:some|words|here)'``-] +{+``(?:some|words|here)``+} (note the ``?:`` after the beginning parenthesis). + +If you find yourself needing a capturing group inside the regex which shouldn't +be part of the output but is used in the regular expressions for backreferencing +(eg: ``r'(<(foo|bar)>)(.*?)()'``), you can pass `None` to the bygroups +function and [-it will skip-] that group will be skipped in the output. + + +Changing states +=============== + +Many lexers need multiple states to work as expected. For example, some +languages allow multiline comments to be nested. Since this is a recursive +pattern it's impossible to lex just using regular expressions. + +Here is [-the solution: + +.. sourcecode:: python-] {+a lexer that recognizes C++ style comments (multi-line with ``/* */`` +and single-line with ``//`` until end of line)::+} + + from pygments.lexer import RegexLexer + from pygments.token import * + + class [-ExampleLexer(RegexLexer):-] {+CppCommentLexer(RegexLexer):+} + name = 'Example Lexer with states' + + tokens = { + 'root': [ + (r'[^/]+', Text), + (r'/\*', Comment.Multiline, 'comment'), + (r'//.*?$', Comment.Singleline), + (r'/', Text) + ], + 'comment': [ + (r'[^*/]', Comment.Multiline), + (r'/\*', Comment.Multiline, '#push'), + (r'\*/', Comment.Multiline, '#pop'), + (r'[*/]', Comment.Multiline) + ] + } + +This lexer starts lexing in the ``'root'`` state. It tries to match as much as +possible until it finds a slash (``'/'``). If the next character after the slash +is [-a star-] {+an asterisk+} (``'*'``) the `RegexLexer` sends those two characters to the +output stream marked as `Comment.Multiline` and continues [-parsing-] {+lexing+} with the rules +defined in the ``'comment'`` state. + +If there wasn't [-a star-] {+an asterisk+} after the slash, the `RegexLexer` checks if it's a +[-singleline-] +{+Singleline+} comment [-(eg:-] {+(i.e.+} followed by a second slash). If this also wasn't the +case it must be a single [-slash-] {+slash, which is not a comment starter+} (the separate +regex for a single slash must also be given, else the slash would be marked as +an error token). + +Inside the ``'comment'`` state, we do the same thing again. Scan until the +lexer finds a star or slash. If it's the opening of a multiline comment, push +the ``'comment'`` state on the stack and continue scanning, again in the +``'comment'`` state. Else, check if it's the end of the multiline comment. If +yes, pop one state from the stack. + +Note: If you pop from an empty stack you'll get an `IndexError`. (There is an +easy way to prevent this from happening: don't ``'#pop'`` in the root state). + +If the `RegexLexer` encounters a newline that is flagged as an error token, the +stack is emptied and the lexer continues scanning in the ``'root'`` state. This +[-helps-] +{+can help+} producing error-tolerant highlighting for erroneous input, e.g. when a +single-line string is not closed. + + +Advanced state tricks +===================== + +There are a few more things you can do with states: + +- You can push multiple states onto the stack if you give a tuple instead of a + simple string as the third item in a rule tuple. For example, if you want to + match a comment containing a directive, something [-like::-] {+like: + + .. code-block:: text+} + + /* rest of comment */ + + you can use this [-rule: + + .. sourcecode:: python-] {+rule::+} + + tokens = { + 'root': [ + (r'/\* <', Comment, ('comment', 'directive')), + ... + ], + 'directive': [ + (r'[^>]*', Comment.Directive), + (r'>', Comment, '#pop'), + ], + 'comment': [ + (r'[^*]+', Comment), + (r'\*/', Comment, '#pop'), + (r'\*', Comment), + ] + } + + When this encounters the above sample, first ``'comment'`` and ``'directive'`` + are pushed onto the stack, then the lexer continues in the directive state + until it finds the closing ``>``, then it continues in the comment state until + the closing ``*/``. Then, both states are popped from the stack again and + lexing continues in the root state. + + .. versionadded:: 0.9 + The tuple can contain the special ``'#push'`` and ``'#pop'`` (but not + ``'#pop:n'``) directives. + + +- You can include the rules of a state in the definition of another. This is + done by using `include` from [-`pygments.lexer`: + + .. sourcecode:: python-] {+`pygments.lexer`::+} + + from pygments.lexer import RegexLexer, bygroups, include + from pygments.token import * + + class ExampleLexer(RegexLexer): + tokens = { + 'comments': [ + (r'/\*.*?\*/', Comment), + (r'//.*?\n', Comment), + ], + 'root': [ + include('comments'), + (r'(function )(\w+)( {)', + bygroups(Keyword, Name, Keyword), 'function'), + (r'.', Text), + ], + 'function': [ + (r'[^}/]+', Text), + include('comments'), + (r'/', Text), + [-(r'}',-] + {+(r'\}',+} Keyword, '#pop'), + ] + } + + This is a hypothetical lexer for a language that consist of functions and + comments. Because comments can occur at toplevel and in functions, we need + rules for comments in both states. As you can see, the `include` helper saves + repeating rules that occur more than once (in this example, the state + ``'comment'`` will never be entered by the lexer, as it's only there to be + included in ``'root'`` and ``'function'``). + +- Sometimes, you may want to "combine" a state from existing ones. This is + possible with the [-`combine`-] {+`combined`+} helper from `pygments.lexer`. + + If you, instead of a new state, write ``combined('state1', 'state2')`` as the + third item of a rule tuple, a new anonymous state will be formed from state1 + and state2 and if the rule matches, the lexer will enter this state. + + This is not used very often, but can be helpful in some cases, such as the + `PythonLexer`'s string literal processing. + +- If you want your lexer to start lexing in a different state you can modify the + stack by [-overloading-] {+overriding+} the `get_tokens_unprocessed()` [-method: + + .. sourcecode:: python-] {+method::+} + + from pygments.lexer import RegexLexer + + class [-MyLexer(RegexLexer):-] {+ExampleLexer(RegexLexer):+} + tokens = {...} + + def get_tokens_unprocessed(self, [-text): + stack = ['root', 'otherstate']-] {+text, stack=('root', 'otherstate')):+} + for item in RegexLexer.get_tokens_unprocessed(text, stack): + yield item + + Some lexers like the `PhpLexer` use this to make the leading ``', Name.Tag), + ], + 'script-content': [ + (r'(.+?)(<\s*/\s*script\s*>)', + bygroups(using(JavascriptLexer), Name.Tag), + '#pop'), + ] + } + +Here the content of a ```` end tag is processed by the `JavascriptLexer`, +while the end tag is yielded as a normal token with the `Name.Tag` type. + +[-As an additional goodie, if the lexer class is replaced by `this` (imported from +`pygments.lexer`), the "other" lexer will be the current one (because you cannot +refer to the current class within the code that runs at class definition time).-] + +Also note the ``(r'<\s*script\s*', Name.Tag, ('script-content', 'tag'))`` rule. +Here, two states are pushed onto the state stack, ``'script-content'`` and +``'tag'``. That means that first ``'tag'`` is processed, which will [-parse-] {+lex+} +attributes and the closing ``>``, then the ``'tag'`` state is popped and the +next state on top of the stack will be ``'script-content'``. + +{+Since you cannot refer to the class currently being defined, use `this` +(imported from `pygments.lexer`) to refer to the current lexer class, i.e. +``using(this)``. This construct may seem unnecessary, but this is often the +most obvious way of lexing arbitrary syntax between fixed delimiters without +introducing deeply nested states.+} + +The `using()` helper has a special keyword argument, `state`, which works as +follows: if given, the lexer to use initially is not in the ``"root"`` state, +but in the state given by this argument. This [-*only* works-] {+does not work+} with [-a `RegexLexer`.-] {+advanced +`RegexLexer` subclasses such as `ExtendedRegexLexer` (see below).+} + +Any other keywords arguments passed to `using()` are added to the keyword +arguments used to create the lexer. + + +Delegating Lexer +================ + +Another approach for nested lexers is the `DelegatingLexer` which is for example +used for the template engine lexers. It takes two lexers as arguments on +initialisation: a `root_lexer` and a `language_lexer`. + +The input is processed as follows: First, the whole text is lexed with the +`language_lexer`. All tokens yielded with [-a-] {+the special+} type of ``Other`` are +then concatenated and given to the `root_lexer`. The language tokens of the +`language_lexer` are then inserted into the `root_lexer`'s token stream at the +appropriate positions. + +[-.. sourcecode:: python-] {+::+} + + from pygments.lexer import DelegatingLexer + from pygments.lexers.web import HtmlLexer, PhpLexer + + class HtmlPhpLexer(DelegatingLexer): + def __init__(self, **options): + super(HtmlPhpLexer, self).__init__(HtmlLexer, PhpLexer, **options) + +This procedure ensures that e.g. HTML with template tags in it is highlighted +correctly even if the template tags are put into HTML tags or attributes. + +If you want to change the needle token ``Other`` to something else, you can give +the lexer another token type as the third [-parameter: + +.. sourcecode:: python-] {+parameter::+} + + DelegatingLexer.__init__(MyLexer, OtherLexer, Text, **options) + + +Callbacks +========= + +Sometimes the grammar of a language is so complex that a lexer would be unable +to [-parse-] {+process+} it just by using regular expressions and stacks. + +For this, the `RegexLexer` allows callbacks to be given in rule tuples, instead +of token types (`bygroups` and `using` are nothing else but preimplemented +callbacks). The callback must be a function taking two arguments: + +* the lexer itself +* the match object for the last matched rule + +The callback must then return an iterable of (or simply yield) ``(index, +tokentype, value)`` tuples, which are then just passed through by +`get_tokens_unprocessed()`. The ``index`` here is the position of the token in +the input string, ``tokentype`` is the normal token type (like `Name.Builtin`), +and ``value`` the associated part of the input string. + +You can see an example [-here: + +.. sourcecode:: python-] {+here::+} + + from pygments.lexer import RegexLexer + from pygments.token import Generic + + class HypotheticLexer(RegexLexer): + + def headline_callback(lexer, match): + equal_signs = match.group(1) + text = match.group(2) + yield match.start(), Generic.Headline, equal_signs + text + equal_signs + + tokens = { + 'root': [ + (r'(=+)(.*?)(\1)', headline_callback) + ] + } + +If the regex for the `headline_callback` matches, the function is called with +the match object. Note that after the callback is done, processing continues +normally, that is, after the end of the previous match. The callback has no +possibility to influence the position. + +There are not really any simple examples for lexer callbacks, but you can see +them in action e.g. in the [-`compiled.py`_ source code-] {+`SMLLexer` class+} in [-the `CLexer` and +`JavaLexer` classes.-] {+`ml.py`_.+} + +.. [-_compiled.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/compiled.py-] {+_ml.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ml.py+} + + +The ExtendedRegexLexer class +============================ + +The `RegexLexer`, even with callbacks, unfortunately isn't powerful enough for +the funky syntax rules of [-some-] languages [-that will go unnamed,-] such as Ruby. + +But fear not; even then you don't have to abandon the regular expression +[-approach. For-] +{+approach:+} Pygments has a subclass of `RegexLexer`, the `ExtendedRegexLexer`. +All features known from RegexLexers are available here too, and the tokens are +specified in exactly the same way, *except* for one detail: + +The `get_tokens_unprocessed()` method holds its internal state data not as local +variables, but in an instance of the `pygments.lexer.LexerContext` class, and +that instance is passed to callbacks as a third argument. This means that you +can modify the lexer state in callbacks. + +The `LexerContext` class has the following members: + +* `text` -- the input text +* `pos` -- the current starting position that is used for matching regexes +* `stack` -- a list containing the state stack +* `end` -- the maximum position to which regexes are matched, this defaults to + the length of `text` + +Additionally, the `get_tokens_unprocessed()` method can be given a +`LexerContext` instead of a string and will then process this context instead of +creating a new one for the string argument. + +Note that because you can set the current position to anything in the callback, +it won't be automatically be set by the caller after the callback is finished. +For example, this is how the hypothetical lexer above would be written with the +[-`ExtendedRegexLexer`: + +.. sourcecode:: python-] +{+`ExtendedRegexLexer`::+} + + from pygments.lexer import ExtendedRegexLexer + from pygments.token import Generic + + class ExHypotheticLexer(ExtendedRegexLexer): + + def headline_callback(lexer, match, ctx): + equal_signs = match.group(1) + text = match.group(2) + yield match.start(), Generic.Headline, equal_signs + text + equal_signs + ctx.pos = match.end() + + tokens = { + 'root': [ + (r'(=+)(.*?)(\1)', headline_callback) + ] + } + +This might sound confusing (and it can really be). But it is needed, and for an +example look at the Ruby lexer in [-`agile.py`_.-] {+`ruby.py`_.+} + +.. [-_agile.py: https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/agile.py + + +Filtering-] {+_ruby.py: https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ruby.py + + +Handling Lists of Keywords +========================== + +For a relatively short list (hundreds) you can construct an optimized regular +expression directly using ``words()`` (longer lists, see next section). This +function handles a few things for you automatically, including escaping +metacharacters and Python's first-match rather than longest-match in +alternations. Feel free to put the lists themselves in +``pygments/lexers/_$lang_builtins.py`` (see examples there), and generated by +code if possible. + +An example of using ``words()`` is something like:: + + from pygments.lexer import RegexLexer, words, Name + + class MyLexer(RegexLexer): + + tokens = { + 'root': [ + (words(('else', 'elseif'), suffix=r'\b'), Name.Builtin), + (r'\w+', Name), + ], + } + +As you can see, you can add ``prefix`` and ``suffix`` parts to the constructed +regex. + + +Modifying+} Token Streams +======================= + +Some languages ship a lot of builtin functions (for example PHP). The total +amount of those functions differs from system to system because not everybody +has every extension installed. In the case of PHP there are over 3000 builtin +functions. That's an [-incredible-] {+incredibly+} huge amount of functions, much more than you +[-can-] +{+want to+} put into a regular expression. + +But because only `Name` tokens can be function names [-it's-] {+this is+} solvable by +overriding the ``get_tokens_unprocessed()`` method. The following lexer +subclasses the `PythonLexer` so that it highlights some additional names as +pseudo [-keywords: + +.. sourcecode:: python-] {+keywords::+} + + from [-pygments.lexers.agile-] {+pygments.lexers.python+} import PythonLexer + from pygments.token import Name, Keyword + + class MyPythonLexer(PythonLexer): + EXTRA_KEYWORDS = [-['foo',-] {+set(('foo',+} 'bar', 'foobar', 'barfoo', 'spam', [-'eggs']-] {+'eggs'))+} + + def get_tokens_unprocessed(self, text): + for index, token, value in PythonLexer.get_tokens_unprocessed(self, text): + if token is Name and value in self.EXTRA_KEYWORDS: + yield index, Keyword.Pseudo, value + else: + yield index, token, value + +The `PhpLexer` and `LuaLexer` use this method to resolve builtin functions. + +[-.. note:: Do not confuse this with the :doc:`filter ` system.-] diff --git a/tests/examplefiles/wdiff_example2.wdiff b/tests/examplefiles/wdiff_example2.wdiff new file mode 100644 index 00000000..1a746fe5 --- /dev/null +++ b/tests/examplefiles/wdiff_example2.wdiff @@ -0,0 +1,758 @@ +.. -*- mode: rst -*- + +[-..-] + +{+{+..+} highlight:: [-python-] {+python+}+} + +==================== +Write your own lexer +==================== + +If a lexer for your favorite language is missing in the Pygments package, you +can easily write your own and extend Pygments. + +All you need can be found inside the :mod:`pygments.lexer` module. As you can +read in the :doc:`API documentation `, a lexer is a class that is +initialized with some keyword arguments (the lexer options) and that provides a +:meth:`.get_tokens_unprocessed()` method which is given a string or unicode +object with the data to [-lex.-] {+[-parse.-] {+lex.+}+} + +The :meth:`.get_tokens_unprocessed()` method must return an iterator or iterable +containing tuples in the form ``(index, token, value)``. Normally you don't +need to do this since there are {+[-numerous-]+} base lexers [-that-] {+{+that+} do most of the work and [-that-] {+that+}+} +you can subclass. + + +RegexLexer +========== + +[-The-] + +{+[-A very powerful (but quite easy to use)-] + +{+The+}+} lexer [-base-] {+{+base+} class used by almost all of Pygments' [-lexers-] {+lexers+}+} is the +:class:`RegexLexer`. This +{+[-lexer base-]+} class allows you to define lexing rules in terms of +*regular expressions* for different *states*. + +States are groups of regular expressions that are matched against the input +string at the *current position*. If one of these expressions matches, a +corresponding action is performed [-(such as-] {+[-(normally-] {+(such as+}+} yielding a token with a specific +[-type,-] +{+[-type),-] +{+type,+} or changing [-state),-] {+state),+}+} the current position is set to where the last match +ended and the matching process continues with the first regex of the current +state. + +Lexer states are kept [-on-] {+[-in-] {+on+}+} a {+[-state-]+} stack: each time a new state is entered, the new +state is pushed onto the stack. The most basic lexers (like the `DiffLexer`) +just need one state. + +Each state is defined as a list of tuples in the form (`regex`, `action`, +`new_state`) where the last item is optional. In the most basic form, `action` +is a token type (like `Name.Builtin`). That means: When `regex` matches, emit a +token with the match text and type `tokentype` and push `new_state` on the state +stack. If the new state is ``'#pop'``, the topmost state is popped from the +stack instead. [-To-] {+[-(To-] {+To+}+} pop more than one state, use ``'#pop:2'`` and so [-on.-] {+[-on.)-] {+on.+}+} +``'#push'`` is a synonym for pushing the current state on the stack. + +The following example shows the `DiffLexer` from the builtin lexers. Note that +it contains some additional attributes `name`, `aliases` and `filenames` which +aren't required for a lexer. They are used by the builtin lexer lookup +functions. [-::-] + +{+[-.. sourcecode:: python-] {+::+}+} + + from pygments.lexer import RegexLexer + from pygments.token import * + + class DiffLexer(RegexLexer): + name = 'Diff' + aliases = ['diff'] + filenames = ['*.diff'] + + tokens = { + 'root': [ + (r' .*\n', Text), + (r'\+.*\n', Generic.Inserted), + (r'-.*\n', Generic.Deleted), + (r'@.*\n', Generic.Subheading), + (r'Index.*\n', Generic.Heading), + (r'=.*\n', Generic.Heading), + (r'.*\n', Text), + ] + } + +As you can see this lexer only uses one state. When the lexer starts scanning +the text, it first checks if the current character is a space. If this is true +it scans everything until newline and returns the {+[-parsed-]+} data as [-a-] {+{+a+}+} `Text` [-token-] {+[-token.-] {+token+} (which +is the "no special highlighting" [-token).-] {+token).+}+} + +If this rule doesn't match, it checks if the current char is a plus sign. And +so on. + +If no rule matches at the current position, the current char is emitted as an +`Error` token that indicates a [-lexing-] {+[-parsing-] {+lexing+}+} error, and the position is increased by +[-one.-] +{+[-1.-] +{+one.+}+} + + +Adding and testing a new lexer +============================== + +To make [-Pygments-] {+[-pygments-] {+Pygments+}+} aware of your new lexer, you have to perform the following +steps: + +First, change to the current directory containing the [-Pygments-] {+[-pygments-] {+Pygments+}+} source code: + +.. [-code-block::-] {+[-sourcecode::-] {+code-block::+}+} console + + $ cd .../pygments-main + +[-Select-] + +{+{+Select+} a matching module under ``pygments/lexers``, or create a new module for +your lexer [-class.-] {+class.+}+} + +Next, make sure the lexer is known from outside of the module. All modules in +the ``pygments.lexers`` specify ``__all__``. For example, [-``esoteric.py`` sets::-] {+[-``other.py`` sets: + +.. sourcecode:: python-] {+``esoteric.py`` sets::+}+} + + __all__ = ['BrainfuckLexer', 'BefungeLexer', ...] + +Simply add the name of your lexer class to this list. + +Finally the lexer can be made [-publicly-] {+[-publically-] {+publicly+}+} known by rebuilding the lexer mapping: + +.. [-code-block::-] {+[-sourcecode::-] {+code-block::+}+} console + + $ make mapfiles + +To test the new lexer, store an example file with the proper extension in +``tests/examplefiles``. For example, to test your ``DiffLexer``, add a +``tests/examplefiles/example.diff`` containing a sample diff output. + +Now you can use pygmentize to render your example to HTML: + +.. [-code-block::-] {+[-sourcecode::-] {+code-block::+}+} console + + $ ./pygmentize -O full -f html -o /tmp/example.html tests/examplefiles/example.diff + +Note that this [-explicitly-] {+[-explicitely-] {+explicitly+}+} calls the ``pygmentize`` in the current directory +by preceding it with ``./``. This ensures your modifications are used. +Otherwise a possibly already installed, unmodified version without your new +lexer would have been called from the system search path (``$PATH``). + +To view the result, open ``/tmp/example.html`` in your browser. + +Once the example renders as expected, you should run the complete test suite: + +.. [-code-block::-] {+[-sourcecode::-] {+code-block::+}+} console + + $ make test + +[-It-] + +{+{+It+} also tests that your lexer fulfills the lexer API and certain invariants, +such as that the concatenation of all token text is the same as the input [-text.-] {+text.+}+} + + +Regex Flags +=========== + +You can either define regex flags [-locally-] {+{+locally+}+} in the regex (``r'(?x)foo bar'``) or +[-globally-] +{+{+globally+}+} by adding a `flags` attribute to your lexer class. If no attribute is +defined, it defaults to `re.MULTILINE`. For more [-information-] {+[-informations-] {+information+}+} about regular +expression flags see the [-page about-] {+{+page about+}+} `regular expressions`_ {+[-help page-]+} in the [-Python-] {+[-python-] {+Python+}+} +documentation. + +.. _regular expressions: [-http://docs.python.org/library/re.html#regular-expression-syntax-] {+[-http://docs.python.org/lib/re-syntax.html-] {+http://docs.python.org/library/re.html#regular-expression-syntax+}+} + + +Scanning multiple tokens at once +================================ + +[-So-] + +{+{+So+} far, the `action` element in the rule tuple of regex, action and state has +been a single token type. Now we look at the first of several other possible +[-values.-] +{+values.+}+} + +Here is a more complex lexer that highlights INI files. INI files consist of +sections, comments and [-``key-] {+[-key-] {+``key+}+} = [-value`` pairs::-] {+[-value pairs: + +.. sourcecode:: python-] {+value`` pairs::+}+} + + from pygments.lexer import RegexLexer, bygroups + from pygments.token import * + + class IniLexer(RegexLexer): + name = 'INI' + aliases = ['ini', 'cfg'] + filenames = ['*.ini', '*.cfg'] + + tokens = { + 'root': [ + (r'\s+', Text), + (r';.*?$', Comment), + (r'\[.*?\]$', Keyword), + (r'(.*?)(\s*)(=)(\s*)(.*?)$', + bygroups(Name.Attribute, Text, Operator, Text, String)) + ] + } + +The lexer first looks for whitespace, comments and section names. [-Later-] {+[-And later-] {+Later+}+} it +looks for a line that looks like a key, value pair, separated by an ``'='`` +sign, and optional whitespace. + +The `bygroups` helper [-yields-] {+[-makes sure that-] {+yields+}+} each [-capturing-] {+{+capturing+}+} group [-in-] {+[-is yielded-] {+in+} the [-regex-] {+regex+}+} with a different +token type. First the `Name.Attribute` token, then a `Text` token for the +optional whitespace, after that a `Operator` token for the equals sign. Then a +`Text` token for the whitespace again. The rest of the line is returned as +`String`. + +Note that for this to work, every part of the match must be inside a capturing +group (a ``(...)``), and there must not be any nested capturing groups. If you +nevertheless need a group, use a non-capturing group defined using this syntax: +[-``(?:some|words|here)``-] +{+[-``r'(?:some|words|here)'``-] +{+``(?:some|words|here)``+}+} (note the ``?:`` after the beginning parenthesis). + +If you find yourself needing a capturing group inside the regex which shouldn't +be part of the output but is used in the regular expressions for backreferencing +(eg: ``r'(<(foo|bar)>)(.*?)()'``), you can pass `None` to the bygroups +function and {+[-it will skip-]+} that group will be skipped in the output. + + +Changing states +=============== + +Many lexers need multiple states to work as expected. For example, some +languages allow multiline comments to be nested. Since this is a recursive +pattern it's impossible to lex just using regular expressions. + +Here is [-a-] {+[-the solution: + +.. sourcecode:: python-] {+a+} lexer that recognizes C++ style comments (multi-line with ``/* */`` +and single-line with ``//`` until end of [-line)::-] {+line)::+}+} + + from pygments.lexer import RegexLexer + from pygments.token import * + + class [-CppCommentLexer(RegexLexer):-] {+[-ExampleLexer(RegexLexer):-] {+CppCommentLexer(RegexLexer):+}+} + name = 'Example Lexer with states' + + tokens = { + 'root': [ + (r'[^/]+', Text), + (r'/\*', Comment.Multiline, 'comment'), + (r'//.*?$', Comment.Singleline), + (r'/', Text) + ], + 'comment': [ + (r'[^*/]', Comment.Multiline), + (r'/\*', Comment.Multiline, '#push'), + (r'\*/', Comment.Multiline, '#pop'), + (r'[*/]', Comment.Multiline) + ] + } + +This lexer starts lexing in the ``'root'`` state. It tries to match as much as +possible until it finds a slash (``'/'``). If the next character after the slash +is [-an asterisk-] {+[-a star-] {+an asterisk+}+} (``'*'``) the `RegexLexer` sends those two characters to the +output stream marked as `Comment.Multiline` and continues [-lexing-] {+[-parsing-] {+lexing+}+} with the rules +defined in the ``'comment'`` state. + +If there wasn't [-an asterisk-] {+[-a star-] {+an asterisk+}+} after the slash, the `RegexLexer` checks if it's a +[-Singleline-] +{+[-singleline-] +{+Singleline+}+} comment [-(i.e.-] {+[-(eg:-] {+(i.e.+}+} followed by a second slash). If this also wasn't the +case it must be a single [-slash,-] {+[-slash-] {+slash,+} which is not a comment [-starter-] {+starter+}+} (the separate +regex for a single slash must also be given, else the slash would be marked as +an error token). + +Inside the ``'comment'`` state, we do the same thing again. Scan until the +lexer finds a star or slash. If it's the opening of a multiline comment, push +the ``'comment'`` state on the stack and continue scanning, again in the +``'comment'`` state. Else, check if it's the end of the multiline comment. If +yes, pop one state from the stack. + +Note: If you pop from an empty stack you'll get an `IndexError`. (There is an +easy way to prevent this from happening: don't ``'#pop'`` in the root state). + +If the `RegexLexer` encounters a newline that is flagged as an error token, the +stack is emptied and the lexer continues scanning in the ``'root'`` state. This +[-can help-] +{+[-helps-] +{+can help+}+} producing error-tolerant highlighting for erroneous input, e.g. when a +single-line string is not closed. + + +Advanced state tricks +===================== + +There are a few more things you can do with states: + +- You can push multiple states onto the stack if you give a tuple instead of a + simple string as the third item in a rule tuple. For example, if you want to + match a comment containing a directive, something [-like:-] {+[-like::-] {+like:+} + + .. code-block:: [-text-] {+text+}+} + + /* rest of comment */ + + you can use this [-rule::-] {+[-rule: + + .. sourcecode:: python-] {+rule::+}+} + + tokens = { + 'root': [ + (r'/\* <', Comment, ('comment', 'directive')), + ... + ], + 'directive': [ + (r'[^>]*', Comment.Directive), + (r'>', Comment, '#pop'), + ], + 'comment': [ + (r'[^*]+', Comment), + (r'\*/', Comment, '#pop'), + (r'\*', Comment), + ] + } + + When this encounters the above sample, first ``'comment'`` and ``'directive'`` + are pushed onto the stack, then the lexer continues in the directive state + until it finds the closing ``>``, then it continues in the comment state until + the closing ``*/``. Then, both states are popped from the stack again and + lexing continues in the root state. + + .. versionadded:: 0.9 + The tuple can contain the special ``'#push'`` and ``'#pop'`` (but not + ``'#pop:n'``) directives. + + +- You can include the rules of a state in the definition of another. This is + done by using `include` from [-`pygments.lexer`::-] {+[-`pygments.lexer`: + + .. sourcecode:: python-] {+`pygments.lexer`::+}+} + + from pygments.lexer import RegexLexer, bygroups, include + from pygments.token import * + + class ExampleLexer(RegexLexer): + tokens = { + 'comments': [ + (r'/\*.*?\*/', Comment), + (r'//.*?\n', Comment), + ], + 'root': [ + include('comments'), + (r'(function )(\w+)( {)', + bygroups(Keyword, Name, Keyword), 'function'), + (r'.', Text), + ], + 'function': [ + (r'[^}/]+', Text), + include('comments'), + (r'/', Text), + [-(r'\}',-] + {+[-(r'}',-] + {+(r'\}',+}+} Keyword, '#pop'), + ] + } + + This is a hypothetical lexer for a language that consist of functions and + comments. Because comments can occur at toplevel and in functions, we need + rules for comments in both states. As you can see, the `include` helper saves + repeating rules that occur more than once (in this example, the state + ``'comment'`` will never be entered by the lexer, as it's only there to be + included in ``'root'`` and ``'function'``). + +- Sometimes, you may want to "combine" a state from existing ones. This is + possible with the [-`combined`-] {+[-`combine`-] {+`combined`+}+} helper from `pygments.lexer`. + + If you, instead of a new state, write ``combined('state1', 'state2')`` as the + third item of a rule tuple, a new anonymous state will be formed from state1 + and state2 and if the rule matches, the lexer will enter this state. + + This is not used very often, but can be helpful in some cases, such as the + `PythonLexer`'s string literal processing. + +- If you want your lexer to start lexing in a different state you can modify the + stack by [-overriding-] {+[-overloading-] {+overriding+}+} the `get_tokens_unprocessed()` [-method::-] {+[-method: + + .. sourcecode:: python-] {+method::+}+} + + from pygments.lexer import RegexLexer + + class [-ExampleLexer(RegexLexer):-] {+[-MyLexer(RegexLexer):-] {+ExampleLexer(RegexLexer):+}+} + tokens = {...} + + def get_tokens_unprocessed(self, [-text,-] {+[-text): + stack = ['root', 'otherstate']-] {+text,+} stack=('root', [-'otherstate')):-] {+'otherstate')):+}+} + for item in RegexLexer.get_tokens_unprocessed(text, stack): + yield item + + Some lexers like the `PhpLexer` use this to make the leading ``', Name.Tag), + ], + 'script-content': [ + (r'(.+?)(<\s*/\s*script\s*>)', + bygroups(using(JavascriptLexer), Name.Tag), + '#pop'), + ] + } + +Here the content of a ```` end tag is processed by the `JavascriptLexer`, +while the end tag is yielded as a normal token with the `Name.Tag` type. + +{+[-As an additional goodie, if the lexer class is replaced by `this` (imported from +`pygments.lexer`), the "other" lexer will be the current one (because you cannot +refer to the current class within the code that runs at class definition time).-]+} + +Also note the ``(r'<\s*script\s*', Name.Tag, ('script-content', 'tag'))`` rule. +Here, two states are pushed onto the state stack, ``'script-content'`` and +``'tag'``. That means that first ``'tag'`` is processed, which will [-lex-] {+[-parse-] {+lex+}+} +attributes and the closing ``>``, then the ``'tag'`` state is popped and the +next state on top of the stack will be ``'script-content'``. + +[-Since-] + +{+{+Since+} you cannot refer to the class currently being defined, use `this` +(imported from `pygments.lexer`) to refer to the current lexer class, i.e. +``using(this)``. This construct may seem unnecessary, but this is often the +most obvious way of lexing arbitrary syntax between fixed delimiters without +introducing deeply nested [-states.-] {+states.+}+} + +The `using()` helper has a special keyword argument, `state`, which works as +follows: if given, the lexer to use initially is not in the ``"root"`` state, +but in the state given by this argument. This [-does-] {+[-*only* works-] {+does+} not [-work-] {+work+}+} with [-advanced-] {+[-a `RegexLexer`.-] {+advanced+} +`RegexLexer` subclasses such as `ExtendedRegexLexer` (see [-below).-] {+below).+}+} + +Any other keywords arguments passed to `using()` are added to the keyword +arguments used to create the lexer. + + +Delegating Lexer +================ + +Another approach for nested lexers is the `DelegatingLexer` which is for example +used for the template engine lexers. It takes two lexers as arguments on +initialisation: a `root_lexer` and a `language_lexer`. + +The input is processed as follows: First, the whole text is lexed with the +`language_lexer`. All tokens yielded with [-the special-] {+[-a-] {+the special+}+} type of ``Other`` are +then concatenated and given to the `root_lexer`. The language tokens of the +`language_lexer` are then inserted into the `root_lexer`'s token stream at the +appropriate positions. [-::-] + +{+[-.. sourcecode:: python-] {+::+}+} + + from pygments.lexer import DelegatingLexer + from pygments.lexers.web import HtmlLexer, PhpLexer + + class HtmlPhpLexer(DelegatingLexer): + def __init__(self, **options): + super(HtmlPhpLexer, self).__init__(HtmlLexer, PhpLexer, **options) + +This procedure ensures that e.g. HTML with template tags in it is highlighted +correctly even if the template tags are put into HTML tags or attributes. + +If you want to change the needle token ``Other`` to something else, you can give +the lexer another token type as the third [-parameter::-] {+[-parameter: + +.. sourcecode:: python-] {+parameter::+}+} + + DelegatingLexer.__init__(MyLexer, OtherLexer, Text, **options) + + +Callbacks +========= + +Sometimes the grammar of a language is so complex that a lexer would be unable +to [-process-] {+[-parse-] {+process+}+} it just by using regular expressions and stacks. + +For this, the `RegexLexer` allows callbacks to be given in rule tuples, instead +of token types (`bygroups` and `using` are nothing else but preimplemented +callbacks). The callback must be a function taking two arguments: + +* the lexer itself +* the match object for the last matched rule + +The callback must then return an iterable of (or simply yield) ``(index, +tokentype, value)`` tuples, which are then just passed through by +`get_tokens_unprocessed()`. The ``index`` here is the position of the token in +the input string, ``tokentype`` is the normal token type (like `Name.Builtin`), +and ``value`` the associated part of the input string. + +You can see an example [-here::-] {+[-here: + +.. sourcecode:: python-] {+here::+}+} + + from pygments.lexer import RegexLexer + from pygments.token import Generic + + class HypotheticLexer(RegexLexer): + + def headline_callback(lexer, match): + equal_signs = match.group(1) + text = match.group(2) + yield match.start(), Generic.Headline, equal_signs + text + equal_signs + + tokens = { + 'root': [ + (r'(=+)(.*?)(\1)', headline_callback) + ] + } + +If the regex for the `headline_callback` matches, the function is called with +the match object. Note that after the callback is done, processing continues +normally, that is, after the end of the previous match. The callback has no +possibility to influence the position. + +There are not really any simple examples for lexer callbacks, but you can see +them in action e.g. in the [-`SMLLexer` class in `ml.py`_.-] {+[-`compiled.py`_ source code-] {+`SMLLexer` class+} in [-the `CLexer` and +`JavaLexer` classes.-] {+`ml.py`_.+}+} + +.. [-_ml.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ml.py-] {+[-_compiled.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/compiled.py-] {+_ml.py: http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ml.py+}+} + + +The ExtendedRegexLexer class +============================ + +The `RegexLexer`, even with callbacks, unfortunately isn't powerful enough for +the funky syntax rules of {+[-some-]+} languages {+[-that will go unnamed,-]+} such as Ruby. + +But fear not; even then you don't have to abandon the regular expression +[-approach:-] +{+[-approach. For-] +{+approach:+}+} Pygments has a subclass of `RegexLexer`, the `ExtendedRegexLexer`. +All features known from RegexLexers are available here too, and the tokens are +specified in exactly the same way, *except* for one detail: + +The `get_tokens_unprocessed()` method holds its internal state data not as local +variables, but in an instance of the `pygments.lexer.LexerContext` class, and +that instance is passed to callbacks as a third argument. This means that you +can modify the lexer state in callbacks. + +The `LexerContext` class has the following members: + +* `text` -- the input text +* `pos` -- the current starting position that is used for matching regexes +* `stack` -- a list containing the state stack +* `end` -- the maximum position to which regexes are matched, this defaults to + the length of `text` + +Additionally, the `get_tokens_unprocessed()` method can be given a +`LexerContext` instead of a string and will then process this context instead of +creating a new one for the string argument. + +Note that because you can set the current position to anything in the callback, +it won't be automatically be set by the caller after the callback is finished. +For example, this is how the hypothetical lexer above would be written with the +[-`ExtendedRegexLexer`::-] +{+[-`ExtendedRegexLexer`: + +.. sourcecode:: python-] +{+`ExtendedRegexLexer`::+}+} + + from pygments.lexer import ExtendedRegexLexer + from pygments.token import Generic + + class ExHypotheticLexer(ExtendedRegexLexer): + + def headline_callback(lexer, match, ctx): + equal_signs = match.group(1) + text = match.group(2) + yield match.start(), Generic.Headline, equal_signs + text + equal_signs + ctx.pos = match.end() + + tokens = { + 'root': [ + (r'(=+)(.*?)(\1)', headline_callback) + ] + } + +This might sound confusing (and it can really be). But it is needed, and for an +example look at the Ruby lexer in [-`ruby.py`_.-] {+[-`agile.py`_.-] {+`ruby.py`_.+}+} + +.. [-_ruby.py:-] {+[-_agile.py: https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/agile.py + + +Filtering-] {+_ruby.py:+} https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/ruby.py + + +Handling Lists of Keywords +========================== + +For a relatively short list (hundreds) you can construct an optimized regular +expression directly using ``words()`` (longer lists, see next section). This +function handles a few things for you automatically, including escaping +metacharacters and Python's first-match rather than longest-match in +alternations. Feel free to put the lists themselves in +``pygments/lexers/_$lang_builtins.py`` (see examples there), and generated by +code if possible. + +An example of using ``words()`` is something like:: + + from pygments.lexer import RegexLexer, words, Name + + class MyLexer(RegexLexer): + + tokens = { + 'root': [ + (words(('else', 'elseif'), suffix=r'\b'), Name.Builtin), + (r'\w+', Name), + ], + } + +As you can see, you can add ``prefix`` and ``suffix`` parts to the constructed +regex. + + +[-Modifying-] + + +{+Modifying+}+} Token Streams +======================= + +Some languages ship a lot of builtin functions (for example PHP). The total +amount of those functions differs from system to system because not everybody +has every extension installed. In the case of PHP there are over 3000 builtin +functions. That's an [-incredibly-] {+[-incredible-] {+incredibly+}+} huge amount of functions, much more than you +[-want to-] +{+[-can-] +{+want to+}+} put into a regular expression. + +But because only `Name` tokens can be function names [-this is-] {+[-it's-] {+this is+}+} solvable by +overriding the ``get_tokens_unprocessed()`` method. The following lexer +subclasses the `PythonLexer` so that it highlights some additional names as +pseudo [-keywords::-] {+[-keywords: + +.. sourcecode:: python-] {+keywords::+}+} + + from [-pygments.lexers.python-] {+[-pygments.lexers.agile-] {+pygments.lexers.python+}+} import PythonLexer + from pygments.token import Name, Keyword + + class MyPythonLexer(PythonLexer): + EXTRA_KEYWORDS = [-set(('foo',-] {+[-['foo',-] {+set(('foo',+}+} 'bar', 'foobar', 'barfoo', 'spam', [-'eggs'))-] {+[-'eggs']-] {+'eggs'))+}+} + + def get_tokens_unprocessed(self, text): + for index, token, value in PythonLexer.get_tokens_unprocessed(self, text): + if token is Name and value in self.EXTRA_KEYWORDS: + yield index, Keyword.Pseudo, value + else: + yield index, token, value + +The `PhpLexer` and `LuaLexer` use this method to resolve builtin functions. + +{+[-.. note:: Do not confuse this with the :doc:`filter ` system.-]+} diff --git a/tests/examplefiles/wdiff_example3.wdiff b/tests/examplefiles/wdiff_example3.wdiff new file mode 100644 index 00000000..89303a75 --- /dev/null +++ b/tests/examplefiles/wdiff_example3.wdiff @@ -0,0 +1,10 @@ +This example is unbalanced open-close. +We can't treat these easily. + +{+ added? -] +[- deleted? +} + +suddenly closed -] +suddenly closed +} + +[+ added? [- deleted? -- cgit v1.2.1 From 4d5ef3c7733a6667cc083f0dbab042be556daab9 Mon Sep 17 00:00:00 2001 From: hhsprings Date: Sat, 7 Nov 2015 21:59:57 +0900 Subject: See `#1164 `_. Before: 28807 bytes, 118.3000 [ms] / 0.004107 [ms/byte] 30964 bytes, 130.4700 [ms] / 0.004214 [ms/byte] 159 bytes, 1.2000 [ms] / 0.007547 [ms/byte] 28807 bytes, 117.6800 [ms] / 0.004085 [ms/byte] 30964 bytes, 124.3500 [ms] / 0.004016 [ms/byte] 159 bytes, 0.9500 [ms] / 0.005975 [ms/byte] 28807 bytes, 123.9600 [ms] / 0.004303 [ms/byte] 30964 bytes, 124.1700 [ms] / 0.004010 [ms/byte] 159 bytes, 1.3200 [ms] / 0.008302 [ms/byte] After: 28807 bytes, 11.3200 [ms] / 0.000393 [ms/byte] 30964 bytes, 21.6200 [ms] / 0.000698 [ms/byte] 159 bytes, 0.3400 [ms] / 0.002138 [ms/byte] 28807 bytes, 15.8100 [ms] / 0.000549 [ms/byte] 30964 bytes, 21.6800 [ms] / 0.000700 [ms/byte] 159 bytes, 0.4100 [ms] / 0.002579 [ms/byte] 28807 bytes, 11.4300 [ms] / 0.000397 [ms/byte] 30964 bytes, 15.3000 [ms] / 0.000494 [ms/byte] 159 bytes, 0.3900 [ms] / 0.002453 [ms/byte] About x10 faster... --- pygments/lexers/diff.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pygments/lexers/diff.py b/pygments/lexers/diff.py index 9efb100b..2056fbff 100644 --- a/pygments/lexers/diff.py +++ b/pygments/lexers/diff.py @@ -136,10 +136,12 @@ class WDiffLexer(RegexLexer): ins_cl = r"\+\}" del_op = r"\[\-" del_cl = r"\-\]" + normal = r'[^{}[\]+-]+' # for performance tokens = { 'root': [ (ins_op, Generic.Inserted, 'inserted'), (del_op, Generic.Deleted, 'deleted'), + (normal, Text), (r'.', Text), ], 'inserted': [ @@ -148,6 +150,7 @@ class WDiffLexer(RegexLexer): (del_cl, Generic.Inserted, '#pop'), (ins_cl, Generic.Inserted, '#pop'), + (normal, Generic.Inserted), (r'.', Generic.Inserted), ], 'deleted': [ @@ -156,6 +159,7 @@ class WDiffLexer(RegexLexer): (ins_cl, Generic.Deleted, '#pop'), (del_cl, Generic.Deleted, '#pop'), + (normal, Generic.Deleted), (r'.', Generic.Deleted), ], } -- cgit v1.2.1 From 68d2058669f4d733bb5f5beccf0c50bcabc81097 Mon Sep 17 00:00:00 2001 From: Hiroaki Itoh Date: Sun, 8 Nov 2015 09:12:24 +0000 Subject: fix typo. --- tests/examplefiles/wdiff_example3.wdiff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/examplefiles/wdiff_example3.wdiff b/tests/examplefiles/wdiff_example3.wdiff index 89303a75..0bbd6d65 100644 --- a/tests/examplefiles/wdiff_example3.wdiff +++ b/tests/examplefiles/wdiff_example3.wdiff @@ -7,4 +7,4 @@ We can't treat these easily. suddenly closed -] suddenly closed +} -[+ added? [- deleted? +{+ added? [- deleted? -- cgit v1.2.1