diff options
author | thatch <devnull@localhost> | 2009-01-04 09:01:06 -0600 |
---|---|---|
committer | thatch <devnull@localhost> | 2009-01-04 09:01:06 -0600 |
commit | 6b17d2b85ec9cb15976f7738cdc03544f0f86e27 (patch) | |
tree | b6763872b9fa331b405dedc7d276f2504bfc985c | |
parent | 64ef7976772355f0ab4af68c4a572483dcbc15ad (diff) | |
parent | 5ab91c0bb87e55c356a0342020afe6e0f599e5b0 (diff) | |
download | pygments-6b17d2b85ec9cb15976f7738cdc03544f0f86e27.tar.gz |
Merge with pygments-main
-rw-r--r-- | AUTHORS | 1 | ||||
-rw-r--r-- | CHANGES | 7 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | pygments/lexers/_mapping.py | 17 | ||||
-rw-r--r-- | pygments/lexers/compiled.py | 3 | ||||
-rw-r--r-- | pygments/lexers/other.py | 2 | ||||
-rw-r--r-- | pygments/lexers/parsers.py | 670 | ||||
-rw-r--r-- | pygments/lexers/templates.py | 8 | ||||
-rw-r--r-- | tests/examplefiles/ANTLRv3.g | 608 | ||||
-rw-r--r-- | tests/examplefiles/ragel-cpp_rlscan | 280 | ||||
-rw-r--r-- | tests/examplefiles/ragel-cpp_snippet | 2 |
11 files changed, 1595 insertions, 5 deletions
@@ -32,6 +32,7 @@ Other contributors, listed alphabetically, are: * Kirk McDonald -- D lexer * Lukas Meuser -- BBCode formatter, Lua lexer * Paulo Moura -- Logtalk lexer +* Ana Nelson -- Ragel, ANTLR lexers * Ronny Pfannschmidt -- BBCode lexer * Benjamin Peterson -- Test suite refactoring * Andre Roberge -- Tango style @@ -7,10 +7,11 @@ Version 1.1 - Lexers added: - * GLSL (#369) + * Antlr/Ragel, thanks to Ana Nelson + * (Ba)sh shell * Erlang shell - * (Ba)sh shell (#349) - * Prolog (#373) + * GLSL + * Prolog - Fix a bug lexing extended Ruby strings. @@ -18,7 +18,7 @@ export PYTHONPATH = $(shell echo "$$PYTHONPATH"):$(shell python -c 'import os; p all: clean-pyc check test check: - @$(PYTHON) scripts/detect_missing_analyse_text.py + @$(PYTHON) scripts/detect_missing_analyse_text.py || true @$(PYTHON) scripts/check_sources.py -i apidocs -i pygments/lexers/_mapping.py \ -i docs/build -i pygments/formatters/_mapping.py -i pygments/unistring.py \ -i pygments/lexers/_vimbuiltins.py diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py index 2a678764..1b4b8c37 100644 --- a/pygments/lexers/_mapping.py +++ b/pygments/lexers/_mapping.py @@ -16,6 +16,15 @@ LEXERS = { 'ActionScript3Lexer': ('pygments.lexers.web', 'ActionScript 3', ('as3', 'actionscript3'), ('*.as',), ('application/x-actionscript', 'text/x-actionscript', 'text/actionscript')), 'ActionScriptLexer': ('pygments.lexers.web', 'ActionScript', ('as', 'actionscript'), ('*.as',), ('application/x-actionscript', 'text/x-actionscript', 'text/actionscript')), + 'AntlrActionScriptLexer': ('pygments.lexers.parsers', 'ANTLR With ActionScript Target', ('antlr-as', 'antlr-actionscript'), ('*.G', '*.g'), ()), + 'AntlrCSharpLexer': ('pygments.lexers.parsers', 'ANTLR With C# Target', ('antlr-csharp', 'antlr-c#'), ('*.G', '*.g'), ()), + 'AntlrCppLexer': ('pygments.lexers.parsers', 'ANTLR With CPP Target', ('antlr-cpp',), ('*.G', '*.g'), ()), + 'AntlrJavaLexer': ('pygments.lexers.parsers', 'ANTLR With Java Target', ('antlr-java',), ('*.G', '*.g'), ()), + 'AntlrLexer': ('pygments.lexers.parsers', 'ANTLR', ('antlr',), (), ()), + 'AntlrObjectiveCLexer': ('pygments.lexers.parsers', 'ANTLR With ObjectiveC Target', ('antlr-objc',), ('*.G', '*.g'), ()), + 'AntlrPerlLexer': ('pygments.lexers.parsers', 'ANTLR With Perl Target', ('antlr-perl',), ('*.G', '*.g'), ()), + 'AntlrPythonLexer': ('pygments.lexers.parsers', 'ANTLR With Python Target', ('antlr-python',), ('*.G', '*.g'), ()), + 'AntlrRubyLexer': ('pygments.lexers.parsers', 'ANTLR With Ruby Target', ('antlr-ruby', 'antlr-rb'), ('*.G', '*.g'), ()), 'ApacheConfLexer': ('pygments.lexers.text', 'ApacheConf', ('apacheconf', 'aconf', 'apache'), ('.htaccess', 'apache.conf', 'apache2.conf'), ('text/x-apacheconf',)), 'AppleScriptLexer': ('pygments.lexers.other', 'AppleScript', ('applescript',), ('*.applescript',), ()), 'BBCodeLexer': ('pygments.lexers.text', 'BBCode', ('bbcode',), (), ('text/x-bbcode',)), @@ -118,6 +127,14 @@ LEXERS = { 'PythonConsoleLexer': ('pygments.lexers.agile', 'Python console session', ('pycon',), (), ('text/x-python-doctest',)), 'PythonLexer': ('pygments.lexers.agile', 'Python', ('python', 'py'), ('*.py', '*.pyw', '*.sc', 'SConstruct', 'SConscript'), ('text/x-python', 'application/x-python')), 'PythonTracebackLexer': ('pygments.lexers.agile', 'Python Traceback', ('pytb',), ('*.pytb',), ('text/x-python-traceback',)), + 'RagelCLexer': ('pygments.lexers.parsers', 'Ragel in C Host', ('ragel-c',), ('*.rl',), ()), + 'RagelCppLexer': ('pygments.lexers.parsers', 'Ragel in CPP Host', ('ragel-cpp',), ('*.rl',), ()), + 'RagelDLexer': ('pygments.lexers.parsers', 'Ragel in D Host', ('ragel-d',), ('*.rl',), ()), + 'RagelEmbeddedLexer': ('pygments.lexers.parsers', 'Embedded Ragel', ('ragel-em',), ('*.rl',), ()), + 'RagelJavaLexer': ('pygments.lexers.parsers', 'Ragel in Java Host', ('ragel-java',), ('*.rl',), ()), + 'RagelLexer': ('pygments.lexers.parsers', 'Ragel', ('ragel',), (), ()), + 'RagelObjectiveCLexer': ('pygments.lexers.parsers', 'Ragel in Objective C Host', ('ragel-objc',), ('*.rl',), ()), + 'RagelRubyLexer': ('pygments.lexers.parsers', 'Ragel in Ruby Host', ('ragel-ruby', 'ragel-rb'), ('*.rl',), ()), 'RawTokenLexer': ('pygments.lexers.special', 'Raw token data', ('raw',), (), ('application/x-pygments-tokens',)), 'RedcodeLexer': ('pygments.lexers.other', 'Redcode', ('redcode',), ('*.cw',), ()), 'RhtmlLexer': ('pygments.lexers.templates', 'RHTML', ('rhtml', 'html+erb', 'html+ruby'), ('*.rhtml',), ('text/html+ruby',)), diff --git a/pygments/lexers/compiled.py b/pygments/lexers/compiled.py index 3624b270..643ceaad 100644 --- a/pygments/lexers/compiled.py +++ b/pygments/lexers/compiled.py @@ -1340,6 +1340,9 @@ class GLShaderLexer(RegexLexer): } class PrologLexer(RegexLexer): + """ + Lexer for Prolog files. + """ name = 'Prolog' aliases = ['prolog'] filenames = ['*.prolog', '*.pro', '*.pl'] diff --git a/pygments/lexers/other.py b/pygments/lexers/other.py index 83913570..dd187130 100644 --- a/pygments/lexers/other.py +++ b/pygments/lexers/other.py @@ -347,7 +347,7 @@ class BashLexer(RegexLexer): r'export|false|fc|fg|getopts|hash|help|history|jobs|kill|let|' r'local|logout|popd|printf|pushd|pwd|read|readonly|set|shift|' r'shopt|source|suspend|test|time|times|trap|true|type|typeset|' - r'ulimit|umask|unalias|unset|wait)\s*\b', + r'ulimit|umask|unalias|unset|wait)\s*\b(?!\.)', Name.Builtin), (r'#.*\n', Comment), (r'\\[\w\W]', String.Escape), diff --git a/pygments/lexers/parsers.py b/pygments/lexers/parsers.py new file mode 100644 index 00000000..00f1ee3e --- /dev/null +++ b/pygments/lexers/parsers.py @@ -0,0 +1,670 @@ +# -*- coding: utf-8 -*- +""" + pygments.lexers.parsers + ~~~~~~~~~~~~~~~~~~~~~~~ + + Lexers for parser generators. + + :copyright: 2008-2009 by Ana Nelson <ana@ananelson.com>, Tim Hatch. + + :license: BSD, see LICENSE for more details. +""" + +import re + +from pygments.lexer import RegexLexer, DelegatingLexer, \ + include, bygroups, using, this +from pygments.token import Error, Punctuation, Generic, Other, \ + Text, Comment, Operator, Keyword, Name, String, Number, Whitespace +from pygments.lexers.compiled import JavaLexer, CLexer, CppLexer, \ + ObjectiveCLexer, DLexer +from pygments.lexers.dotnet import CSharpLexer +from pygments.lexers.agile import RubyLexer, PythonLexer, PerlLexer +from pygments.lexers.web import ActionScriptLexer +# Use TextLexer during development to just focus on one part of a delegating +# lexer. +from pygments.lexers.special import TextLexer + +__all__ = ['RagelLexer', 'RagelEmbeddedLexer', 'RagelCLexer', 'RagelDLexer', + 'RagelCppLexer', 'RagelObjectiveCLexer', 'RagelRubyLexer', + 'RagelJavaLexer', 'AntlrLexer', 'AntlrPythonLexer', + 'AntlrPerlLexer', 'AntlrRubyLexer', 'AntlrCppLexer', + #'AntlrCLexer', + 'AntlrCSharpLexer', 'AntlrObjectiveCLexer', + 'AntlrJavaLexer', "AntlrActionScriptLexer"] + +class RagelLexer(RegexLexer): + """ + A pure `Ragel <http://www.complang.org/ragel/>`_ lexer. Use this for + fragments of Ragel. For ``.rl`` files, use RagelEmbeddedLexer instead + (or one of the language-specific subclasses). + + *New in Pygments 1.1* + """ + + name = 'Ragel' + aliases = ['ragel'] + filenames = [] + + tokens = { + 'whitespace': [ + (r'\s+', Whitespace) + ], + 'comments': [ + (r'\#.*$', Comment), + ], + 'keywords': [ + (r'(access|action|alphtype)\b', Keyword), + (r'(getkey|write|machine|include)\b', Keyword), + (r'(any|ascii|extend|alpha|digit|alnum|lower|upper)\b', Keyword), + (r'(xdigit|cntrl|graph|print|punct|space|zlen|empty)\b', Keyword) + ], + 'numbers': [ + (r'0x[0-9A-Fa-f]+', Number.Hex), + (r'[+-]?[0-9]+', Number.Integer), + ], + 'literals': [ + (r'"(\\\\|\\"|[^"])*"', String), # double quote string + (r"'(\\\\|\\'|[^'])*'", String), # single quote string + (r'\[(\\\\|\\\]|[^\]])*\]', String), # square bracket literals + (r'/(?!\*)(\\\\|\\/|[^/])*/', String.Regex), # regular expressions + ], + 'identifiers': [ + (r'[a-zA-Z_][a-zA-Z_0-9]*', Name.Variable), + ], + 'operators': [ + (r',', Operator), # Join + (r'\||&|-|--', Operator), # Union, Intersection and Subtraction + (r'\.|<:|:>|:>>', Operator), # Concatention + (r':', Operator), # Label + (r'->', Operator), # Epsilon Transition + (r'(>|\$|%|<|@|<>)(/|eof\b)', Operator), # EOF Actions + (r'(>|\$|%|<|@|<>)(!|err\b)', Operator), # Global Error Actions + (r'(>|\$|%|<|@|<>)(\^|lerr\b)', Operator), # Local Error Actions + (r'(>|\$|%|<|@|<>)(~|to\b)', Operator), # To-State Actions + (r'(>|\$|%|<|@|<>)(\*|from\b)', Operator), # From-State Actions + (r'>|@|\$|%', Operator), # Transition Actions and Priorities + (r'\*|\?|\+|{[0-9]*,[0-9]*}', Operator), # Repetition + (r'!|\^', Operator), # Negation + (r'\(|\)', Operator), # Grouping + ], + 'root': [ + include('literals'), + include('whitespace'), + include('comments'), + include('keywords'), + include('numbers'), + include('identifiers'), + include('operators'), + (r'{', Punctuation, 'host'), + (r'=', Operator), + (r';', Punctuation), + ], + 'host': [ + (r'(' + r'|'.join(( # keep host code in largest possible chunks + r'[^{}\'"/#]+', # exclude unsafe characters + r'[^\\][\\][{}]', # allow escaped { or } + + # strings and comments may safely contain unsafe characters + r'"(\\\\|\\"|[^"])*"', # double quote string + r"'(\\\\|\\'|[^'])*'", # single quote string + r'//.*$\n?', # single line comment + r'/\*(.|\n)*?\*/', # multi-line javadoc-style comment + r'\#.*$\n?', # ruby comment + + # regular expression: There's no reason for it to start + # with a * and this stops confusion with comments. + r'/(?!\*)(\\\\|\\/|[^/])*/', + + # / is safe now that we've handled regex and javadoc comments + r'/', + )) + r')+', Other), + + (r'{', Punctuation, '#push'), + (r'}', Punctuation, '#pop'), + ], + } + +class RagelEmbeddedLexer(RegexLexer): + """ + A lexer for `Ragel`_ embedded in a host language file. + + This will only highlight Ragel statements. If you want host language + highlighting then call the language-specific Ragel lexer. + + *New in Pygments 1.1* + """ + + name = 'Embedded Ragel' + aliases = ['ragel-em'] + filenames = ['*.rl'] + + tokens = { + 'root': [ + (r'(' + r'|'.join(( # keep host code in largest possible chunks + r'[^%\'"/#]+', # exclude unsafe characters + r'%(?=[^%]|$)', # a single % sign is okay, just not 2 of them + + # strings and comments may safely contain unsafe characters + r'"(\\\\|\\"|[^"])*"', # double quote string + r"'(\\\\|\\'|[^'])*'", # single quote string + r'/\*(.|\n)*?\*/', # multi-line javadoc-style comment + r'//.*$\n?', # single line comment + r'\#.*$\n?', # ruby/ragel comment + r'/(?!\*)(\\\\|\\/|[^/])*/', # regular expression + + # / is safe now that we've handled regex and javadoc comments + r'/', + )) + r')+', Other), + + # Single Line FSM. + # Please don't put a quoted newline in a single line FSM. + # That's just mean. It will break this. + (r'(%%)(?![{%])(.*)($|;)(\n?)', bygroups(Punctuation, + using(RagelLexer), + Punctuation, Text)), + + # Multi Line FSM. + (r'(%%%%|%%){', Punctuation, 'multi-line-fsm'), + ], + 'multi-line-fsm': [ + (r'(' + r'|'.join(( # keep ragel code in largest possible chunks. + r'(' + r'|'.join(( + r'[^}\'"\[/#]', # exclude unsafe characters + r'}(?=[^%]|$)', # } is okay as long as it's not followed by % + r'}%(?=[^%]|$)', # ...well, one %'s okay, just not two... + r'[^\\][\\][{}]', # ...and } is okay if it's escaped + + # allow / if it's preceded with one of these symbols + # (ragel EOF actions) + r'(>|\$|%|<|@|<>)/', + + # specifically allow regex followed immediately by * + # so it doesn't get mistaken for a comment + r'/(?!\*)(\\\\|\\/|[^/])*/\*', + + # allow / as long as it's not followed by another / or by a * + r'/(?=[^/\*]|$)', + + # We want to match as many of these as we can in one block. + # Not sure if we need the + sign here, + # does it help performance? + )) + r')+', + + # strings and comments may safely contain unsafe characters + r'"(\\\\|\\"|[^"])*"', # double quote string + r"'(\\\\|\\'|[^'])*'", # single quote string + r"\[(\\\\|\\\]|[^\]])*\]", # square bracket literal + r'/\*(.|\n)*?\*/', # multi-line javadoc-style comment + r'//.*$\n?', # single line comment + r'\#.*$\n?', # ruby/ragel comment + )) + r')+', using(RagelLexer)), + + (r'}%%', Punctuation, '#pop'), + ] + } + + def analyse_text(text): + return '@LANG: indep' in text or 0.1 + +class RagelRubyLexer(DelegatingLexer): + """ + A lexer for `Ragel`_ in a Ruby host file. + + *New in Pygments 1.1* + """ + + name = 'Ragel in Ruby Host' + aliases = ['ragel-ruby', 'ragel-rb'] + filenames = ['*.rl'] + + def __init__(self, **options): + super(RagelRubyLexer, self).__init__(RubyLexer, RagelEmbeddedLexer, + **options) + + def analyse_text(text): + return '@LANG: ruby' in text + +class RagelCLexer(DelegatingLexer): + """ + A lexer for `Ragel`_ in a C host file. + + *New in Pygments 1.1* + """ + + name = 'Ragel in C Host' + aliases = ['ragel-c'] + filenames = ['*.rl'] + + def __init__(self, **options): + super(RagelCLexer, self).__init__(CLexer, RagelEmbeddedLexer, + **options) + + def analyse_text(text): + return '@LANG: c' in text + +class RagelDLexer(DelegatingLexer): + """ + A lexer for `Ragel`_ in a D host file. + + *New in Pygments 1.1* + """ + + name = 'Ragel in D Host' + aliases = ['ragel-d'] + filenames = ['*.rl'] + + def __init__(self, **options): + super(RagelDLexer, self).__init__(DLexer, RagelEmbeddedLexer, **options) + + def analyse_text(text): + return '@LANG: d' in text + +class RagelCppLexer(DelegatingLexer): + """ + A lexer for `Ragel`_ in a CPP host file. + + *New in Pygments 1.1* + """ + + name = 'Ragel in CPP Host' + aliases = ['ragel-cpp'] + filenames = ['*.rl'] + + def __init__(self, **options): + super(RagelCppLexer, self).__init__(CppLexer, RagelEmbeddedLexer, **options) + + def analyse_text(text): + return '@LANG: c++' in text + +class RagelObjectiveCLexer(DelegatingLexer): + """ + A lexer for `Ragel`_ in an Objective C host file. + + *New in Pygments 1.1* + """ + + name = 'Ragel in Objective C Host' + aliases = ['ragel-objc'] + filenames = ['*.rl'] + + def __init__(self, **options): + super(RagelObjectiveCLexer, self).__init__(ObjectiveCLexer, + RagelEmbeddedLexer, + **options) + + def analyse_text(text): + return '@LANG: objc' in text + +class RagelJavaLexer(DelegatingLexer): + """ + A lexer for `Ragel`_ in a Java host file. + + *New in Pygments 1.1* + """ + + name = 'Ragel in Java Host' + aliases = ['ragel-java'] + filenames = ['*.rl'] + + def __init__(self, **options): + super(RagelJavaLexer, self).__init__(JavaLexer, RagelEmbeddedLexer, + **options) + + def analyse_text(text): + return '@LANG: java' in text + +class AntlrLexer(RegexLexer): + """ + Generic ANTLR Lexer. + Should not be called directly, instead + use DelegatingLexer for your target language. + + *New in Pygments 1.1* + """ + + name = 'ANTLR' + aliases = ['antlr'] + filenames = [] + + _id = r'[A-Za-z][A-Za-z_0-9]*' + _TOKEN_REF = r'[A-Z][A-Za-z_0-9]*' + _RULE_REF = r'[a-z][A-Za-z_0-9]*' + _STRING_LITERAL = r'\'(?:\\\\|\\\'|[^\']*)\'' + _INT = r'[0-9]+' + + tokens = { + 'whitespace': [ + (r'\s+', Whitespace), + ], + 'comments': [ + (r'//.*$', Comment), + (r'/\*(.|\n)*?\*/', Comment), + ], + 'root': [ + include('whitespace'), + include('comments'), + + (r'(lexer|parser|tree)?(\s*)(grammar\b)(\s*)(' + _id + ')(;)', + bygroups(Keyword, Whitespace, Keyword, Whitespace, Name.Class, + Punctuation)), + # optionsSpec + (r'options\b', Keyword, 'options'), + # tokensSpec + (r'tokens\b', Keyword, 'tokens'), + # attrScope + (r'(scope)(\s*)(' + _id + ')(\s*)({)', + bygroups(Keyword, Whitespace, Name.Variable, Whitespace, + Punctuation), 'action'), + # exception + (r'(catch|finally)\b', Keyword, 'exception'), + # action + (r'(@' + _id + ')(\s*)(::)?(\s*)(' + _id + ')(\s*)({)', + bygroups(Name.Label, Whitespace, Punctuation, Whitespace, + Name.Label, Whitespace, Punctuation), 'action'), + # rule + (r'((?:protected|private|public|fragment)\b)?(\s*)(' + _id + ')(!)?', \ + bygroups(Keyword, Whitespace, Name.Label, Punctuation), + ('rule-alts', 'rule-prelims')), + ], + 'exception': [ + (r'\n', Whitespace, '#pop'), + (r'\s', Whitespace), + include('comments'), + + (r'\[', Punctuation, 'nested-arg-action'), + (r'\{', Punctuation, 'action'), + ], + 'rule-prelims': [ + include('whitespace'), + include('comments'), + + (r'returns\b', Keyword), + (r'\[', Punctuation, 'nested-arg-action'), + (r'\{', Punctuation, 'action'), + # throwsSpec + (r'(throws)(\s+)(' + _id + ')', + bygroups(Keyword, Whitespace, Name.Label)), + (r'(?:(,)(\s*)(' + _id + '))+', + bygroups(Punctuation, Whitespace, Name.Label)), # Additional throws + # optionsSpec + (r'options\b', Keyword, 'options'), + # ruleScopeSpec - scope followed by target language code or name of action + # TODO finish implementing other possibilities for scope + # L173 ANTLRv3.g from ANTLR book + (r'(scope)(\s+)({)', bygroups(Keyword, Whitespace, Punctuation), + 'action'), + (r'(scope)(\s+)(' + _id + ')(\s*)(;)', + bygroups(Keyword, Whitespace, Name.Label, Whitespace, Punctuation)), + # ruleAction + (r'(@' + _id + ')(\s*)({)', + bygroups(Name.Label, Whitespace, Punctuation), 'action'), + # finished prelims, go to rule alts! + (r':', Punctuation, '#pop') + ], + 'rule-alts': [ + include('whitespace'), + include('comments'), + + # These might need to go in a separate 'block' state triggered by ( + (r'options\b', Keyword, 'options'), + (r':', Punctuation), + + # literals + (r"'(\\\\|\\'|[^'])*'", String), + (r'"(\\\\|\\"|[^"])*"', String), + (r'<<([^>]|>[^>])>>', String), + # identifiers + # Tokens start with capital letter. + (r'\$?[A-Z_][A-Za-z_0-9]*', Name.Constant), + # Rules start with small letter. + (r'\$?[a-z_][A-Za-z_0-9]*', Name.Variable), + # operators + (r'(\+|\||->|=>|=|\(|\)|\.\.|\.|\?|\*|\^|!|\#|~)', Operator), + (r',', Punctuation), + (r'\[', Punctuation, 'nested-arg-action'), + (r'\{', Punctuation, 'action'), + (r';', Punctuation, '#pop') + ], + 'tokens': [ + include('whitespace'), + include('comments'), + (r'{', Punctuation), + (r'(' + _TOKEN_REF + r')(\s*)(=)?(\s*)(' + _STRING_LITERAL + ')?(\s*)(;)', + bygroups(Name.Label, Whitespace, Punctuation, Whitespace, + String, Whitespace, Punctuation)), + (r'}', Punctuation, '#pop'), + ], + 'options': [ + include('whitespace'), + include('comments'), + (r'{', Punctuation), + (r'(' + _id + r')(\s*)(=)(\s*)(' + + '|'.join((_id, _STRING_LITERAL, _INT, '\*'))+ ')(\s*)(;)', + bygroups(Name.Variable, Whitespace, Punctuation, Whitespace, + Text, Whitespace, Punctuation)), + (r'}', Punctuation, '#pop'), + ], + 'action': [ + (r'(' + r'|'.join(( # keep host code in largest possible chunks + r'[^\${}\'"/\\]+', # exclude unsafe characters + + # strings and comments may safely contain unsafe characters + r'"(\\\\|\\"|[^"])*"', # double quote string + r"'(\\\\|\\'|[^'])*'", # single quote string + r'//.*$\n?', # single line comment + r'/\*(.|\n)*?\*/', # multi-line javadoc-style comment + + # regular expression: There's no reason for it to start + # with a * and this stops confusion with comments. + r'/(?!\*)(\\\\|\\/|[^/])*/', + + # backslashes are okay, as long as we are not backslashing a % + r'\\(?!%)', + + # Now that we've handled regex and javadoc comments + # it's safe to let / through. + r'/', + )) + r')+', Other), + (r'(\\)(%)', bygroups(Punctuation, Other)), + (r'(\$[a-zA-Z]+)(\.?)(text|value)?', + bygroups(Name.Variable, Punctuation, Name.Property)), + (r'{', Punctuation, '#push'), + (r'}', Punctuation, '#pop'), + ], + 'nested-arg-action': [ + (r'(' + r'|'.join(( # keep host code in largest possible chunks. + r'[^\$\[\]\'"/]+', # exclude unsafe characters + + # strings and comments may safely contain unsafe characters + r'"(\\\\|\\"|[^"])*"', # double quote string + r"'(\\\\|\\'|[^'])*'", # single quote string + r'//.*$\n?', # single line comment + r'/\*(.|\n)*?\*/', # multi-line javadoc-style comment + + # regular expression: There's no reason for it to start + # with a * and this stops confusion with comments. + r'/(?!\*)(\\\\|\\/|[^/])*/', + + # Now that we've handled regex and javadoc comments + # it's safe to let / through. + r'/', + )) + r')+', Other), + + + (r'\[', Punctuation, '#push'), + (r'\]', Punctuation, '#pop'), + (r'(\$[a-zA-Z]+)(\.?)(text|value)?', + bygroups(Name.Variable, Punctuation, Name.Property)), + (r'(\\\\|\\\]|\\\[|[^\[\]])+', Other), + ] + } + +# http://www.antlr.org/wiki/display/ANTLR3/Code+Generation+Targets + +# TH: I'm not aware of any language features of C++ that will cause +# incorrect lexing of C files. Antlr doesn't appear to make a distinction, +# so just assume they're C++. No idea how to make Objective C work in the +# future. + +#class AntlrCLexer(DelegatingLexer): +# """ +# ANTLR with C Target +# +# *New in Pygments 1.1* +# """ +# +# name = 'ANTLR With C Target' +# aliases = ['antlr-c'] +# filenames = ['*.G', '*.g'] +# +# def __init__(self, **options): +# super(AntlrCLexer, self).__init__(CLexer, AntlrLexer, **options) +# +# def analyse_text(text): +# return re.match(r'^\s*language\s*=\s*C\s*;', text) + +class AntlrCppLexer(DelegatingLexer): + """ + ANTLR with CPP Target + + *New in Pygments 1.1* + """ + + name = 'ANTLR With CPP Target' + aliases = ['antlr-cpp'] + filenames = ['*.G', '*.g'] + + def __init__(self, **options): + super(AntlrCppLexer, self).__init__(CppLexer, AntlrLexer, **options) + + def analyse_text(text): + return re.match(r'^\s*language\s*=\s*C\s*;', text, re.M) + +class AntlrObjectiveCLexer(DelegatingLexer): + """ + ANTLR with ObjectiveC Target + + *New in Pygments 1.1* + """ + + name = 'ANTLR With ObjectiveC Target' + aliases = ['antlr-objc'] + filenames = ['*.G', '*.g'] + + def __init__(self, **options): + super(AntlrObjectiveCLexer, self).__init__(ObjectiveCLexer, + AntlrLexer, **options) + + def analyse_text(text): + return re.match(r'^\s*language\s*=\s*C\s*;', text) + +class AntlrCSharpLexer(DelegatingLexer): + """ + ANTLR with C# Target + + *New in Pygments 1.1* + """ + + name = 'ANTLR With C# Target' + aliases = ['antlr-csharp', 'antlr-c#'] + filenames = ['*.G', '*.g'] + + def __init__(self, **options): + super(AntlrCSharpLexer, self).__init__(CSharpLexer, AntlrLexer, + **options) + + def analyse_text(text): + return re.match(r'^\s*language\s*=\s*CSharp2\s*;', text, re.M) + +class AntlrPythonLexer(DelegatingLexer): + """ + ANTLR with Python Target + + *New in Pygments 1.1* + """ + + name = 'ANTLR With Python Target' + aliases = ['antlr-python'] + filenames = ['*.G', '*.g'] + + def __init__(self, **options): + super(AntlrPythonLexer, self).__init__(PythonLexer, AntlrLexer, + **options) + + def analyse_text(text): + return re.match(r'^\s*language\s*=\s*Python\s*;', text, re.M) + + +class AntlrJavaLexer(DelegatingLexer): + """ + ANTLR with Java Target + + *New in Pygments 1.1* + """ + + name = 'ANTLR With Java Target' + aliases = ['antlr-java'] + filenames = ['*.G', '*.g'] + + def __init__(self, **options): + super(AntlrJavaLexer, self).__init__(JavaLexer, AntlrLexer, + **options) + + def analyse_text(text): + return 0.5 # Antlr is Java if not specified + + +class AntlrRubyLexer(DelegatingLexer): + """ + ANTLR with Ruby Target + + *New in Pygments 1.1* + """ + + name = 'ANTLR With Ruby Target' + aliases = ['antlr-ruby', 'antlr-rb'] + filenames = ['*.G', '*.g'] + + def __init__(self, **options): + super(AntlrRubyLexer, self).__init__(RubyLexer, AntlrLexer, + **options) + + def analyse_text(text): + return re.match(r'^\s*language\s*=\s*Ruby\s*;', text, re.M) + +class AntlrPerlLexer(DelegatingLexer): + """ + ANTLR with Perl Target + + *New in Pygments 1.1* + """ + + name = 'ANTLR With Perl Target' + aliases = ['antlr-perl'] + filenames = ['*.G', '*.g'] + + def __init__(self, **options): + super(AntlrPerlLexer, self).__init__(PerlLexer, AntlrLexer, + **options) + + def analyse_text(text): + return re.match(r'^\s*language\s*=\s*Perl5\s*;', text, re.M) + +class AntlrActionScriptLexer(DelegatingLexer): + """ + ANTLR with ActionScript Target + + *New in Pygments 1.1* + """ + + name = 'ANTLR With ActionScript Target' + aliases = ['antlr-as', 'antlr-actionscript'] + filenames = ['*.G', '*.g'] + + def __init__(self, **options): + super(AntlrActionScriptLexer, self).__init__(ActionScriptLexer, + AntlrLexer, **options) + + def analyse_text(text): + return re.match(r'^\s*language\s*=\s*ActionScript\s*;', text, re.M) diff --git a/pygments/lexers/templates.py b/pygments/lexers/templates.py index fe062a18..f34a69a8 100644 --- a/pygments/lexers/templates.py +++ b/pygments/lexers/templates.py @@ -426,11 +426,19 @@ class MakoLexer(RegexLexer): (r'''(?sx) (.+?) # anything, followed by: (?: +<<<<<<< local + (?<=\n)(?=%|\#\#) |# an eval or comment line + (?=\#\*) | # multiline comment + (?=</?%) | # a python block + # call start or end + (?=\$\{) | # a substitution +======= (?<=\n)(?=%|\#\#) | # an eval or comment line (?=\#\*) | # multiline comment (?=</?%) | # a python block # call start or end (?=\$\{) | # a substitution +>>>>>>> other (?<=\n)(?=\s*%) | # - don't consume (\\\n) | # an escaped newline diff --git a/tests/examplefiles/ANTLRv3.g b/tests/examplefiles/ANTLRv3.g new file mode 100644 index 00000000..fbe6d654 --- /dev/null +++ b/tests/examplefiles/ANTLRv3.g @@ -0,0 +1,608 @@ +/* + [The "BSD licence"] + Copyright (c) 2005-2007 Terence Parr + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** ANTLR v3 grammar written in ANTLR v3 with AST construction */ +grammar ANTLRv3; + +options { + output=AST; + ASTLabelType=CommonTree; +} + +tokens { + DOC_COMMENT; + PARSER; + LEXER; + RULE; + BLOCK; + OPTIONAL; + CLOSURE; + POSITIVE_CLOSURE; + SYNPRED; + RANGE; + CHAR_RANGE; + EPSILON; + ALT; + EOR; + EOB; + EOA; // end of alt + ID; + ARG; + ARGLIST; + RET; + LEXER_GRAMMAR; + PARSER_GRAMMAR; + TREE_GRAMMAR; + COMBINED_GRAMMAR; + INITACTION; + LABEL; // $x used in rewrite rules + TEMPLATE; + SCOPE='scope'; + SEMPRED; + GATED_SEMPRED; // {p}? => + SYN_SEMPRED; // (...) => it's a manually-specified synpred converted to sempred + BACKTRACK_SEMPRED; // auto backtracking mode syn pred converted to sempred + FRAGMENT='fragment'; + TREE_BEGIN='^('; + ROOT='^'; + BANG='!'; + RANGE='..'; + REWRITE='->'; +} + +@members { + int gtype; +} + +grammarDef + : DOC_COMMENT? + ( 'lexer' {gtype=LEXER_GRAMMAR;} // pure lexer + | 'parser' {gtype=PARSER_GRAMMAR;} // pure parser + | 'tree' {gtype=TREE_GRAMMAR;} // a tree parser + | {gtype=COMBINED_GRAMMAR;} // merged parser/lexer + ) + g='grammar' id ';' optionsSpec? tokensSpec? attrScope* action* + rule+ + EOF + -> ^( {adaptor.create(gtype,$g)} + id DOC_COMMENT? optionsSpec? tokensSpec? attrScope* action* rule+ + ) + ; + +tokensSpec + : TOKENS tokenSpec+ '}' -> ^(TOKENS tokenSpec+) + ; + +tokenSpec + : TOKEN_REF + ( '=' (lit=STRING_LITERAL|lit=CHAR_LITERAL) -> ^('=' TOKEN_REF $lit) + | -> TOKEN_REF + ) + ';' + ; + +attrScope + : 'scope' id ACTION -> ^('scope' id ACTION) + ; + +/** Match stuff like @parser::members {int i;} */ +action + : '@' (actionScopeName '::')? id ACTION -> ^('@' actionScopeName? id ACTION) + ; + +/** Sometimes the scope names will collide with keywords; allow them as + * ids for action scopes. + */ +actionScopeName + : id + | l='lexer' -> ID[$l] + | p='parser' -> ID[$p] + ; + +optionsSpec + : OPTIONS (option ';')+ '}' -> ^(OPTIONS option+) + ; + +option + : id '=' optionValue -> ^('=' id optionValue) + ; + +optionValue + : id + | STRING_LITERAL + | CHAR_LITERAL + | INT + | s='*' -> STRING_LITERAL[$s] // used for k=* + ; + +rule +scope { + String name; +} + : DOC_COMMENT? + ( modifier=('protected'|'public'|'private'|'fragment') )? + id {$rule::name = $id.text;} + '!'? + ( arg=ARG_ACTION )? + ( 'returns' rt=ARG_ACTION )? + throwsSpec? optionsSpec? ruleScopeSpec? ruleAction* + ':' altList ';' + exceptionGroup? + -> ^( RULE id {modifier!=null?adaptor.create(modifier):null} ^(ARG $arg)? ^(RET $rt)? + optionsSpec? ruleScopeSpec? ruleAction* + altList + exceptionGroup? + EOR["EOR"] + ) + ; + +/** Match stuff like @init {int i;} */ +ruleAction + : '@' id ACTION -> ^('@' id ACTION) + ; + +throwsSpec + : 'throws' id ( ',' id )* -> ^('throws' id+) + ; + +ruleScopeSpec + : 'scope' ACTION -> ^('scope' ACTION) + | 'scope' id (',' id)* ';' -> ^('scope' id+) + | 'scope' ACTION + 'scope' id (',' id)* ';' + -> ^('scope' ACTION id+ ) + ; + +block + : lp='(' + ( (opts=optionsSpec)? ':' )? + a1=alternative rewrite ( '|' a2=alternative rewrite )* + rp=')' + -> ^( BLOCK[$lp,"BLOCK"] optionsSpec? alternative+ EOB[$rp,"EOB"] ) + ; + +altList +@init { + // must create root manually as it's used by invoked rules in real antlr tool. + // leave here to demonstrate use of {...} in rewrite rule + // it's really BLOCK[firstToken,"BLOCK"]; set line/col to previous ( or : token. + CommonTree blkRoot = (CommonTree)adaptor.create(BLOCK,input.LT(-1),"BLOCK"); +} + : a1=alternative rewrite ( '|' a2=alternative rewrite )* + -> ^( {blkRoot} (alternative rewrite?)+ EOB["EOB"] ) + ; + +alternative +@init { + Token firstToken = input.LT(1); + Token prevToken = input.LT(-1); // either : or | I think +} + : element+ -> ^(ALT[firstToken,"ALT"] element+ EOA["EOA"]) + | -> ^(ALT[prevToken,"ALT"] EPSILON[prevToken,"EPSILON"] EOA["EOA"]) + ; + +exceptionGroup + : ( exceptionHandler )+ ( finallyClause )? + | finallyClause + ; + +exceptionHandler + : 'catch' ARG_ACTION ACTION -> ^('catch' ARG_ACTION ACTION) + ; + +finallyClause + : 'finally' ACTION -> ^('finally' ACTION) + ; + +element + : elementNoOptionSpec + ; + +elementNoOptionSpec + : id (labelOp='='|labelOp='+=') atom + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] ^($labelOp id atom) EOA["EOA"]) EOB["EOB"])) + | -> ^($labelOp id atom) + ) + | id (labelOp='='|labelOp='+=') block + ( ebnfSuffix -> ^( ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] ^($labelOp id block) EOA["EOA"]) EOB["EOB"])) + | -> ^($labelOp id block) + ) + | atom + ( ebnfSuffix -> ^(BLOCK["BLOCK"] ^(ALT["ALT"] atom EOA["EOA"]) EOB["EOB"]) + | -> atom + ) + | ebnf + | ACTION + | SEMPRED ( '=>' -> GATED_SEMPRED | -> SEMPRED ) + | treeSpec + ; + +atom: range ( (op='^'|op='!') -> ^($op range) | -> range ) + | terminal + | notSet ( (op='^'|op='!') -> ^($op notSet) | -> notSet ) + | RULE_REF ( arg=ARG_ACTION )? ( (op='^'|op='!') )? + -> {$arg!=null&&op!=null}? ^($op RULE_REF $arg) + -> {$arg!=null}? ^(RULE_REF $arg) + -> {$op!=null}? ^($op RULE_REF) + -> RULE_REF + ; + +notSet + : '~' + ( notTerminal -> ^('~' notTerminal) + | block -> ^('~' block) + ) + ; + +treeSpec + : '^(' element ( element )+ ')' -> ^(TREE_BEGIN element+) + ; + +/** Matches ENBF blocks (and token sets via block rule) */ +ebnf +@init { + Token firstToken = input.LT(1); +} +@after { + $ebnf.tree.getToken().setLine(firstToken.getLine()); + $ebnf.tree.getToken().setCharPositionInLine(firstToken.getCharPositionInLine()); +} + : block {Token op=input.LT(1);} + ( '?' -> ^(OPTIONAL[op] block) + | '*' -> ^(CLOSURE[op] block) + | '+' -> ^(POSITIVE_CLOSURE[op] block) + | '^' -> ^('^' block) + | '!' -> ^('!' block) + | '=>' // syntactic predicate + -> {gtype==COMBINED_GRAMMAR && + Character.isUpperCase($rule::name.charAt(0))}? + // if lexer rule in combined, leave as pred for lexer + ^(SYNPRED["=>"] block) + // in real antlr tool, text for SYN_SEMPRED is predname + -> SYN_SEMPRED + | -> block + ) + ; + +range! + : c1=CHAR_LITERAL RANGE c2=CHAR_LITERAL -> ^(CHAR_RANGE[$c1,".."] $c1 $c2) + ; + +terminal + : ( CHAR_LITERAL -> CHAR_LITERAL + // Args are only valid for lexer rules + | TOKEN_REF + ( ARG_ACTION -> ^(TOKEN_REF ARG_ACTION) + | -> TOKEN_REF + ) + | STRING_LITERAL -> STRING_LITERAL + | '.' -> '.' + ) + ( '^' -> ^('^' $terminal) + | '!' -> ^('!' $terminal) + )? + ; + +notTerminal + : CHAR_LITERAL + | TOKEN_REF + | STRING_LITERAL + ; + +ebnfSuffix +@init { + Token op = input.LT(1); +} + : '?' -> OPTIONAL[op] + | '*' -> CLOSURE[op] + | '+' -> POSITIVE_CLOSURE[op] + ; + + + +// R E W R I T E S Y N T A X + +rewrite +@init { + Token firstToken = input.LT(1); +} + : (rew+='->' preds+=SEMPRED predicated+=rewrite_alternative)* + rew2='->' last=rewrite_alternative + -> ^($rew $preds $predicated)* ^($rew2 $last) + | + ; + +rewrite_alternative + : rewrite_template + | rewrite_tree_alternative + | /* empty rewrite */ -> ^(ALT["ALT"] EPSILON["EPSILON"] EOA["EOA"]) + ; + +rewrite_template_block + : lp='(' rewrite_template ')' -> ^(BLOCK[$lp,"BLOCK"] rewrite_template EOB[$lp,"EOB"]) + ; + +rewrite_tree_block + : lp='(' rewrite_tree_alternative ')' + -> ^(BLOCK[$lp,"BLOCK"] rewrite_tree_alternative EOB[$lp,"EOB"]) + ; + +rewrite_tree_alternative + : rewrite_tree_element+ -> ^(ALT["ALT"] rewrite_tree_element+ EOA["EOA"]) + ; + +rewrite_tree_element + : rewrite_tree_atom + | rewrite_tree_atom ebnfSuffix + -> ^( ebnfSuffix ^(BLOCK["BLOCK"] ^(ALT["ALT"] rewrite_tree_atom EOA["EOA"]) EOB["EOB"])) + | rewrite_tree + ( ebnfSuffix + -> ^(BLOCK["BLOCK"] ^(ALT["ALT"] rewrite_tree EOA["EOA"]) EOB["EOB"]) + | -> rewrite_tree + ) + | rewrite_tree_ebnf + ; + +rewrite_tree_atom + : CHAR_LITERAL + | TOKEN_REF ARG_ACTION? -> ^(TOKEN_REF ARG_ACTION?) // for imaginary nodes + | RULE_REF + | STRING_LITERAL + | d='$' id -> LABEL[$d,$id.text] // reference to a label in a rewrite rule + | ACTION + ; + +rewrite_tree_ebnf +@init { + Token firstToken = input.LT(1); +} +@after { + $rewrite_tree_ebnf.tree.getToken().setLine(firstToken.getLine()); + $rewrite_tree_ebnf.tree.getToken().setCharPositionInLine(firstToken.getCharPositionInLine()); +} + : rewrite_tree_block ebnfSuffix -> ^(ebnfSuffix rewrite_tree_block) + ; + +rewrite_tree + : '^(' rewrite_tree_atom rewrite_tree_element* ')' + -> ^(TREE_BEGIN rewrite_tree_atom rewrite_tree_element* ) + ; + +/** Build a tree for a template rewrite: + ^(TEMPLATE (ID|ACTION) ^(ARGLIST ^(ARG ID ACTION) ...) ) + where ARGLIST is always there even if no args exist. + ID can be "template" keyword. If first child is ACTION then it's + an indirect template ref + + -> foo(a={...}, b={...}) + -> ({string-e})(a={...}, b={...}) // e evaluates to template name + -> {%{$ID.text}} // create literal template from string (done in ActionTranslator) + -> {st-expr} // st-expr evaluates to ST + */ +rewrite_template + : // -> template(a={...},...) "..." inline template + {input.LT(1).getText().equals("template")}? + id lp='(' rewrite_template_args ')' + st=( DOUBLE_QUOTE_STRING_LITERAL | DOUBLE_ANGLE_STRING_LITERAL ) + -> ^(TEMPLATE[$lp,"TEMPLATE"] id rewrite_template_args $st) + + | // -> foo(a={...}, ...) + rewrite_template_ref + + | // -> ({expr})(a={...}, ...) + rewrite_indirect_template_head + + | // -> {...} + ACTION + ; + +/** -> foo(a={...}, ...) */ +rewrite_template_ref + : id lp='(' rewrite_template_args ')' + -> ^(TEMPLATE[$lp,"TEMPLATE"] id rewrite_template_args) + ; + +/** -> ({expr})(a={...}, ...) */ +rewrite_indirect_template_head + : lp='(' ACTION ')' '(' rewrite_template_args ')' + -> ^(TEMPLATE[$lp,"TEMPLATE"] ACTION rewrite_template_args) + ; + +rewrite_template_args + : rewrite_template_arg (',' rewrite_template_arg)* + -> ^(ARGLIST rewrite_template_arg+) + | -> ARGLIST + ; + +rewrite_template_arg + : id '=' ACTION -> ^(ARG[$id.start] id ACTION) + ; + +id : TOKEN_REF -> ID[$TOKEN_REF] + | RULE_REF -> ID[$RULE_REF] + ; + +// L E X I C A L R U L E S + +SL_COMMENT + : '//' + ( ' $ANTLR ' SRC // src directive + | ~('\r'|'\n')* + ) + '\r'? '\n' + {$channel=HIDDEN;} + ; + +ML_COMMENT + : '/*' {if (input.LA(1)=='*') $type=DOC_COMMENT; else $channel=HIDDEN;} .* '*/' + ; + +CHAR_LITERAL + : '\'' LITERAL_CHAR '\'' + ; + +STRING_LITERAL + : '\'' LITERAL_CHAR LITERAL_CHAR* '\'' + ; + +fragment +LITERAL_CHAR + : ESC + | ~('\''|'\\') + ; + +DOUBLE_QUOTE_STRING_LITERAL + : '"' LITERAL_CHAR* '"' + ; + +DOUBLE_ANGLE_STRING_LITERAL + : '<<' .* '>>' + ; + +fragment +ESC : '\\' + ( 'n' + | 'r' + | 't' + | 'b' + | 'f' + | '"' + | '\'' + | '\\' + | '>' + | 'u' XDIGIT XDIGIT XDIGIT XDIGIT + | . // unknown, leave as it is + ) + ; + +fragment +XDIGIT : + '0' .. '9' + | 'a' .. 'f' + | 'A' .. 'F' + ; + +INT : '0'..'9'+ + ; + +ARG_ACTION + : NESTED_ARG_ACTION + ; + +fragment +NESTED_ARG_ACTION : + '[' + ( options {greedy=false; k=1;} + : NESTED_ARG_ACTION + | ACTION_STRING_LITERAL + | ACTION_CHAR_LITERAL + | . + )* + ']' + {setText(getText().substring(1, getText().length()-1));} + ; + +ACTION + : NESTED_ACTION ( '?' {$type = SEMPRED;} )? + ; + +fragment +NESTED_ACTION : + '{' + ( options {greedy=false; k=3;} + : NESTED_ACTION + | SL_COMMENT + | ML_COMMENT + | ACTION_STRING_LITERAL + | ACTION_CHAR_LITERAL + | . + )* + '}' + {$channel = DEFAULT_TOKEN_CHANNEL;} + ; + +fragment +ACTION_CHAR_LITERAL + : '\'' (ACTION_ESC|~('\\'|'\'')) '\'' + ; + +fragment +ACTION_STRING_LITERAL + : '"' (ACTION_ESC|~('\\'|'"'))+ '"' + ; + +fragment +ACTION_ESC + : '\\\'' + | '\\"' + | '\\' ~('\''|'"') + ; + +TOKEN_REF + : 'A'..'Z' ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* + ; + +RULE_REF + : 'a'..'z' ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* + ; + +/** Match the start of an options section. Don't allow normal + * action processing on the {...} as it's not a action. + */ +OPTIONS + : 'options' WS_LOOP '{' {$channel=DEFAULT_TOKEN_CHANNEL;} // WS_LOOP sets channel + ; + +TOKENS + : 'tokens' WS_LOOP '{' {$channel=DEFAULT_TOKEN_CHANNEL;} + ; + +/** Reset the file and line information; useful when the grammar + * has been generated so that errors are shown relative to the + * original file like the old C preprocessor used to do. + */ +fragment +SRC : 'src' ' ' file=ACTION_STRING_LITERAL ' ' line=INT {$channel=HIDDEN;} + ; + +WS : ( ' ' + | '\t' + | '\r'? '\n' + )+ + {$channel=HIDDEN;} + ; + +fragment +WS_LOOP + : ( WS + | SL_COMMENT + | ML_COMMENT + )* + {$channel=HIDDEN;} + ; + diff --git a/tests/examplefiles/ragel-cpp_rlscan b/tests/examplefiles/ragel-cpp_rlscan new file mode 100644 index 00000000..4b146329 --- /dev/null +++ b/tests/examplefiles/ragel-cpp_rlscan @@ -0,0 +1,280 @@ +/* + * Lexes Ragel input files. + * + * @LANG: c++ + * + * Test works with split code gen. + */ + +#include <iostream> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +using namespace std; + +void escapeXML( const char *data ) +{ + while ( *data != 0 ) { + switch ( *data ) { + case '<': cout << "<"; break; + case '>': cout << ">"; break; + case '&': cout << "&"; break; + default: cout << *data; break; + } + data += 1; + } +} + +void escapeXML( char c ) +{ + switch ( c ) { + case '<': cout << "<"; break; + case '>': cout << ">"; break; + case '&': cout << "&"; break; + default: cout << c; break; + } +} + +void escapeXML( const char *data, int len ) +{ + for ( const char *end = data + len; data != end; data++ ) { + switch ( *data ) { + case '<': cout << "<"; break; + case '>': cout << ">"; break; + case '&': cout << "&"; break; + default: cout << *data; break; + } + } +} + +inline void write( const char *data ) +{ + cout << data; +} + +inline void write( char c ) +{ + cout << c; +} + +inline void write( const char *data, int len ) +{ + cout.write( data, len ); +} + + +%%{ + machine RagelScan; + + word = [a-zA-Z_][a-zA-Z_0-9]*; + integer = [0-9]+; + hex = '0x' [0-9a-fA-F] [0-9a-fA-F]*; + + default = ^0; + EOF = 0; + + # Handles comments in outside code and inline blocks. + c_comment := + ( default* :>> '*/' ) + ${ escapeXML( fc ); } + @{ fret; }; + + action emit { + escapeXML( ts, te-ts ); + } + + # + # Inline action code + # + + ilscan := |* + + "'" ( [^'\\] | /\\./ )* "'" => emit; + '"' ( [^"\\] | /\\./ )* '"' => emit; + '/*' { + write( "/*" ); + fcall c_comment; + }; + '//' [^\n]* '\n' => emit; + + '{' { + write( '{' ); + inline_depth += 1; + }; + + '}' { + write( '}' ); + /* If dropping down to the last } then return + * to ragel code. */ + if ( --inline_depth == 0 ) { + write( "</inline>\n" ); + fgoto rlscan; + } + }; + + default => { escapeXML( *ts ); }; + *|; + + # + # Ragel Tokens + # + + rlscan := |* + '}%%' { + if ( !single_line ) { + write( "</section>\n" ); + fgoto main; + } + }; + + '\n' { + if ( single_line ) { + write( "</section>\n" ); + fgoto main; + } + }; + + # Word + word { + write( "<word>" ); + write( ts, te-ts ); + write( "</word>\n" ); + }; + + # Decimal integer. + integer { + write( "<int>" ); + write( ts, te-ts ); + write( "</int>\n" ); + }; + + # Hexidecimal integer. + hex { + write( "<hex>" ); + write( ts, te-ts ); + write( "</hex>\n" ); + }; + + # Consume comments. + '#' [^\n]* '\n'; + + # Single literal string. + "'" ( [^'\\] | /\\./ )* "'" { + write( "<single_lit>" ); + escapeXML( ts, te-ts ); + write( "</single_lit>\n" ); + }; + + # Double literal string. + '"' ( [^"\\] | /\\./ )* '"' { + write( "<double_lit>" ); + escapeXML( ts, te-ts ); + write( "</double_lit>\n" ); + }; + + # Or literal. + '[' ( [^\]\\] | /\\./ )* ']' { + write( "<or_lit>" ); + escapeXML( ts, te-ts ); + write( "</or_lit>\n" ); + }; + + # Regex Literal. + '/' ( [^/\\] | /\\./ ) * '/' { + write( "<re_lit>" ); + escapeXML( ts, te-ts ); + write( "</re_lit>\n" ); + }; + + # Open an inline block + '{' { + inline_depth = 1; + write( "<inline>{" ); + fgoto ilscan; + }; + + punct { + write( "<symbol>" ); + escapeXML( fc ); + write( "</symbol>\n" ); + }; + + default; + *|; + + # + # Outside code. + # + + main := |* + + "'" ( [^'\\] | /\\./ )* "'" => emit; + '"' ( [^"\\] | /\\./ )* '"' => emit; + + '/*' { + escapeXML( ts, te-ts ); + fcall c_comment; + }; + + '//' [^\n]* '\n' => emit; + + '%%{' { + write( "<section>\n" ); + single_line = false; + fgoto rlscan; + }; + + '%%' { + write( "<section>\n" ); + single_line = true; + fgoto rlscan; + }; + + default { + escapeXML( *ts ); + }; + + # EOF. + EOF; + *|; +}%% + +%% write data nofinal; + +void test( const char *data ) +{ + std::ios::sync_with_stdio(false); + + int cs, act; + const char *ts, *te; + int stack[1], top; + + bool single_line = false; + int inline_depth = 0; + + %% write init; + + /* Read in a block. */ + const char *p = data; + const char *pe = data + strlen( data ); + const char *eof = pe; + %% write exec; + + if ( cs == RagelScan_error ) { + /* Machine failed before finding a token. */ + cerr << "PARSE ERROR" << endl; + exit(1); + } +} + +#define BUFSIZE 2048 + +int main() +{ + std::ios::sync_with_stdio(false); + + test("hi %%{ /'}%%'/ { /*{*/ {} } + '\\'' }%%there\n"); + + return 0; +} diff --git a/tests/examplefiles/ragel-cpp_snippet b/tests/examplefiles/ragel-cpp_snippet new file mode 100644 index 00000000..203ae28b --- /dev/null +++ b/tests/examplefiles/ragel-cpp_snippet @@ -0,0 +1,2 @@ + %% write init; + /* Read in a block. */ |