Adding new lexer for Opa (http://opalang.org).

author: Adam Koprowski <Adam.Koprowski@gmail.com> 2011-10-27 18:27:39 +0200
committer: Adam Koprowski <Adam.Koprowski@gmail.com> 2011-10-27 18:27:39 +0200
commit: ef8092fc119ca109430312a6ac1b4b6ec5d72af0 (patch)
tree: 56836485440fda4a2f08c8b5313e660210099987
parent: 8ae412d0e5262f91179a100a4fb5409186b1db6d (diff)
download: pygments-ef8092fc119ca109430312a6ac1b4b6ec5d72af0.tar.gz
2 files changed, 329 insertions, 2 deletions
diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py
index 4772a9a0..641078af 100644
--- a/pygments/lexers/_mapping.py
+++ b/pygments/lexers/_mapping.py
@@ -163,6 +163,7 @@ LEXERS = {
     'OcamlLexer': ('pygments.lexers.functional', 'OCaml', ('ocaml',), ('*.ml', '*.mli', '*.mll', '*.mly'), ('text/x-ocaml',)),
     'OctaveLexer': ('pygments.lexers.math', 'Octave', ('octave',), ('*.m',), ('text/octave',)),
     'OocLexer': ('pygments.lexers.compiled', 'Ooc', ('ooc',), ('*.ooc',), ('text/x-ooc',)),
+    'OpaLexer': ('pygments.lexers.functional', 'Opa', ('opa',), ('*.opa',), ('text/x-opa',)),
     'PerlLexer': ('pygments.lexers.agile', 'Perl', ('perl', 'pl'), ('*.pl', '*.pm'), ('text/x-perl', 'application/x-perl')),
     'PhpLexer': ('pygments.lexers.web', 'PHP', ('php', 'php3', 'php4', 'php5'), ('*.php', '*.php[345]'), ('text/x-php',)),
     'PlPgsqlLexer': ('pygments.lexers.postgres', 'PL/pgSQL', ('plpgsql',), (), ('text/x-plpgsql',)),
diff --git a/pygments/lexers/functional.py b/pygments/lexers/functional.py
index a8102cc8..acd0e705 100644
--- a/pygments/lexers/functional.py
+++ b/pygments/lexers/functional.py
@@ -14,11 +14,11 @@ import re
 from pygments.lexer import Lexer, RegexLexer, bygroups, include, do_insertions
 from pygments.token import Text, Comment, Operator, Keyword, Name, \
      String, Number, Punctuation, Literal, Generic, Error
-
+import pygments.formatters
 
 __all__ = ['SchemeLexer', 'CommonLispLexer', 'HaskellLexer',
            'LiterateHaskellLexer', 'SMLLexer', 'OcamlLexer', 'ErlangLexer',
-           'ErlangShellLexer']
+           'ErlangShellLexer', 'OpaLexer']
 
 
 class SchemeLexer(RegexLexer):
@@ -1079,3 +1079,329 @@ class ErlangShellLexer(Lexer):
                                       erlexer.get_tokens_unprocessed(curcode)):
                 yield item
 
+class OpaLexer(RegexLexer):
+    """
+      Lexer for the Opa language (http://opalang.org)
+    """
+
+    name = 'Opa'
+    aliases = ['opa']
+    filenames = ['*.opa']
+    mimetypes = ['text/x-opa']
+
+    # most of these aren't strictly keywords
+    # but if you color only real keywords, you might just
+    # as well not color anything
+    keywords = [
+        'and','as',
+        'begin',
+        'css',
+        'database','db','do',
+        'else','end','external',
+        'forall',
+        'if','import',
+        'match',
+        'package','parser',
+        'rec',
+        'server',
+        'then','type',
+        'val',
+        'with',
+        'xml_parser'
+    ]
+
+    # matches both stuff and `stuff`
+    ident_re = r'(([a-zA-Z_]\w*)|(`[^`]*`))'
+
+    op_re = r'[.=\-<>,@~%/+?*&^!]'
+    punc_re = r'[()\[\],;|]' # '{' and '}' are treated elsewhere
+                               # because they are also used for inserts
+
+    tokens = {
+        # copied from the caml lexer, should be adapted
+        'escape-sequence': [
+            (r'\\[\\\"\'ntr}]', String.Escape),
+            (r'\\[0-9]{3}', String.Escape),
+            (r'\\x[0-9a-fA-F]{2}', String.Escape),
+        ],
+
+        # factorizing these rules, because they are inserted many times
+        'comments': [
+            (r'/\*', Comment, 'nested-comment'),
+            (r'//.*?$', Comment),
+        ],
+        'comments-and-spaces': [
+            pygments.lexer.include('comments'),
+            (r'\s+', Text),
+        ],
+
+        'root': [
+            pygments.lexer.include('comments-and-spaces'),
+            # keywords
+            (r'\b(%s)\b' % '|'.join(keywords), Keyword),
+            # directives
+            # we could parse the actual set of directives instead of anything
+            # starting with @, but this is troublesome
+            # because it needs to be adjusted all the time
+            # and assuming we parse only sources that compile, it is useless
+            (r'@'+ident_re+r'\b', Name.Builtin.Pseudo),
+
+            # number literals
+            (r'-?.[\d]+([eE][+\-]?\d+)', Number.Float),
+            (r'-?\d+.\d*([eE][+\-]?\d+)', Number.Float),
+            (r'-?\d+[eE][+\-]?\d+', Number.Float),
+            (r'0[xX][\da-fA-F]+', Number.Hex),
+            (r'0[oO][0-7]+', Number.Oct),
+            (r'0[bB][01]+', Number.Binary),
+            (r'\d+', Number.Integer),
+            # color literals
+            (r'#[\da-fA-F]{3,6}', Number.Integer),
+
+            # string literals
+            (r'"', String.Double, 'string'),
+            # char literal, should be checked because this is the regexp from the caml lexer
+            (r"'(?:(\\[\\\"'ntbr ])|(\\[0-9]{3})|(\\x[0-9a-fA-F]{2})|.)'",
+             String.Char),
+
+            # this is meant to deal with embedded exprs in strings
+            # every time we find a '}' we pop a state so that if we were
+            # inside a string, we are back in the string state
+            # as a consequence, we must also push a state every time we find a '{'
+            # or else we will have errors when parsing {} for instance
+            (r'{', Operator, '#push'),
+            (r'}', Operator, '#pop'),
+
+            # html literals
+            # this is a much more strict that the actual parser,
+            # since a<b would not be parsed as html
+            # but then again, the parser is way too lax, and we can't hope
+            # to have something as tolerant
+            (r'<(?=[a-zA-Z>])', String.Single, 'html-open-tag'),
+
+            # db path
+            # matching the '[_]' in '/a[_]' because it is a part
+            # of the syntax of the db path definition
+            # unfortunately, i don't know how to match the ']' in
+            # /a[1], so this is somewhat inconsistent
+            (r'[@?!]?(/\w+)+(\[_\])?', Name.Variable),
+            # putting the same color on <- as on db path, since
+            # it can be used only to mean Db.write
+            (r'<-(?!'+op_re+r')', Name.Variable),
+
+            # 'modules'
+            # although modules are not distinguished by their names as in caml
+            # the standard library seems to follow the convention that modules
+            # only area capitalized
+            (r'\b([A-Z]\w*)(?=\.)', Name.Namespace),
+
+            # operators
+            # = has a special role because this is the only
+            # way to syntactic distinguish binding constructions
+            # unfortunately, this colors the equal in {x=2} too
+            (r'=(?!'+op_re+r')', Keyword),
+            (r'(%s)+' % op_re, Operator),
+            (r'(%s)+' % punc_re, Operator),
+
+            # coercions
+            (r':', Operator, 'type'),
+            # type variables
+            # we need this rule because we don't parse specially type definitions
+            # so in "type t('a) = ...", "'a" is parsed by 'root'
+            ("'"+ident_re, Keyword.Type),
+
+            # id literal, #something, or #{expr}
+            (r'#'+ident_re, String.Single),
+            (r'#(?={)', String.Single),
+
+            # identifiers
+            # this avoids to color '2' in 'a2' as an integer
+            (ident_re, Text),
+
+            # default, not sure if that is needed or not
+            # (r'.', Text),
+        ],
+
+        # it is quite painful to have to parse types to know where they end
+        # this is the general rule for a type
+        # a type is either:
+        # * -> ty
+        # * type-with-slash
+        # * type-with-slash -> ty
+        # * type-with-slash (, type-with-slash)+ -> ty
+        #
+        # the code is pretty funky in here, but this code would roughly translate
+        # in caml to:
+        # let rec type stream =
+        # match stream with
+        # | [< "->";  stream >] -> type stream
+        # | [< "";  stream >] ->
+        #   type_with_slash stream
+        #   type_lhs_1 stream;
+        # and type_1 stream = ...
+        'type': [
+            pygments.lexer.include('comments-and-spaces'),
+            (r'->', Keyword.Type),
+            (r'', Keyword.Type, ('#pop', 'type-lhs-1', 'type-with-slash')),
+        ],
+
+        # parses all the atomic or closed constructions in the syntax of type expressions
+        # record types, tuple types, type constructors, basic type and type variables
+        'type-1': [
+            pygments.lexer.include('comments-and-spaces'),
+            (r'\(', Keyword.Type, ('#pop', 'type-tuple')),
+            (r'~?{', Keyword.Type, ('#pop', 'type-record')),
+            (ident_re+r'\(', Keyword.Type, ('#pop', 'type-tuple')),
+            (ident_re, Keyword.Type, '#pop'),
+            ("'"+ident_re, Keyword.Type),
+            # this case is not in the syntax but sometimes
+            # we think we are parsing types when in fact we are parsing
+            # some css, so we just pop the states until we get back into
+            # the root state
+            (r'', Keyword.Type, '#pop'),
+        ],
+
+        # type-with-slash is either:
+        # * type-1
+        # * type-1 (/ type-1)+
+        'type-with-slash': [
+            pygments.lexer.include('comments-and-spaces'),
+            (r'', Keyword.Type, ('#pop', 'slash-type-1', 'type-1')),
+        ],
+        'slash-type-1': [
+            pygments.lexer.include('comments-and-spaces'),
+            ('/', Keyword.Type, ('#pop', 'type-1')),
+            # same remark as above
+            (r'', Keyword.Type, '#pop'),
+        ],
+
+        # we go in this state after having parsed a type-with-slash
+        # while trying to parse a type
+        # and at this point we must determine if we are parsing an arrow
+        # type (in which case we must continue parsing) or not (in which
+        # case we stop)
+        'type-lhs-1': [
+            pygments.lexer.include('comments-and-spaces'),
+            (r'->', Keyword.Type, ('#pop', 'type')),
+            (r'(?=,)', Keyword.Type, ('#pop', 'type-arrow')),
+            (r'', Keyword.Type, '#pop'),
+        ],
+        'type-arrow': [
+            pygments.lexer.include('comments-and-spaces'),
+            # the look ahead here allows to parse f(x : int, y : float -> truc) correctly
+            (r',(?=[^:]*?->)', Keyword.Type, 'type-with-slash'),
+            (r'->', Keyword.Type, ('#pop', 'type')),
+            # same remark as above
+            (r'', Keyword.Type, '#pop'),
+        ],
+
+        # no need to do precise parsing for tuples and records
+        # because they are closed constructions, so we can simply
+        # find the closing delimiter
+        # note that this function would be not work if the source
+        # contained identifiers like `{)` (although it could be patched
+        # to support it)
+        'type-tuple': [
+            pygments.lexer.include('comments-and-spaces'),
+            (r'[^\(\)/*]+', Keyword.Type),
+            (r'[/*]', Keyword.Type),
+            (r'\(', Keyword.Type, '#push'),
+            (r'\)', Keyword.Type, '#pop'),
+        ],
+        'type-record': [
+            pygments.lexer.include('comments-and-spaces'),
+            (r'[^{}/*]+', Keyword.Type),
+            (r'[/*]', Keyword.Type),
+            (r'{', Keyword.Type, '#push'),
+            (r'}', Keyword.Type, '#pop'),
+        ],
+
+#        'type-tuple': [
+#            pygments.lexer.include('comments-and-spaces'),
+#            (r'\)', Keyword.Type, '#pop'),
+#            (r'', Keyword.Type, ('#pop', 'type-tuple-1', 'type-1')),
+#        ],
+#        'type-tuple-1': [
+#            pygments.lexer.include('comments-and-spaces'),
+#            (r',?\s*\)', Keyword.Type, '#pop'), # ,) is a valid end of tuple, in (1,)
+#            (r',', Keyword.Type, 'type-1'),
+#        ],
+#        'type-record':[
+#            pygments.lexer.include('comments-and-spaces'),
+#            (r'}', Keyword.Type, '#pop'),
+#            (r'~?(?:\w+|`[^`]*`)', Keyword.Type, 'type-record-field-expr'),
+#        ],
+#        'type-record-field-expr': [
+#
+#        ],
+
+        'nested-comment': [
+            (r'[^/*]+', Comment),
+            (r'/\*', Comment, '#push'),
+            (r'\*/', Comment, '#pop'),
+            (r'[/*]', Comment),
+        ],
+
+        # the coy pasting between string and single-string
+        # is kinda sad. Is there a way to avoid that??
+        'string': [
+            (r'[^\\"{]+', String.Double),
+            (r'"', String.Double, '#pop'),
+            (r'{', Operator, 'root'),
+            pygments.lexer.include('escape-sequence'),
+        ],
+        'single-string': [
+            (r'[^\\\'{]+', String.Double),
+            (r'\'', String.Double, '#pop'),
+            (r'{', Operator, 'root'),
+            pygments.lexer.include('escape-sequence'),
+        ],
+
+        # all the html stuff
+        # can't really reuse some existing html parser
+        # because we must be able to parse embedded expressions
+
+        # we are in this state after someone parsed the '<' that
+        # started the html literal
+        'html-open-tag': [
+            (r'[\w\-:]+', String.Single, ('#pop', 'html-attr')),
+            (r'>', String.Single, ('#pop', 'html-content')),
+        ],
+
+        # we are in this state after someone parsed the '</' that
+        # started the end of the closing tag
+        'html-end-tag': [
+            # this is a star, because </> is allowed
+            (r'[\w\-:]*>', String.Single, '#pop'),
+        ],
+
+        # we are in this state after having parsed '<ident(:ident)?'
+        # we thus parse a possibly empty list of attributes
+        'html-attr': [
+            (r'\s+', Text),
+            (r'[\w\-:]+=', String.Single, 'html-attr-value'),
+            (r'/>', String.Single, '#pop'),
+            (r'>', String.Single, ('#pop', 'html-content')),
+        ],
+
+        'html-attr-value': [
+            (r"'", String.Single, ('#pop', 'single-string')),
+            (r'"', String.Single, ('#pop', 'string')),
+            (r'#'+ident_re, String.Single, '#pop'),
+            (r'#(?={)', String.Single, ('#pop', 'root')),
+            (r'{', Operator, ('#pop', 'root')), # this is a tail call!
+        ],
+
+        # we should probably deal with '\' escapes here
+        'html-content': [
+            (r'<!--', Comment, 'html-comment'),
+            (r'</', String.Single, ('#pop', 'html-end-tag')),
+            (r'<', String.Single, 'html-open-tag'),
+            (r'{', Operator, 'root'),
+            (r'.|\s+', String.Single),
+        ],
+
+        'html-comment': [
+            (r'-->', Comment, '#pop'),
+            (r'[^\-]+|-', Comment),
+        ],
+    }
author	Adam Koprowski <Adam.Koprowski@gmail.com>	2011-10-27 18:27:39 +0200
committer	Adam Koprowski <Adam.Koprowski@gmail.com>	2011-10-27 18:27:39 +0200
commit	ef8092fc119ca109430312a6ac1b4b6ec5d72af0 (patch)
tree	56836485440fda4a2f08c8b5313e660210099987
parent	8ae412d0e5262f91179a100a4fb5409186b1db6d (diff)
download	pygments-ef8092fc119ca109430312a6ac1b4b6ec5d72af0.tar.gz