From 63b7f3c535b3d8a63a612862025fb04967b20155 Mon Sep 17 00:00:00 2001 From: Lucas Werkmeister Date: Sat, 20 Jul 2019 22:22:50 +0200 Subject: Add lexer for ShExC ShExC [1] is one syntax for the ShEx (shape expressions) language [2] to describe the structure of RDF graphs (the other two syntaxes are based on JSON-LD and RDF and don?t need special lexers). It is syntactically similar to SPARQL, which is why a lot of the productions of ShExCLexer are copied from SparqlLexer, but at the same time has enough differences that I feel it?s better to simply copy the productions rather than trying to share them between the two lexers (compare e.?g. PN_LOCAL_ESCAPE_CHARS or IRIREF). The example file purports to be a brief schema for Pygments lexers, which I put together from scratch to avoid licensing issues with existing example schemas; it should not be taken too seriously. [1]: https://shex.io/shex-semantics/#shexc [2]: https://shexspec.github.io/primer/ --- pygments/lexers/_mapping.py | 1 + pygments/lexers/rdf.py | 148 +++++++++++++++++++++++++++++++++++++++- tests/examplefiles/example.shex | 20 ++++++ 3 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 tests/examplefiles/example.shex diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py index adb1ec83..595e100f 100644 --- a/pygments/lexers/_mapping.py +++ b/pygments/lexers/_mapping.py @@ -390,6 +390,7 @@ LEXERS = { 'SchemeLexer': ('pygments.lexers.lisp', 'Scheme', ('scheme', 'scm'), ('*.scm', '*.ss'), ('text/x-scheme', 'application/x-scheme')), 'ScilabLexer': ('pygments.lexers.matlab', 'Scilab', ('scilab',), ('*.sci', '*.sce', '*.tst'), ('text/scilab',)), 'ScssLexer': ('pygments.lexers.css', 'SCSS', ('scss',), ('*.scss',), ('text/x-scss',)), + 'ShExCLexer': ('pygments.lexers.rdf', 'ShExC', ('shexc', 'shex'), ('*.shex',), ('text/shex',)), 'ShenLexer': ('pygments.lexers.lisp', 'Shen', ('shen',), ('*.shen',), ('text/x-shen', 'application/x-shen')), 'SilverLexer': ('pygments.lexers.verification', 'Silver', ('silver',), ('*.sil', '*.vpr'), ()), 'SlashLexer': ('pygments.lexers.slash', 'Slash', ('slash',), ('*.sl',), ()), diff --git a/pygments/lexers/rdf.py b/pygments/lexers/rdf.py index 8682387e..f172708d 100644 --- a/pygments/lexers/rdf.py +++ b/pygments/lexers/rdf.py @@ -15,7 +15,7 @@ from pygments.lexer import RegexLexer, bygroups, default from pygments.token import Keyword, Punctuation, String, Number, Operator, Generic, \ Whitespace, Name, Literal, Comment, Text -__all__ = ['SparqlLexer', 'TurtleLexer'] +__all__ = ['SparqlLexer', 'TurtleLexer', 'ShExCLexer'] class SparqlLexer(RegexLexer): @@ -275,3 +275,149 @@ class TurtleLexer(RegexLexer): for t in ('@base ', 'BASE ', '@prefix ', 'PREFIX '): if re.search(r'^\s*%s' % t, text): return 0.80 + + +class ShExCLexer(RegexLexer): + """ + Lexer for `ShExC `_ shape expressions language syntax. + """ + name = 'ShExC' + aliases = ['shexc', 'shex'] + filenames = ['*.shex'] + mimetypes = ['text/shex'] + + # character group definitions :: + + PN_CHARS_BASE_GRP = (u'a-zA-Z' + u'\u00c0-\u00d6' + u'\u00d8-\u00f6' + u'\u00f8-\u02ff' + u'\u0370-\u037d' + u'\u037f-\u1fff' + u'\u200c-\u200d' + u'\u2070-\u218f' + u'\u2c00-\u2fef' + u'\u3001-\ud7ff' + u'\uf900-\ufdcf' + u'\ufdf0-\ufffd') + + PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_') + + PN_CHARS_GRP = (PN_CHARS_U_GRP + + r'\-' + + r'0-9' + + u'\u00b7' + + u'\u0300-\u036f' + + u'\u203f-\u2040') + + HEX_GRP = '0-9A-Fa-f' + + PN_LOCAL_ESC_CHARS_GRP = r"_~.\-!$&'()*+,;=/?#@%" + + # terminal productions :: + + PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']' + + PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']' + + PN_CHARS = '[' + PN_CHARS_GRP + ']' + + HEX = '[' + HEX_GRP + ']' + + PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']' + + UCHAR_NO_BACKSLASH = '(?:u' + HEX + '{4}|U' + HEX + '{8})' + + UCHAR = r'\\' + UCHAR_NO_BACKSLASH + + IRIREF = r'<(?:[^\x00-\x20<>"{}|^`\\]|' + UCHAR + ')*>' + + BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \ + '.]*' + PN_CHARS + ')?' + + PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?' + + PERCENT = '%' + HEX + HEX + + PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS + + PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')' + + PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' + + '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' + + PN_CHARS_GRP + ':]|' + PLX + '))?') + + EXPONENT = r'[eE][+-]?\d+' + + # Lexer token definitions :: + + tokens = { + 'root': [ + (r'\s+', Text), + # keywords :: + (r'((?i)base|prefix|start|external|' + r'literal|iri|bnode|nonliteral|length|minlength|maxlength|' + r'mininclusive|minexclusive|maxinclusive|maxexclusive|' + r'totaldigits|fractiondigits|' + r'closed|extra)\b', Keyword), + (r'(a)\b', Keyword), + # IRIs :: + ('(' + IRIREF + ')', Name.Label), + # blank nodes :: + ('(' + BLANK_NODE_LABEL + ')', Name.Label), + # prefixed names :: + (r'(' + PN_PREFIX + ')?(\:)(' + PN_LOCAL + ')?', + bygroups(Name.Namespace, Punctuation, Name.Tag)), + # boolean literals :: + (r'(true|false)', Keyword.Constant), + # double literals :: + (r'[+\-]?(\d+\.\d*' + EXPONENT + '|\.?\d+' + EXPONENT + ')', Number.Float), + # decimal literals :: + (r'[+\-]?(\d+\.\d*|\.\d+)', Number.Float), + # integer literals :: + (r'[+\-]?\d+', Number.Integer), + # operators :: + (r'[@|$&=*+?^\-~]', Operator), + # operator keywords :: + (r'((?i)and|or|not)\b', Operator.Word), + # punctuation characters :: + (r'[(){}.;,:^\[\]]', Punctuation), + # line comments :: + (r'#[^\n]*', Comment), + # strings :: + (r'"""', String, 'triple-double-quoted-string'), + (r'"', String, 'single-double-quoted-string'), + (r"'''", String, 'triple-single-quoted-string'), + (r"'", String, 'single-single-quoted-string'), + ], + 'triple-double-quoted-string': [ + (r'"""', String, 'end-of-string'), + (r'[^\\]+', String), + (r'\\', String, 'string-escape'), + ], + 'single-double-quoted-string': [ + (r'"', String, 'end-of-string'), + (r'[^"\\\n]+', String), + (r'\\', String, 'string-escape'), + ], + 'triple-single-quoted-string': [ + (r"'''", String, 'end-of-string'), + (r'[^\\]+', String), + (r'\\', String.Escape, 'string-escape'), + ], + 'single-single-quoted-string': [ + (r"'", String, 'end-of-string'), + (r"[^'\\\n]+", String), + (r'\\', String, 'string-escape'), + ], + 'string-escape': [ + (UCHAR_NO_BACKSLASH, String.Escape, '#pop'), + (r'.', String.Escape, '#pop'), + ], + 'end-of-string': [ + (r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)', + bygroups(Operator, Name.Function), '#pop:2'), + (r'\^\^', Operator, '#pop:2'), + default('#pop:2'), + ], + } diff --git a/tests/examplefiles/example.shex b/tests/examplefiles/example.shex new file mode 100644 index 00000000..8fab2c85 --- /dev/null +++ b/tests/examplefiles/example.shex @@ -0,0 +1,20 @@ +PREFIX rdf: +PREFIX rdfs: +PREFIX schema: +PREFIX skos: +PREFIX xsd: +PREFIX ex: + +ex:Lexer { + rdfs:label xsd:string; + skos:altLabel xsd:string*; + ex:filenames xsd:string+; + ex:mimetypes xsd:string+; + ex:priority xsd:decimal MinInclusive 0.0 MaxExclusive 1.0; # seems to be the de facto range of currently defined priorities + ex:lexes @ex:Language*; +} + +ex:Language { + schema:description rdf:langString*; + schema:url IRI?; +} -- cgit v1.2.1