From 63b7f3c535b3d8a63a612862025fb04967b20155 Mon Sep 17 00:00:00 2001
From: Lucas Werkmeister <mail@lucaswerkmeister.de>
Date: Sat, 20 Jul 2019 22:22:50 +0200
Subject: Add lexer for ShExC

ShExC [1] is one syntax for the ShEx (shape expressions) language [2] to
describe the structure of RDF graphs (the other two syntaxes are based on
JSON-LD and RDF and don?t need special lexers). It is syntactically similar to
SPARQL, which is why a lot of the productions of ShExCLexer are copied from
SparqlLexer, but at the same time has enough differences that I feel it?s
better to simply copy the productions rather than trying to share them between
the two lexers (compare e.?g. PN_LOCAL_ESCAPE_CHARS or IRIREF).

The example file purports to be a brief schema for Pygments lexers, which I put
together from scratch to avoid licensing issues with existing example schemas;
it should not be taken too seriously.

[1]: https://shex.io/shex-semantics/#shexc
[2]: https://shexspec.github.io/primer/
---
 pygments/lexers/_mapping.py     |   1 +
 pygments/lexers/rdf.py          | 148 +++++++++++++++++++++++++++++++++++++++-
 tests/examplefiles/example.shex |  20 ++++++
 3 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 tests/examplefiles/example.shex

diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py
index adb1ec83..595e100f 100644
--- a/pygments/lexers/_mapping.py
+++ b/pygments/lexers/_mapping.py
@@ -390,6 +390,7 @@ LEXERS = {
     'SchemeLexer': ('pygments.lexers.lisp', 'Scheme', ('scheme', 'scm'), ('*.scm', '*.ss'), ('text/x-scheme', 'application/x-scheme')),
     'ScilabLexer': ('pygments.lexers.matlab', 'Scilab', ('scilab',), ('*.sci', '*.sce', '*.tst'), ('text/scilab',)),
     'ScssLexer': ('pygments.lexers.css', 'SCSS', ('scss',), ('*.scss',), ('text/x-scss',)),
+    'ShExCLexer': ('pygments.lexers.rdf', 'ShExC', ('shexc', 'shex'), ('*.shex',), ('text/shex',)),
     'ShenLexer': ('pygments.lexers.lisp', 'Shen', ('shen',), ('*.shen',), ('text/x-shen', 'application/x-shen')),
     'SilverLexer': ('pygments.lexers.verification', 'Silver', ('silver',), ('*.sil', '*.vpr'), ()),
     'SlashLexer': ('pygments.lexers.slash', 'Slash', ('slash',), ('*.sl',), ()),
diff --git a/pygments/lexers/rdf.py b/pygments/lexers/rdf.py
index 8682387e..f172708d 100644
--- a/pygments/lexers/rdf.py
+++ b/pygments/lexers/rdf.py
@@ -15,7 +15,7 @@ from pygments.lexer import RegexLexer, bygroups, default
 from pygments.token import Keyword, Punctuation, String, Number, Operator, Generic, \
     Whitespace, Name, Literal, Comment, Text
 
-__all__ = ['SparqlLexer', 'TurtleLexer']
+__all__ = ['SparqlLexer', 'TurtleLexer', 'ShExCLexer']
 
 
 class SparqlLexer(RegexLexer):
@@ -275,3 +275,149 @@ class TurtleLexer(RegexLexer):
         for t in ('@base ', 'BASE ', '@prefix ', 'PREFIX '):
             if re.search(r'^\s*%s' % t, text):
                 return 0.80
+
+
+class ShExCLexer(RegexLexer):
+    """
+    Lexer for `ShExC <https://shex.io/shex-semantics/#shexc>`_ shape expressions language syntax.
+    """
+    name = 'ShExC'
+    aliases = ['shexc', 'shex']
+    filenames = ['*.shex']
+    mimetypes = ['text/shex']
+
+    # character group definitions ::
+
+    PN_CHARS_BASE_GRP = (u'a-zA-Z'
+                         u'\u00c0-\u00d6'
+                         u'\u00d8-\u00f6'
+                         u'\u00f8-\u02ff'
+                         u'\u0370-\u037d'
+                         u'\u037f-\u1fff'
+                         u'\u200c-\u200d'
+                         u'\u2070-\u218f'
+                         u'\u2c00-\u2fef'
+                         u'\u3001-\ud7ff'
+                         u'\uf900-\ufdcf'
+                         u'\ufdf0-\ufffd')
+
+    PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
+
+    PN_CHARS_GRP = (PN_CHARS_U_GRP +
+                    r'\-' +
+                    r'0-9' +
+                    u'\u00b7' +
+                    u'\u0300-\u036f' +
+                    u'\u203f-\u2040')
+
+    HEX_GRP = '0-9A-Fa-f'
+
+    PN_LOCAL_ESC_CHARS_GRP = r"_~.\-!$&'()*+,;=/?#@%"
+
+    # terminal productions ::
+
+    PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
+
+    PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']'
+
+    PN_CHARS = '[' + PN_CHARS_GRP + ']'
+
+    HEX = '[' + HEX_GRP + ']'
+
+    PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
+
+    UCHAR_NO_BACKSLASH = '(?:u' + HEX + '{4}|U' + HEX + '{8})'
+
+    UCHAR = r'\\' + UCHAR_NO_BACKSLASH
+
+    IRIREF = r'<(?:[^\x00-\x20<>"{}|^`\\]|' + UCHAR + ')*>'
+
+    BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \
+                       '.]*' + PN_CHARS + ')?'
+
+    PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
+
+    PERCENT = '%' + HEX + HEX
+
+    PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
+
+    PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
+
+    PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
+                '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
+                PN_CHARS_GRP + ':]|' + PLX + '))?')
+
+    EXPONENT = r'[eE][+-]?\d+'
+
+    # Lexer token definitions ::
+
+    tokens = {
+        'root': [
+            (r'\s+', Text),
+            # keywords ::
+            (r'((?i)base|prefix|start|external|'
+             r'literal|iri|bnode|nonliteral|length|minlength|maxlength|'
+             r'mininclusive|minexclusive|maxinclusive|maxexclusive|'
+             r'totaldigits|fractiondigits|'
+             r'closed|extra)\b', Keyword),
+            (r'(a)\b', Keyword),
+            # IRIs ::
+            ('(' + IRIREF + ')', Name.Label),
+            # blank nodes ::
+            ('(' + BLANK_NODE_LABEL + ')', Name.Label),
+            # prefixed names ::
+            (r'(' + PN_PREFIX + ')?(\:)(' + PN_LOCAL + ')?',
+             bygroups(Name.Namespace, Punctuation, Name.Tag)),
+            # boolean literals ::
+            (r'(true|false)', Keyword.Constant),
+            # double literals ::
+            (r'[+\-]?(\d+\.\d*' + EXPONENT + '|\.?\d+' + EXPONENT + ')', Number.Float),
+            # decimal literals ::
+            (r'[+\-]?(\d+\.\d*|\.\d+)', Number.Float),
+            # integer literals ::
+            (r'[+\-]?\d+', Number.Integer),
+            # operators ::
+            (r'[@|$&=*+?^\-~]', Operator),
+            # operator keywords ::
+            (r'((?i)and|or|not)\b', Operator.Word),
+            # punctuation characters ::
+            (r'[(){}.;,:^\[\]]', Punctuation),
+            # line comments ::
+            (r'#[^\n]*', Comment),
+            # strings ::
+            (r'"""', String, 'triple-double-quoted-string'),
+            (r'"', String, 'single-double-quoted-string'),
+            (r"'''", String, 'triple-single-quoted-string'),
+            (r"'", String, 'single-single-quoted-string'),
+        ],
+        'triple-double-quoted-string': [
+            (r'"""', String, 'end-of-string'),
+            (r'[^\\]+', String),
+            (r'\\', String, 'string-escape'),
+        ],
+        'single-double-quoted-string': [
+            (r'"', String, 'end-of-string'),
+            (r'[^"\\\n]+', String),
+            (r'\\', String, 'string-escape'),
+        ],
+        'triple-single-quoted-string': [
+            (r"'''", String, 'end-of-string'),
+            (r'[^\\]+', String),
+            (r'\\', String.Escape, 'string-escape'),
+        ],
+        'single-single-quoted-string': [
+            (r"'", String, 'end-of-string'),
+            (r"[^'\\\n]+", String),
+            (r'\\', String, 'string-escape'),
+        ],
+        'string-escape': [
+            (UCHAR_NO_BACKSLASH, String.Escape, '#pop'),
+            (r'.', String.Escape, '#pop'),
+        ],
+        'end-of-string': [
+            (r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
+             bygroups(Operator, Name.Function), '#pop:2'),
+            (r'\^\^', Operator, '#pop:2'),
+            default('#pop:2'),
+        ],
+    }
diff --git a/tests/examplefiles/example.shex b/tests/examplefiles/example.shex
new file mode 100644
index 00000000..8fab2c85
--- /dev/null
+++ b/tests/examplefiles/example.shex
@@ -0,0 +1,20 @@
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX schema: <http://schema.org/>
+PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX ex: <http://pygments.example/#>
+
+ex:Lexer {
+  rdfs:label xsd:string;
+  skos:altLabel xsd:string*;
+  ex:filenames xsd:string+;
+  ex:mimetypes xsd:string+;
+  ex:priority xsd:decimal MinInclusive 0.0 MaxExclusive 1.0; # seems to be the de facto range of currently defined priorities
+  ex:lexes @ex:Language*;
+}
+
+ex:Language {
+  schema:description rdf:langString*;
+  schema:url IRI?;
+}
-- 
cgit v1.2.1