summaryrefslogtreecommitdiff
path: root/pygments/lexers/rdf.py
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2016-02-02 10:30:58 +0100
committerGeorg Brandl <georg@python.org>2016-02-02 10:30:58 +0100
commit28c93d6d5bd08e158ca8529e520560796059b510 (patch)
tree3242ecd439e3fa314452df58e899969efa4b3db2 /pygments/lexers/rdf.py
parenta3134c1f01023eba264bf5eb04764df41890a24e (diff)
downloadpygments-28c93d6d5bd08e158ca8529e520560796059b510.tar.gz
Backout 10520a0c4913 - this breaks Jython.
If you want to handle non-BMP characters correctly with Python 2.x, you should use a build with wide unicode characters. Closes #1205.
Diffstat (limited to 'pygments/lexers/rdf.py')
-rw-r--r--pygments/lexers/rdf.py71
1 files changed, 42 insertions, 29 deletions
diff --git a/pygments/lexers/rdf.py b/pygments/lexers/rdf.py
index cb634ee0..103b4ad0 100644
--- a/pygments/lexers/rdf.py
+++ b/pygments/lexers/rdf.py
@@ -29,43 +29,56 @@ class SparqlLexer(RegexLexer):
filenames = ['*.rq', '*.sparql']
mimetypes = ['application/sparql-query']
+ # character group definitions ::
+
+ PN_CHARS_BASE_GRP = (u'a-zA-Z'
+ u'\u00c0-\u00d6'
+ u'\u00d8-\u00f6'
+ u'\u00f8-\u02ff'
+ u'\u0370-\u037d'
+ u'\u037f-\u1fff'
+ u'\u200c-\u200d'
+ u'\u2070-\u218f'
+ u'\u2c00-\u2fef'
+ u'\u3001-\ud7ff'
+ u'\uf900-\ufdcf'
+ u'\ufdf0-\ufffd'
+ u'\U00010000-\U000effff')
+
+ PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
+
+ PN_CHARS_GRP = (PN_CHARS_U_GRP +
+ r'\-' +
+ r'0-9' +
+ u'\u00b7' +
+ u'\u0300-\u036f' +
+ u'\u203f-\u2040')
+
+ HEX_GRP = '0-9A-Fa-f'
+
+ PN_LOCAL_ESC_CHARS_GRP = r' _~.\-!$&""()*+,;=/?#@%'
+
# terminal productions ::
- PN_CHARS_BASE = (u'(?:[a-zA-Z'
- u'\u00c0-\u00d6'
- u'\u00d8-\u00f6'
- u'\u00f8-\u02ff'
- u'\u0370-\u037d'
- u'\u037f-\u1fff'
- u'\u200c-\u200d'
- u'\u2070-\u218f'
- u'\u2c00-\u2fef'
- u'\u3001-\ud7ff'
- u'\uf900-\ufdcf'
- u'\ufdf0-\ufffd]|'
- u'[^\u0000-\uffff]|'
- u'[\ud800-\udbff][\udc00-\udfff])')
+ PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
- PN_CHARS_U = '(?:' + PN_CHARS_BASE + '|_)'
+ PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']'
- PN_CHARS = ('(?:' + PN_CHARS_U + r'|[\-0-9' +
- u'\u00b7' +
- u'\u0300-\u036f' +
- u'\u203f-\u2040])')
+ PN_CHARS = '[' + PN_CHARS_GRP + ']'
- HEX = '[0-9A-Fa-f]'
+ HEX = '[' + HEX_GRP + ']'
- PN_LOCAL_ESC_CHARS = r'[ _~.\-!$&""()*+,;=/?#@%]'
+ PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
IRIREF = r'<(?:[^<>"{}|^`\\\x00-\x20])*>'
- BLANK_NODE_LABEL = '_:(?:' + PN_CHARS_U + '|[0-9])(?:(?:' + PN_CHARS + '|\.)*' + \
- PN_CHARS + ')?'
+ BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \
+ '.]*' + PN_CHARS + ')?'
- PN_PREFIX = PN_CHARS_BASE + '(?:(?:' + PN_CHARS + '|\.)*' + PN_CHARS + ')?'
+ PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
- VARNAME = '(?:' + PN_CHARS_U + '|[0-9])(?:' + PN_CHARS_U + \
- u'|[0-9\u00b7\u0300-\u036f\u203f-\u2040])*'
+ VARNAME = u'[0-9' + PN_CHARS_U_GRP + '][' + PN_CHARS_U_GRP + \
+ u'0-9\u00b7\u0300-\u036f\u203f-\u2040]*'
PERCENT = '%' + HEX + HEX
@@ -73,9 +86,9 @@ class SparqlLexer(RegexLexer):
PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
- PN_LOCAL = ('(?:(?:' + PN_CHARS_U + '|[:0-9])|' + PLX + ')' +
- '(?:(?:(?:' + PN_CHARS + '|[.:])|' + PLX + ')*(?:(?:' +
- PN_CHARS + '|:)|' + PLX + '))?')
+ PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
+ '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
+ PN_CHARS_GRP + ':]|' + PLX + '))?')
EXPONENT = r'[eE][+-]?\d+'