summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornicholascar <nicholas.car@surroundaustralia.com>2022-01-09 23:12:04 +1000
committernicholascar <nicholas.car@surroundaustralia.com>2022-01-09 23:12:04 +1000
commit2c5b50993d58a65c800bf337eb3c5c718b3a0410 (patch)
tree8d3dbfe6655fba310c875eba7f8facdafb998212
parentb2fdaf5a1f45c09694dbd8925ab6b6dee84436b4 (diff)
downloadrdflib-2c5b50993d58a65c800bf337eb3c5c718b3a0410.tar.gz
allow hext to participate in RDF format roundtripping
-rw-r--r--rdflib/plugins/parsers/hext.py27
-rw-r--r--rdflib/plugins/serializers/hext.py11
-rw-r--r--test/test_parser_hext.py39
-rw-r--r--test/test_roundtrip.py25
-rw-r--r--test/test_serializer_hext.py74
5 files changed, 160 insertions, 16 deletions
diff --git a/rdflib/plugins/parsers/hext.py b/rdflib/plugins/parsers/hext.py
index 59e045cf..184d77a9 100644
--- a/rdflib/plugins/parsers/hext.py
+++ b/rdflib/plugins/parsers/hext.py
@@ -5,7 +5,7 @@ handle contexts, i.e. multiple graphs.
"""
import json
-from typing import List, Union
+from typing import List, Union, cast
from rdflib.parser import Parser
from rdflib import ConjunctiveGraph, URIRef, Literal, BNode
import warnings
@@ -24,21 +24,30 @@ class HextuplesParser(Parser):
pass
def _load_json_line(self, line: str):
- return [x if x != "" else None for x in json.loads(line)]
+ # this complex handing is because the 'value' component is
+ # allowed to be "" but not None
+ # all other "" values are treated as None
+ ret1 = json.loads(line)
+ ret2 = [x if x != "" else None for x in json.loads(line)]
+ if ret1[2] == "":
+ ret2[2] = ""
+ return ret2
def _parse_hextuple(self, cg: ConjunctiveGraph, tup: List[Union[str, None]]):
# all values check
# subject, predicate, value, datatype cannot be None
# language and graph may be None
if tup[0] is None or tup[1] is None or tup[2] is None or tup[3] is None:
- raise ValueError("subject, predicate, value, datatype cannot be None")
+ raise ValueError(
+ "subject, predicate, value, datatype cannot be None. Given: "
+ f"{tup}")
# 1 - subject
s: Union[URIRef, BNode]
if tup[0].startswith("_"):
s = BNode(value=tup[0].replace("_:", ""))
else:
- s = URIRef(tup[0])
+ s = cast(URIRef, URIRef(tup[0]))
# 2 - predicate
p = URIRef(tup[1])
@@ -46,14 +55,18 @@ class HextuplesParser(Parser):
# 3 - value
o: Union[URIRef, BNode, Literal]
if tup[3] == "globalId":
- o = URIRef(tup[2])
+ o = cast(URIRef, URIRef(tup[2]))
elif tup[3] == "localId":
o = BNode(value=tup[2].replace("_:", ""))
else: # literal
if tup[4] is None:
- o = Literal(tup[2], datatype=URIRef(tup[3]))
+ o = cast(
+ Literal,
+ Literal(tup[2], datatype=URIRef(tup[3])))
else:
- o = Literal(tup[2], lang=tup[4])
+ o = cast(
+ Literal,
+ Literal(tup[2], lang=tup[4]))
# 6 - context
if tup[5] is not None:
diff --git a/rdflib/plugins/serializers/hext.py b/rdflib/plugins/serializers/hext.py
index c86882a2..6a57a263 100644
--- a/rdflib/plugins/serializers/hext.py
+++ b/rdflib/plugins/serializers/hext.py
@@ -3,6 +3,7 @@ HextuplesSerializer RDF graph serializer for RDFLib.
See <https://github.com/ontola/hextuples> for details about the format.
"""
from typing import IO, Optional, Union
+import json
from rdflib.graph import Graph, ConjunctiveGraph
from rdflib.term import Literal, URIRef, Node, BNode
from rdflib.serializer import Serializer
@@ -20,6 +21,7 @@ class HextuplesSerializer(Serializer):
def __init__(self, store: Union[Graph, ConjunctiveGraph]):
self.default_context: Optional[Node]
if isinstance(store, ConjunctiveGraph):
+ self.graph_type = ConjunctiveGraph
self.contexts = list(store.contexts())
if store.default_context:
self.default_context = store.default_context
@@ -27,6 +29,7 @@ class HextuplesSerializer(Serializer):
else:
self.default_context = None
else:
+ self.graph_type = Graph
self.contexts = [store]
self.default_context = None
@@ -101,14 +104,14 @@ class HextuplesSerializer(Serializer):
else:
language = ""
- return '["%s", "%s", "%s", "%s", "%s", "%s"]\n' % (
+ return json.dumps([
self._iri_or_bn(triple[0]),
triple[1],
value,
datatype,
language,
- self._context(context),
- )
+ self._context(context)
+ ]) + "\n"
else: # do not return anything for non-IRIs or BNs, e.g. QuotedGraph, Subjects
return None
@@ -121,7 +124,7 @@ class HextuplesSerializer(Serializer):
return None
def _context(self, context):
- if self.default_context is None:
+ if self.graph_type == Graph:
return ""
if context.identifier == "urn:x-rdflib:default":
return ""
diff --git a/test/test_parser_hext.py b/test/test_parser_hext.py
index 27d00838..fdf41911 100644
--- a/test/test_parser_hext.py
+++ b/test/test_parser_hext.py
@@ -22,13 +22,30 @@ def test_small_string():
assert len(d) == 10
+def test_small_string_cg():
+ s = """
+ ["http://example.com/s01", "http://example.com/a", "http://example.com/Type1", "globalId", "", ""]
+ ["http://example.com/s01", "http://example.com/label", "This is a Label", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", ""]
+ ["http://example.com/s01", "http://example.com/comment", "This is a comment", "http://www.w3.org/2001/XMLSchema#string", "", ""]
+ ["http://example.com/s01", "http://example.com/creationDate", "2021-12-01", "http://www.w3.org/2001/XMLSchema#date", "", ""]
+ ["http://example.com/s01", "http://example.com/creationTime", "2021-12-01T12:13:00", "http://www.w3.org/2001/XMLSchema#dateTime", "", ""]
+ ["http://example.com/s01", "http://example.com/age", "42", "http://www.w3.org/2001/XMLSchema#integer", "", ""]
+ ["http://example.com/s01", "http://example.com/trueFalse", "false", ",http://www.w3.org/2001/XMLSchema#boolean", "", ""]
+ ["http://example.com/s01", "http://example.com/op1", "http://example.com/o1", "globalId", "", ""]
+ ["http://example.com/s01", "http://example.com/op1", "http://example.com/o2", "globalId", "", ""]
+ ["http://example.com/s01", "http://example.com/op2", "http://example.com/o3", "globalId", "", ""]
+ """
+ d = ConjunctiveGraph().parse(data=s, format="hext")
+ assert len(d) == 10
+
+
def test_small_file_singlegraph():
d = Dataset().parse(Path(__file__).parent / "test_parser_hext_singlegraph.ndjson", format="hext")
assert len(d) == 10
def test_small_file_multigraph():
- d = ConjunctiveGraph()
+ d = Dataset()
assert len(d) == 0
d.parse(
Path(__file__).parent / "test_parser_hext_multigraph.ndjson",
@@ -47,6 +64,26 @@ def test_small_file_multigraph():
assert total_triples == 18
+def test_small_file_multigraph_cg():
+ d = ConjunctiveGraph()
+ assert len(d) == 0
+ d.parse(
+ Path(__file__).parent / "test_parser_hext_multigraph.ndjson",
+ format="hext",
+ publicID=d.default_context.identifier
+ )
+
+ """There are 22 lines in the file test_parser_hext_multigraph.ndjson. When loaded
+ into a CG, we get only 18 quads since the the CG can contextualise
+ the triples and thus deduplicate 4."""
+ total_triples = 0
+ # count all the triples in the Dataset
+ for context in d.contexts():
+ for triple in context.triples((None, None, None)):
+ total_triples += 1
+ assert total_triples == 18
+
+
def test_roundtrip():
# these are some RDF files that HexT can round-trip since the have no
# literals with no datatype declared:
diff --git a/test/test_roundtrip.py b/test/test_roundtrip.py
index 3b083cdd..4a7b7acd 100644
--- a/test/test_roundtrip.py
+++ b/test/test_roundtrip.py
@@ -11,6 +11,7 @@ from _pytest.mark.structures import Mark, MarkDecorator, ParameterSet
import rdflib
import rdflib.compare
from rdflib.util import guess_format
+from rdflib.namespace import XSD
"""
Test round-tripping by all serializers/parser that are registered.
@@ -116,6 +117,14 @@ XFAILS = {
reason="rdflib.compare.isomorphic does not work for quoted graphs.",
raises=AssertionError,
),
+ ("hext", "n3-writer-test-22.n3"): pytest.mark.xfail(
+ reason='HexTuples conflates "" and ""^^xsd:string strings',
+ raises=AssertionError,
+ ),
+ ("hext", "rdf-test-21.n3"): pytest.mark.xfail(
+ reason='HexTuples conflates "" and ""^^xsd:string strings',
+ raises=AssertionError,
+ ),
}
# This is for files which can only be represented properly in one format
@@ -155,6 +164,18 @@ def roundtrip(infmt: str, testfmt: str, source: Path, verbose: bool = False) ->
g2 = rdflib.ConjunctiveGraph()
g2.parse(data=s, format=testfmt)
+ if testfmt == "hext":
+ # HexTuples always sets Literal("abc") -> Literal("abc", datatype=XSD.string)
+ # and this prevents roundtripping since most other formats don't equate "" with
+ # ""^^xsd:string, at least not in these tests
+ #
+ # So we have to scrub the literals' string datatype declarations...
+ for c in g2.contexts():
+ for s, p, o in c.triples((None, None, None)):
+ if type(o) == rdflib.Literal and o.datatype == XSD.string:
+ c.remove((s, p, o))
+ c.add((s, p, rdflib.Literal(str(o))))
+
if verbose:
both, first, second = rdflib.compare.graph_diff(g1, g2)
print("Diff:")
@@ -193,8 +214,8 @@ def get_formats() -> Set[str]:
def make_cases(files: Collection[Tuple[Path, str]]) -> Iterable[ParameterSet]:
formats = get_formats()
for testfmt in formats:
- if testfmt == "hext":
- continue
+ # if testfmt == "hext":
+ # continue
logging.debug("testfmt = %s", testfmt)
for f, infmt in files:
constrained_formats = CONSTRAINED_FORMAT_MAP.get(f.name, None)
diff --git a/test/test_serializer_hext.py b/test/test_serializer_hext.py
index c322a211..7231338f 100644
--- a/test/test_serializer_hext.py
+++ b/test/test_serializer_hext.py
@@ -1,7 +1,7 @@
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent.absolute()))
-from rdflib import Dataset, Graph
+from rdflib import Dataset, Graph, ConjunctiveGraph
import json
@@ -31,7 +31,7 @@ def test_hext_graph():
g.parse(data=turtle_data, format="turtle")
out = g.serialize(format="hext")
- # note: cant' test for BNs in result as they will be different ever time
+ # note: can't test for BNs in result as they will be different every time
testing_lines = [
[False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", ""]'],
[False, '["http://example.com/s1", "http://example.com/p3", "Object 3", "http://www.w3.org/2001/XMLSchema#string", "", ""]'],
@@ -54,6 +54,76 @@ def test_hext_graph():
assert all([x[0] for x in testing_lines])
+def test_hext_cg():
+ """Tests ConjunctiveGraph data"""
+ d = ConjunctiveGraph()
+ trig_data = """
+ PREFIX ex: <http://example.com/>
+ PREFIX owl: <http://www.w3.org/2002/07/owl#>
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+
+ ex:g1 {
+ ex:s1
+ ex:p1 ex:o1 , ex:o2 ;
+ ex:p2 [
+ a owl:Thing ;
+ rdf:value "thingy" ;
+ ] ;
+ ex:p3 "Object 3" , "Object 4 - English"@en ;
+ ex:p4 "2021-12-03"^^xsd:date ;
+ ex:p5 42 ;
+ ex:p6 "42" ;
+ .
+ }
+
+ ex:g2 {
+ ex:s1
+ ex:p1 ex:o1 , ex:o2 ;
+ .
+ ex:s11 ex:p11 ex:o11 , ex:o12 .
+ }
+
+ # default graph triples
+ ex:s1 ex:p1 ex:o1 , ex:o2 .
+ ex:s21 ex:p21 ex:o21 , ex:o22 .
+
+ # other default graph triples
+ {
+ ex:s1 ex:p1 ex:o1 , ex:o2 .
+ }
+ """
+ d.parse(data=trig_data, format="trig", publicID=d.default_context.identifier)
+ out = d.serialize(format="hext")
+ # note: cant' test for BNs in result as they will be different ever time
+ testing_lines = [
+ [False, '["http://example.com/s21", "http://example.com/p21", "http://example.com/o21", "globalId", "", ""]'],
+ [False, '["http://example.com/s21", "http://example.com/p21", "http://example.com/o22", "globalId", "", ""]'],
+ [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", ""]'],
+ [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", ""]'],
+ [False, '["http://example.com/s11", "http://example.com/p11", "http://example.com/o12", "globalId", "", "http://example.com/g2"]'],
+ [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g2"]'],
+ [False, '["http://example.com/s11", "http://example.com/p11", "http://example.com/o11", "globalId", "", "http://example.com/g2"]'],
+ [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g2"]'],
+ [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g1"]'],
+ [False, '["http://example.com/s1", "http://example.com/p2"'],
+ [False, '"http://www.w3.org/1999/02/22-rdf-syntax-ns#value", "thingy", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'],
+ [False, '"http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/2002/07/owl#Thing", "globalId", "", "http://example.com/g1"]'],
+ [False, '["http://example.com/s1", "http://example.com/p3", "Object 4 - English", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", "http://example.com/g1"]'],
+ [False, '["http://example.com/s1", "http://example.com/p6", "42", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'],
+ [False, '["http://example.com/s1", "http://example.com/p4", "2021-12-03", "http://www.w3.org/2001/XMLSchema#date", "", "http://example.com/g1"]'],
+ [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g1"]'],
+ [False, '["http://example.com/s1", "http://example.com/p5", "42", "http://www.w3.org/2001/XMLSchema#integer", "", "http://example.com/g1"]'],
+ [False, '["http://example.com/s1", "http://example.com/p3", "Object 3", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'],
+ ]
+ for line in out.splitlines():
+ for test in testing_lines:
+ if test[1] in line:
+ test[0] = True
+
+ assert all([x[0] for x in testing_lines])
+
+
def test_hext_dataset():
"""Tests context-aware (multigraph) data"""
d = Dataset()