allow hext to participate in RDF format roundtripping

author: nicholascar <nicholas.car@surroundaustralia.com> 2022-01-09 23:12:04 +1000
committer: nicholascar <nicholas.car@surroundaustralia.com> 2022-01-09 23:12:04 +1000
commit: 2c5b50993d58a65c800bf337eb3c5c718b3a0410 (patch)
tree: 8d3dbfe6655fba310c875eba7f8facdafb998212
parent: b2fdaf5a1f45c09694dbd8925ab6b6dee84436b4 (diff)
download: rdflib-2c5b50993d58a65c800bf337eb3c5c718b3a0410.tar.gz
5 files changed, 160 insertions, 16 deletions
diff --git a/rdflib/plugins/parsers/hext.py b/rdflib/plugins/parsers/hext.py
index 59e045cf..184d77a9 100644
--- a/rdflib/plugins/parsers/hext.py
+++ b/rdflib/plugins/parsers/hext.py
@@ -5,7 +5,7 @@ handle contexts, i.e. multiple graphs.
 """
 import json
 
-from typing import List, Union
+from typing import List, Union, cast
 from rdflib.parser import Parser
 from rdflib import ConjunctiveGraph, URIRef, Literal, BNode
 import warnings
@@ -24,21 +24,30 @@ class HextuplesParser(Parser):
         pass
 
     def _load_json_line(self, line: str):
-        return [x if x != "" else None for x in json.loads(line)]
+        # this complex handing is because the 'value' component is
+        # allowed to be "" but not None
+        # all other "" values are treated as None
+        ret1 = json.loads(line)
+        ret2 = [x if x != "" else None for x in json.loads(line)]
+        if ret1[2] == "":
+            ret2[2] = ""
+        return ret2
 
     def _parse_hextuple(self, cg: ConjunctiveGraph, tup: List[Union[str, None]]):
         # all values check
         # subject, predicate, value, datatype cannot be None
         # language and graph may be None
         if tup[0] is None or tup[1] is None or tup[2] is None or tup[3] is None:
-            raise ValueError("subject, predicate, value, datatype cannot be None")
+            raise ValueError(
+                "subject, predicate, value, datatype cannot be None. Given: "
+                f"{tup}")
 
         # 1 - subject
         s: Union[URIRef, BNode]
         if tup[0].startswith("_"):
             s = BNode(value=tup[0].replace("_:", ""))
         else:
-            s = URIRef(tup[0])
+            s = cast(URIRef, URIRef(tup[0]))
 
         # 2 - predicate
         p = URIRef(tup[1])
@@ -46,14 +55,18 @@ class HextuplesParser(Parser):
         # 3 - value
         o: Union[URIRef, BNode, Literal]
         if tup[3] == "globalId":
-            o = URIRef(tup[2])
+            o = cast(URIRef, URIRef(tup[2]))
         elif tup[3] == "localId":
             o = BNode(value=tup[2].replace("_:", ""))
         else:  # literal
             if tup[4] is None:
-                o = Literal(tup[2], datatype=URIRef(tup[3]))
+                o = cast(
+                    Literal,
+                    Literal(tup[2], datatype=URIRef(tup[3])))
             else:
-                o = Literal(tup[2], lang=tup[4])
+                o = cast(
+                    Literal,
+                    Literal(tup[2], lang=tup[4]))
 
         # 6 - context
         if tup[5] is not None:
diff --git a/rdflib/plugins/serializers/hext.py b/rdflib/plugins/serializers/hext.py
index c86882a2..6a57a263 100644
--- a/rdflib/plugins/serializers/hext.py
+++ b/rdflib/plugins/serializers/hext.py
@@ -3,6 +3,7 @@ HextuplesSerializer RDF graph serializer for RDFLib.
 See <https://github.com/ontola/hextuples> for details about the format.
 """
 from typing import IO, Optional, Union
+import json
 from rdflib.graph import Graph, ConjunctiveGraph
 from rdflib.term import Literal, URIRef, Node, BNode
 from rdflib.serializer import Serializer
@@ -20,6 +21,7 @@ class HextuplesSerializer(Serializer):
     def __init__(self, store: Union[Graph, ConjunctiveGraph]):
         self.default_context: Optional[Node]
         if isinstance(store, ConjunctiveGraph):
+            self.graph_type = ConjunctiveGraph
             self.contexts = list(store.contexts())
             if store.default_context:
                 self.default_context = store.default_context
@@ -27,6 +29,7 @@ class HextuplesSerializer(Serializer):
             else:
                 self.default_context = None
         else:
+            self.graph_type = Graph
             self.contexts = [store]
             self.default_context = None
 
@@ -101,14 +104,14 @@ class HextuplesSerializer(Serializer):
             else:
                 language = ""
 
-            return '["%s", "%s", "%s", "%s", "%s", "%s"]\n' % (
+            return json.dumps([
                 self._iri_or_bn(triple[0]),
                 triple[1],
                 value,
                 datatype,
                 language,
-                self._context(context),
-            )
+                self._context(context)
+            ]) + "\n"
         else:  # do not return anything for non-IRIs or BNs, e.g. QuotedGraph, Subjects
             return None
 
@@ -121,7 +124,7 @@ class HextuplesSerializer(Serializer):
             return None
 
     def _context(self, context):
-        if self.default_context is None:
+        if self.graph_type == Graph:
             return ""
         if context.identifier == "urn:x-rdflib:default":
             return ""
diff --git a/test/test_parser_hext.py b/test/test_parser_hext.py
index 27d00838..fdf41911 100644
--- a/test/test_parser_hext.py
+++ b/test/test_parser_hext.py
@@ -22,13 +22,30 @@ def test_small_string():
     assert len(d) == 10
 
 
+def test_small_string_cg():
+    s = """
+        ["http://example.com/s01", "http://example.com/a", "http://example.com/Type1", "globalId", "", ""]
+        ["http://example.com/s01", "http://example.com/label", "This is a Label", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", ""]
+        ["http://example.com/s01", "http://example.com/comment", "This is a comment", "http://www.w3.org/2001/XMLSchema#string", "", ""]
+        ["http://example.com/s01", "http://example.com/creationDate", "2021-12-01", "http://www.w3.org/2001/XMLSchema#date", "", ""]
+        ["http://example.com/s01", "http://example.com/creationTime", "2021-12-01T12:13:00", "http://www.w3.org/2001/XMLSchema#dateTime", "", ""]
+        ["http://example.com/s01", "http://example.com/age", "42", "http://www.w3.org/2001/XMLSchema#integer", "", ""]
+        ["http://example.com/s01", "http://example.com/trueFalse", "false", ",http://www.w3.org/2001/XMLSchema#boolean", "", ""]
+        ["http://example.com/s01", "http://example.com/op1", "http://example.com/o1", "globalId", "", ""]
+        ["http://example.com/s01", "http://example.com/op1", "http://example.com/o2", "globalId", "", ""]
+        ["http://example.com/s01", "http://example.com/op2", "http://example.com/o3", "globalId", "", ""]
+        """
+    d = ConjunctiveGraph().parse(data=s, format="hext")
+    assert len(d) == 10
+
+
 def test_small_file_singlegraph():
     d = Dataset().parse(Path(__file__).parent / "test_parser_hext_singlegraph.ndjson", format="hext")
     assert len(d) == 10
 
 
 def test_small_file_multigraph():
-    d = ConjunctiveGraph()
+    d = Dataset()
     assert len(d) == 0
     d.parse(
         Path(__file__).parent / "test_parser_hext_multigraph.ndjson",
@@ -47,6 +64,26 @@ def test_small_file_multigraph():
     assert total_triples == 18
 
 
+def test_small_file_multigraph_cg():
+    d = ConjunctiveGraph()
+    assert len(d) == 0
+    d.parse(
+        Path(__file__).parent / "test_parser_hext_multigraph.ndjson",
+        format="hext",
+        publicID=d.default_context.identifier
+    )
+
+    """There are 22 lines in the file test_parser_hext_multigraph.ndjson. When loaded
+    into a CG, we get only 18 quads since the the CG can contextualise
+    the triples and thus deduplicate 4."""
+    total_triples = 0
+    # count all the triples in the Dataset
+    for context in d.contexts():
+        for triple in context.triples((None, None, None)):
+            total_triples += 1
+    assert total_triples == 18
+
+
 def test_roundtrip():
     # these are some RDF files that HexT can round-trip since the have no
     # literals with no datatype declared:
diff --git a/test/test_roundtrip.py b/test/test_roundtrip.py
index 3b083cdd..4a7b7acd 100644
--- a/test/test_roundtrip.py
+++ b/test/test_roundtrip.py
@@ -11,6 +11,7 @@ from _pytest.mark.structures import Mark, MarkDecorator, ParameterSet
 import rdflib
 import rdflib.compare
 from rdflib.util import guess_format
+from rdflib.namespace import XSD
 
 """
 Test round-tripping by all serializers/parser that are registered.
@@ -116,6 +117,14 @@ XFAILS = {
         reason="rdflib.compare.isomorphic does not work for quoted graphs.",
         raises=AssertionError,
     ),
+    ("hext", "n3-writer-test-22.n3"): pytest.mark.xfail(
+        reason='HexTuples conflates "" and ""^^xsd:string strings',
+        raises=AssertionError,
+    ),
+    ("hext", "rdf-test-21.n3"): pytest.mark.xfail(
+        reason='HexTuples conflates "" and ""^^xsd:string strings',
+        raises=AssertionError,
+    ),
 }
 
 # This is for files which can only be represented properly in one format
@@ -155,6 +164,18 @@ def roundtrip(infmt: str, testfmt: str, source: Path, verbose: bool = False) ->
     g2 = rdflib.ConjunctiveGraph()
     g2.parse(data=s, format=testfmt)
 
+    if testfmt == "hext":
+        # HexTuples always sets Literal("abc") -> Literal("abc", datatype=XSD.string)
+        # and this prevents roundtripping since most other formats don't equate "" with
+        # ""^^xsd:string, at least not in these tests
+        #
+        # So we have to scrub the literals' string datatype declarations...
+        for c in g2.contexts():
+            for s, p, o in c.triples((None, None, None)):
+                if type(o) == rdflib.Literal and o.datatype == XSD.string:
+                    c.remove((s, p, o))
+                    c.add((s, p, rdflib.Literal(str(o))))
+
     if verbose:
         both, first, second = rdflib.compare.graph_diff(g1, g2)
         print("Diff:")
@@ -193,8 +214,8 @@ def get_formats() -> Set[str]:
 def make_cases(files: Collection[Tuple[Path, str]]) -> Iterable[ParameterSet]:
     formats = get_formats()
     for testfmt in formats:
-        if testfmt == "hext":
-            continue
+        # if testfmt == "hext":
+        #     continue
         logging.debug("testfmt = %s", testfmt)
         for f, infmt in files:
             constrained_formats = CONSTRAINED_FORMAT_MAP.get(f.name, None)
diff --git a/test/test_serializer_hext.py b/test/test_serializer_hext.py
index c322a211..7231338f 100644
--- a/test/test_serializer_hext.py
+++ b/test/test_serializer_hext.py
@@ -1,7 +1,7 @@
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent.parent.absolute()))
-from rdflib import Dataset, Graph
+from rdflib import Dataset, Graph, ConjunctiveGraph
 import json
 
 
@@ -31,7 +31,7 @@ def test_hext_graph():
 
     g.parse(data=turtle_data, format="turtle")
     out = g.serialize(format="hext")
-    # note: cant' test for BNs in result as they will be different ever time
+    # note: can't test for BNs in result as they will be different every time
     testing_lines = [
         [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", ""]'],
         [False, '["http://example.com/s1", "http://example.com/p3", "Object 3", "http://www.w3.org/2001/XMLSchema#string", "", ""]'],
@@ -54,6 +54,76 @@ def test_hext_graph():
     assert all([x[0] for x in testing_lines])
 
 
+def test_hext_cg():
+    """Tests ConjunctiveGraph data"""
+    d = ConjunctiveGraph()
+    trig_data = """
+            PREFIX ex: <http://example.com/>
+            PREFIX owl: <http://www.w3.org/2002/07/owl#>
+            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+
+            ex:g1 {
+                ex:s1
+                    ex:p1 ex:o1 , ex:o2 ;
+                    ex:p2 [
+                        a owl:Thing ;
+                        rdf:value "thingy" ;
+                    ] ;
+                    ex:p3 "Object 3" , "Object 4 - English"@en ;
+                    ex:p4 "2021-12-03"^^xsd:date ;
+                    ex:p5 42 ;
+                    ex:p6 "42" ;
+                .
+            }
+
+            ex:g2 {
+                ex:s1
+                    ex:p1 ex:o1 , ex:o2 ;
+                .
+                ex:s11 ex:p11 ex:o11 , ex:o12 .
+            }
+
+            # default graph triples
+            ex:s1 ex:p1 ex:o1 , ex:o2 .
+            ex:s21 ex:p21 ex:o21 , ex:o22 .
+
+            # other default graph triples
+            {
+                ex:s1 ex:p1 ex:o1 , ex:o2 .
+            }
+           """
+    d.parse(data=trig_data, format="trig", publicID=d.default_context.identifier)
+    out = d.serialize(format="hext")
+    # note: cant' test for BNs in result as they will be different ever time
+    testing_lines = [
+        [False, '["http://example.com/s21", "http://example.com/p21", "http://example.com/o21", "globalId", "", ""]'],
+        [False, '["http://example.com/s21", "http://example.com/p21", "http://example.com/o22", "globalId", "", ""]'],
+        [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", ""]'],
+        [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", ""]'],
+        [False, '["http://example.com/s11", "http://example.com/p11", "http://example.com/o12", "globalId", "", "http://example.com/g2"]'],
+        [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g2"]'],
+        [False, '["http://example.com/s11", "http://example.com/p11", "http://example.com/o11", "globalId", "", "http://example.com/g2"]'],
+        [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g2"]'],
+        [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g1"]'],
+        [False, '["http://example.com/s1", "http://example.com/p2"'],
+        [False, '"http://www.w3.org/1999/02/22-rdf-syntax-ns#value", "thingy", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'],
+        [False, '"http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/2002/07/owl#Thing", "globalId", "", "http://example.com/g1"]'],
+        [False, '["http://example.com/s1", "http://example.com/p3", "Object 4 - English", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", "http://example.com/g1"]'],
+        [False, '["http://example.com/s1", "http://example.com/p6", "42", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'],
+        [False, '["http://example.com/s1", "http://example.com/p4", "2021-12-03", "http://www.w3.org/2001/XMLSchema#date", "", "http://example.com/g1"]'],
+        [False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g1"]'],
+        [False, '["http://example.com/s1", "http://example.com/p5", "42", "http://www.w3.org/2001/XMLSchema#integer", "", "http://example.com/g1"]'],
+        [False, '["http://example.com/s1", "http://example.com/p3", "Object 3", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'],
+    ]
+    for line in out.splitlines():
+        for test in testing_lines:
+            if test[1] in line:
+                test[0] = True
+
+    assert all([x[0] for x in testing_lines])
+
+
 def test_hext_dataset():
     """Tests context-aware (multigraph) data"""
     d = Dataset()
author	nicholascar <nicholas.car@surroundaustralia.com>	2022-01-09 23:12:04 +1000
committer	nicholascar <nicholas.car@surroundaustralia.com>	2022-01-09 23:12:04 +1000
commit	2c5b50993d58a65c800bf337eb3c5c718b3a0410 (patch)
tree	8d3dbfe6655fba310c875eba7f8facdafb998212
parent	b2fdaf5a1f45c09694dbd8925ab6b6dee84436b4 (diff)
download	rdflib-2c5b50993d58a65c800bf337eb3c5c718b3a0410.tar.gz