test/test_parsers/test_parser_hext.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

from pathlib import Path

from rdflib import ConjunctiveGraph, Dataset, Literal
from rdflib.namespace import XSD


def test_small_string():
    s = """
        ["http://example.com/s01", "http://example.com/a", "http://example.com/Type1", "globalId", "", ""]
        ["http://example.com/s01", "http://example.com/label", "This is a Label", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", ""]
        ["http://example.com/s01", "http://example.com/comment", "This is a comment", "http://www.w3.org/2001/XMLSchema#string", "", ""]
        ["http://example.com/s01", "http://example.com/creationDate", "2021-12-01", "http://www.w3.org/2001/XMLSchema#date", "", ""]
        ["http://example.com/s01", "http://example.com/creationTime", "2021-12-01T12:13:00", "http://www.w3.org/2001/XMLSchema#dateTime", "", ""]
        ["http://example.com/s01", "http://example.com/age", "42", "http://www.w3.org/2001/XMLSchema#integer", "", ""]
        ["http://example.com/s01", "http://example.com/trueFalse", "false", ",http://www.w3.org/2001/XMLSchema#boolean", "", ""]
        ["http://example.com/s01", "http://example.com/op1", "http://example.com/o1", "globalId", "", ""]
        ["http://example.com/s01", "http://example.com/op1", "http://example.com/o2", "globalId", "", ""]
        ["http://example.com/s01", "http://example.com/op2", "http://example.com/o3", "globalId", "", ""]
        """
    d = Dataset().parse(data=s, format="hext")
    assert len(d) == 10


def test_small_string_cg():
    s = """
        ["http://example.com/s01", "http://example.com/a", "http://example.com/Type1", "globalId", "", ""]
        ["http://example.com/s01", "http://example.com/label", "This is a Label", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", ""]
        ["http://example.com/s01", "http://example.com/comment", "This is a comment", "http://www.w3.org/2001/XMLSchema#string", "", ""]
        ["http://example.com/s01", "http://example.com/creationDate", "2021-12-01", "http://www.w3.org/2001/XMLSchema#date", "", ""]
        ["http://example.com/s01", "http://example.com/creationTime", "2021-12-01T12:13:00", "http://www.w3.org/2001/XMLSchema#dateTime", "", ""]
        ["http://example.com/s01", "http://example.com/age", "42", "http://www.w3.org/2001/XMLSchema#integer", "", ""]
        ["http://example.com/s01", "http://example.com/trueFalse", "false", ",http://www.w3.org/2001/XMLSchema#boolean", "", ""]
        ["http://example.com/s01", "http://example.com/op1", "http://example.com/o1", "globalId", "", ""]
        ["http://example.com/s01", "http://example.com/op1", "http://example.com/o2", "globalId", "", ""]
        ["http://example.com/s01", "http://example.com/op2", "http://example.com/o3", "globalId", "", ""]
        """
    d = ConjunctiveGraph().parse(data=s, format="hext")
    assert len(d) == 10


def test_small_file_singlegraph():
    d = Dataset().parse(
        Path(__file__).parent.parent / "data/test_parser_hext_singlegraph.ndjson",
        format="hext",
    )
    assert len(d) == 10


def test_small_file_multigraph():
    d = Dataset()
    assert len(d) == 0
    d.parse(
        Path(__file__).parent.parent / "data/test_parser_hext_multigraph.ndjson",
        format="hext",
        publicID=d.default_context.identifier,
    )

    """There are 22 lines in the file test_parser_hext_multigraph.ndjson. When loaded
    into a Dataset, we get only 18 quads since the the dataset can contextualise
    the triples and thus deduplicate 4."""
    total_triples = 0
    # count all the triples in the Dataset
    for context in d.contexts():
        for triple in context.triples((None, None, None)):
            total_triples += 1
    assert total_triples == 18


def test_small_file_multigraph_cg():
    d = ConjunctiveGraph()
    assert len(d) == 0
    d.parse(
        Path(__file__).parent.parent / "data/test_parser_hext_multigraph.ndjson",
        format="hext",
        publicID=d.default_context.identifier,
    )

    """There are 22 lines in the file test_parser_hext_multigraph.ndjson. When loaded
    into a CG, we get only 18 quads since the the CG can contextualise
    the triples and thus deduplicate 4."""
    total_triples = 0
    # count all the triples in the Dataset
    for context in d.contexts():
        for triple in context.triples((None, None, None)):
            total_triples += 1
    assert total_triples == 18


def test_roundtrip():
    # these are some RDF files that HexT can round-trip since the have no
    # literals with no datatype declared:
    TEST_DIR = Path(__file__).parent.absolute() / "nt"
    files_to_skip = {
        "paths-04.nt": "subject literal",
        "even_more_literals.nt": "JSON decoding error",
        "literals-02.nt": "JSON decoding error",
        "more_literals.nt": "JSON decoding error",
        "test.nt": "JSON decoding error",
        "literals-05.nt": "JSON decoding error",
        "i18n-01.nt": "JSON decoding error",
        "literals-04.nt": "JSON decoding error",
        "rdflibtest01.nt": "JSON decoding error",
        "rdflibtest05.nt": "JSON decoding error",
    }
    tests = 0
    skipped = 0
    skip = False
    print()
    p = TEST_DIR.glob("**/*")
    for f in [x for x in p if x.is_file()]:
        tests += 1
        print(f"Test {tests}: {f}")
        if f.name not in files_to_skip.keys():
            try:
                cg = ConjunctiveGraph().parse(f, format="nt")
                # print(cg.serialize(format="n3"))
            except Exception:
                print("Skipping: could not NT parse")
                skipped += 1
                skip = True
            if not skip:
                cg2 = ConjunctiveGraph()
                cg2.parse(
                    data=cg.serialize(format="hext"),
                    format="hext",
                    publicID=cg2.default_context.identifier,
                )
                if cg2.context_aware:
                    for context in cg2.contexts():
                        for triple in context.triples((None, None, None)):
                            if type(triple[2]) == Literal:
                                if triple[2].datatype == XSD.string:
                                    context.remove((triple[0], triple[1], triple[2]))
                                    context.add(
                                        (triple[0], triple[1], Literal(str(triple[2])))
                                    )
                else:
                    for triple in cg2.triples((None, None, None)):
                        if type(triple[2]) == Literal:
                            if triple[2].datatype == XSD.string:
                                cg2.remove((triple[0], triple[1], triple[2]))
                                cg2.add((triple[0], triple[1], Literal(str(triple[2]))))

                # print(cg2.serialize(format="trig"))
                assert cg.isomorphic(cg2)
            skip = False
        else:
            print(f"Skipping: {files_to_skip[f.name]}")

    print(f"No. tests: {tests}")
    print(f"No. tests skipped: {skipped}")


if __name__ == "__main__":
    test_small_file_multigraph()