diff options
author | Carlos Garnacho <carlosg@gnome.org> | 2022-07-11 20:28:29 +0200 |
---|---|---|
committer | Carlos Garnacho <carlosg@gnome.org> | 2023-01-10 13:02:31 +0100 |
commit | af89f12b1547ff0d7f01286495c545a63aa1bf29 (patch) | |
tree | 909ee2e51538e086259ed4b6aa36962eeefabe07 /src/libtracker-sparql | |
parent | ddf6665bccd51343362124dd0b250988d64cb940 (diff) | |
download | tracker-af89f12b1547ff0d7f01286495c545a63aa1bf29.tar.gz |
libtracker-sparql: Add JSON-LD deserializer implementation
This object parses a JSON-LD document and translates it into an iterable
cursor containing RDF data. Nested objects, graphs, and arrays of objects
are all observed.
Diffstat (limited to 'src/libtracker-sparql')
-rw-r--r-- | src/libtracker-sparql/meson.build | 1 | ||||
-rw-r--r-- | src/libtracker-sparql/tracker-deserializer-json-ld.c | 672 | ||||
-rw-r--r-- | src/libtracker-sparql/tracker-deserializer-json-ld.h | 38 |
3 files changed, 711 insertions, 0 deletions
diff --git a/src/libtracker-sparql/meson.build b/src/libtracker-sparql/meson.build index e5207b071..5ea2079e4 100644 --- a/src/libtracker-sparql/meson.build +++ b/src/libtracker-sparql/meson.build @@ -28,6 +28,7 @@ libtracker_sparql_sources = files( 'tracker-deserializer-rdf.c', 'tracker-deserializer-turtle.c', 'tracker-deserializer-json.c', + 'tracker-deserializer-json-ld.c', 'tracker-deserializer-resource.c', 'tracker-deserializer-xml.c', 'tracker-endpoint.c', diff --git a/src/libtracker-sparql/tracker-deserializer-json-ld.c b/src/libtracker-sparql/tracker-deserializer-json-ld.c new file mode 100644 index 000000000..aa81a4c6e --- /dev/null +++ b/src/libtracker-sparql/tracker-deserializer-json-ld.c @@ -0,0 +1,672 @@ +/* + * Copyright (C) 2022, Red Hat Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * Author: Carlos Garnacho <carlosg@gnome.org> + */ + +/* Deserialization to cursors for the JSON format defined at: + * https://www.w3.org/TR/json-ld/ + */ + +#include "config.h" + +#include "tracker-deserializer-json-ld.h" + +#include <json-glib/json-glib.h> + +enum { + STATE_INITIAL, + STATE_ROOT_LIST, + STATE_MAYBE_GRAPH, + STATE_OBJECT_LIST, + STATE_PROPERTIES, + STATE_VALUE_LIST, + STATE_VALUE, + STATE_FINAL, +}; + +enum { + STACK_ARRAY, + STACK_OBJECT, +}; + +typedef struct { + guint type; + guint state; + union { + struct { + gint idx; + guint elements; + } array; + struct { + gint idx; + gchar **members; + gchar *id; + gboolean is_graph; + } object; + } data; +} StateStack; + +struct _TrackerDeserializerJsonLD { + TrackerDeserializer parent_instance; + JsonParser *parser; + JsonReader *reader; + GArray *state_stack; + gchar *cur_graph; + gchar *cur_subject; + gchar *cur_predicate; + gchar *cur_object; + TrackerSparqlValueType object_type; + guint state; + gboolean has_row; + guint blank_node_idx; + GError *init_error; +}; + +G_DEFINE_TYPE (TrackerDeserializerJsonLD, + tracker_deserializer_json_ld, + TRACKER_TYPE_DESERIALIZER_RDF) + +static void +tracker_deserializer_json_ld_finalize (GObject *object) +{ + TrackerDeserializerJsonLD *deserializer = + TRACKER_DESERIALIZER_JSON_LD (object); + + g_clear_object (&deserializer->reader); + g_clear_object (&deserializer->parser); + g_array_unref (deserializer->state_stack); + g_clear_pointer (&deserializer->cur_graph, g_free); + g_clear_pointer (&deserializer->cur_subject, g_free); + g_clear_pointer (&deserializer->cur_predicate, g_free); + g_clear_pointer (&deserializer->cur_object, g_free); + + G_OBJECT_CLASS (tracker_deserializer_json_ld_parent_class)->finalize (object); +} + +static void +tracker_deserializer_json_ld_constructed (GObject *object) +{ + TrackerDeserializerJsonLD *deserializer = + TRACKER_DESERIALIZER_JSON_LD (object); + GInputStream *stream; + + G_OBJECT_CLASS (tracker_deserializer_json_ld_parent_class)->constructed (object); + + stream = tracker_deserializer_get_stream (TRACKER_DESERIALIZER (object)); + + if (json_parser_load_from_stream (deserializer->parser, + stream, + NULL, + &deserializer->init_error)) { + JsonNode *root; + + root = json_parser_get_root (deserializer->parser); + deserializer->reader = json_reader_new (root); + } +} + +static void +state_clear (gpointer user_data) +{ + StateStack *elem = user_data; + + if (elem->type == STACK_OBJECT) { + g_strfreev (elem->data.object.members); + g_free (elem->data.object.id); + } +} + +static gboolean +advance_stack (TrackerDeserializerJsonLD *deserializer) +{ + StateStack *elem; + + g_assert (deserializer->state_stack->len > 0); + + elem = &g_array_index (deserializer->state_stack, + StateStack, + deserializer->state_stack->len - 1); + + if (elem->type == STACK_ARRAY) { + if (elem->data.array.idx >= 0) + json_reader_end_element (deserializer->reader); + + elem->data.array.idx++; + + if (elem->data.array.idx >= (gint) elem->data.array.elements) + return FALSE; + + json_reader_read_element (deserializer->reader, + elem->data.array.idx); + return TRUE; + } else if (elem->type == STACK_OBJECT) { + if (elem->data.object.idx >= 0) + json_reader_end_member (deserializer->reader); + + elem->data.object.idx++; + + if (elem->data.object.members[elem->data.array.idx] == NULL) + return FALSE; + + json_reader_read_member (deserializer->reader, + elem->data.object.members[elem->data.array.idx]); + return TRUE; + } + + return FALSE; +} + +static void +push_stack (TrackerDeserializerJsonLD *deserializer, + guint state) +{ + StateStack elem = { 0 }; + const gchar *id = NULL; + + if (json_reader_is_array (deserializer->reader)) { + elem.type = STACK_ARRAY; + elem.data.array.idx = -1; + elem.data.array.elements = + json_reader_count_elements (deserializer->reader); + } else if (json_reader_is_object (deserializer->reader)) { + elem.type = STACK_OBJECT; + elem.data.object.idx = -1; + elem.data.object.members = + json_reader_list_members (deserializer->reader); + + elem.data.object.is_graph = + json_reader_read_member (deserializer->reader, "@graph"); + json_reader_end_member (deserializer->reader); + + if (json_reader_read_member (deserializer->reader, "@id")) + id = json_reader_get_string_value (deserializer->reader); + json_reader_end_member (deserializer->reader); + + if (id) { + TrackerNamespaceManager *namespaces; + + namespaces = tracker_deserializer_get_namespaces (TRACKER_DESERIALIZER (deserializer)); + elem.data.object.id = + tracker_namespace_manager_expand_uri (namespaces, id); + } + } else { + g_assert_not_reached (); + } + + elem.state = state; + g_array_append_val (deserializer->state_stack, elem); + deserializer->state = state; +} + +static void +pop_stack (TrackerDeserializerJsonLD *deserializer) +{ + StateStack *elem; + + g_assert (deserializer->state_stack->len > 0); + + g_array_set_size (deserializer->state_stack, + deserializer->state_stack->len - 1); + + if (deserializer->state_stack->len > 0) { + elem = &g_array_index (deserializer->state_stack, + StateStack, + deserializer->state_stack->len - 1); + deserializer->state = elem->state; + } else { + deserializer->state = STATE_FINAL; + } +} + +static guint +stack_state (TrackerDeserializerJsonLD *deserializer) +{ + StateStack *elem; + + g_assert (deserializer->state_stack->len > 0); + + elem = &g_array_index (deserializer->state_stack, + StateStack, + deserializer->state_stack->len - 1); + + return elem->state; +} + +static const gchar * +current_member (TrackerDeserializerJsonLD *deserializer) +{ + StateStack *elem; + gint i; + + g_assert (deserializer->state_stack->len > 0); + + for (i = (gint) deserializer->state_stack->len - 1; i >= 0; i--) { + elem = &g_array_index (deserializer->state_stack, + StateStack, i); + + if (elem->type == STACK_OBJECT) { + return elem->data.object.idx >= 0 ? + elem->data.object.members[elem->data.object.idx] : + NULL; + } + } + + return NULL; +} + +static const gchar * +current_id (TrackerDeserializerJsonLD *deserializer) +{ + StateStack *elem; + gint i; + + g_assert (deserializer->state_stack->len > 0); + + for (i = (gint) deserializer->state_stack->len - 1; i >= 0; i--) { + elem = &g_array_index (deserializer->state_stack, + StateStack, i); + if (elem->type == STACK_OBJECT && + !elem->data.object.is_graph && + elem->data.object.id) + return elem->data.object.id; + } + + return NULL; +} + +static const gchar * +current_graph (TrackerDeserializerJsonLD *deserializer) +{ + StateStack *elem; + gint i; + + g_assert (deserializer->state_stack->len > 0); + + for (i = (gint) deserializer->state_stack->len - 1; i >= 0; i--) { + elem = &g_array_index (deserializer->state_stack, + StateStack, i); + if (elem->type == STACK_OBJECT && + elem->data.object.is_graph) + return elem->data.object.id; + } + + return NULL; +} + +static gchar * +node_to_string (JsonNode *node, + TrackerNamespaceManager *namespaces, + TrackerSparqlValueType *value_type) +{ + GValue value = G_VALUE_INIT; + GType type; + gchar *str = NULL; + + json_node_get_value (node, &value); + type = json_node_get_value_type (node); + + if (type == G_TYPE_INT64) { + *value_type = TRACKER_SPARQL_VALUE_TYPE_INTEGER; + str = g_strdup_printf ("%" G_GINT64_FORMAT, g_value_get_int64 (&value)); + } else if (type == G_TYPE_STRING) { + *value_type = TRACKER_SPARQL_VALUE_TYPE_STRING; + str = tracker_namespace_manager_expand_uri (namespaces, g_value_get_string (&value)); + } else if (type == G_TYPE_DOUBLE) { + gchar buf[G_ASCII_DTOSTR_BUF_SIZE]; + + g_ascii_dtostr (buf, G_ASCII_DTOSTR_BUF_SIZE, g_value_get_double (&value)); + *value_type = TRACKER_SPARQL_VALUE_TYPE_DOUBLE; + str = g_strdup (buf); + } else if (type == G_TYPE_BOOLEAN) { + *value_type = TRACKER_SPARQL_VALUE_TYPE_BOOLEAN; + str = g_strdup (g_value_get_boolean (&value) ? "true" : "false"); + } else { + *value_type = TRACKER_SPARQL_VALUE_TYPE_UNBOUND; + } + + g_value_unset (&value); + + return str; +} + +static void +load_context (TrackerDeserializerJsonLD *deserializer) +{ + TrackerNamespaceManager *namespaces; + + namespaces = tracker_deserializer_get_namespaces (TRACKER_DESERIALIZER (deserializer)); + + if (json_reader_read_member (deserializer->reader, "@context")) { + gchar **members = json_reader_list_members (deserializer->reader); + guint i; + + for (i = 0; members[i] != NULL; i++) { + if (tracker_namespace_manager_lookup_prefix (namespaces, members[i])) + continue; + + if (json_reader_read_member (deserializer->reader, members[i])) { + const gchar *expanded = json_reader_get_string_value (deserializer->reader); + tracker_namespace_manager_add_prefix (namespaces, members[i], expanded); + } + + json_reader_end_member (deserializer->reader); + } + + g_strfreev (members); + } + + json_reader_end_member (deserializer->reader); +} + +static gboolean +forward_state (TrackerDeserializerJsonLD *deserializer) +{ + TrackerNamespaceManager *namespaces; + const gchar *member; + + namespaces = tracker_deserializer_get_namespaces (TRACKER_DESERIALIZER (deserializer)); + + switch (deserializer->state) { + case STATE_INITIAL: + if (json_reader_is_array (deserializer->reader)) { + push_stack (deserializer, STATE_ROOT_LIST); + } else if (json_reader_is_object (deserializer->reader)) { + push_stack (deserializer, STATE_PROPERTIES); + deserializer->state = STATE_MAYBE_GRAPH; + } + break; + case STATE_ROOT_LIST: + if (!advance_stack (deserializer)) { + pop_stack (deserializer); + break; + } + + push_stack (deserializer, STATE_MAYBE_GRAPH); + break; + case STATE_MAYBE_GRAPH: + load_context (deserializer); + + if (json_reader_read_member (deserializer->reader, "@graph")) { + g_clear_pointer (&deserializer->cur_graph, g_free); + deserializer->cur_graph = g_strdup (current_graph (deserializer)); + push_stack (deserializer, STATE_OBJECT_LIST); + } else { + json_reader_end_member (deserializer->reader); + g_clear_pointer (&deserializer->cur_subject, g_free); + deserializer->cur_subject = g_strdup (current_id (deserializer)); + deserializer->state = STATE_PROPERTIES; + } + break; + case STATE_OBJECT_LIST: + if (!advance_stack (deserializer)) { + /* Pop the graph array, close manually the @graph + * member, and pop the graph object too + */ + pop_stack (deserializer); + json_reader_end_member (deserializer->reader); + pop_stack (deserializer); + break; + } + + push_stack (deserializer, STATE_PROPERTIES); + deserializer->state = STATE_MAYBE_GRAPH; + break; + case STATE_PROPERTIES: + if (!advance_stack (deserializer)) { + pop_stack (deserializer); + + if (deserializer->state == STATE_PROPERTIES || + deserializer->state == STATE_VALUE_LIST) { + gchar *nested_object_id; + + /* The state popped belonged to a nested object, + * switch subject/predicate back to the parent + * object, and finalize the property that defined it. + */ + nested_object_id = g_steal_pointer (&deserializer->cur_subject); + + deserializer->cur_subject = g_strdup (current_id (deserializer)); + g_clear_pointer (&deserializer->cur_predicate, g_free); + deserializer->cur_predicate = + tracker_namespace_manager_expand_uri (namespaces, current_member (deserializer)); + g_clear_pointer (&deserializer->cur_object, g_free); + deserializer->cur_object = nested_object_id; + deserializer->object_type = TRACKER_SPARQL_VALUE_TYPE_STRING; + deserializer->has_row = TRUE; + } + + break; + } + + member = current_member (deserializer); + g_clear_pointer (&deserializer->cur_predicate, g_free); + + if (g_strcmp0 (member, "@type") == 0) + deserializer->cur_predicate = g_strdup (TRACKER_PREFIX_RDF "type"); + else if (member[0] != '@') + deserializer->cur_predicate = tracker_namespace_manager_expand_uri (namespaces, member); + else + break; + + if (json_reader_is_array (deserializer->reader)) { + push_stack (deserializer, STATE_VALUE_LIST); + } else if (json_reader_is_object (deserializer->reader)) { + push_stack (deserializer, STATE_PROPERTIES); + deserializer->state = STATE_MAYBE_GRAPH; + } else { + deserializer->state = STATE_VALUE; + } + break; + case STATE_VALUE_LIST: + if (!advance_stack (deserializer)) { + pop_stack (deserializer); + break; + } + + if (json_reader_is_object (deserializer->reader)) { + push_stack (deserializer, STATE_PROPERTIES); + deserializer->state = STATE_MAYBE_GRAPH; + } else { + deserializer->state = STATE_VALUE; + } + break; + case STATE_VALUE: + g_clear_pointer (&deserializer->cur_object, g_free); + deserializer->cur_object = node_to_string (json_reader_get_value (deserializer->reader), + namespaces, + &deserializer->object_type); + deserializer->has_row = TRUE; + + deserializer->state = stack_state (deserializer); + break; + case STATE_FINAL: + break; + } + + return deserializer->state_stack->len > 0; +} + +static TrackerSparqlValueType +tracker_deserializer_json_ld_get_value_type (TrackerSparqlCursor *cursor, + gint column) +{ + TrackerDeserializerJsonLD *deserializer = + TRACKER_DESERIALIZER_JSON_LD (cursor); + + switch (column) { + case TRACKER_RDF_COL_SUBJECT: + if (!deserializer->cur_subject) + return TRACKER_SPARQL_VALUE_TYPE_UNBOUND; + else if (strncmp (deserializer->cur_object, "_:", 2) == 0) + return TRACKER_SPARQL_VALUE_TYPE_BLANK_NODE; + else + return TRACKER_SPARQL_VALUE_TYPE_URI; + break; + case TRACKER_RDF_COL_PREDICATE: + if (!deserializer->cur_predicate) + return TRACKER_SPARQL_VALUE_TYPE_UNBOUND; + else + return TRACKER_SPARQL_VALUE_TYPE_URI; + break; + case TRACKER_RDF_COL_OBJECT: + if (!deserializer->cur_object) + return TRACKER_SPARQL_VALUE_TYPE_UNBOUND; + else + return deserializer->object_type; + break; + case TRACKER_RDF_COL_GRAPH: + if (!deserializer->cur_graph) + return TRACKER_SPARQL_VALUE_TYPE_UNBOUND; + else + return TRACKER_SPARQL_VALUE_TYPE_URI; + break; + default: + return TRACKER_SPARQL_VALUE_TYPE_UNBOUND; + } +} + +static const gchar * +tracker_deserializer_json_ld_get_string (TrackerSparqlCursor *cursor, + gint column, + glong *length) +{ + TrackerDeserializerJsonLD *deserializer = + TRACKER_DESERIALIZER_JSON_LD (cursor); + const gchar *str = NULL; + + switch (column) { + case TRACKER_RDF_COL_SUBJECT: + str = deserializer->cur_subject; + break; + case TRACKER_RDF_COL_PREDICATE: + str = deserializer->cur_predicate; + break; + case TRACKER_RDF_COL_OBJECT: + str = deserializer->cur_object; + break; + case TRACKER_RDF_COL_GRAPH: + str = deserializer->cur_graph; + break; + default: + break; + } + + return str; +} + +static gboolean +tracker_deserializer_json_ld_next (TrackerSparqlCursor *cursor, + GCancellable *cancellable, + GError **error) +{ + TrackerDeserializerJsonLD *deserializer = + TRACKER_DESERIALIZER_JSON_LD (cursor); + + if (deserializer->init_error) { + GError *init_error; + + init_error = g_steal_pointer (&deserializer->init_error); + g_propagate_error (error, init_error); + return FALSE; + } + + deserializer->has_row = FALSE; + + while (!deserializer->has_row) { + if (g_cancellable_set_error_if_cancelled (cancellable, error)) + return FALSE; + + if (!forward_state (deserializer)) { + const GError *reader_error; + + reader_error = json_reader_get_error (deserializer->reader); + if (error && reader_error) + *error = g_error_copy (reader_error); + return FALSE; + } + } + + if (!deserializer->cur_subject) + deserializer->cur_subject = g_strdup_printf ("_:%d", deserializer->blank_node_idx++); + + return TRUE; +} + +static void +tracker_deserializer_json_ld_rewind (TrackerSparqlCursor *cursor) +{ + TrackerDeserializerJsonLD *deserializer = + TRACKER_DESERIALIZER_JSON_LD (cursor); + + g_array_set_size (deserializer->state_stack, 0); + deserializer->state = STATE_INITIAL; +} + +static void +tracker_deserializer_json_ld_close (TrackerSparqlCursor *cursor) +{ +} + +gboolean +tracker_deserializer_json_ld_get_parser_location (TrackerDeserializer *deserializer, + goffset *line_no, + goffset *column_no) +{ + return FALSE; +} + +static void +tracker_deserializer_json_ld_class_init (TrackerDeserializerJsonLDClass *klass) +{ + GObjectClass *object_class = G_OBJECT_CLASS (klass); + TrackerSparqlCursorClass *cursor_class = + TRACKER_SPARQL_CURSOR_CLASS (klass); + TrackerDeserializerClass *deserializer_class = + TRACKER_DESERIALIZER_CLASS (klass); + + object_class->finalize = tracker_deserializer_json_ld_finalize; + object_class->constructed = tracker_deserializer_json_ld_constructed; + + cursor_class->get_value_type = tracker_deserializer_json_ld_get_value_type; + cursor_class->get_string = tracker_deserializer_json_ld_get_string; + cursor_class->next = tracker_deserializer_json_ld_next; + cursor_class->rewind = tracker_deserializer_json_ld_rewind; + cursor_class->close = tracker_deserializer_json_ld_close; + + deserializer_class->get_parser_location = + tracker_deserializer_json_ld_get_parser_location; +} + +static void +tracker_deserializer_json_ld_init (TrackerDeserializerJsonLD *deserializer) +{ + deserializer->parser = json_parser_new (); + deserializer->state_stack = g_array_new (FALSE, FALSE, sizeof (StateStack)); + g_array_set_clear_func (deserializer->state_stack, state_clear); +} + +TrackerSparqlCursor * +tracker_deserializer_json_ld_new (GInputStream *stream, + TrackerNamespaceManager *namespaces) +{ + return g_object_new (TRACKER_TYPE_DESERIALIZER_JSON_LD, + "stream", stream, + "namespace-manager", namespaces, + NULL); +} diff --git a/src/libtracker-sparql/tracker-deserializer-json-ld.h b/src/libtracker-sparql/tracker-deserializer-json-ld.h new file mode 100644 index 000000000..82e4210c2 --- /dev/null +++ b/src/libtracker-sparql/tracker-deserializer-json-ld.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2022, Red Hat Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * Author: Carlos Garnacho <carlosg@gnome.org> + */ + +#include "tracker-deserializer-rdf.h" + +#include <gio/gio.h> + +#ifndef __TRACKER_DESERIALIZER_JSON_LD_H__ +#define __TRACKER_DESERIALIZER_JSON_LD_H__ + +#define TRACKER_TYPE_DESERIALIZER_JSON_LD (tracker_deserializer_json_ld_get_type ()) +G_DECLARE_FINAL_TYPE (TrackerDeserializerJsonLD, + tracker_deserializer_json_ld, + TRACKER, DESERIALIZER_JSON_LD, + TrackerDeserializerRdf) + +TrackerSparqlCursor * tracker_deserializer_json_ld_new (GInputStream * stream, + TrackerNamespaceManager * manager); + +#endif /* __TRACKER_DESERIALIZER_JSON_LD_H__ */ |