diff options
Diffstat (limited to 'src/tracker-extract/tracker-extract-epub.c')
-rw-r--r-- | src/tracker-extract/tracker-extract-epub.c | 693 |
1 files changed, 0 insertions, 693 deletions
diff --git a/src/tracker-extract/tracker-extract-epub.c b/src/tracker-extract/tracker-extract-epub.c deleted file mode 100644 index 2e65e10bd..000000000 --- a/src/tracker-extract/tracker-extract-epub.c +++ /dev/null @@ -1,693 +0,0 @@ -/* - * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org> - * Copyright (C) 2008, Nokia <ivan.frade@nokia.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. - */ - -#include <libtracker-extract/tracker-extract.h> - -#include "tracker-main.h" -#include "tracker-gsf.h" -#include "tracker-read.h" - -#include <unistd.h> - -typedef enum { - OPF_TAG_TYPE_UNKNOWN, - OPF_TAG_TYPE_TITLE, - OPF_TAG_TYPE_CREATED, - - OPF_TAG_TYPE_AUTHOR, - OPF_TAG_TYPE_EDITOR, - OPF_TAG_TYPE_ILLUSTRATOR, - OPF_TAG_TYPE_CONTRIBUTOR, - - OPF_TAG_TYPE_LANGUAGE, - OPF_TAG_TYPE_SUBJECT, - OPF_TAG_TYPE_DESCRIPTION, - OPF_TAG_TYPE_UUID, - OPF_TAG_TYPE_ISBN, - OPF_TAG_TYPE_PUBLISHER, - OPF_TAG_TYPE_RATING /* calibre addition, should it be indexed? how? */ -} OPFTagType; - -typedef struct { - TrackerResource *resource; - gchar *uri; - - OPFTagType element; - GList *pages; - guint in_metadata : 1; - guint in_manifest : 1; - guint has_publisher : 1; - guint has_title : 1; - guint has_content_created : 1; - guint has_language : 1; - guint has_subject : 1; - guint has_description : 1; - guint has_identifier : 1; - gchar *savedstring; -} OPFData; - -typedef struct { - GString *contents; - gsize limit; -} OPFContentData; - -static inline OPFData * -opf_data_new (const char *uri, - TrackerResource *resource) -{ - OPFData *data = g_slice_new0 (OPFData); - - data->uri = g_strdup (uri); - data->resource = g_object_ref (resource); - - return data; -} - -static inline void -opf_data_clear_saved_string (OPFData *data) -{ - if (!data || !data->savedstring) { - return; - } - - g_free (data->savedstring); - data->savedstring = NULL; -} - -static inline void -opf_data_free (OPFData *data) -{ - if (!data) { - return; - } - - g_free (data->savedstring); - - g_list_foreach (data->pages, (GFunc) g_free, NULL); - g_list_free (data->pages); - - g_object_unref (data->resource); - g_free (data->uri); - - g_slice_free (OPFData, data); -} - -/* Methods to parse the container.xml file - * pointing to the real metadata/content - */ -static void -container_xml_start_element_handler (GMarkupParseContext *context, - const gchar *element_name, - const gchar **attribute_names, - const gchar **attribute_values, - gpointer user_data, - GError **error) -{ - gchar **path_out = user_data; - gint i; - - if (g_strcmp0 (element_name, "rootfile") != 0) { - return; - } - - for (i = 0; attribute_names[i] != NULL; i++) { - if (g_strcmp0 (attribute_names[i], "full-path") == 0) { - if (!*path_out) { - *path_out = g_strdup (attribute_values[i]); - } - break; - } - } -} - -/* Methods to parse the OPF document metadata/layout */ -static void -opf_xml_start_element_handler (GMarkupParseContext *context, - const gchar *element_name, - const gchar **attribute_names, - const gchar **attribute_values, - gpointer user_data, - GError **error) -{ - OPFData *data = user_data; - gint i; - gboolean has_role_attr = FALSE; - - if (g_strcmp0 (element_name, "metadata") == 0) { - data->in_metadata = TRUE; - } else if (g_strcmp0 (element_name, "manifest") == 0) { - data->in_manifest = TRUE; - } else if (data->in_metadata) { - /* epub metadata */ - if (g_strcmp0 (element_name, "dc:title") == 0) { - data->element = OPF_TAG_TYPE_TITLE; - } else if (g_strcmp0 (element_name, "dc:creator") == 0) { - for (i = 0; attribute_names[i] != NULL; i++) { - if (g_strcmp0 (attribute_names[i], "opf:file-as") == 0) { - g_debug ("Found creator file-as tag"); - data->savedstring = g_strdup (attribute_values[i]); - } else if (g_strcmp0 (attribute_names[i], "opf:role") == 0) { - has_role_attr = TRUE; - if (g_strcmp0 (attribute_values[i], "aut") == 0) { - data->element = OPF_TAG_TYPE_AUTHOR; - } else if (g_strcmp0 (attribute_values[i], "edt") == 0) { - data->element = OPF_TAG_TYPE_EDITOR; - } else if (g_strcmp0 (attribute_values[i], "ill") == 0) { - data->element = OPF_TAG_TYPE_ILLUSTRATOR; - } else { - data->element = OPF_TAG_TYPE_UNKNOWN; - opf_data_clear_saved_string (data); - g_debug ("Unknown role, skipping"); - } - } - } - if (!has_role_attr) { - data->element = OPF_TAG_TYPE_AUTHOR; - } - } else if (g_strcmp0 (element_name, "dc:date") == 0) { - for (i = 0; attribute_names[i] != NULL; i++) { - if (g_strcmp0 (attribute_names[i], "opf:event") == 0 && - g_strcmp0 (attribute_values[i], "original-publication") == 0) { - data->element = OPF_TAG_TYPE_CREATED; - break; - } - } - } else if (g_strcmp0 (element_name, "dc:publisher") == 0) { - data->element = OPF_TAG_TYPE_PUBLISHER; - } else if (g_strcmp0 (element_name, "dc:description") == 0) { - data->element = OPF_TAG_TYPE_DESCRIPTION; - } else if (g_strcmp0 (element_name, "dc:language") == 0) { - data->element = OPF_TAG_TYPE_LANGUAGE; - } else if (g_strcmp0 (element_name, "dc:identifier") == 0) { - data->element = OPF_TAG_TYPE_UUID; - for (i = 0; attribute_names[i] != NULL; i++) { - if (g_strcmp0 (attribute_names[i], "opf:scheme") == 0) { - if (g_ascii_strncasecmp (attribute_values[i], "isbn", 4) == 0) { - data->element = OPF_TAG_TYPE_ISBN; - } - } - } - /* } else if (g_strcmp0 (element_name, "meta") == 0) { */ - /* for (i = 0; attribute_names[i] != NULL; i++) { */ - /* if (g_strcmp0 (attribute_names[i], "name") == 0) { */ - /* if (g_strcmp0 (attribute_values[i], "calibre:rating") == 0) { */ - /* anybool = TRUE; */ - /* } */ - /* } else if (anybool && g_strcmp0 (attribute_names[i], "content")) { */ - /* data->element = OPF_TAG_TYPE_RATING; */ - /* data->savedstring = g_strdup (attribute_values[i]); */ - /* } */ - /* } */ - /* } else if (g_strcmp0 (element_name, "dc:subject") == 0) { */ - /* data->element = OPF_TAG_TYPE_SUBJECT; */ - } - } else if (data->in_manifest && - g_strcmp0 (element_name, "item") == 0) { - const gchar *rel_path = NULL; - gboolean is_xhtml = FALSE; - - /* Keep list of xhtml documents for plain text extraction */ - for (i = 0; attribute_names[i] != NULL; i++) { - if (g_strcmp0 (attribute_names[i], "href") == 0) { - rel_path = attribute_values[i]; - } else if (g_strcmp0 (attribute_names[i], "media-type") == 0 && - g_strcmp0 (attribute_values[i], "application/xhtml+xml") == 0) { - is_xhtml = TRUE; - } - } - - if (is_xhtml && rel_path) { - data->pages = g_list_append (data->pages, g_strdup (rel_path)); - } - } -} - -static void -opf_xml_end_element_handler (GMarkupParseContext *context, - const gchar *element_name, - gpointer user_data, - GError **error) -{ - OPFData *data = user_data; - - if (g_strcmp0 (element_name, "metadata") == 0) { - data->in_metadata = FALSE; - } else if (g_strcmp0 (element_name, "manifest") == 0) { - data->in_manifest = FALSE; - } else { - data->element = OPF_TAG_TYPE_UNKNOWN; - } -} - -static void -opf_xml_text_handler (GMarkupParseContext *context, - const gchar *text, - gsize text_len, - gpointer user_data, - GError **error) -{ - OPFData *data = user_data; - - switch (data->element) { - case OPF_TAG_TYPE_PUBLISHER: - if (data->has_publisher) { - g_warning ("Avoiding additional publisher (%s) in EPUB '%s'", - text, data->uri); - } else { - TrackerResource *publisher; - - publisher = tracker_resource_new (NULL); - tracker_resource_set_uri (publisher, "rdf:type", "nco:Contact"); - tracker_resource_set_string (publisher, "nco:fullname", text); - - tracker_resource_set_relation (data->resource, "nco:publisher", publisher); - g_object_unref (publisher); - - data->has_publisher = TRUE; - } - break; - case OPF_TAG_TYPE_AUTHOR: - case OPF_TAG_TYPE_EDITOR: - case OPF_TAG_TYPE_ILLUSTRATOR: - case OPF_TAG_TYPE_CONTRIBUTOR: { - TrackerResource *contact, *artist = NULL; - gchar *fname, *gname, *oname; - const gchar *fullname = NULL; - gchar *role_uri = NULL; - const gchar *role_str = NULL; - gint i, j = 0, len; - - fname = NULL; - gname = NULL; - oname = NULL; - - /* parse name. may not work for dissimilar cultures. */ - if (data->savedstring != NULL) { - fullname = data->savedstring; - - /* <family name>, <given name> <other name> */ - g_debug ("Parsing 'opf:file-as' attribute:'%s'", data->savedstring); - len = strlen (data->savedstring); - - for (i = 0; i < len; i++) { - if (data->savedstring[i] == ',') { - fname = g_strndup (data->savedstring, i); - g_debug ("Found family name:'%s'", fname); - - for (; data->savedstring[i] == ',' || data->savedstring[i] == ' '; i++); - j = i; - - break; - } - } - - if (i == len) { - fname = g_strdup (data->savedstring); - g_debug ("Found only one name"); - } else { - for (; i <= len; i++) { - if (i == len || data->savedstring[i] == ' ') { - gname = g_strndup (data->savedstring + j, i - j); - g_debug ("Found given name:'%s'", gname); - - for (; data->savedstring[i] == ',' || data->savedstring[i] == ' '; i++); - - if (i != len) { - oname = g_strdup (data->savedstring + i); - g_debug ("Found other name:'%s'", oname); - } - - break; - } - } - } - } else { - fullname = text; - - /* <given name> <other name> <family name> */ - g_debug ("Parsing name, no 'opf:file-as' found: '%s'", text); - - len = strlen (text); - - for (i = 0; i < len; i++) { - if (text[i] == ' ') { - gname = g_strndup (text, i); - g_debug ("Found given name:'%s'", gname); - j = i + 1; - - break; - } - } - - if (j == 0) { - fname = g_strdup (data->savedstring); - g_debug ("Found only one name:'%s'", fname); - } else { - for (i = len - 1; i >= j - 1; i--) { - if (text[i] == ' ') { - fname = g_strdup (text + i + 1); - g_debug ("Found family name:'%s'", fname); - - if (i > j) { - oname = g_strndup (text + j, i - j); - g_debug ("Found other name:'%s'", oname); - } - - break; - } - } - } - } - - /* Role details */ - role_uri = tracker_sparql_escape_uri_printf ("urn:artist:%s", fullname); - - if (data->element == OPF_TAG_TYPE_AUTHOR) { - role_str = "nco:creator"; - } else if (data->element == OPF_TAG_TYPE_EDITOR && !data->has_publisher) { - /* Should this be nco:contributor ? - * 'Editor' is a bit vague here. - */ - role_str = "nco:publisher"; - } else if (data->element == OPF_TAG_TYPE_ILLUSTRATOR) { - /* There is no illustrator class, using contributor */ - role_str = "nco:contributor"; - } else { - g_assert ("Unknown role"); - } - - if (role_uri) { - artist = tracker_resource_new (role_uri); - tracker_resource_set_uri (artist, "rdf:type", "nmm:Artist"); - tracker_resource_set_string (artist, "nmm:artistName", fullname); - } - - /* Creator contact details */ - contact = tracker_resource_new (NULL); - tracker_resource_set_uri (contact, "rdf:type", "nco:PersonContact"); - tracker_resource_set_string (contact, "nco:fullname", fullname); - - if (fname) { - tracker_resource_set_string (contact, "nco:nameFamily", fname); - g_free (fname); - } - - if (gname) { - tracker_resource_set_string (contact, "nco:nameGiven", gname); - g_free (gname); - } - - if (oname) { - tracker_resource_set_string (contact, "nco:nameAdditional", oname); - g_free (oname); - } - - if (role_uri) { - tracker_resource_set_relation (contact, role_str, artist); - g_free (role_uri); - } - - tracker_resource_set_relation (data->resource, "nco:creator", contact); - g_clear_object (&artist); - g_object_unref (contact); - - break; - } - case OPF_TAG_TYPE_TITLE: - if (data->has_title) { - g_warning ("Avoiding additional title (%s) in EPUB '%s'", - text, data->uri); - } else { - data->has_title = TRUE; - tracker_resource_set_string (data->resource, "nie:title", text); - } - break; - case OPF_TAG_TYPE_CREATED: { - if (data->has_content_created) { - g_warning ("Avoiding additional creation time (%s) in EPUB '%s'", - text, data->uri); - } else { - gchar *date = tracker_date_guess (text); - - if (date) { - data->has_content_created = TRUE; - tracker_resource_set_string (data->resource, "nie:contentCreated", date); - g_free (date); - } else { - g_warning ("Could not parse creation time (%s) in EPUB '%s'", - text, data->uri); - } - } - break; - } - case OPF_TAG_TYPE_LANGUAGE: - if (data->has_language) { - g_warning ("Avoiding additional language (%s) in EPUB '%s'", - text, data->uri); - } else { - data->has_language = TRUE; - tracker_resource_set_string (data->resource, "nie:language", text); - } - break; - case OPF_TAG_TYPE_SUBJECT: - if (data->has_subject) { - g_warning ("Avoiding additional subject (%s) in EPUB '%s'", - text, data->uri); - } else { - data->has_subject = TRUE; - tracker_resource_set_string (data->resource, "nie:subject", text); - } - break; - case OPF_TAG_TYPE_DESCRIPTION: - if (data->has_description) { - g_warning ("Avoiding additional description (%s) in EPUB '%s'", - text, data->uri); - } else { - data->has_description = TRUE; - tracker_resource_set_string (data->resource, "nie:description", text); - } - break; - case OPF_TAG_TYPE_UUID: - case OPF_TAG_TYPE_ISBN: - if (data->has_identifier) { - g_warning ("Avoiding additional identifier (%s) in EPUB '%s'", - text, data->uri); - } else { - data->has_identifier = TRUE; - tracker_resource_set_string (data->resource, "nie:identifier", text); - } - break; - /* case OPF_TAG_TYPE_RATING: */ - case OPF_TAG_TYPE_UNKNOWN: - default: - break; - } - - opf_data_clear_saved_string (data); -} - -/* Methods to extract XHTML text content */ -static void -content_xml_text_handler (GMarkupParseContext *context, - const gchar *text, - gsize text_len, - gpointer user_data, - GError **error) -{ - OPFContentData *content_data = user_data; - gsize written_bytes = 0; - - if (text_len <= 0) { - return; - } - - if (tracker_text_validate_utf8 (text, - MIN (text_len, content_data->limit), - &content_data->contents, - &written_bytes)) { - if (content_data->contents->str[content_data->contents->len - 1] != ' ') { - g_string_append_c (content_data->contents, ' '); - } - } - - content_data->limit -= written_bytes; -} - -static gchar * -extract_opf_path (const gchar *uri) -{ - GMarkupParseContext *context; - gchar *path = NULL; - GError *error = NULL; - GMarkupParser parser = { - container_xml_start_element_handler, - NULL, NULL, NULL, NULL - }; - - /* Create parsing context */ - context = g_markup_parse_context_new (&parser, 0, &path, NULL); - - /* Load the internal container file from the Zip archive, - * and parse it to extract the .opf file to get metadata from - */ - tracker_gsf_parse_xml_in_zip (uri, "META-INF/container.xml", context, &error); - g_markup_parse_context_free (context); - - if (error || !path) { - g_warning ("Could not get EPUB container.xml file: %s\n", - (error) ? error->message : "No error provided"); - g_error_free (error); - return NULL; - } - - return path; -} - -static gchar * -extract_opf_contents (const gchar *uri, - const gchar *content_prefix, - GList *content_files) -{ - OPFContentData content_data = { 0 }; - TrackerConfig *config; - GError *error = NULL; - GList *l; - GMarkupParser xml_parser = { - NULL, NULL, - content_xml_text_handler, - NULL, NULL - }; - - config = tracker_main_get_config (); - - content_data.contents = g_string_new (""); - content_data.limit = (gsize) tracker_config_get_max_bytes (config); - - g_debug ("Extracting up to %" G_GSIZE_FORMAT " bytes of content", content_data.limit); - - for (l = content_files; l; l = l->next) { - GMarkupParseContext *context; - gchar *path; - - context = g_markup_parse_context_new (&xml_parser, 0, &content_data, NULL); - - /* Page file is relative to OPF file location */ - path = g_build_filename (content_prefix, l->data, NULL); - tracker_gsf_parse_xml_in_zip (uri, path, context, &error); - - if (error) { - g_warning ("Error extracting EPUB contents (%s): %s", - path, error->message); - g_clear_error (&error); - } - g_free (path); - - g_markup_parse_context_free (context); - - if (content_data.limit <= 0) { - /* Reached plain text extraction limit */ - break; - } - } - - return g_string_free (content_data.contents, FALSE); -} - -static TrackerResource * -extract_opf (const gchar *uri, - const gchar *opf_path) -{ - TrackerResource *ebook; - GMarkupParseContext *context; - OPFData *data = NULL; - GError *error = NULL; - gchar *dirname, *contents; - GMarkupParser opf_parser = { - opf_xml_start_element_handler, - opf_xml_end_element_handler, - opf_xml_text_handler, - NULL, NULL - }; - - g_debug ("Extracting OPF file contents from EPUB '%s'", uri); - - ebook = tracker_resource_new (NULL); - tracker_resource_add_uri (ebook, "rdf:type", "nfo:EBook"); - - data = opf_data_new (uri, ebook); - - /* Create parsing context */ - context = g_markup_parse_context_new (&opf_parser, 0, data, NULL); - - /* Load the internal container file from the Zip archive, - * and parse it to extract the .opf file to get metadata from - */ - tracker_gsf_parse_xml_in_zip (uri, opf_path, context, &error); - g_markup_parse_context_free (context); - - if (error) { - g_warning ("Could not get EPUB '%s' file: %s\n", opf_path, - (error) ? error->message : "No error provided"); - g_error_free (error); - opf_data_free (data); - g_object_unref (ebook); - return NULL; - } - - dirname = g_path_get_dirname (opf_path); - contents = extract_opf_contents (uri, dirname, data->pages); - g_free (dirname); - - if (contents && *contents) { - tracker_resource_set_string (ebook, "nie:plainTextContent", contents); - } - - opf_data_free (data); - g_free (contents); - - return ebook; -} - -G_MODULE_EXPORT gboolean -tracker_extract_get_metadata (TrackerExtractInfo *info) -{ - TrackerResource *ebook; - gchar *opf_path, *uri; - GFile *file; - - file = tracker_extract_info_get_file (info); - uri = g_file_get_uri (file); - - opf_path = extract_opf_path (uri); - - if (!opf_path) { - g_free (uri); - return FALSE; - } - - ebook = extract_opf (uri, opf_path); - g_free (opf_path); - g_free (uri); - - tracker_extract_info_set_resource (info, ebook); - g_object_unref (ebook); - - return TRUE; -} |