summaryrefslogtreecommitdiff
path: root/src/tracker-extract/tracker-extract-oasis.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/tracker-extract/tracker-extract-oasis.c')
-rw-r--r--src/tracker-extract/tracker-extract-oasis.c533
1 files changed, 0 insertions, 533 deletions
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
deleted file mode 100644
index 704dd2ac5..000000000
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ /dev/null
@@ -1,533 +0,0 @@
-/*
- * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
- * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- */
-
-#include "config.h"
-
-#include <libtracker-common/tracker-common.h>
-
-#include <libtracker-extract/tracker-extract.h>
-
-#include "tracker-main.h"
-#include "tracker-gsf.h"
-#include "tracker-read.h"
-
-#include <unistd.h>
-
-typedef enum {
- ODT_TAG_TYPE_UNKNOWN,
- ODT_TAG_TYPE_TITLE,
- ODT_TAG_TYPE_SUBJECT,
- ODT_TAG_TYPE_AUTHOR,
- ODT_TAG_TYPE_KEYWORDS,
- ODT_TAG_TYPE_COMMENTS,
- ODT_TAG_TYPE_STATS,
- ODT_TAG_TYPE_CREATED,
- ODT_TAG_TYPE_GENERATOR,
- ODT_TAG_TYPE_WORD_TEXT,
- ODT_TAG_TYPE_SLIDE_TEXT,
- ODT_TAG_TYPE_SPREADSHEET_TEXT,
- ODT_TAG_TYPE_GRAPHICS_TEXT
-} ODTTagType;
-
-typedef enum {
- FILE_TYPE_INVALID,
- FILE_TYPE_ODP,
- FILE_TYPE_ODT,
- FILE_TYPE_ODS,
- FILE_TYPE_ODG
-} ODTFileType;
-
-typedef struct {
- TrackerResource *metadata;
- ODTTagType current;
- const gchar *uri;
- guint has_title : 1;
- guint has_subject : 1;
- guint has_publisher : 1;
- guint has_comment : 1;
- guint has_generator : 1;
- guint has_word_count : 1;
- guint has_page_count : 1;
- guint has_content_created : 1;
-} ODTMetadataParseInfo;
-
-typedef struct {
- ODTTagType current;
- ODTFileType file_type;
- GString *content;
- gulong bytes_pending;
-} ODTContentParseInfo;
-
-GQuark maximum_size_error_quark = 0;
-
-static void xml_start_element_handler_metadata (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error);
-static void xml_end_element_handler_metadata (GMarkupParseContext *context,
- const gchar *element_name,
- gpointer user_data,
- GError **error);
-static void xml_text_handler_metadata (GMarkupParseContext *context,
- const gchar *text,
- gsize text_len,
- gpointer user_data,
- GError **error);
-static void xml_start_element_handler_content (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error);
-static void xml_end_element_handler_content (GMarkupParseContext *context,
- const gchar *element_name,
- gpointer user_data,
- GError **error);
-static void xml_text_handler_content (GMarkupParseContext *context,
- const gchar *text,
- gsize text_len,
- gpointer user_data,
- GError **error);
-static void extract_oasis_content (const gchar *uri,
- gulong total_bytes,
- ODTFileType file_type,
- TrackerResource *metadata);
-
-static void
-extract_oasis_content (const gchar *uri,
- gulong total_bytes,
- ODTFileType file_type,
- TrackerResource *metadata)
-{
- gchar *content = NULL;
- ODTContentParseInfo info;
- GMarkupParseContext *context;
- GError *error = NULL;
- GMarkupParser parser = {
- xml_start_element_handler_content,
- xml_end_element_handler_content,
- xml_text_handler_content,
- NULL,
- NULL
- };
-
- /* If no content requested, return */
- if (total_bytes == 0) {
- return;
- }
-
- /* Create parse info */
- info.current = ODT_TAG_TYPE_UNKNOWN;
- info.file_type = file_type;
- info.content = g_string_new ("");
- info.bytes_pending = total_bytes;
-
- /* Create parsing context */
- context = g_markup_parse_context_new (&parser, 0, &info, NULL);
-
- /* Load the internal XML file from the Zip archive, and parse it
- * using the given context */
- tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error);
-
- if (!error || g_error_matches (error, maximum_size_error_quark, 0)) {
- content = g_string_free (info.content, FALSE);
- tracker_resource_set_string (metadata, "nie:plainTextContent", content);
- } else {
- g_warning ("Got error parsing XML file: %s\n", error->message);
- g_string_free (info.content, TRUE);
- }
-
- if (error) {
- g_error_free (error);
- }
-
- g_free (content);
- g_markup_parse_context_free (context);
-}
-
-G_MODULE_EXPORT gboolean
-tracker_extract_get_metadata (TrackerExtractInfo *extract_info)
-{
- TrackerResource *metadata;
- TrackerConfig *config;
- ODTMetadataParseInfo info = { 0 };
- ODTFileType file_type;
- GFile *file;
- gchar *uri;
- const gchar *mime_used;
- GMarkupParseContext *context;
- GMarkupParser parser = {
- xml_start_element_handler_metadata,
- xml_end_element_handler_metadata,
- xml_text_handler_metadata,
- NULL,
- NULL
- };
-
- if (G_UNLIKELY (maximum_size_error_quark == 0)) {
- maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
- }
-
- metadata = tracker_resource_new (NULL);
- mime_used = tracker_extract_info_get_mimetype (extract_info);
-
- file = tracker_extract_info_get_file (extract_info);
- uri = g_file_get_uri (file);
-
- /* Setup conf */
- config = tracker_main_get_config ();
-
- g_debug ("Extracting OASIS metadata and contents from '%s'", uri);
-
- /* First, parse metadata */
-
- tracker_resource_add_uri (metadata, "rdf:type", "nfo:PaginatedTextDocument");
-
- /* Create parse info */
- info.metadata = metadata;
- info.current = ODT_TAG_TYPE_UNKNOWN;
- info.uri = uri;
-
- /* Create parsing context */
- context = g_markup_parse_context_new (&parser, 0, &info, NULL);
-
- /* Load the internal XML file from the Zip archive, and parse it
- * using the given context */
- tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL);
- g_markup_parse_context_free (context);
-
- if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) {
- file_type = FILE_TYPE_ODT;
- } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) {
- file_type = FILE_TYPE_ODP;
- } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) {
- file_type = FILE_TYPE_ODS;
- } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.graphics") == 0) {
- file_type = FILE_TYPE_ODG;
- } else {
- g_message ("Mime type was not recognised:'%s'", mime_used);
- file_type = FILE_TYPE_INVALID;
- }
-
- /* Extract content with the given limitations */
- extract_oasis_content (uri,
- tracker_config_get_max_bytes (config),
- file_type,
- metadata);
-
- g_free (uri);
-
- tracker_extract_info_set_resource (extract_info, metadata);
- g_object_unref (metadata);
-
- return TRUE;
-}
-
-static void
-xml_start_element_handler_metadata (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error)
-{
- ODTMetadataParseInfo *data = user_data;
-
- if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
- data->current = ODT_TAG_TYPE_TITLE;
- } else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
- data->current = ODT_TAG_TYPE_SUBJECT;
- } else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
- data->current = ODT_TAG_TYPE_AUTHOR;
- } else if (g_ascii_strcasecmp (element_name, "meta:keyword") == 0) {
- data->current = ODT_TAG_TYPE_KEYWORDS;
- } else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
- data->current = ODT_TAG_TYPE_COMMENTS;
- } else if (g_ascii_strcasecmp (element_name, "meta:document-statistic") == 0) {
- TrackerResource *metadata;
- const gchar **a, **v;
-
- metadata = data->metadata;
-
- for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
- if (g_ascii_strcasecmp (*a, "meta:word-count") == 0) {
- if (data->has_word_count) {
- g_warning ("Avoiding additional word count (%s) in OASIS document '%s'",
- *v, data->uri);
- } else {
- data->has_word_count = TRUE;
- tracker_resource_set_string (metadata, "nfo:wordCount", *v);
- }
- } else if (g_ascii_strcasecmp (*a, "meta:page-count") == 0) {
- if (data->has_page_count) {
- g_warning ("Avoiding additional page count (%s) in OASIS document '%s'",
- *v, data->uri);
- } else {
- data->has_page_count = TRUE;
- tracker_resource_set_string (metadata, "nfo:pageCount", *v);
- }
- }
- }
-
- data->current = ODT_TAG_TYPE_STATS;
- } else if (g_ascii_strcasecmp (element_name, "meta:creation-date") == 0) {
- data->current = ODT_TAG_TYPE_CREATED;
- } else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
- data->current = ODT_TAG_TYPE_GENERATOR;
- } else {
- data->current = -1;
- }
-}
-
-static void
-xml_end_element_handler_metadata (GMarkupParseContext *context,
- const gchar *element_name,
- gpointer user_data,
- GError **error)
-{
- ((ODTMetadataParseInfo*) user_data)->current = -1;
-}
-
-static void
-xml_text_handler_metadata (GMarkupParseContext *context,
- const gchar *text,
- gsize text_len,
- gpointer user_data,
- GError **error)
-{
- ODTMetadataParseInfo *data;
- TrackerResource *metadata;
- gchar *date;
-
- data = user_data;
- metadata = data->metadata;
-
- if (text_len == 0) {
- /* ignore empty values */
- return;
- }
-
- switch (data->current) {
- case ODT_TAG_TYPE_TITLE:
- if (data->has_title) {
- g_warning ("Avoiding additional title (%s) in OASIS document '%s'",
- text, data->uri);
- } else {
- data->has_title = TRUE;
- tracker_resource_set_string (metadata, "nie:title", text);
- }
- break;
-
- case ODT_TAG_TYPE_SUBJECT:
- if (data->has_subject) {
- g_warning ("Avoiding additional subject (%s) in OASIS document '%s'",
- text, data->uri);
- } else {
- data->has_subject = TRUE;
- tracker_resource_set_string (metadata, "nie:subject", text);
- }
- break;
-
- case ODT_TAG_TYPE_AUTHOR:
- if (data->has_publisher) {
- g_warning ("Avoiding additional publisher (%s) in OASIS document '%s'",
- text, data->uri);
- } else {
- TrackerResource *publisher = tracker_extract_new_contact (text);
-
- data->has_publisher = TRUE;
- tracker_resource_set_relation (metadata, "nco:publisher", publisher);
-
- g_object_unref (publisher);
- }
- break;
-
- case ODT_TAG_TYPE_KEYWORDS: {
- gchar *keywords;
- gchar *lasts, *keyw;
-
- keywords = g_strdup (text);
-
- for (keyw = strtok_r (keywords, ",; ", &lasts);
- keyw;
- keyw = strtok_r (NULL, ",; ", &lasts)) {
- tracker_resource_add_string (metadata, "nie:keyword", keyw);
- }
-
- g_free (keywords);
-
- break;
- }
-
- case ODT_TAG_TYPE_COMMENTS:
- if (data->has_comment) {
- g_warning ("Avoiding additional comment (%s) in OASIS document '%s'",
- text, data->uri);
- } else {
- data->has_comment = TRUE;
- tracker_resource_set_string (metadata, "nie:comment", text);
- }
- break;
-
- case ODT_TAG_TYPE_CREATED:
- if (data->has_content_created) {
- g_warning ("Avoiding additional creation time (%s) in OASIS document '%s'",
- text, data->uri);
- } else {
- date = tracker_date_guess (text);
- if (date) {
- data->has_content_created = TRUE;
- tracker_resource_set_string (metadata, "nie:contentCreated", date);
- g_free (date);
- } else {
- g_warning ("Could not parse creation time (%s) in OASIS document '%s'",
- text, data->uri);
- }
- }
- break;
-
- case ODT_TAG_TYPE_GENERATOR:
- if (data->has_generator) {
- g_warning ("Avoiding additional creation time (%s) in OASIS document '%s'",
- text, data->uri);
- } else {
- data->has_generator = TRUE;
- tracker_resource_set_string (metadata, "nie:generator", text);
- }
- break;
-
- default:
- case ODT_TAG_TYPE_STATS:
- break;
- }
-}
-
-static void
-xml_start_element_handler_content (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error)
-{
- ODTContentParseInfo *data = user_data;
-
- switch (data->file_type) {
- case FILE_TYPE_ODT:
- if ((g_ascii_strcasecmp (element_name, "text:p") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:h") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:a") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:span") == 0) ||
- (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:s") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:tab") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:line-break") == 0)) {
- data->current = ODT_TAG_TYPE_WORD_TEXT;
- } else {
- data->current = -1;
- }
- break;
-
- case FILE_TYPE_ODP:
- data->current = ODT_TAG_TYPE_SLIDE_TEXT;
- break;
-
- case FILE_TYPE_ODS:
- if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
- data->current = ODT_TAG_TYPE_SPREADSHEET_TEXT;
- } else {
- data->current = -1;
- }
- break;
-
- case FILE_TYPE_ODG:
- if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
- data->current = ODT_TAG_TYPE_GRAPHICS_TEXT;
- } else {
- data->current = -1;
- }
- break;
-
- case FILE_TYPE_INVALID:
- g_message ("Open Office Document type: %d invalid", data->file_type);
- break;
- }
-}
-
-static void
-xml_end_element_handler_content (GMarkupParseContext *context,
- const gchar *element_name,
- gpointer user_data,
- GError **error)
-{
- ODTContentParseInfo *data = user_data;
-
- /* Don't stop processing if it was a so-called 'empty' tag (e.g. <text:tab/>) */
- if (!((g_ascii_strcasecmp (element_name, "text:s") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:tab") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:line-break") == 0))) {
- data->current = -1;
- }
-
-}
-
-static void
-xml_text_handler_content (GMarkupParseContext *context,
- const gchar *text,
- gsize text_len,
- gpointer user_data,
- GError **error)
-{
- ODTContentParseInfo *data = user_data;
- gsize written_bytes = 0;
-
- switch (data->current) {
- case ODT_TAG_TYPE_WORD_TEXT:
- case ODT_TAG_TYPE_SLIDE_TEXT:
- case ODT_TAG_TYPE_SPREADSHEET_TEXT:
- case ODT_TAG_TYPE_GRAPHICS_TEXT:
- if (data->bytes_pending == 0) {
- g_set_error_literal (error,
- maximum_size_error_quark, 0,
- "Maximum text limit reached");
- break;
- }
-
- /* Look for valid UTF-8 text */
- if (tracker_text_validate_utf8 (text,
- MIN (text_len, data->bytes_pending),
- &data->content,
- &written_bytes)) {
- if (data->content->str[data->content->len - 1] != ' ') {
- /* If some bytes found to be valid, append an extra whitespace
- * as separator */
- g_string_append_c (data->content, ' ');
- }
- }
-
- data->bytes_pending -= written_bytes;
- break;
-
- default:
- break;
- }
-}