tracker-extract: Split msoffice and msoffice-xml extractors

author: Aleksander Morgado <aleksander@lanedo.com> 2010-11-24 11:01:34 +0100
committer: Aleksander Morgado <aleksander@lanedo.com> 2010-11-24 13:23:39 +0100
commit: b98fa7891390ef13f4478319afcf4952b41d7578 (patch)
tree: 3e5b7d48df809a5ce9c9c9c99e70584288ba2b1a
parent: 74eee612cf1f242eb30d2e3c458f2542ca4379d7 (diff)
download: tracker-b98fa7891390ef13f4478319afcf4952b41d7578.tar.gz
3 files changed, 726 insertions, 663 deletions
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index af5adc317..7afe879de 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -52,6 +52,7 @@ endif
 if HAVE_LIBGSF
 modules_LTLIBRARIES +=                                 \
 	libextract-msoffice.la                         \
+	libextract-msoffice-xml.la                     \
 	libextract-oasis.la
 endif
 
@@ -205,6 +206,17 @@ libextract_msoffice_la_LIBADD =                        \
 	$(TRACKER_EXTRACT_LIBS)                        \
 	$(LIBGSF_LIBS)
 
+# MS Office XML
+libextract_msoffice_xml_la_SOURCES = tracker-extract-msoffice-xml.c
+libextract_msoffice_xml_la_CFLAGS = $(LIBGSF_CFLAGS)
+libextract_msoffice_xml_la_LDFLAGS = $(module_flags)
+libextract_msoffice_xml_la_LIBADD =                    \
+	$(top_builddir)/src/libtracker-extract/libtracker-extract-@TRACKER_API_VERSION@.la \
+	$(top_builddir)/src/libtracker-common/libtracker-common.la \
+	$(BUILD_LIBS)                                  \
+	$(TRACKER_EXTRACT_LIBS)                        \
+	$(LIBGSF_LIBS)
+
 # PDF
 libextract_pdf_la_SOURCES = tracker-extract-pdf.cpp
 libextract_pdf_la_CXXFLAGS = $(POPPLER_CFLAGS)
diff --git a/src/tracker-extract/tracker-extract-msoffice-xml.c b/src/tracker-extract/tracker-extract-msoffice-xml.c
new file mode 100644
index 000000000..7e34dd34b
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-msoffice-xml.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (C) 2008-2010 Nokia <ivan.frade@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include <glib.h>
+
+#include <gsf/gsf.h>
+#include <gsf/gsf-doc-meta-data.h>
+#include <gsf/gsf-infile.h>
+#include <gsf/gsf-infile-msole.h>
+#include <gsf/gsf-input-stdio.h>
+#include <gsf/gsf-msole-utils.h>
+#include <gsf/gsf-utils.h>
+#include <gsf/gsf-infile-zip.h>
+
+#include <libtracker-common/tracker-utils.h>
+#include <libtracker-common/tracker-os-dependant.h>
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-main.h"
+#include "tracker-gsf.h"
+
+typedef enum {
+	MS_OFFICE_XML_TAG_INVALID,
+	MS_OFFICE_XML_TAG_TITLE,
+	MS_OFFICE_XML_TAG_SUBJECT,
+	MS_OFFICE_XML_TAG_AUTHOR,
+	MS_OFFICE_XML_TAG_MODIFIED,
+	MS_OFFICE_XML_TAG_COMMENTS,
+	MS_OFFICE_XML_TAG_CREATED,
+	MS_OFFICE_XML_TAG_GENERATOR,
+	MS_OFFICE_XML_TAG_NUM_OF_PAGES,
+	MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS,
+	MS_OFFICE_XML_TAG_NUM_OF_WORDS,
+	MS_OFFICE_XML_TAG_NUM_OF_LINES,
+	MS_OFFICE_XML_TAG_APPLICATION,
+	MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS,
+	MS_OFFICE_XML_TAG_SLIDE_TEXT,
+	MS_OFFICE_XML_TAG_WORD_TEXT,
+	MS_OFFICE_XML_TAG_XLS_SHARED_TEXT,
+	MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA,
+	MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA
+} MsOfficeXMLTagType;
+
+typedef enum {
+	FILE_TYPE_INVALID,
+	FILE_TYPE_PPTX,
+	FILE_TYPE_PPSX,
+	FILE_TYPE_DOCX,
+	FILE_TYPE_XLSX
+} MsOfficeXMLFileType;
+
+typedef struct {
+	TrackerSparqlBuilder *metadata;
+	MsOfficeXMLFileType file_type;
+	MsOfficeXMLTagType tag_type;
+	gboolean style_element_present;
+	gboolean preserve_attribute_present;
+	const gchar *uri;
+	GString *content;
+	gboolean title_already_set;
+	gboolean generator_already_set;
+	gulong bytes_pending;
+} MsOfficeXMLParserInfo;
+
+static GQuark maximum_size_error_quark = 0;
+
+static void extract_msoffice_xml (const gchar          *uri,
+                                  TrackerSparqlBuilder *preupdate,
+                                  TrackerSparqlBuilder *metadata);
+
+static TrackerExtractData data[] = {
+	/* MSoffice2007*/
+	{ "application/vnd.openxmlformats-officedocument.presentationml.presentation", extract_msoffice_xml },
+	{ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",    extract_msoffice_xml },
+	{ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",         extract_msoffice_xml },
+	{ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",   extract_msoffice_xml },
+	{ NULL, NULL }
+};
+
+static void
+xml_start_element_handler_text_data (GMarkupParseContext  *context,
+                                     const gchar          *element_name,
+                                     const gchar         **attribute_names,
+                                     const gchar         **attribute_values,
+                                     gpointer              user_data,
+                                     GError              **error)
+{
+	MsOfficeXMLParserInfo *info = user_data;
+	const gchar **a;
+	const gchar **v;
+
+	switch (info->file_type) {
+	case FILE_TYPE_DOCX:
+		if (g_ascii_strcasecmp (element_name, "w:pStyle") == 0) {
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
+					continue;
+				}
+
+				if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
+					info->style_element_present = TRUE;
+				} else if (g_ascii_strncasecmp (*v, "TOC", 3) == 0) {
+					info->style_element_present = TRUE;
+				} else if (g_ascii_strncasecmp (*v, "Section", 7) == 0) {
+					info->style_element_present = TRUE;
+				} else if (g_ascii_strncasecmp (*v, "Title", 5) == 0) {
+					info->style_element_present = TRUE;
+				} else if (g_ascii_strncasecmp (*v, "Subtitle", 8) == 0) {
+					info->style_element_present = TRUE;
+				}
+			}
+		} else if (g_ascii_strcasecmp (element_name, "w:rStyle") == 0) {
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
+					continue;
+				}
+
+				if (g_ascii_strncasecmp (*v, "SubtleEmphasis", 14) == 0) {
+					info->style_element_present = TRUE;
+				} else if (g_ascii_strncasecmp (*v, "SubtleReference", 15) == 0) {
+					info->style_element_present = TRUE;
+				}
+			}
+		} else if (g_ascii_strcasecmp (element_name, "w:sz") == 0) {
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
+					continue;
+				}
+
+				if (atoi (*v) >= 38) {
+					info->style_element_present = TRUE;
+				}
+			}
+		} else if (g_ascii_strcasecmp (element_name, "w:smartTag") == 0) {
+			info->style_element_present = TRUE;
+		} else if (g_ascii_strcasecmp (element_name, "w:sdtContent") == 0) {
+			info->style_element_present = TRUE;
+		} else if (g_ascii_strcasecmp (element_name, "w:hyperlink") == 0) {
+			info->style_element_present = TRUE;
+		} else if (g_ascii_strcasecmp (element_name, "w:t") == 0) {
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "xml:space") != 0) {
+					continue;
+				}
+
+				if (g_ascii_strncasecmp (*v, "preserve", 8) == 0) {
+					info->preserve_attribute_present = TRUE;
+				}
+			}
+
+			info->tag_type = MS_OFFICE_XML_TAG_WORD_TEXT;
+		}
+		break;
+
+	case FILE_TYPE_XLSX:
+		if (g_ascii_strcasecmp (element_name, "sheet") == 0) {
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "name") == 0) {
+					info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
+				}
+			}
+
+		} else if (g_ascii_strcasecmp (element_name, "t") == 0) {
+			info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
+		}
+		break;
+
+	case FILE_TYPE_PPTX:
+	case FILE_TYPE_PPSX:
+		info->tag_type = MS_OFFICE_XML_TAG_SLIDE_TEXT;
+		break;
+
+	case FILE_TYPE_INVALID:
+		g_message ("Microsoft document type:%d invalid", info->file_type);
+		break;
+	}
+}
+
+static void
+xml_end_element_handler_document_data (GMarkupParseContext  *context,
+                                       const gchar          *element_name,
+                                       gpointer              user_data,
+                                       GError              **error)
+{
+	MsOfficeXMLParserInfo *info = user_data;
+
+	if (g_ascii_strcasecmp (element_name, "w:p") == 0) {
+		info->style_element_present = FALSE;
+		info->preserve_attribute_present = FALSE;
+	}
+
+	((MsOfficeXMLParserInfo*) user_data)->tag_type = MS_OFFICE_XML_TAG_INVALID;
+}
+
+static void
+xml_start_element_handler_core_data	(GMarkupParseContext  *context,
+                                     const gchar           *element_name,
+                                     const gchar          **attribute_names,
+                                     const gchar          **attribute_values,
+                                     gpointer               user_data,
+                                     GError               **error)
+{
+	MsOfficeXMLParserInfo *info = user_data;
+
+	if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_TITLE;
+	} else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_SUBJECT;
+	} else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_AUTHOR;
+	} else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_COMMENTS;
+	} else if (g_ascii_strcasecmp (element_name, "dcterms:created") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_CREATED;
+	} else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_GENERATOR;
+	} else if (g_ascii_strcasecmp (element_name, "dcterms:modified") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_MODIFIED;
+	} else if (g_ascii_strcasecmp (element_name, "cp:lastModifiedBy") == 0) {
+		/* Do nothing ? */
+	} else if (g_ascii_strcasecmp (element_name, "Pages") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
+	} else if (g_ascii_strcasecmp (element_name, "Slides") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
+	} else if (g_ascii_strcasecmp (element_name, "Paragraphs") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS;
+	} else if (g_ascii_strcasecmp (element_name, "Characters") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS;
+	} else if (g_ascii_strcasecmp (element_name, "Words") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_WORDS;
+	} else if (g_ascii_strcasecmp (element_name, "Lines") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_LINES;
+	} else if (g_ascii_strcasecmp (element_name, "Application") == 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_APPLICATION;
+	} else {
+		info->tag_type = MS_OFFICE_XML_TAG_INVALID;
+	}
+}
+
+static void
+xml_core_handler_document_data (GMarkupParseContext  *context,
+                                const gchar          *text,
+                                gsize                 text_len,
+                                gpointer              user_data,
+                                GError              **error)
+{
+	MsOfficeXMLParserInfo *info = user_data;
+
+	switch (info->tag_type) {
+	/* Ignore tags that may not happen inside the core subdocument */
+	case MS_OFFICE_XML_TAG_WORD_TEXT:
+	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
+	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
+		break;
+
+	case MS_OFFICE_XML_TAG_TITLE:
+		if (info->title_already_set) {
+			g_warning ("Avoiding additional title (%s) in MsOffice XML document '%s'",
+			           text, info->uri);
+		} else {
+			info->title_already_set = TRUE;
+			tracker_sparql_builder_predicate (info->metadata, "nie:title");
+			tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		}
+		break;
+
+	case MS_OFFICE_XML_TAG_SUBJECT:
+		tracker_sparql_builder_predicate (info->metadata, "nie:subject");
+		tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		break;
+
+	case MS_OFFICE_XML_TAG_AUTHOR:
+		tracker_sparql_builder_predicate (info->metadata, "nco:publisher");
+
+		tracker_sparql_builder_object_blank_open (info->metadata);
+		tracker_sparql_builder_predicate (info->metadata, "a");
+		tracker_sparql_builder_object (info->metadata, "nco:Contact");
+
+		tracker_sparql_builder_predicate (info->metadata, "nco:fullname");
+		tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		tracker_sparql_builder_object_blank_close (info->metadata);
+		break;
+
+	case MS_OFFICE_XML_TAG_COMMENTS:
+		tracker_sparql_builder_predicate (info->metadata, "nie:comment");
+		tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		break;
+
+	case MS_OFFICE_XML_TAG_CREATED: {
+		gchar *date;
+
+		date = tracker_date_guess (text);
+		tracker_sparql_builder_predicate (info->metadata, "nie:contentCreated");
+		tracker_sparql_builder_object_unvalidated (info->metadata, date);
+		g_free (date);
+		break;
+	}
+
+	case MS_OFFICE_XML_TAG_GENERATOR:
+		if (info->generator_already_set) {
+			g_warning ("Avoiding additional generator (%s) in MsOffice XML document '%s'",
+			           text, info->uri);
+		} else {
+			info->generator_already_set = TRUE;
+			tracker_sparql_builder_predicate (info->metadata, "nie:generator");
+			tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		}
+		break;
+
+	case MS_OFFICE_XML_TAG_APPLICATION:
+		/* FIXME: Same code as MS_OFFICE_XML_TAG_GENERATOR should be
+		 * used, but nie:generator has max cardinality of 1
+		 * and this would cause errors.
+		 */
+		break;
+
+	case MS_OFFICE_XML_TAG_MODIFIED: {
+		gchar *date;
+
+                date = tracker_date_guess (text);
+		tracker_sparql_builder_predicate (info->metadata, "nie:contentLastModified");
+		tracker_sparql_builder_object_unvalidated (info->metadata, date);
+                g_free (date);
+		break;
+	}
+
+	case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
+		tracker_sparql_builder_predicate (info->metadata, "nfo:pageCount");
+		tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		break;
+
+	case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
+		tracker_sparql_builder_predicate (info->metadata, "nfo:characterCount");
+		tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		break;
+
+	case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
+		tracker_sparql_builder_predicate (info->metadata, "nfo:wordCount");
+		tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		break;
+
+	case MS_OFFICE_XML_TAG_NUM_OF_LINES:
+		tracker_sparql_builder_predicate (info->metadata, "nfo:lineCount");
+		tracker_sparql_builder_object_unvalidated (info->metadata, text);
+		break;
+
+	case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
+		/* TODO: There is no ontology for this. */
+		break;
+
+	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
+	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
+		/* Nothing as we are using it in defining type of data */
+		break;
+
+	case MS_OFFICE_XML_TAG_INVALID:
+		/* Here we cant use log otheriwse it will print for other non useful files */
+		break;
+	}
+}
+
+static void
+xml_text_handler_document_data (GMarkupParseContext  *context,
+                                const gchar          *text,
+                                gsize                 text_len,
+                                gpointer              user_data,
+                                GError              **error)
+{
+	MsOfficeXMLParserInfo *info = user_data;
+	gsize written_bytes = 0;
+
+	/* If reached max bytes to extract, just return */
+	if (info->bytes_pending == 0) {
+		g_set_error_literal (error,
+		                     maximum_size_error_quark,
+                             0,
+		                     "Maximum text limit reached");
+		return;
+	}
+
+	/* Create content string if not already done before */
+	if (G_UNLIKELY (info->content == NULL)) {
+		info->content =	g_string_new ("");
+	}
+
+	switch (info->tag_type) {
+	case MS_OFFICE_XML_TAG_WORD_TEXT:
+		tracker_text_validate_utf8 (text,
+		                            MIN (text_len, info->bytes_pending),
+		                            &info->content,
+		                            &written_bytes);
+		g_string_append_c (info->content, ' ');
+		info->bytes_pending -= written_bytes;
+		break;
+
+	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
+		tracker_text_validate_utf8 (text,
+		                            MIN (text_len, info->bytes_pending),
+		                            &info->content,
+		                            &written_bytes);
+		g_string_append_c (info->content, ' ');
+		info->bytes_pending -= written_bytes;
+		break;
+
+	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
+		if (atoi (text) == 0)  {
+			tracker_text_validate_utf8 (text,
+			                            MIN (text_len, info->bytes_pending),
+			                            &info->content,
+			                            &written_bytes);
+			g_string_append_c (info->content, ' ');
+			info->bytes_pending -= written_bytes;
+		}
+		break;
+
+	/* Ignore tags that may not happen inside the text subdocument */
+	case MS_OFFICE_XML_TAG_TITLE:
+	case MS_OFFICE_XML_TAG_SUBJECT:
+	case MS_OFFICE_XML_TAG_AUTHOR:
+	case MS_OFFICE_XML_TAG_COMMENTS:
+	case MS_OFFICE_XML_TAG_CREATED:
+	case MS_OFFICE_XML_TAG_GENERATOR:
+	case MS_OFFICE_XML_TAG_APPLICATION:
+	case MS_OFFICE_XML_TAG_MODIFIED:
+	case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
+	case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
+	case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
+	case MS_OFFICE_XML_TAG_NUM_OF_LINES:
+	case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
+	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
+	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
+	case MS_OFFICE_XML_TAG_INVALID:
+		break;
+	}
+}
+
+static gboolean
+xml_read (MsOfficeXMLParserInfo *parser_info,
+          const gchar           *xml_filename,
+          MsOfficeXMLTagType     type)
+{
+	GMarkupParseContext *context;
+	MsOfficeXMLParserInfo info;
+	TrackerConfig *config;
+
+	/* Setup conf */
+	config = tracker_main_get_config ();
+
+	/* FIXME: Can we use the original info here? */
+	info.metadata = parser_info->metadata;
+	info.file_type = parser_info->file_type;
+	info.tag_type = MS_OFFICE_XML_TAG_INVALID;
+	info.style_element_present = FALSE;
+	info.preserve_attribute_present = FALSE;
+	info.uri = parser_info->uri;
+	info.content = parser_info->content;
+	info.title_already_set = parser_info->title_already_set;
+	info.bytes_pending = tracker_config_get_max_bytes (config);
+	switch (type) {
+	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA: {
+		GMarkupParser parser = {
+			xml_start_element_handler_core_data,
+			xml_end_element_handler_document_data,
+			xml_core_handler_document_data,
+			NULL,
+			NULL
+		};
+
+		context = g_markup_parse_context_new (&parser,
+		                                      0,
+		                                      &info,
+		                                      NULL);
+		break;
+	}
+
+	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA: {
+		GMarkupParser parser = {
+			xml_start_element_handler_text_data,
+			xml_end_element_handler_document_data,
+			xml_text_handler_document_data,
+			NULL,
+			NULL
+		};
+
+		context = g_markup_parse_context_new (&parser,
+		                                      0,
+		                                      &info,
+		                                      NULL);
+		break;
+	}
+
+	default:
+		context = NULL;
+		break;
+	}
+
+	if (context) {
+		/* Load the internal XML file from the Zip archive, and parse it
+		 * using the given context */
+		tracker_gsf_parse_xml_in_zip (parser_info->uri,
+		                              xml_filename,
+		                              context, NULL);
+		g_markup_parse_context_free (context);
+	}
+
+	return TRUE;
+}
+
+static void
+xml_start_element_handler_content_types (GMarkupParseContext  *context,
+                                         const gchar          *element_name,
+                                         const gchar         **attribute_names,
+                                         const gchar         **attribute_values,
+                                         gpointer              user_data,
+                                         GError              **error)
+{
+	MsOfficeXMLParserInfo *info;
+	const gchar *part_name;
+	const gchar *content_type;
+	gint i;
+
+	info = user_data;
+
+	if (g_ascii_strcasecmp (element_name, "Override") != 0) {
+		info->tag_type = MS_OFFICE_XML_TAG_INVALID;
+		return;
+	}
+
+	part_name = NULL;
+	content_type = NULL;
+
+	for (i = 0; attribute_names[i]; i++) {
+		if (g_ascii_strcasecmp (attribute_names[i], "PartName") == 0) {
+			part_name = attribute_values[i];
+		} else if (g_ascii_strcasecmp (attribute_names[i], "ContentType") == 0) {
+			content_type = attribute_values[i];
+		}
+	}
+
+	/* Both part_name and content_type MUST be NON-NULL */
+	if (!part_name || !content_type) {
+		g_message ("Invalid file (part_name:%s, content_type:%s)",
+		           part_name ? part_name : "none",
+		           content_type ? content_type : "none");
+		return;
+	}
+
+	if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-package.core-properties+xml") == 0) ||
+	    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.extended-properties+xml") == 0)) {
+		xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA);
+		return;
+	}
+
+	switch (info->file_type) {
+	case FILE_TYPE_DOCX:
+		if (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") == 0) {
+			xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
+		}
+		break;
+
+	case FILE_TYPE_PPTX:
+	case FILE_TYPE_PPSX:
+		if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.presentationml.slide+xml") == 0) ||
+		    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml") == 0)) {
+			xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
+		}
+		break;
+
+	case FILE_TYPE_XLSX:
+		if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml") == 0) ||
+		    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml") == 0)) {
+			xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
+		}
+		break;
+
+	case FILE_TYPE_INVALID:
+		g_message ("Invalid file type:'%d'", info->file_type);
+		break;
+	}
+}
+
+static void
+extract_msoffice_xml (const gchar          *uri,
+                      TrackerSparqlBuilder *preupdate,
+                      TrackerSparqlBuilder *metadata)
+{
+	MsOfficeXMLParserInfo info;
+	MsOfficeXMLFileType file_type;
+	TrackerConfig *config;
+	GFile *file;
+	GFileInfo *file_info;
+	GMarkupParseContext *context = NULL;
+	GError *error = NULL;
+	gulong  total_bytes;
+	GMarkupParser parser = {
+		xml_start_element_handler_content_types,
+		xml_end_element_handler_document_data,
+		NULL,
+		NULL,
+		NULL
+	};
+	const gchar *mime_used;
+
+	if (G_UNLIKELY (maximum_size_error_quark == 0)) {
+		maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
+	}
+
+	file = g_file_new_for_uri (uri);
+
+	if (!file) {
+		g_warning ("Could not create GFile for URI:'%s'",
+		           uri);
+		return;
+	}
+
+	file_info = g_file_query_info (file,
+	                               G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
+	                               G_FILE_QUERY_INFO_NONE,
+	                               NULL,
+	                               NULL);
+	g_object_unref (file);
+
+	if (!file_info) {
+		g_warning ("Could not get GFileInfo for URI:'%s'",
+		           uri);
+		return;
+	}
+
+	mime_used = g_file_info_get_content_type (file_info);
+
+	if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.wordprocessingml.document") == 0) {
+		file_type = FILE_TYPE_DOCX;
+	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.presentation") == 0) {
+		file_type = FILE_TYPE_PPTX;
+	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.slideshow") == 0) {
+		file_type = FILE_TYPE_PPSX;
+	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") == 0) {
+		file_type = FILE_TYPE_XLSX;
+	} else {
+		g_message ("Mime type was not recognised:'%s'", mime_used);
+		file_type = FILE_TYPE_INVALID;
+	}
+
+	g_object_unref (file_info);
+
+	/* Setup conf */
+	config = tracker_main_get_config ();
+
+	g_debug ("Extracting MsOffice XML format...");
+
+	tracker_sparql_builder_predicate (metadata, "a");
+	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+	total_bytes = tracker_config_get_max_bytes (config);
+	info.metadata = metadata;
+	info.file_type = file_type;
+	info.tag_type = MS_OFFICE_XML_TAG_INVALID;
+	info.style_element_present = FALSE;
+	info.preserve_attribute_present = FALSE;
+	info.uri = uri;
+	info.content = NULL;
+	info.title_already_set = FALSE;
+	info.bytes_pending = total_bytes;
+	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+
+	/* Load the internal XML file from the Zip archive, and parse it
+	 * using the given context */
+	tracker_gsf_parse_xml_in_zip (uri,
+	                              "[Content_Types].xml",
+	                              context,
+                                  &error);
+
+	/* If we got any content, add it */
+	if (info.content) {
+		gchar *content;
+
+		content = g_string_free (info.content, FALSE);
+		info.content = NULL;
+
+		if (content) {
+			tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+			tracker_sparql_builder_object_unvalidated (metadata, content);
+			g_free (content);
+		}
+	}
+
+	g_markup_parse_context_free (context);
+}
+
+TrackerExtractData *
+tracker_extract_get_data (void)
+{
+	return data;
+}
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 04caa2470..72a89d219 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -126,74 +126,20 @@ typedef struct {
 	gsize     length;
 } ExcelExtendedStringRecord;
 
-typedef enum {
-	MS_OFFICE_XML_TAG_INVALID,
-	MS_OFFICE_XML_TAG_TITLE,
-	MS_OFFICE_XML_TAG_SUBJECT,
-	MS_OFFICE_XML_TAG_AUTHOR,
-	MS_OFFICE_XML_TAG_MODIFIED,
-	MS_OFFICE_XML_TAG_COMMENTS,
-	MS_OFFICE_XML_TAG_CREATED,
-	MS_OFFICE_XML_TAG_GENERATOR,
-	MS_OFFICE_XML_TAG_NUM_OF_PAGES,
-	MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS,
-	MS_OFFICE_XML_TAG_NUM_OF_WORDS,
-	MS_OFFICE_XML_TAG_NUM_OF_LINES,
-	MS_OFFICE_XML_TAG_APPLICATION,
-	MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS,
-	MS_OFFICE_XML_TAG_SLIDE_TEXT,
-	MS_OFFICE_XML_TAG_WORD_TEXT,
-	MS_OFFICE_XML_TAG_XLS_SHARED_TEXT,
-	MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA,
-	MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA
-} MsOfficeXMLTagType;
-
-typedef enum {
-	FILE_TYPE_INVALID,
-	FILE_TYPE_PPTX,
-	FILE_TYPE_PPSX,
-	FILE_TYPE_DOCX,
-	FILE_TYPE_XLSX
-} MsOfficeXMLFileType;
-
-typedef struct {
-	TrackerSparqlBuilder *metadata;
-	MsOfficeXMLFileType file_type;
-	MsOfficeXMLTagType tag_type;
-	gboolean style_element_present;
-	gboolean preserve_attribute_present;
-	const gchar *uri;
-	GString *content;
-	gboolean title_already_set;
-	gboolean generator_already_set;
-	gulong bytes_pending;
-} MsOfficeXMLParserInfo;
-
 typedef struct {
 	TrackerSparqlBuilder *metadata;
 	const gchar *uri;
 } MetadataInfo;
 
-static GQuark maximum_size_error_quark = 0;
-
 static void extract_msoffice     (const gchar          *uri,
                                   TrackerSparqlBuilder *preupdate,
                                   TrackerSparqlBuilder *metadata);
-static void extract_msoffice_xml (const gchar          *uri,
-                                  TrackerSparqlBuilder *preupdate,
-                                  TrackerSparqlBuilder *metadata);
 
 static TrackerExtractData data[] = {
 	{ "application/msword",            extract_msoffice },
-	/* Powerpoint files */
 	{ "application/vnd.ms-powerpoint", extract_msoffice },
 	{ "application/vnd.ms-excel",	   extract_msoffice },
 	{ "application/vnd.ms-*",          extract_msoffice },
-	/* MSoffice2007*/
-	{ "application/vnd.openxmlformats-officedocument.presentationml.presentation", extract_msoffice_xml },
-	{ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",    extract_msoffice_xml },
-	{ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",         extract_msoffice_xml },
-	{ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",   extract_msoffice_xml },
 	{ NULL, NULL }
 };
 
@@ -1736,615 +1682,6 @@ extract_msoffice (const gchar          *uri,
 	gsf_shutdown ();
 }
 
-static void
-xml_start_element_handler_text_data (GMarkupParseContext  *context,
-                                     const gchar          *element_name,
-                                     const gchar         **attribute_names,
-                                     const gchar         **attribute_values,
-                                     gpointer              user_data,
-                                     GError              **error)
-{
-	MsOfficeXMLParserInfo *info = user_data;
-	const gchar **a;
-	const gchar **v;
-
-	switch (info->file_type) {
-	case FILE_TYPE_DOCX:
-		if (g_ascii_strcasecmp (element_name, "w:pStyle") == 0) {
-			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
-				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
-					continue;
-				}
-
-				if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
-					info->style_element_present = TRUE;
-				} else if (g_ascii_strncasecmp (*v, "TOC", 3) == 0) {
-					info->style_element_present = TRUE;
-				} else if (g_ascii_strncasecmp (*v, "Section", 7) == 0) {
-					info->style_element_present = TRUE;
-				} else if (g_ascii_strncasecmp (*v, "Title", 5) == 0) {
-					info->style_element_present = TRUE;
-				} else if (g_ascii_strncasecmp (*v, "Subtitle", 8) == 0) {
-					info->style_element_present = TRUE;
-				}
-			}
-		} else if (g_ascii_strcasecmp (element_name, "w:rStyle") == 0) {
-			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
-				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
-					continue;
-				}
-
-				if (g_ascii_strncasecmp (*v, "SubtleEmphasis", 14) == 0) {
-					info->style_element_present = TRUE;
-				} else if (g_ascii_strncasecmp (*v, "SubtleReference", 15) == 0) {
-					info->style_element_present = TRUE;
-				}
-			}
-		} else if (g_ascii_strcasecmp (element_name, "w:sz") == 0) {
-			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
-				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
-					continue;
-				}
-
-				if (atoi (*v) >= 38) {
-					info->style_element_present = TRUE;
-				}
-			}
-		} else if (g_ascii_strcasecmp (element_name, "w:smartTag") == 0) {
-			info->style_element_present = TRUE;
-		} else if (g_ascii_strcasecmp (element_name, "w:sdtContent") == 0) {
-			info->style_element_present = TRUE;
-		} else if (g_ascii_strcasecmp (element_name, "w:hyperlink") == 0) {
-			info->style_element_present = TRUE;
-		} else if (g_ascii_strcasecmp (element_name, "w:t") == 0) {
-			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
-				if (g_ascii_strcasecmp (*a, "xml:space") != 0) {
-					continue;
-				}
-
-				if (g_ascii_strncasecmp (*v, "preserve", 8) == 0) {
-					info->preserve_attribute_present = TRUE;
-				}
-			}
-
-			info->tag_type = MS_OFFICE_XML_TAG_WORD_TEXT;
-		}
-		break;
-
-	case FILE_TYPE_XLSX:
-		if (g_ascii_strcasecmp (element_name, "sheet") == 0) {
-			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
-				if (g_ascii_strcasecmp (*a, "name") == 0) {
-					info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
-				}
-			}
-
-		} else if (g_ascii_strcasecmp (element_name, "t") == 0) {
-			info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
-		}
-		break;
-
-	case FILE_TYPE_PPTX:
-	case FILE_TYPE_PPSX:
-		info->tag_type = MS_OFFICE_XML_TAG_SLIDE_TEXT;
-		break;
-
-	case FILE_TYPE_INVALID:
-		g_message ("Microsoft document type:%d invalid", info->file_type);
-		break;
-	}
-}
-
-static void
-xml_end_element_handler_document_data (GMarkupParseContext  *context,
-                                       const gchar          *element_name,
-                                       gpointer              user_data,
-                                       GError              **error)
-{
-	MsOfficeXMLParserInfo *info = user_data;
-
-	if (g_ascii_strcasecmp (element_name, "w:p") == 0) {
-		info->style_element_present = FALSE;
-		info->preserve_attribute_present = FALSE;
-	}
-
-	((MsOfficeXMLParserInfo*) user_data)->tag_type = MS_OFFICE_XML_TAG_INVALID;
-}
-
-static void
-xml_start_element_handler_core_data	(GMarkupParseContext  *context,
-                                     const gchar           *element_name,
-                                     const gchar          **attribute_names,
-                                     const gchar          **attribute_values,
-                                     gpointer               user_data,
-                                     GError               **error)
-{
-	MsOfficeXMLParserInfo *info = user_data;
-
-	if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_TITLE;
-	} else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_SUBJECT;
-	} else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_AUTHOR;
-	} else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_COMMENTS;
-	} else if (g_ascii_strcasecmp (element_name, "dcterms:created") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_CREATED;
-	} else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_GENERATOR;
-	} else if (g_ascii_strcasecmp (element_name, "dcterms:modified") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_MODIFIED;
-	} else if (g_ascii_strcasecmp (element_name, "cp:lastModifiedBy") == 0) {
-		/* Do nothing ? */
-	} else if (g_ascii_strcasecmp (element_name, "Pages") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
-	} else if (g_ascii_strcasecmp (element_name, "Slides") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
-	} else if (g_ascii_strcasecmp (element_name, "Paragraphs") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS;
-	} else if (g_ascii_strcasecmp (element_name, "Characters") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS;
-	} else if (g_ascii_strcasecmp (element_name, "Words") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_WORDS;
-	} else if (g_ascii_strcasecmp (element_name, "Lines") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_LINES;
-	} else if (g_ascii_strcasecmp (element_name, "Application") == 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_APPLICATION;
-	} else {
-		info->tag_type = MS_OFFICE_XML_TAG_INVALID;
-	}
-}
-
-static void
-xml_core_handler_document_data (GMarkupParseContext  *context,
-                                const gchar          *text,
-                                gsize                 text_len,
-                                gpointer              user_data,
-                                GError              **error)
-{
-	MsOfficeXMLParserInfo *info = user_data;
-
-	switch (info->tag_type) {
-	/* Ignore tags that may not happen inside the core subdocument */
-	case MS_OFFICE_XML_TAG_WORD_TEXT:
-	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
-	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
-		break;
-
-	case MS_OFFICE_XML_TAG_TITLE:
-		if (info->title_already_set) {
-			g_warning ("Avoiding additional title (%s) in MsOffice XML document '%s'",
-			           text, info->uri);
-		} else {
-			info->title_already_set = TRUE;
-			tracker_sparql_builder_predicate (info->metadata, "nie:title");
-			tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		}
-		break;
-
-	case MS_OFFICE_XML_TAG_SUBJECT:
-		tracker_sparql_builder_predicate (info->metadata, "nie:subject");
-		tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		break;
-
-	case MS_OFFICE_XML_TAG_AUTHOR:
-		tracker_sparql_builder_predicate (info->metadata, "nco:publisher");
-
-		tracker_sparql_builder_object_blank_open (info->metadata);
-		tracker_sparql_builder_predicate (info->metadata, "a");
-		tracker_sparql_builder_object (info->metadata, "nco:Contact");
-
-		tracker_sparql_builder_predicate (info->metadata, "nco:fullname");
-		tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		tracker_sparql_builder_object_blank_close (info->metadata);
-		break;
-
-	case MS_OFFICE_XML_TAG_COMMENTS:
-		tracker_sparql_builder_predicate (info->metadata, "nie:comment");
-		tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		break;
-
-	case MS_OFFICE_XML_TAG_CREATED: {
-		gchar *date;
-
-		date = tracker_date_guess (text);
-		tracker_sparql_builder_predicate (info->metadata, "nie:contentCreated");
-		tracker_sparql_builder_object_unvalidated (info->metadata, date);
-		g_free (date);
-		break;
-	}
-
-	case MS_OFFICE_XML_TAG_GENERATOR:
-		if (info->generator_already_set) {
-			g_warning ("Avoiding additional generator (%s) in MsOffice XML document '%s'",
-			           text, info->uri);
-		} else {
-			info->generator_already_set = TRUE;
-			tracker_sparql_builder_predicate (info->metadata, "nie:generator");
-			tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		}
-		break;
-
-	case MS_OFFICE_XML_TAG_APPLICATION:
-		/* FIXME: Same code as MS_OFFICE_XML_TAG_GENERATOR should be
-		 * used, but nie:generator has max cardinality of 1
-		 * and this would cause errors.
-		 */
-		break;
-
-	case MS_OFFICE_XML_TAG_MODIFIED: {
-		gchar *date;
-
-                date = tracker_date_guess (text);
-		tracker_sparql_builder_predicate (info->metadata, "nie:contentLastModified");
-		tracker_sparql_builder_object_unvalidated (info->metadata, date);
-                g_free (date);
-		break;
-	}
-
-	case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
-		tracker_sparql_builder_predicate (info->metadata, "nfo:pageCount");
-		tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		break;
-
-	case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
-		tracker_sparql_builder_predicate (info->metadata, "nfo:characterCount");
-		tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		break;
-
-	case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
-		tracker_sparql_builder_predicate (info->metadata, "nfo:wordCount");
-		tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		break;
-
-	case MS_OFFICE_XML_TAG_NUM_OF_LINES:
-		tracker_sparql_builder_predicate (info->metadata, "nfo:lineCount");
-		tracker_sparql_builder_object_unvalidated (info->metadata, text);
-		break;
-
-	case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
-		/* TODO: There is no ontology for this. */
-		break;
-
-	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
-	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
-		/* Nothing as we are using it in defining type of data */
-		break;
-
-	case MS_OFFICE_XML_TAG_INVALID:
-		/* Here we cant use log otheriwse it will print for other non useful files */
-		break;
-	}
-}
-
-static void
-xml_text_handler_document_data (GMarkupParseContext  *context,
-                                const gchar          *text,
-                                gsize                 text_len,
-                                gpointer              user_data,
-                                GError              **error)
-{
-	MsOfficeXMLParserInfo *info = user_data;
-	gsize written_bytes = 0;
-
-	/* If reached max bytes to extract, just return */
-	if (info->bytes_pending == 0) { 
-		g_set_error_literal (error,
-		                     maximum_size_error_quark,
-                             0,
-		                     "Maximum text limit reached");
-		return;
-	}
-
-	/* Create content string if not already done before */
-	if (G_UNLIKELY (info->content == NULL)) {
-		info->content =	g_string_new ("");
-	}
-
-	switch (info->tag_type) {
-	case MS_OFFICE_XML_TAG_WORD_TEXT:
-		tracker_text_validate_utf8 (text,
-		                            MIN (text_len, info->bytes_pending),
-		                            &info->content,
-		                            &written_bytes);
-		g_string_append_c (info->content, ' ');
-		info->bytes_pending -= written_bytes;
-		break;
-
-	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
-		tracker_text_validate_utf8 (text,
-		                            MIN (text_len, info->bytes_pending),
-		                            &info->content,
-		                            &written_bytes);
-		g_string_append_c (info->content, ' ');
-		info->bytes_pending -= written_bytes;
-		break;
-
-	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
-		if (atoi (text) == 0)  {
-			tracker_text_validate_utf8 (text,
-			                            MIN (text_len, info->bytes_pending),
-			                            &info->content,
-			                            &written_bytes);
-			g_string_append_c (info->content, ' ');
-			info->bytes_pending -= written_bytes;
-		}
-		break;
-
-	/* Ignore tags that may not happen inside the text subdocument */
-	case MS_OFFICE_XML_TAG_TITLE:
-	case MS_OFFICE_XML_TAG_SUBJECT:
-	case MS_OFFICE_XML_TAG_AUTHOR:
-	case MS_OFFICE_XML_TAG_COMMENTS:
-	case MS_OFFICE_XML_TAG_CREATED:
-	case MS_OFFICE_XML_TAG_GENERATOR:
-	case MS_OFFICE_XML_TAG_APPLICATION:
-	case MS_OFFICE_XML_TAG_MODIFIED:
-	case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
-	case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
-	case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
-	case MS_OFFICE_XML_TAG_NUM_OF_LINES:
-	case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
-	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
-	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
-	case MS_OFFICE_XML_TAG_INVALID:
-		break;
-	}
-}
-
-static gboolean
-xml_read (MsOfficeXMLParserInfo *parser_info,
-          const gchar           *xml_filename,
-          MsOfficeXMLTagType     type)
-{
-	GMarkupParseContext *context;
-	MsOfficeXMLParserInfo info;
-	TrackerConfig *config;
-
-	/* Setup conf */
-	config = tracker_main_get_config ();
-
-	/* FIXME: Can we use the original info here? */
-	info.metadata = parser_info->metadata;
-	info.file_type = parser_info->file_type;
-	info.tag_type = MS_OFFICE_XML_TAG_INVALID;
-	info.style_element_present = FALSE;
-	info.preserve_attribute_present = FALSE;
-	info.uri = parser_info->uri;
-	info.content = parser_info->content;
-	info.title_already_set = parser_info->title_already_set;
-	info.bytes_pending = tracker_config_get_max_bytes (config);
-	switch (type) {
-	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA: {
-		GMarkupParser parser = {
-			xml_start_element_handler_core_data,
-			xml_end_element_handler_document_data,
-			xml_core_handler_document_data,
-			NULL,
-			NULL
-		};
-
-		context = g_markup_parse_context_new (&parser,
-		                                      0,
-		                                      &info,
-		                                      NULL);
-		break;
-	}
-
-	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA: {
-		GMarkupParser parser = {
-			xml_start_element_handler_text_data,
-			xml_end_element_handler_document_data,
-			xml_text_handler_document_data,
-			NULL,
-			NULL
-		};
-
-		context = g_markup_parse_context_new (&parser,
-		                                      0,
-		                                      &info,
-		                                      NULL);
-		break;
-	}
-
-	default:
-		context = NULL;
-		break;
-	}
-
-	if (context) {
-		/* Load the internal XML file from the Zip archive, and parse it
-		 * using the given context */
-		tracker_gsf_parse_xml_in_zip (parser_info->uri,
-		                              xml_filename,
-		                              context, NULL);
-		g_markup_parse_context_free (context);
-	}
-
-	return TRUE;
-}
-
-static void
-xml_start_element_handler_content_types (GMarkupParseContext  *context,
-                                         const gchar          *element_name,
-                                         const gchar         **attribute_names,
-                                         const gchar         **attribute_values,
-                                         gpointer              user_data,
-                                         GError              **error)
-{
-	MsOfficeXMLParserInfo *info;
-	const gchar *part_name;
-	const gchar *content_type;
-	gint i;
-
-	info = user_data;
-
-	if (g_ascii_strcasecmp (element_name, "Override") != 0) {
-		info->tag_type = MS_OFFICE_XML_TAG_INVALID;
-		return;
-	}
-
-	part_name = NULL;
-	content_type = NULL;
-
-	for (i = 0; attribute_names[i]; i++) {
-		if (g_ascii_strcasecmp (attribute_names[i], "PartName") == 0) {
-			part_name = attribute_values[i];
-		} else if (g_ascii_strcasecmp (attribute_names[i], "ContentType") == 0) {
-			content_type = attribute_values[i];
-		}
-	}
-
-	/* Both part_name and content_type MUST be NON-NULL */
-	if (!part_name || !content_type) {
-		g_message ("Invalid file (part_name:%s, content_type:%s)",
-		           part_name ? part_name : "none",
-		           content_type ? content_type : "none");
-		return;
-	}
-
-	if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-package.core-properties+xml") == 0) ||
-	    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.extended-properties+xml") == 0)) {
-		xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA);
-		return;
-	}
-
-	switch (info->file_type) {
-	case FILE_TYPE_DOCX:
-		if (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") == 0) {
-			xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
-		}
-		break;
-
-	case FILE_TYPE_PPTX:
-	case FILE_TYPE_PPSX:
-		if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.presentationml.slide+xml") == 0) ||
-		    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml") == 0)) {
-			xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
-		}
-		break;
-
-	case FILE_TYPE_XLSX:
-		if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml") == 0) ||
-		    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml") == 0)) {
-			xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
-		}
-		break;
-
-	case FILE_TYPE_INVALID:
-		g_message ("Invalid file type:'%d'", info->file_type);
-		break;
-	}
-}
-
-static void
-extract_msoffice_xml (const gchar          *uri,
-                      TrackerSparqlBuilder *preupdate,
-                      TrackerSparqlBuilder *metadata)
-{
-	MsOfficeXMLParserInfo info;
-	MsOfficeXMLFileType file_type;
-	TrackerConfig *config;
-	GFile *file;
-	GFileInfo *file_info;
-	GMarkupParseContext *context = NULL;
-	GError *error = NULL;
-	gulong  total_bytes;
-	GMarkupParser parser = {
-		xml_start_element_handler_content_types,
-		xml_end_element_handler_document_data,
-		NULL,
-		NULL,
-		NULL
-	};
-	const gchar *mime_used;
-
-	if (G_UNLIKELY (maximum_size_error_quark == 0)) {
-		maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
-	}
-
-	file = g_file_new_for_uri (uri);
-
-	if (!file) {
-		g_warning ("Could not create GFile for URI:'%s'",
-		           uri);
-		return;
-	}
-
-	file_info = g_file_query_info (file,
-	                               G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
-	                               G_FILE_QUERY_INFO_NONE,
-	                               NULL,
-	                               NULL);
-	g_object_unref (file);
-
-	if (!file_info) {
-		g_warning ("Could not get GFileInfo for URI:'%s'",
-		           uri);
-		return;
-	}
-
-	mime_used = g_file_info_get_content_type (file_info);
-
-	if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.wordprocessingml.document") == 0) {
-		file_type = FILE_TYPE_DOCX;
-	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.presentation") == 0) {
-		file_type = FILE_TYPE_PPTX;
-	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.slideshow") == 0) {
-		file_type = FILE_TYPE_PPSX;
-	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") == 0) {
-		file_type = FILE_TYPE_XLSX;
-	} else {
-		g_message ("Mime type was not recognised:'%s'", mime_used);
-		file_type = FILE_TYPE_INVALID;
-	}
-
-	g_object_unref (file_info);
-
-	/* Setup conf */
-	config = tracker_main_get_config ();
-
-	g_debug ("Extracting MsOffice XML format...");
-
-	tracker_sparql_builder_predicate (metadata, "a");
-	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
-	total_bytes = tracker_config_get_max_bytes (config);
-	info.metadata = metadata;
-	info.file_type = file_type;
-	info.tag_type = MS_OFFICE_XML_TAG_INVALID;
-	info.style_element_present = FALSE;
-	info.preserve_attribute_present = FALSE;
-	info.uri = uri;
-	info.content = NULL;
-	info.title_already_set = FALSE;
-	info.bytes_pending = total_bytes;
-	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
-
-	/* Load the internal XML file from the Zip archive, and parse it
-	 * using the given context */
-	tracker_gsf_parse_xml_in_zip (uri,
-	                              "[Content_Types].xml",
-	                              context,
-                                  &error);
-
-	/* If we got any content, add it */
-	if (info.content) {
-		gchar *content;
-
-		content = g_string_free (info.content, FALSE);
-		info.content = NULL;
-
-		if (content) {
-			tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
-			tracker_sparql_builder_object_unvalidated (metadata, content);
-			g_free (content);
-		}
-	}
-
-	g_markup_parse_context_free (context);
-}
-
 TrackerExtractData *
 tracker_extract_get_data (void)
 {
author	Aleksander Morgado <aleksander@lanedo.com>	2010-11-24 11:01:34 +0100
committer	Aleksander Morgado <aleksander@lanedo.com>	2010-11-24 13:23:39 +0100
commit	b98fa7891390ef13f4478319afcf4952b41d7578 (patch)
tree	3e5b7d48df809a5ce9c9c9c99e70584288ba2b1a
parent	74eee612cf1f242eb30d2e3c458f2542ca4379d7 (diff)
download	tracker-b98fa7891390ef13f4478319afcf4952b41d7578.tar.gz