summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAleksander Morgado <aleksander@lanedo.com>2010-11-24 15:51:22 +0100
committerAleksander Morgado <aleksander@lanedo.com>2010-11-24 15:51:22 +0100
commit055802d37ce3771065fe8e24547e92fef3090606 (patch)
tree9b92f005dafa47bccef7b56087c30ce14ec9ec70
parent05b0357b3e9d64888e35e614d29c6eac51310392 (diff)
downloadtracker-055802d37ce3771065fe8e24547e92fef3090606.tar.gz
Fixes GB#634424: MsOffice metadata extractor is not unicode aware
-rw-r--r--src/tracker-extract/tracker-extract-msoffice.c49
1 files changed, 49 insertions, 0 deletions
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 72a89d219..b8a38003e 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -143,6 +143,52 @@ static TrackerExtractData data[] = {
{ NULL, NULL }
};
+/* Valid range from \000 to \377 (0 to 255) */
+#define octal_ascii_triplet_is_valid(slash, a2, a1, a0) \
+ (slash == '\\' && \
+ a2 >= '0' && a2 <= '3' && \
+ a1 >= '0' && a1 <= '8' && \
+ a0 >= '0' && a0 <= '8')
+
+#define octal_ascii_triplet_to_decimal_int(a2, a1, a0) \
+ ((a0 - '0') + 8 * ((a1 - '0') + 8 * (a2 - '0')))
+
+/*
+ * So, we may get input strings with UTF-8 characters encoded in OCTAL and
+ * represented in ASCII, like this:
+ * K\303\230BENHAVNS UNIVERSITET
+ * which is equivalent to:
+ * KØBENHAVNS UNIVERSITET
+ */
+static void
+msoffice_string_process_octal_triplets (guchar *str)
+{
+ guint i = 0; /* index in original string */
+ guint j = 0; /* index in processed string */
+ guint length = strlen (str);
+
+ /* Changing the string IN PLACE, note that j<=i ALWAYS! */
+ while (i < length) {
+ if (length - i >= 4 &&
+ octal_ascii_triplet_is_valid (str[i], str[i+1], str[i+2], str[i+3])) {
+ /* Found a new octal triplet */
+ str[j] = octal_ascii_triplet_to_decimal_int (str[i+1], str[i+2], str[i+3]);
+ i += 4;
+ } else if (i != j) {
+ /* We previously found an octal triplet,
+ * we need to update the string */
+ str[j] = str[i];
+ i++;
+ } else {
+ /* No need to update the string yet */
+ i++;
+ }
+ j++;
+ }
+ /* New end of string */
+ str[j]='\0';
+}
+
static void
metadata_add_gvalue (TrackerSparqlBuilder *metadata,
const gchar *uri,
@@ -212,6 +258,9 @@ metadata_add_gvalue (TrackerSparqlBuilder *metadata,
}
if (str_val) {
+ /* Process (in place) octal triplets if found */
+ msoffice_string_process_octal_triplets (str_val);
+
if (type && predicate) {
tracker_sparql_builder_predicate (metadata, key);