diff options
author | Aleksander Morgado <aleksander@lanedo.com> | 2010-11-24 15:51:22 +0100 |
---|---|---|
committer | Aleksander Morgado <aleksander@lanedo.com> | 2010-11-24 15:51:22 +0100 |
commit | 055802d37ce3771065fe8e24547e92fef3090606 (patch) | |
tree | 9b92f005dafa47bccef7b56087c30ce14ec9ec70 | |
parent | 05b0357b3e9d64888e35e614d29c6eac51310392 (diff) | |
download | tracker-055802d37ce3771065fe8e24547e92fef3090606.tar.gz |
Fixes GB#634424: MsOffice metadata extractor is not unicode aware
-rw-r--r-- | src/tracker-extract/tracker-extract-msoffice.c | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c index 72a89d219..b8a38003e 100644 --- a/src/tracker-extract/tracker-extract-msoffice.c +++ b/src/tracker-extract/tracker-extract-msoffice.c @@ -143,6 +143,52 @@ static TrackerExtractData data[] = { { NULL, NULL } }; +/* Valid range from \000 to \377 (0 to 255) */ +#define octal_ascii_triplet_is_valid(slash, a2, a1, a0) \ + (slash == '\\' && \ + a2 >= '0' && a2 <= '3' && \ + a1 >= '0' && a1 <= '8' && \ + a0 >= '0' && a0 <= '8') + +#define octal_ascii_triplet_to_decimal_int(a2, a1, a0) \ + ((a0 - '0') + 8 * ((a1 - '0') + 8 * (a2 - '0'))) + +/* + * So, we may get input strings with UTF-8 characters encoded in OCTAL and + * represented in ASCII, like this: + * K\303\230BENHAVNS UNIVERSITET + * which is equivalent to: + * KØBENHAVNS UNIVERSITET + */ +static void +msoffice_string_process_octal_triplets (guchar *str) +{ + guint i = 0; /* index in original string */ + guint j = 0; /* index in processed string */ + guint length = strlen (str); + + /* Changing the string IN PLACE, note that j<=i ALWAYS! */ + while (i < length) { + if (length - i >= 4 && + octal_ascii_triplet_is_valid (str[i], str[i+1], str[i+2], str[i+3])) { + /* Found a new octal triplet */ + str[j] = octal_ascii_triplet_to_decimal_int (str[i+1], str[i+2], str[i+3]); + i += 4; + } else if (i != j) { + /* We previously found an octal triplet, + * we need to update the string */ + str[j] = str[i]; + i++; + } else { + /* No need to update the string yet */ + i++; + } + j++; + } + /* New end of string */ + str[j]='\0'; +} + static void metadata_add_gvalue (TrackerSparqlBuilder *metadata, const gchar *uri, @@ -212,6 +258,9 @@ metadata_add_gvalue (TrackerSparqlBuilder *metadata, } if (str_val) { + /* Process (in place) octal triplets if found */ + msoffice_string_process_octal_triplets (str_val); + if (type && predicate) { tracker_sparql_builder_predicate (metadata, key); |