summaryrefslogtreecommitdiff
path: root/src/libtracker-common
diff options
context:
space:
mode:
Diffstat (limited to 'src/libtracker-common')
-rw-r--r--src/libtracker-common/tracker-parser-libicu.c186
-rw-r--r--src/libtracker-common/tracker-parser-libunistring.c65
-rw-r--r--src/libtracker-common/tracker-parser.h22
3 files changed, 269 insertions, 4 deletions
diff --git a/src/libtracker-common/tracker-parser-libicu.c b/src/libtracker-common/tracker-parser-libicu.c
index 8c4803206..a3271c003 100644
--- a/src/libtracker-common/tracker-parser-libicu.c
+++ b/src/libtracker-common/tracker-parser-libicu.c
@@ -144,7 +144,7 @@ get_word_info (const UChar *word,
/* The input word in this method MUST be normalized in NFKD form,
* and given in UChars, where str_length is the number of UChars
* (not the number of bytes) */
-gboolean
+static gboolean
tracker_parser_unaccent_nfkd_string (gpointer str,
gsize *str_length)
{
@@ -754,3 +754,187 @@ tracker_parser_next (TrackerParser *parser,
return str;
}
+gunichar2 *
+tracker_parser_tolower (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ UChar *zOutput;
+ int nOutput;
+ UErrorCode status = U_ZERO_ERROR;
+
+ g_return_val_if_fail (input, NULL);
+
+ nOutput = len * 2 + 2;
+ zOutput = malloc (nOutput);
+
+ u_strToLower (zOutput, nOutput / 2,
+ input, len / 2,
+ NULL, &status);
+
+ if (!U_SUCCESS (status)) {
+ memcpy (zOutput, input, len);
+ zOutput[len] = '\0';
+ nOutput = len;
+ }
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+gunichar2 *
+tracker_parser_toupper (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ UChar *zOutput;
+ int nOutput;
+ UErrorCode status = U_ZERO_ERROR;
+
+ nOutput = len * 2 + 2;
+ zOutput = malloc (nOutput);
+
+ u_strToUpper (zOutput, nOutput / 2,
+ input, len / 2,
+ NULL, &status);
+
+ if (!U_SUCCESS (status)) {
+ memcpy (zOutput, input, len);
+ zOutput[len] = '\0';
+ nOutput = len;
+ }
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+gunichar2 *
+tracker_parser_casefold (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ UChar *zOutput;
+ int nOutput;
+ UErrorCode status = U_ZERO_ERROR;
+
+ nOutput = len * 2 + 2;
+ zOutput = malloc (nOutput);
+
+ u_strFoldCase (zOutput, nOutput / 2,
+ input, len / 2,
+ U_FOLD_CASE_DEFAULT, &status);
+
+ if (!U_SUCCESS (status)){
+ memcpy (zOutput, input, len);
+ zOutput[len] = '\0';
+ nOutput = len;
+ }
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+static gunichar2 *
+normalize_string (const gunichar2 *string,
+ gsize string_len, /* In gunichar2s */
+ const UNormalizer2 *normalizer,
+ gsize *len_out, /* In gunichar2s */
+ UErrorCode *status)
+{
+ int nOutput;
+ gunichar2 *zOutput;
+
+ nOutput = (string_len * 2) + 1;
+ zOutput = g_new0 (gunichar2, nOutput);
+
+ nOutput = unorm2_normalize (normalizer, string, string_len, zOutput, nOutput, status);
+
+ if (*status == U_BUFFER_OVERFLOW_ERROR) {
+ /* Try again after allocating enough space for the normalization */
+ *status = U_ZERO_ERROR;
+ zOutput = g_renew (gunichar2, zOutput, nOutput);
+ memset (zOutput, 0, nOutput * sizeof (gunichar2));
+ nOutput = unorm2_normalize (normalizer, string, string_len, zOutput, nOutput, status);
+ }
+
+ if (!U_SUCCESS (*status)) {
+ g_clear_pointer (&zOutput, g_free);
+ nOutput = 0;
+ }
+
+ if (len_out)
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+gunichar2 *
+tracker_parser_normalize (const gunichar2 *input,
+ GNormalizeMode mode,
+ gsize len,
+ gsize *len_out)
+{
+ uint16_t *zOutput = NULL;
+ gsize nOutput;
+ const UNormalizer2 *normalizer;
+ UErrorCode status = U_ZERO_ERROR;
+
+ if (mode == G_NORMALIZE_NFC)
+ normalizer = unorm2_getNFCInstance (&status);
+ else if (mode == G_NORMALIZE_NFD)
+ normalizer = unorm2_getNFDInstance (&status);
+ else if (mode == G_NORMALIZE_NFKC)
+ normalizer = unorm2_getNFKCInstance (&status);
+ else if (mode == G_NORMALIZE_NFKD)
+ normalizer = unorm2_getNFKDInstance (&status);
+ else
+ g_assert_not_reached ();
+
+ if (U_SUCCESS (status)) {
+ zOutput = normalize_string (input, len / 2,
+ normalizer,
+ &nOutput, &status);
+ }
+
+ if (!U_SUCCESS (status)) {
+ zOutput = g_memdup2 (input, len);
+ nOutput = len;
+ }
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+gunichar2 *
+tracker_parser_unaccent (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ uint16_t *zOutput = NULL;
+ gsize nOutput;
+ const UNormalizer2 *normalizer;
+ UErrorCode status = U_ZERO_ERROR;
+
+ normalizer = unorm2_getNFKDInstance (&status);
+
+ if (U_SUCCESS (status)) {
+ zOutput = normalize_string (input, len / 2,
+ normalizer,
+ &nOutput, &status);
+ }
+
+ if (!U_SUCCESS (status)) {
+ zOutput = g_memdup2 (input, len);
+ }
+
+ /* Unaccenting is done in place */
+ tracker_parser_unaccent_nfkd_string (zOutput, &nOutput);
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
diff --git a/src/libtracker-common/tracker-parser-libunistring.c b/src/libtracker-common/tracker-parser-libunistring.c
index d24c5f1cb..04f08a2c6 100644
--- a/src/libtracker-common/tracker-parser-libunistring.c
+++ b/src/libtracker-common/tracker-parser-libunistring.c
@@ -159,7 +159,7 @@ get_word_info (TrackerParser *parser,
/* The input word in this method MUST be normalized in NFKD form,
* and given in UTF-8, where str_length is the byte-length
* (note: there is no trailing NUL character!) */
-gboolean
+static gboolean
tracker_parser_unaccent_nfkd_string (gpointer str,
gsize *str_length)
{
@@ -541,3 +541,66 @@ tracker_parser_next (TrackerParser *parser,
return str;
}
+gunichar2 *
+tracker_parser_tolower (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ return u16_tolower (input, len / 2, NULL, NULL, NULL, len_out);
+}
+
+gunichar2 *
+tracker_parser_toupper (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ return u16_toupper (input, len / 2, NULL, NULL, NULL, len_out);
+}
+
+gunichar2 *
+tracker_parser_casefold (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ return u16_casefold (input, len / 2, NULL, NULL, NULL, len_out);
+}
+
+gunichar2 *
+tracker_parser_normalize (const gunichar2 *input,
+ GNormalizeMode mode,
+ gsize len,
+ gsize *len_out)
+{
+ uninorm_t nf;
+
+ if (mode == G_NORMALIZE_NFC)
+ nf = UNINORM_NFC;
+ else if (mode == G_NORMALIZE_NFD)
+ nf = UNINORM_NFD;
+ else if (mode == G_NORMALIZE_NFKC)
+ nf = UNINORM_NFKC;
+ else if (mode == G_NORMALIZE_NFKD)
+ nf = UNINORM_NFKD;
+ else
+ g_assert_not_reached ();
+
+ return u16_normalize (nf, input, len / 2, NULL, len_out);
+}
+
+gunichar2 *
+tracker_parser_unaccent (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ gunichar2 *zOutput;
+ gsize written = 0;
+
+ zOutput = u16_normalize (UNINORM_NFKD, input, len, NULL, &written);
+
+ /* Unaccenting is done in place */
+ tracker_parser_unaccent_nfkd_string (zOutput, &written);
+
+ *len_out = written;
+
+ return zOutput;
+}
diff --git a/src/libtracker-common/tracker-parser.h b/src/libtracker-common/tracker-parser.h
index 3c8271503..a6a51fd4e 100644
--- a/src/libtracker-common/tracker-parser.h
+++ b/src/libtracker-common/tracker-parser.h
@@ -59,8 +59,26 @@ void tracker_parser_free (TrackerParser *parser);
/* Other helper methods */
-gboolean tracker_parser_unaccent_nfkd_string (gpointer str,
- gsize *str_length);
+gunichar2 * tracker_parser_tolower (const gunichar2 *input,
+ gsize len,
+ gsize *len_out);
+
+gunichar2 * tracker_parser_toupper (const gunichar2 *input,
+ gsize len,
+ gsize *len_out);
+
+gunichar2 * tracker_parser_casefold (const gunichar2 *input,
+ gsize len,
+ gsize *len_out);
+
+gunichar2 * tracker_parser_normalize (const gunichar2 *input,
+ GNormalizeMode mode,
+ gsize len,
+ gsize *len_out);
+
+gunichar2 * tracker_parser_unaccent (const gunichar2 *input,
+ gsize len,
+ gsize *len_out);
G_END_DECLS