summaryrefslogtreecommitdiff
path: root/src/libtracker-common
diff options
context:
space:
mode:
authorCarlos Garnacho <carlosg@gnome.org>2023-03-02 11:01:39 +0100
committerCarlos Garnacho <carlosg@gnome.org>2023-03-02 15:44:59 +0100
commit819e2510fbb1fd54dd41d02c99583be94a47e4c3 (patch)
tree0ee95a12b37dd3854a485db84c9298e619b02a69 /src/libtracker-common
parent6d250002586146de5ff1d34eaa29440bc18ff3ae (diff)
downloadtracker-819e2510fbb1fd54dd41d02c99583be94a47e4c3.tar.gz
core: Move unicode treatment SPARQL function features to helpers
Have the helpers defined together with the TrackerParser, so we don't drag details about the unicode library to other parts of the code.
Diffstat (limited to 'src/libtracker-common')
-rw-r--r--src/libtracker-common/tracker-parser-libicu.c186
-rw-r--r--src/libtracker-common/tracker-parser-libunistring.c65
-rw-r--r--src/libtracker-common/tracker-parser.h22
3 files changed, 269 insertions, 4 deletions
diff --git a/src/libtracker-common/tracker-parser-libicu.c b/src/libtracker-common/tracker-parser-libicu.c
index 8c4803206..a3271c003 100644
--- a/src/libtracker-common/tracker-parser-libicu.c
+++ b/src/libtracker-common/tracker-parser-libicu.c
@@ -144,7 +144,7 @@ get_word_info (const UChar *word,
/* The input word in this method MUST be normalized in NFKD form,
* and given in UChars, where str_length is the number of UChars
* (not the number of bytes) */
-gboolean
+static gboolean
tracker_parser_unaccent_nfkd_string (gpointer str,
gsize *str_length)
{
@@ -754,3 +754,187 @@ tracker_parser_next (TrackerParser *parser,
return str;
}
+gunichar2 *
+tracker_parser_tolower (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ UChar *zOutput;
+ int nOutput;
+ UErrorCode status = U_ZERO_ERROR;
+
+ g_return_val_if_fail (input, NULL);
+
+ nOutput = len * 2 + 2;
+ zOutput = malloc (nOutput);
+
+ u_strToLower (zOutput, nOutput / 2,
+ input, len / 2,
+ NULL, &status);
+
+ if (!U_SUCCESS (status)) {
+ memcpy (zOutput, input, len);
+ zOutput[len] = '\0';
+ nOutput = len;
+ }
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+gunichar2 *
+tracker_parser_toupper (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ UChar *zOutput;
+ int nOutput;
+ UErrorCode status = U_ZERO_ERROR;
+
+ nOutput = len * 2 + 2;
+ zOutput = malloc (nOutput);
+
+ u_strToUpper (zOutput, nOutput / 2,
+ input, len / 2,
+ NULL, &status);
+
+ if (!U_SUCCESS (status)) {
+ memcpy (zOutput, input, len);
+ zOutput[len] = '\0';
+ nOutput = len;
+ }
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+gunichar2 *
+tracker_parser_casefold (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ UChar *zOutput;
+ int nOutput;
+ UErrorCode status = U_ZERO_ERROR;
+
+ nOutput = len * 2 + 2;
+ zOutput = malloc (nOutput);
+
+ u_strFoldCase (zOutput, nOutput / 2,
+ input, len / 2,
+ U_FOLD_CASE_DEFAULT, &status);
+
+ if (!U_SUCCESS (status)){
+ memcpy (zOutput, input, len);
+ zOutput[len] = '\0';
+ nOutput = len;
+ }
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+static gunichar2 *
+normalize_string (const gunichar2 *string,
+ gsize string_len, /* In gunichar2s */
+ const UNormalizer2 *normalizer,
+ gsize *len_out, /* In gunichar2s */
+ UErrorCode *status)
+{
+ int nOutput;
+ gunichar2 *zOutput;
+
+ nOutput = (string_len * 2) + 1;
+ zOutput = g_new0 (gunichar2, nOutput);
+
+ nOutput = unorm2_normalize (normalizer, string, string_len, zOutput, nOutput, status);
+
+ if (*status == U_BUFFER_OVERFLOW_ERROR) {
+ /* Try again after allocating enough space for the normalization */
+ *status = U_ZERO_ERROR;
+ zOutput = g_renew (gunichar2, zOutput, nOutput);
+ memset (zOutput, 0, nOutput * sizeof (gunichar2));
+ nOutput = unorm2_normalize (normalizer, string, string_len, zOutput, nOutput, status);
+ }
+
+ if (!U_SUCCESS (*status)) {
+ g_clear_pointer (&zOutput, g_free);
+ nOutput = 0;
+ }
+
+ if (len_out)
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+gunichar2 *
+tracker_parser_normalize (const gunichar2 *input,
+ GNormalizeMode mode,
+ gsize len,
+ gsize *len_out)
+{
+ uint16_t *zOutput = NULL;
+ gsize nOutput;
+ const UNormalizer2 *normalizer;
+ UErrorCode status = U_ZERO_ERROR;
+
+ if (mode == G_NORMALIZE_NFC)
+ normalizer = unorm2_getNFCInstance (&status);
+ else if (mode == G_NORMALIZE_NFD)
+ normalizer = unorm2_getNFDInstance (&status);
+ else if (mode == G_NORMALIZE_NFKC)
+ normalizer = unorm2_getNFKCInstance (&status);
+ else if (mode == G_NORMALIZE_NFKD)
+ normalizer = unorm2_getNFKDInstance (&status);
+ else
+ g_assert_not_reached ();
+
+ if (U_SUCCESS (status)) {
+ zOutput = normalize_string (input, len / 2,
+ normalizer,
+ &nOutput, &status);
+ }
+
+ if (!U_SUCCESS (status)) {
+ zOutput = g_memdup2 (input, len);
+ nOutput = len;
+ }
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
+
+gunichar2 *
+tracker_parser_unaccent (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ uint16_t *zOutput = NULL;
+ gsize nOutput;
+ const UNormalizer2 *normalizer;
+ UErrorCode status = U_ZERO_ERROR;
+
+ normalizer = unorm2_getNFKDInstance (&status);
+
+ if (U_SUCCESS (status)) {
+ zOutput = normalize_string (input, len / 2,
+ normalizer,
+ &nOutput, &status);
+ }
+
+ if (!U_SUCCESS (status)) {
+ zOutput = g_memdup2 (input, len);
+ }
+
+ /* Unaccenting is done in place */
+ tracker_parser_unaccent_nfkd_string (zOutput, &nOutput);
+
+ *len_out = nOutput;
+
+ return zOutput;
+}
diff --git a/src/libtracker-common/tracker-parser-libunistring.c b/src/libtracker-common/tracker-parser-libunistring.c
index d24c5f1cb..04f08a2c6 100644
--- a/src/libtracker-common/tracker-parser-libunistring.c
+++ b/src/libtracker-common/tracker-parser-libunistring.c
@@ -159,7 +159,7 @@ get_word_info (TrackerParser *parser,
/* The input word in this method MUST be normalized in NFKD form,
* and given in UTF-8, where str_length is the byte-length
* (note: there is no trailing NUL character!) */
-gboolean
+static gboolean
tracker_parser_unaccent_nfkd_string (gpointer str,
gsize *str_length)
{
@@ -541,3 +541,66 @@ tracker_parser_next (TrackerParser *parser,
return str;
}
+gunichar2 *
+tracker_parser_tolower (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ return u16_tolower (input, len / 2, NULL, NULL, NULL, len_out);
+}
+
+gunichar2 *
+tracker_parser_toupper (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ return u16_toupper (input, len / 2, NULL, NULL, NULL, len_out);
+}
+
+gunichar2 *
+tracker_parser_casefold (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ return u16_casefold (input, len / 2, NULL, NULL, NULL, len_out);
+}
+
+gunichar2 *
+tracker_parser_normalize (const gunichar2 *input,
+ GNormalizeMode mode,
+ gsize len,
+ gsize *len_out)
+{
+ uninorm_t nf;
+
+ if (mode == G_NORMALIZE_NFC)
+ nf = UNINORM_NFC;
+ else if (mode == G_NORMALIZE_NFD)
+ nf = UNINORM_NFD;
+ else if (mode == G_NORMALIZE_NFKC)
+ nf = UNINORM_NFKC;
+ else if (mode == G_NORMALIZE_NFKD)
+ nf = UNINORM_NFKD;
+ else
+ g_assert_not_reached ();
+
+ return u16_normalize (nf, input, len / 2, NULL, len_out);
+}
+
+gunichar2 *
+tracker_parser_unaccent (const gunichar2 *input,
+ gsize len,
+ gsize *len_out)
+{
+ gunichar2 *zOutput;
+ gsize written = 0;
+
+ zOutput = u16_normalize (UNINORM_NFKD, input, len, NULL, &written);
+
+ /* Unaccenting is done in place */
+ tracker_parser_unaccent_nfkd_string (zOutput, &written);
+
+ *len_out = written;
+
+ return zOutput;
+}
diff --git a/src/libtracker-common/tracker-parser.h b/src/libtracker-common/tracker-parser.h
index 3c8271503..a6a51fd4e 100644
--- a/src/libtracker-common/tracker-parser.h
+++ b/src/libtracker-common/tracker-parser.h
@@ -59,8 +59,26 @@ void tracker_parser_free (TrackerParser *parser);
/* Other helper methods */
-gboolean tracker_parser_unaccent_nfkd_string (gpointer str,
- gsize *str_length);
+gunichar2 * tracker_parser_tolower (const gunichar2 *input,
+ gsize len,
+ gsize *len_out);
+
+gunichar2 * tracker_parser_toupper (const gunichar2 *input,
+ gsize len,
+ gsize *len_out);
+
+gunichar2 * tracker_parser_casefold (const gunichar2 *input,
+ gsize len,
+ gsize *len_out);
+
+gunichar2 * tracker_parser_normalize (const gunichar2 *input,
+ GNormalizeMode mode,
+ gsize len,
+ gsize *len_out);
+
+gunichar2 * tracker_parser_unaccent (const gunichar2 *input,
+ gsize len,
+ gsize *len_out);
G_END_DECLS