/* * Copyright (C) 2006, Jamie McCracken * Copyright (C) 2008,2009,2010 Nokia * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301 USA */ #include "config.h" #include #include /* libunistring versions prior to 9.1.2 need this hack */ #define _UNUSED_PARAMETER_ #include #include #include #include #include "tracker-language.h" #include "tracker-parser.h" #include "tracker-parser-utils.h" /* Type of words detected */ typedef enum { TRACKER_PARSER_WORD_TYPE_ASCII, TRACKER_PARSER_WORD_TYPE_OTHER_UNAC, TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC, } TrackerParserWordType; /* If string lenth less than this value, allocating from the stack */ #define MAX_STACK_STR_SIZE 8192 /* Max possible length of a UTF-8 encoded string (just a safety limit) */ #define WORD_BUFFER_LENGTH 512 struct TrackerParser { const gchar *txt; gint txt_size; TrackerLanguage *language; guint max_word_length; gboolean enable_stemmer; gboolean enable_unaccent; gboolean ignore_stop_words; gboolean ignore_reserved_words; gboolean ignore_numbers; gboolean enable_forced_wordbreaks; /* Private members */ gchar *word; gint word_length; guint word_position; /* Cursor, as index of the input array of bytes */ gsize cursor; /* libunistring flags array */ gchar *word_break_flags; /* general category of the start character in words */ uc_general_category_t allowed_start; }; static gboolean get_word_info (TrackerParser *parser, gsize *p_word_length, gboolean *p_is_allowed_word_start, TrackerParserWordType *p_word_type) { ucs4_t first_unichar; gint first_unichar_len; gboolean ascii_only; /* Defaults */ *p_is_allowed_word_start = TRUE; /* Get first character of the word as UCS4 */ first_unichar_len = u8_strmbtouc (&first_unichar, (const guchar *) &(parser->txt[parser->cursor])); if (first_unichar_len <= 0) { /* This should only happen if NIL was passed to u8_strmbtouc, * so better just force stop here */ return FALSE; } else { /* If first character has length 1, it's ASCII-7 */ ascii_only = first_unichar_len == 1 ? TRUE : FALSE; } /* Consider word starts with a forced wordbreak */ if (parser->enable_forced_wordbreaks && IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) { *p_word_length = first_unichar_len; } else { gsize i; /* Find next word break, and in the same loop checking if only ASCII * characters */ i = parser->cursor + first_unichar_len; while (1) { /* Text bounds reached? */ if (i >= (gsize) parser->txt_size) break; /* Proper unicode word break detected? */ if (parser->word_break_flags[i]) break; /* Forced word break detected? */ if (parser->enable_forced_wordbreaks && IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i])) break; if (ascii_only && !IS_ASCII_UCS4 ((guint32)parser->txt[i])) { ascii_only = FALSE; } i++; } /* Word end is the first byte after the word, which is either the * start of next word or the end of the string */ *p_word_length = i - parser->cursor; } /* We only want the words where the first character * in the word is either a letter, a number or a symbol. * This is needed because the word break algorithm also * considers word breaks after for example commas or other * punctuation marks. * Note that looking at the first character in the string * should be compatible with all Unicode normalization * methods. */ if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) && !uc_is_general_category (first_unichar, parser->allowed_start)) { *p_is_allowed_word_start = FALSE; return TRUE; } /* Decide word type */ if (ascii_only) { *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII; } else if (IS_CJK_UCS4 (first_unichar)) { *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC; } else { *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC; } return TRUE; } /* The input word in this method MUST be normalized in NFKD form, * and given in UTF-8, where str_length is the byte-length * (note: there is no trailing NUL character!) */ static gboolean tracker_parser_unaccent_nfkd_string (gpointer str, gsize *str_length) { gchar *word; gsize word_length; gsize i; gsize j; g_return_val_if_fail (str != NULL, FALSE); g_return_val_if_fail (str_length != NULL, FALSE); word = (gchar *)str; word_length = *str_length; i = 0; j = 0; while (i < word_length) { ucs4_t unichar; gint utf8_len; /* Get next character of the word as UCS4 */ utf8_len = u8_strmbtouc (&unichar, (const guchar *) &word[i]); /* Invalid UTF-8 character or end of original string. */ if (utf8_len <= 0) { break; } /* If the given unichar is a combining diacritical mark, * just update the original index, not the output one */ if (IS_CDM_UCS4 ((guint32) unichar)) { i += utf8_len; continue; } /* If already found a previous combining * diacritical mark, indexes are different so * need to copy characters. As output and input * buffers may overlap, need to use memmove * instead of memcpy */ if (i != j) { memmove (&word[j], &word[i], utf8_len); } /* Update both indexes */ i += utf8_len; j += utf8_len; } /* Set new output length */ *str_length = j; return TRUE; } static gchar * process_word_utf8 (TrackerParser *parser, const gchar *word, gint length, TrackerParserWordType type, gboolean *stop_word) { gchar word_buffer [WORD_BUFFER_LENGTH]; gchar *normalized = NULL; gchar *stemmed = NULL; size_t new_word_length; g_return_val_if_fail (parser != NULL, NULL); g_return_val_if_fail (word != NULL, NULL); /* If length is set as -1, the input word MUST be NIL-terminated. * Otherwise, this restriction is not needed as the length to process * is given as input argument */ if (length < 0) { length = strlen (word); } /* Log original word */ tracker_parser_message_hex ("ORIGINAL word", word, length); /* Normalization and case-folding ONLY for non-ASCII */ if (type != TRACKER_PARSER_WORD_TYPE_ASCII) { /* Leave space for last NIL */ new_word_length = WORD_BUFFER_LENGTH - 1; /* Casefold and NFKD normalization in output. * NOTE: if the output buffer is not big enough, u8_casefold will * return a newly-allocated buffer. */ normalized = (gchar*) u8_casefold ((const uint8_t *)word, length, uc_locale_language (), UNINORM_NFKD, (guchar *) word_buffer, &new_word_length); /* Case folding + Normalization failed, ignore this word */ g_return_val_if_fail (normalized != NULL, NULL); /* If output buffer is not the same as the one passed to * u8_casefold, we know it was newly-allocated, so need * to resize it in 1 byte to add last NIL */ if (normalized != word_buffer) { normalized = g_realloc (normalized, new_word_length + 1); } /* Log after Normalization */ tracker_parser_message_hex (" After Casefolding and NFKD normalization", normalized, new_word_length); } else { /* For ASCII-only, just tolower() each character */ gsize i; normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer; for (i = 0; i < (gsize) length; i++) { normalized[i] = g_ascii_tolower (word[i]); } new_word_length = length; /* Log after tolower */ tracker_parser_message_hex (" After Lowercasing", normalized, new_word_length); } /* UNAC stripping needed? (for non-CJK and non-ASCII) */ if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC && tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) { /* Log after UNAC stripping */ tracker_parser_message_hex (" After UNAC stripping", normalized, new_word_length); } /* Set output NIL */ normalized[new_word_length] = '\0'; /* Check if stop word */ if (parser->ignore_stop_words) { *stop_word = tracker_language_is_stop_word (parser->language, normalized); } /* Stemming needed? */ if (parser->enable_stemmer) { stemmed = tracker_language_stem_word (parser->language, normalized, new_word_length); /* Log after stemming */ tracker_parser_message_hex (" After stemming", stemmed, strlen (stemmed)); } /* If stemmed wanted and succeeded, free previous and return it */ if (stemmed) { if (normalized != word_buffer) { g_free (normalized); } return stemmed; } /* It may be the case that no stripping and no stemming was needed, and * that the output buffer in stack was enough for case-folding and * normalization. In this case, need to strdup() the string to return it */ return normalized == word_buffer ? g_strdup (word_buffer) : normalized; } static gboolean parser_next (TrackerParser *parser, gint *byte_offset_start, gint *byte_offset_end, gboolean *stop_word) { gsize word_length = 0; gchar *processed_word = NULL; *byte_offset_start = 0; *byte_offset_end = 0; g_return_val_if_fail (parser, FALSE); /* Loop to look for next valid word */ while (!processed_word && parser->cursor < (gsize) parser->txt_size) { TrackerParserWordType type; gsize truncated_length; gboolean is_allowed; /* Get word info */ if (!get_word_info (parser, &word_length, &is_allowed, &type)) { /* Quit loop just in case */ parser->cursor = parser->txt_size; break; } /* Ignore the word if not an allowed word start */ if (!is_allowed) { /* Ignore this word and keep on looping */ parser->cursor += word_length; continue; } /* Ignore the word if longer than the maximum allowed */ if (word_length >= parser->max_word_length) { /* Ignore this word and keep on looping */ parser->cursor += word_length; continue; } /* check if word is reserved and ignore it if so */ if (parser->ignore_reserved_words && tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor], word_length)) { /* Ignore this word and keep on looping */ parser->cursor += word_length; continue; } /* compute truncated word length if needed (to avoid extremely * long words)*/ truncated_length = (word_length < WORD_BUFFER_LENGTH ? word_length : WORD_BUFFER_LENGTH - 1); /* Process the word here. If it fails, we can still go * to the next one. Returns newly allocated string * always */ processed_word = process_word_utf8 (parser, &(parser->txt[parser->cursor]), truncated_length, type, stop_word); if (!processed_word) { /* Ignore this word and keep on looping */ parser->cursor += word_length; continue; } } /* If we got a word here, set output */ if (processed_word) { /* Set outputs */ *byte_offset_start = parser->cursor; *byte_offset_end = parser->cursor + word_length; /* Update cursor */ parser->cursor += word_length; parser->word_length = strlen (processed_word); parser->word = processed_word; return TRUE; } /* No more words... */ return FALSE; } TrackerParser * tracker_parser_new (void) { TrackerParser *parser; parser = g_new0 (TrackerParser, 1); parser->language = tracker_language_new (NULL); return parser; } void tracker_parser_free (TrackerParser *parser) { g_return_if_fail (parser != NULL); if (parser->language) { g_object_unref (parser->language); } g_free (parser->word_break_flags); g_free (parser->word); g_free (parser); } void tracker_parser_reset (TrackerParser *parser, const gchar *txt, gint txt_size, guint max_word_length, gboolean enable_stemmer, gboolean enable_unaccent, gboolean ignore_stop_words, gboolean ignore_reserved_words, gboolean ignore_numbers) { g_return_if_fail (parser != NULL); g_return_if_fail (txt != NULL); parser->max_word_length = max_word_length; parser->enable_stemmer = enable_stemmer; parser->enable_unaccent = enable_unaccent; parser->ignore_stop_words = ignore_stop_words; parser->ignore_reserved_words = ignore_reserved_words; parser->ignore_numbers = ignore_numbers; /* Note: We're forcing some unicode characters to behave * as wordbreakers: e.g, the '.' The main reason for this * is to enable FTS searches matching file extension. */ parser->enable_forced_wordbreaks = TRUE; parser->txt_size = txt_size; parser->txt = txt; g_free (parser->word); parser->word = NULL; parser->word_position = 0; parser->cursor = 0; g_free (parser->word_break_flags); /* Create array of flags, same size as original text. */ parser->word_break_flags = g_malloc (txt_size); /* Get wordbreak flags in the whole string */ u8_wordbreaks ((const uint8_t *)txt, (size_t) txt_size, (char *)parser->word_break_flags); /* Prepare a custom category which is a combination of the * desired ones */ parser->allowed_start = UC_LETTER; if (!parser->ignore_numbers) { parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER); } } const gchar * tracker_parser_next (TrackerParser *parser, gint *position, gint *byte_offset_start, gint *byte_offset_end, gboolean *stop_word, gint *word_length) { const gchar *str; gint byte_start = 0, byte_end = 0; str = NULL; g_free (parser->word); parser->word = NULL; *stop_word = FALSE; if (parser_next (parser, &byte_start, &byte_end, stop_word)) { str = parser->word; } if (!*stop_word) { parser->word_position++; } *word_length = parser->word_length; *position = parser->word_position; *byte_offset_start = byte_start; *byte_offset_end = byte_end; return str; } gpointer tracker_collation_init (void) { /* Nothing to do */ return NULL; } void tracker_collation_shutdown (gpointer collator) { /* Nothing to do */ } gint tracker_collation_utf8 (gpointer collator, gint len1, gconstpointer str1, gint len2, gconstpointer str2) { gint result; guchar *aux1; guchar *aux2; /* Note: str1 and str2 are NOT NUL-terminated */ aux1 = (len1 < MAX_STACK_STR_SIZE) ? g_alloca (len1+1) : g_malloc (len1+1); aux2 = (len2 < MAX_STACK_STR_SIZE) ? g_alloca (len2+1) : g_malloc (len2+1); memcpy (aux1, str1, len1); aux1[len1] = '\0'; memcpy (aux2, str2, len2); aux2[len2] = '\0'; result = u8_strcoll (aux1, aux2); if (len1 >= MAX_STACK_STR_SIZE) g_free (aux1); if (len2 >= MAX_STACK_STR_SIZE) g_free (aux2); return result; } gunichar2 * tracker_parser_tolower (const gunichar2 *input, gsize len, gsize *len_out) { return u16_tolower (input, len / 2, NULL, NULL, NULL, len_out); } gunichar2 * tracker_parser_toupper (const gunichar2 *input, gsize len, gsize *len_out) { return u16_toupper (input, len / 2, NULL, NULL, NULL, len_out); } gunichar2 * tracker_parser_casefold (const gunichar2 *input, gsize len, gsize *len_out) { return u16_casefold (input, len / 2, NULL, NULL, NULL, len_out); } gunichar2 * tracker_parser_normalize (const gunichar2 *input, GNormalizeMode mode, gsize len, gsize *len_out) { uninorm_t nf; if (mode == G_NORMALIZE_NFC) nf = UNINORM_NFC; else if (mode == G_NORMALIZE_NFD) nf = UNINORM_NFD; else if (mode == G_NORMALIZE_NFKC) nf = UNINORM_NFKC; else if (mode == G_NORMALIZE_NFKD) nf = UNINORM_NFKD; else g_assert_not_reached (); return u16_normalize (nf, input, len / 2, NULL, len_out); } gunichar2 * tracker_parser_unaccent (const gunichar2 *input, gsize len, gsize *len_out) { gunichar2 *zOutput; gsize written = 0; zOutput = u16_normalize (UNINORM_NFKD, input, len, NULL, &written); /* Unaccenting is done in place */ tracker_parser_unaccent_nfkd_string (zOutput, &written); *len_out = written; return zOutput; }