/*
 * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
 * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301  USA
 */

#include "config.h"

#include <stdio.h>
#include <string.h>
#include <locale.h>

#include <unicode/utypes.h>
#include <unicode/ucnv.h>
#include <unicode/ubrk.h>
#include <unicode/ustring.h>
#include <unicode/uchar.h>
#include <unicode/unorm.h>
#include <unicode/ucol.h>

#include "tracker-language.h"
#include "tracker-debug.h"
#include "tracker-parser.h"
#include "tracker-parser-utils.h"

/* Type of words detected */
typedef enum {
	TRACKER_PARSER_WORD_TYPE_ASCII,
	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
} TrackerParserWordType;

typedef UCollator TrackerCollator;

/* Max possible length of a UChar encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512

struct TrackerParser {
	const gchar           *txt;
	gint                   txt_size;

	TrackerLanguage       *language;
	guint                  max_word_length;
	gboolean               enable_stemmer;
	gboolean               enable_unaccent;
	gboolean               ignore_stop_words;
	gboolean               ignore_reserved_words;
	gboolean               ignore_numbers;
	gboolean               enable_forced_wordbreaks;

	/* Private members */
	gchar                 *word;
	gint                   word_length;
	guint                  word_position;

	/* Text as UChars */
	UChar                 *utxt;
	gsize                  utxt_size;
	/* Original offset of each UChar in the input txt string */
	gint32                *offsets;

	/* The word-break iterator */
	UBreakIterator        *bi;

	/* Cursor, as index of the utxt array of bytes */
	gsize                  cursor;
};


static gboolean
get_word_info (const UChar           *word,
               gsize                  word_length,
               gboolean               ignore_numbers,
               gboolean              *p_is_allowed_word_start,
               TrackerParserWordType *p_word_type)
{
	UCharIterator iter;
	UChar32 unichar;
	guint8 unichar_gc;

	/* Get first character of the word as UCS4 */
	uiter_setString (&iter, word, word_length);
	unichar = uiter_current32 (&iter);
	if (unichar == U_SENTINEL) {
		return FALSE;
	}

	/* We only want the words where the first character
	 * in the word is either a letter, a number or a symbol.
	 *
	 * This is needed because the word break algorithm also
	 * considers word breaks after for example commas or other
	 * punctuation marks.
	 *
	 * Note that looking at the first character in the string
	 * should be compatible with all Unicode normalization
	 * methods.
	 */
	unichar_gc = u_charType (unichar);
	if (unichar_gc == U_UPPERCASE_LETTER ||
	    unichar_gc == U_LOWERCASE_LETTER ||
	    unichar_gc == U_TITLECASE_LETTER ||
	    unichar_gc == U_MODIFIER_LETTER ||
	    unichar_gc == U_OTHER_LETTER ||
	    IS_UNDERSCORE_UCS4 ((guint32)unichar) ||
	    (!ignore_numbers &&
	     (unichar_gc == U_DECIMAL_DIGIT_NUMBER ||
	      unichar_gc == U_LETTER_NUMBER ||
	      unichar_gc == U_OTHER_NUMBER))) {
		*p_is_allowed_word_start = TRUE;
	} else {
		*p_is_allowed_word_start = FALSE;
		return TRUE;
	}

	/* Word starts with a CJK character? */
	if (IS_CJK_UCS4 ((guint32)unichar)) {
		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
		return TRUE;
	}

	/* Is ASCII-only string? */
	while (unichar != U_SENTINEL) {
		if (!IS_ASCII_UCS4 ((guint32)unichar)) {
			*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
			return TRUE;
		}
		unichar = uiter_next32 (&iter);
	}

	*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
	return TRUE;
}

/* The input word in this method MUST be normalized in NFKD form,
 * and given in UChars, where str_length is the number of UChars
 * (not the number of bytes) */
static gboolean
tracker_parser_unaccent_nfkd_string (gpointer  str,
                                     gsize    *str_length)
{
	UChar *word;
	gsize word_length;
	gsize i;
	gsize j;

	g_return_val_if_fail (str != NULL, FALSE);
	g_return_val_if_fail (str_length != NULL, FALSE);

	word = (UChar *)str;
	word_length = *str_length;

	i = 0;
	j = 0;
	while (i < word_length) {
		UChar32 unichar;
		gint utf16_len; /* given in UChars */
		gsize aux_i;

		/* Get next character of the word as UCS4 */
		aux_i = i;
		U16_NEXT (word, aux_i, word_length, unichar);
		utf16_len = aux_i - i;

		/* Invalid UTF-16 character or end of original string. */
		if (utf16_len <= 0) {
			break;
		}

		/* If the given unichar is a combining diacritical mark,
		 * just update the original index, not the output one */
		if (IS_CDM_UCS4 ((guint32) unichar)) {
			i += utf16_len;
			continue;
		}

		/* If already found a previous combining
		 * diacritical mark, indexes are different so
		 * need to copy characters. As output and input
		 * buffers may overlap, need to use memmove
		 * instead of memcpy */
		if (i != j) {
			memmove (&word[j], &word[i], sizeof (UChar) * utf16_len);
		}

		/* Update both indexes */
		i += utf16_len;
		j += utf16_len;
	}

	/* Force proper string end */
	word[j] = (UChar) 0;

	/* Set new output length */
	*str_length = j;

	return TRUE;
}

static gchar *
convert_UChar_to_utf8 (const UChar *word,
                       gsize        uchar_len,
                       gsize       *utf8_len)
{
	gchar *utf8_str;
	UErrorCode icu_error = U_ZERO_ERROR;
	UConverter *converter;
	gsize new_utf8_len;

	g_return_val_if_fail (word, NULL);
	g_return_val_if_fail (utf8_len, NULL);

	/* Open converter UChar to UTF-16BE */
	converter = ucnv_open ("UTF-8", &icu_error);
	if (!converter) {
		g_warning ("Cannot open UTF-8 converter: '%s'",
		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
		return NULL;
	}

	/* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
	 *  in UTF-8. */
	utf8_str = g_malloc (2 * uchar_len * sizeof (UChar) + 1);

	/* Convert from UChar to UTF-8 (NIL-terminated) */
	new_utf8_len = ucnv_fromUChars (converter,
	                                utf8_str,
	                                2 * uchar_len * sizeof (UChar) + 1,
	                                word,
	                                uchar_len,
	                                &icu_error);
	if (U_FAILURE (icu_error)) {
		g_warning ("Cannot convert from UChar to UTF-8: '%s'",
		           u_errorName (icu_error));
		g_free (utf8_str);
		ucnv_close (converter);
		return NULL;
	}

	*utf8_len = new_utf8_len;
	ucnv_close (converter);

	return utf8_str;
}

static gchar *
process_word_uchar (TrackerParser         *parser,
                    const UChar           *word,
                    gint                   length,
                    TrackerParserWordType  type,
                    gboolean              *stop_word)
{
	UErrorCode error = U_ZERO_ERROR;
	UChar normalized_buffer[WORD_BUFFER_LENGTH];
	gchar *utf8_str = NULL;
	gsize new_word_length;

	/* Log original word */
	tracker_parser_message_hex ("ORIGINAL word",
	                            (guint8 *)word,
	                            length * sizeof (UChar));


	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
		UChar casefolded_buffer [WORD_BUFFER_LENGTH];
		const UNormalizer2 *normalizer;

		/* Casefold... */
		new_word_length = u_strFoldCase (casefolded_buffer,
		                                 WORD_BUFFER_LENGTH,
		                                 word,
		                                 length,
		                                 U_FOLD_CASE_DEFAULT,
		                                 &error);
		if (U_FAILURE (error)) {
			g_warning ("Error casefolding: '%s'",
			           u_errorName (error));
			return NULL;
		}
		if (new_word_length > WORD_BUFFER_LENGTH)
			new_word_length = WORD_BUFFER_LENGTH;

		/* Log after casefolding */
		tracker_parser_message_hex (" After Casefolding",
		                            (guint8 *)casefolded_buffer,
		                            new_word_length * sizeof (UChar));

		/* NFKD normalization... */
		normalizer = unorm2_getNFKDInstance (&error);

		if (U_SUCCESS (error)) {
			new_word_length = unorm2_normalize (normalizer,
			                                    casefolded_buffer,
			                                    new_word_length,
			                                    normalized_buffer,
			                                    WORD_BUFFER_LENGTH,
			                                    &error);
		}

		if (U_FAILURE (error)) {
			g_warning ("Error normalizing: '%s'",
			           u_errorName (error));
			return NULL;
		}

		if (new_word_length > WORD_BUFFER_LENGTH)
			new_word_length = WORD_BUFFER_LENGTH;

		/* Log after casefolding */
		tracker_parser_message_hex (" After Normalization",
		                            (guint8 *) normalized_buffer,
		                            new_word_length * sizeof (UChar));
	} else {
		/* For ASCII-only, just tolower() each character */
		new_word_length = u_strToLower (normalized_buffer,
		                                WORD_BUFFER_LENGTH,
		                                word,
		                                length,
		                                NULL,
		                                &error);
		if (U_FAILURE (error)) {
			g_warning ("Error lowercasing: '%s'",
			           u_errorName (error));
			return NULL;
		}

		/* Log after casefolding */
		tracker_parser_message_hex (" After lowercase",
		                            (guint8 *) normalized_buffer,
		                            new_word_length * sizeof (UChar));
	}

	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
	if (parser->enable_unaccent &&
	    type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
	    tracker_parser_unaccent_nfkd_string (normalized_buffer, &new_word_length)) {
		/* Log after unaccenting */
		tracker_parser_message_hex ("  After UNAC",
		                            (guint8 *) normalized_buffer,
		                            new_word_length * sizeof (UChar));
	}

	/* Finally, convert to UTF-8 */
	utf8_str = convert_UChar_to_utf8 (normalized_buffer,
	                                  new_word_length,
	                                  &new_word_length);

	/* Log after unaccenting */
	tracker_parser_message_hex ("   After UTF8 conversion",
	                            utf8_str,
	                            new_word_length);

	/* Check if stop word */
	if (parser->ignore_stop_words) {
		*stop_word = tracker_language_is_stop_word (parser->language,
		                                            utf8_str);
	}

	/* Stemming needed? */
	if (utf8_str &&
	    parser->enable_stemmer) {
		gchar *stemmed;

		/* Input for stemmer ALWAYS in UTF-8, as well as output */
		stemmed = tracker_language_stem_word (parser->language,
		                                      utf8_str,
		                                      new_word_length);

		/* Log after stemming */
		tracker_parser_message_hex ("    After stemming",
		                            stemmed, strlen (stemmed));

		/* If stemmed wanted and succeeded, free previous and return it */
		if (stemmed) {
			g_free (utf8_str);
			return stemmed;
		}
	}

	return utf8_str;
}

static gboolean
parser_check_forced_wordbreaks (const UChar *buffer,
                                gsize        current,
                                gsize       *next)
{
	gsize unicode_word_length = *next - current;
	gsize word_length = 0;
	UCharIterator iter;
	UChar32 unichar;

	uiter_setString (&iter, &buffer[current], unicode_word_length);

	/* Iterate over the string looking for forced word breaks */
	while ((unichar = uiter_next32 (&iter)) != U_SENTINEL &&
	       word_length < unicode_word_length) {

		if (IS_FORCED_WORDBREAK_UCS4 ((guint32) unichar)) {
			/* Support word starting with a forced wordbreak */
			if (word_length == 0) {
				word_length = 1;
			}
			break;
		}

		word_length ++;
	}

	/* g_debug ("current: %" G_GSIZE_FORMAT ", " */
	/*          "next: %" G_GSIZE_FORMAT ", " */
	/*          "now: %" G_GSIZE_FORMAT, */
	/*          current, */
	/*          *next, */
	/*          current + word_length); */

	if (word_length != unicode_word_length) {
		*next = current + word_length;
		return TRUE;
	}
	return FALSE;
}

static gboolean
parser_next (TrackerParser *parser,
             gint          *byte_offset_start,
             gint          *byte_offset_end,
             gboolean      *stop_word)
{
	gsize word_length_uchar = 0;
	gsize word_length_utf8 = 0;
	gchar *processed_word = NULL;
	gsize current_word_offset_utf8 = 0;

	*byte_offset_start = 0;
	*byte_offset_end = 0;

	g_return_val_if_fail (parser, FALSE);

	/* Loop to look for next valid word */
	while (!processed_word &&
	       parser->cursor < parser->utxt_size) {
		TrackerParserWordType type;
		gboolean is_allowed;
		gsize next_word_offset_uchar;
		gsize next_word_offset_utf8;
		gsize truncated_length;

		/* Set current word offset in the original UTF-8 string */
		current_word_offset_utf8 = parser->offsets[parser->cursor];

		/* Find next word break. */
		next_word_offset_uchar = ubrk_next (parser->bi);

		/* Check if any forced wordbreaks here... */
		if (parser->enable_forced_wordbreaks) {
			/* Returns TRUE if next word offset changed */
			if (parser_check_forced_wordbreaks (parser->utxt,
			                                    parser->cursor,
			                                    &next_word_offset_uchar)) {
				/* We need to reset the iterator so that next word
				 * actually returns the same result */
				ubrk_previous (parser->bi);
			}
		}

		if (next_word_offset_uchar >= parser->utxt_size) {
			/* Last word support... */
			next_word_offset_uchar = parser->utxt_size;
			next_word_offset_utf8 = parser->txt_size;
		} else {
			next_word_offset_utf8 = parser->offsets[next_word_offset_uchar];
		}

		/* Word end is the first byte after the word, which is either the
		 *  start of next word or the end of the string */
		word_length_uchar = next_word_offset_uchar - parser->cursor;
		word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;

		/* g_debug ("word_length_uchar: %" G_GSIZE_FORMAT, word_length_uchar); */
		/* g_debug ("next_word_offset_uchar: %" G_GSIZE_FORMAT, next_word_offset_uchar); */
		/* g_debug ("current_word_offset_uchar: %" G_GSIZE_FORMAT, parser->cursor); */
		/* g_debug ("word_length_utf8: %" G_GSIZE_FORMAT, word_length_utf8); */
		/* g_debug ("next_word_offset_utf8: %" G_GSIZE_FORMAT, next_word_offset_utf8); */
		/* g_debug ("current_word_offset_utf8: %" G_GSIZE_FORMAT, current_word_offset_utf8); */

		/* Ignore the word if longer than the maximum allowed */
		if (word_length_utf8 >= parser->max_word_length) {
			/* Ignore this word and keep on looping */
			parser->cursor = next_word_offset_uchar;
			continue;
		}

		/* Get word info... */
		if (!get_word_info (&parser->utxt[parser->cursor],
		                    word_length_uchar,
		                    parser->ignore_numbers,
		                    &is_allowed,
		                    &type)) {
			/* Quit loop just in case */
			parser->cursor = parser->utxt_size;
			break;
		}

		/* Ignore the word if not an allowed word start */
		if (!is_allowed) {
			/* Ignore this word and keep on looping */
			parser->cursor = next_word_offset_uchar;
			continue;
		}

		/* check if word is reserved (looking at ORIGINAL UTF-8 buffer here! */
		if (parser->ignore_reserved_words &&
		    tracker_parser_is_reserved_word_utf8 (&parser->txt[current_word_offset_utf8],
		                                          word_length_utf8)) {
			/* Ignore this word and keep on looping */
			parser->cursor = next_word_offset_uchar;
			continue;
		}

		/* compute truncated word length (in UChar bytes) if needed (to
		 * avoid extremely long words) */
		truncated_length = (word_length_uchar < 2 * WORD_BUFFER_LENGTH ?
		                    word_length_uchar :
		                    2 * WORD_BUFFER_LENGTH);

		/* Process the word here. If it fails, we can still go
		 *  to the next one. Returns newly allocated UTF-8
		 *  string always.
		 * Enable UNAC stripping only if no ASCII and no CJK
		 * Note we are passing UChar encoded string here!
		 */
		processed_word = process_word_uchar (parser,
		                                     &(parser->utxt[parser->cursor]),
		                                     truncated_length,
		                                     type,
		                                     stop_word);
		if (!processed_word) {
			/* Ignore this word and keep on looping */
			parser->cursor = next_word_offset_uchar;
			continue;
		}
	}

	/* If we got a word here, set output */
	if (processed_word) {
		/* Set outputs */
		*byte_offset_start = current_word_offset_utf8;
		*byte_offset_end = current_word_offset_utf8 + word_length_utf8;

		/* Update cursor */
		parser->cursor += word_length_uchar;

		parser->word_length = strlen (processed_word);
		parser->word = processed_word;

		return TRUE;
	}

	/* No more words... */
	return FALSE;
}

TrackerParser *
tracker_parser_new (void)
{
	TrackerParser *parser;

	parser = g_new0 (TrackerParser, 1);
	parser->language = tracker_language_new (NULL);

	return parser;
}

void
tracker_parser_free (TrackerParser *parser)
{
	g_return_if_fail (parser != NULL);

	if (parser->language) {
		g_object_unref (parser->language);
	}

	if (parser->bi) {
		ubrk_close (parser->bi);
	}

	g_free (parser->utxt);
	g_free (parser->offsets);

	g_free (parser->word);

	g_free (parser);
}

void
tracker_parser_reset (TrackerParser *parser,
                      const gchar   *txt,
                      gint           txt_size,
                      guint          max_word_length,
                      gboolean       enable_stemmer,
                      gboolean       enable_unaccent,
                      gboolean       ignore_stop_words,
                      gboolean       ignore_reserved_words,
                      gboolean       ignore_numbers)
{
	UErrorCode error = U_ZERO_ERROR;
	UConverter *converter;
	UChar *last_uchar;
	const gchar *last_utf8;

	g_return_if_fail (parser != NULL);
	g_return_if_fail (txt != NULL);

	parser->max_word_length = max_word_length;
	parser->enable_stemmer = enable_stemmer;
	parser->enable_unaccent = enable_unaccent;
	parser->ignore_stop_words = ignore_stop_words;
	parser->ignore_reserved_words = ignore_reserved_words;
	parser->ignore_numbers = ignore_numbers;

	/* Note: We're forcing some unicode characters to behave
	 * as wordbreakers: e.g, the '.' The main reason for this
	 * is to enable FTS searches matching file extension. */
	parser->enable_forced_wordbreaks = TRUE;

	parser->txt_size = txt_size;
	parser->txt = txt;

	g_free (parser->word);
	parser->word = NULL;

	if (parser->bi) {
		ubrk_close (parser->bi);
		parser->bi = NULL;
	}
	g_free (parser->utxt);
	parser->utxt = NULL;
	g_free (parser->offsets);
	parser->offsets = NULL;

	parser->word_position = 0;

	parser->cursor = 0;

	if (parser->txt_size == 0)
		return;

	/* Open converter UTF-8 to UChar */
	converter = ucnv_open ("UTF-8", &error);
	if (!converter) {
		g_warning ("Cannot open UTF-8 converter: '%s'",
		           U_FAILURE (error) ? u_errorName (error) : "none");
		return;
	}

	/* Allocate UChars and offsets buffers */
	parser->utxt_size = txt_size + 1;
	parser->utxt = g_malloc (parser->utxt_size * sizeof (UChar));
	parser->offsets = g_malloc (parser->utxt_size * sizeof (gint32));

	/* last_uchar and last_utf8 will be also an output parameter! */
	last_uchar = parser->utxt;
	last_utf8 = parser->txt;

	/* Convert to UChars storing offsets */
	ucnv_toUnicode (converter,
	                &last_uchar,
	                &parser->utxt[txt_size],
	                &last_utf8,
	                &parser->txt[txt_size],
	                parser->offsets,
	                FALSE,
	                &error);
	if (U_SUCCESS (error)) {
		/* Proper UChar array size is now given by 'last_uchar' */
		parser->utxt_size = last_uchar - parser->utxt;

		/* Open word-break iterator */
		parser->bi = ubrk_open(UBRK_WORD,
		                       setlocale (LC_CTYPE, NULL),
		                       parser->utxt,
		                       parser->utxt_size,
		                       &error);
		if (U_SUCCESS (error)) {
			/* Find FIRST word in the UChar array */
			parser->cursor = ubrk_first (parser->bi);
		}
	}

	/* If any error happened, reset buffers */
	if (U_FAILURE (error)) {
		g_warning ("Error initializing libicu support: '%s'",
		           u_errorName (error));
		/* Reset buffers */
		g_free (parser->utxt);
		parser->utxt = NULL;
		g_free (parser->offsets);
		parser->offsets = NULL;
		parser->utxt_size = 0;
		if (parser->bi) {
			ubrk_close (parser->bi);
			parser->bi = NULL;
		}
	}

	/* Close converter */
	ucnv_close (converter);
}

const gchar *
tracker_parser_next (TrackerParser *parser,
                     gint          *position,
                     gint          *byte_offset_start,
                     gint          *byte_offset_end,
                     gboolean      *stop_word,
                     gint          *word_length)
{
	const gchar  *str;
	gint byte_start = 0, byte_end = 0;

	str = NULL;

	g_free (parser->word);
	parser->word = NULL;

	*stop_word = FALSE;

	if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
		str = parser->word;
	}

	if (!*stop_word) {
		parser->word_position++;
	}

	*word_length = parser->word_length;
	*position = parser->word_position;
	*byte_offset_start = byte_start;
	*byte_offset_end = byte_end;

	return str;
}

gpointer
tracker_collation_init (void)
{
	UCollator *collator = NULL;
	UErrorCode status = U_ZERO_ERROR;
	const gchar *locale;

	/* Get locale! */
	locale = setlocale (LC_COLLATE, NULL);

	collator = ucol_open (locale, &status);
	if (!collator) {
		g_warning ("[ICU collation] Collator for locale '%s' cannot be created: %s",
		           locale, u_errorName (status));
		/* Try to get UCA collator then... */
		status = U_ZERO_ERROR;
		collator = ucol_open ("root", &status);
		if (!collator) {
			g_critical ("[ICU collation] UCA Collator cannot be created: %s",
			            u_errorName (status));
		}
	}

	return collator;
}

void
tracker_collation_shutdown (gpointer collator)
{
	if (collator)
		ucol_close ((UCollator *)collator);
}

gint
tracker_collation_utf8 (gpointer      collator,
                        gint          len1,
                        gconstpointer str1,
                        gint          len2,
                        gconstpointer str2)
{
	UErrorCode status = U_ZERO_ERROR;
	UCharIterator iter1;
	UCharIterator iter2;
	UCollationResult result;

	/* Collator must be created before trying to collate */
	g_return_val_if_fail (collator, -1);

	/* Setup iterators */
	uiter_setUTF8 (&iter1, str1, len1);
	uiter_setUTF8 (&iter2, str2, len2);

	result = ucol_strcollIter ((UCollator *)collator,
	                           &iter1,
	                           &iter2,
	                           &status);
	if (status != U_ZERO_ERROR)
		g_critical ("Error collating: %s", u_errorName (status));

	if (result == UCOL_GREATER)
		return 1;
	if (result == UCOL_LESS)
		return -1;
	return 0;
}

gunichar2 *
tracker_parser_tolower (const gunichar2 *input,
			gsize            len,
			gsize           *len_out)
{
	UChar *zOutput;
	int nOutput;
	UErrorCode status = U_ZERO_ERROR;

	g_return_val_if_fail (input, NULL);

	nOutput = len * 2 + 2;
	zOutput = malloc (nOutput);

	u_strToLower (zOutput, nOutput / 2,
		      input, len / 2,
		      NULL, &status);

	if (!U_SUCCESS (status)) {
		memcpy (zOutput, input, len);
		zOutput[len] = '\0';
		nOutput = len;
	}

	*len_out = nOutput;

	return zOutput;
}

gunichar2 *
tracker_parser_toupper (const gunichar2 *input,
			gsize            len,
			gsize           *len_out)
{
	UChar *zOutput;
	int nOutput;
	UErrorCode status = U_ZERO_ERROR;

	nOutput = len * 2 + 2;
	zOutput = malloc (nOutput);

	u_strToUpper (zOutput, nOutput / 2,
		      input, len / 2,
		      NULL, &status);

	if (!U_SUCCESS (status)) {
		memcpy (zOutput, input, len);
		zOutput[len] = '\0';
		nOutput = len;
	}

	*len_out = nOutput;

	return zOutput;
}

gunichar2 *
tracker_parser_casefold (const gunichar2 *input,
			 gsize            len,
			 gsize           *len_out)
{
	UChar *zOutput;
	int nOutput;
	UErrorCode status = U_ZERO_ERROR;

	nOutput = len * 2 + 2;
	zOutput = malloc (nOutput);

	u_strFoldCase (zOutput, nOutput / 2,
		       input, len / 2,
		       U_FOLD_CASE_DEFAULT, &status);

	if (!U_SUCCESS (status)){
		memcpy (zOutput, input, len);
		zOutput[len] = '\0';
		nOutput = len;
	}

	*len_out = nOutput;

	return zOutput;
}

static gunichar2 *
normalize_string (const gunichar2    *string,
                  gsize               string_len, /* In gunichar2s */
                  const UNormalizer2 *normalizer,
                  gsize              *len_out,    /* In gunichar2s */
                  UErrorCode         *status)
{
	int nOutput;
	gunichar2 *zOutput;

	nOutput = (string_len * 2) + 1;
	zOutput = g_new0 (gunichar2, nOutput);

	nOutput = unorm2_normalize (normalizer, string, string_len, zOutput, nOutput, status);

	if (*status == U_BUFFER_OVERFLOW_ERROR) {
		/* Try again after allocating enough space for the normalization */
		*status = U_ZERO_ERROR;
		zOutput = g_renew (gunichar2, zOutput, nOutput);
		memset (zOutput, 0, nOutput * sizeof (gunichar2));
		nOutput = unorm2_normalize (normalizer, string, string_len, zOutput, nOutput, status);
	}

	if (!U_SUCCESS (*status)) {
		g_clear_pointer (&zOutput, g_free);
		nOutput = 0;
	}

	if (len_out)
		*len_out = nOutput;

	return zOutput;
}

gunichar2 *
tracker_parser_normalize (const gunichar2 *input,
                          GNormalizeMode   mode,
			  gsize            len,
			  gsize           *len_out)
{
	uint16_t *zOutput = NULL;
	gsize nOutput;
	const UNormalizer2 *normalizer;
	UErrorCode status = U_ZERO_ERROR;

	if (mode == G_NORMALIZE_NFC)
		normalizer = unorm2_getNFCInstance (&status);
	else if (mode == G_NORMALIZE_NFD)
		normalizer = unorm2_getNFDInstance (&status);
	else if (mode == G_NORMALIZE_NFKC)
		normalizer = unorm2_getNFKCInstance (&status);
	else if (mode == G_NORMALIZE_NFKD)
		normalizer = unorm2_getNFKDInstance (&status);
	else
		g_assert_not_reached ();

	if (U_SUCCESS (status)) {
		zOutput = normalize_string (input, len / 2,
					    normalizer,
					    &nOutput, &status);
	}

	if (!U_SUCCESS (status)) {
		zOutput = g_memdup2 (input, len);
		nOutput = len;
	}

	*len_out = nOutput;

	return zOutput;
}

gunichar2 *
tracker_parser_unaccent (const gunichar2 *input,
			 gsize            len,
			 gsize           *len_out)
{
	uint16_t *zOutput = NULL;
	gsize nOutput;
	const UNormalizer2 *normalizer;
	UErrorCode status = U_ZERO_ERROR;

	normalizer = unorm2_getNFKDInstance (&status);

	if (U_SUCCESS (status)) {
		zOutput = normalize_string (input, len / 2,
					    normalizer,
					    &nOutput, &status);
	}

	if (!U_SUCCESS (status)) {
		zOutput = g_memdup2 (input, len);
	}

	/* Unaccenting is done in place */
	tracker_parser_unaccent_nfkd_string (zOutput, &nOutput);

	*len_out = nOutput;

	return zOutput;
}