/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
 * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
 *
 * This library is free software: you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library. If not, see <http://www.gnu.org/licenses/>.
 *
 * Authors: Michael Zucchi <notzed@ximian.com>
 */

#include "evolution-data-server-config.h"

#include <string.h>
#include <sys/types.h>

#include "camel-utf8.h"

/**
 * camel_utf8_putc:
 * @ptr: (inout): pointer to write the character to
 * @c: a Unicode character to write
 *
 * Output a 32 bit unicode character as UTF-8 octets.  At most 4 octets will
 * be written to @ptr. The @ptr will be advanced to the next character position.
 **/
void
camel_utf8_putc (guchar **ptr,
                 guint32 c)
{
	register guchar *p = *ptr;

	if (c <= 0x7f)
		*p++ = c;
	else if (c <= 0x7ff) {
		*p++ = 0xc0 | c >> 6;
		*p++ = 0x80 | (c & 0x3f);
	} else if (c <= 0xffff) {
		*p++ = 0xe0 | c >> 12;
		*p++ = 0x80 | ((c >> 6) & 0x3f);
		*p++ = 0x80 | (c & 0x3f);
	} else {
		/* see unicode standard 3.0, S 3.8, max 4 octets */
		*p++ = 0xf0 | c >> 18;
		*p++ = 0x80 | ((c >> 12) & 0x3f);
		*p++ = 0x80 | ((c >> 6) & 0x3f);
		*p++ = 0x80 | (c & 0x3f);
	}

	*ptr = p;
}

/**
 * camel_utf8_getc:
 * @ptr: (inout): a pointer to read the character from
 *
 * Get a Unicode character from a UTF-8 stream.  @ptr will be advanced
 * to the next character position.  Invalid utf8 characters will be
 * silently skipped. The @ptr should point to a NUL terminated array.
 *
 * Returns: The next Unicode character. The @ptr will be advanced to
 *    the next character always.
 **/
guint32
camel_utf8_getc (const guchar **ptr)
{
	register guchar *p = (guchar *) * ptr;
	register guchar c, r;
	register guint32 v, m;

again:
	r = *p++;
loop:
	if (r < 0x80) {
		*ptr = p;
		v = r;
	} else if (r < 0xf8) { /* valid start char? (max 4 octets) */
		v = r;
		m = 0x7f80;	/* used to mask out the length bits */
		do {
			c = *p++;
			if ((c & 0xc0) != 0x80) {
				r = c;
				goto loop;
			}
			v = (v << 6) | (c & 0x3f);
			r <<= 1;
			m <<= 5;
		} while (r & 0x40);

		*ptr = p;

		v &= ~m;
	} else {
		goto again;
	}

	return v;
}

/**
 * camel_utf8_getc_limit:
 * @ptr: (inout): a pointer to read the character from
 * @end: upper limit for the read, must not be %NULL
 *
 * Get the next UTF-8 gchar at @ptr, and return it, advancing @ptr to
 * the next character. If @end is reached before a full UTF-8
 * character can be read, then the invalid Unicode gchar 0xffff is
 * returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
 * advanced.
 *
 * Returns: The next UTF-8 char, or 0xffff.
 **/
guint32
camel_utf8_getc_limit (const guchar **ptr,
                       const guchar *end)
{
	register guchar *p = (guchar *) * ptr;
	register guchar c, r;
	register guint32 v = 0xffff, m;

again:
	while (p < end) {
		r = *p++;
loop:
		if (r < 0x80) {
			*ptr = p;
			return r;
		} else if (r < 0xf8) { /* valid start char? (max 4 octets) */
			v = r;
			m = 0x7f80;	/* used to mask out the length bits */
			do {
				if (p >= end)
					return 0xffff;

				c = *p++;
				if ((c & 0xc0) != 0x80) {
					r = c;
					goto loop;
				}
				v = (v << 6) | (c & 0x3f);
				r <<= 1;
				m <<= 5;
			} while (r & 0x40);

			*ptr = p;

			v &= ~m;
			return v;
		} else {
			goto again;
		}
	}

	return 0xffff;
}

static const gchar utf7_alphabet[] =
	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";

static const guchar utf7_rank[256] = {
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff,
	0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
	0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff,
	0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
	0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
};

/**
 * camel_utf7_utf8:
 * @ptr: a UTF-7 string to convert
 *
 * Convert a modified UTF-7 string to UTF-8.  If the UTF-7 string
 * contains 8 bit characters, they are treated as iso-8859-1.
 *
 * The IMAP rules [rfc2060] are used in the UTF-7 encoding.
 *
 * Returns: (transfer full): The converted string. Free it with g_free(),
 *    when no longer needed.
 **/
gchar *
camel_utf7_utf8 (const gchar *ptr)
{
	const guchar *p = (guchar *) ptr;
	guint c;
	guint32 v = 0, x;
	gint i = 0;
	gint state = 0;
	gchar *ret;
	gunichar2 *utf16, *pos;
	gsize block_size;

	g_return_val_if_fail (ptr != NULL, NULL);

	block_size = sizeof (gunichar2) * (1 + strlen (ptr));
	utf16 = g_slice_alloc (block_size);
	pos = utf16;

	do {
		c = *p++;
		switch (state) {
		case 0:
			if (c == '&') {
				state = 1;
			} else {
				*pos = c;
				pos++;
			}
			break;
		case 1:
			if (c == '-') {
				*pos = '&';
				pos++;
				state = 0;
			} else if (utf7_rank[c] != 0xff) {
				v = utf7_rank[c];
				i = 6;
				state = 2;
			} else {
				/* invalid */
				*pos = '&';
				pos++;
				*pos = '-';
				pos++;
				state = 0;
			}
			break;
		case 2:
			if (c == '-') {
				state = 0;
			} else if (utf7_rank[c] != 0xff) {
				v = (v << 6) | utf7_rank[c];
				i+=6;
				if (i >= 16) {
					x = (v >> (i - 16)) & 0xffff;
					*pos = x;
					pos++;
					i-=16;
				}
			} else {
				*pos = c;
				pos++;
				state = 0;
			}
			break;
		}
	} while (c);

	ret = g_utf16_to_utf8 (utf16, -1, NULL, NULL, NULL);

	g_slice_free1 (block_size, utf16);

	return ret;
}

static void utf7_closeb64 (GString *out, guint32 v, guint32 i)
{
	guint32 x;

	if (i > 0) {
		x = (v << (6 - i)) & 0x3f;
		g_string_append_c (out, utf7_alphabet[x]);
	}
	g_string_append_c (out, '-');
}

/**
 * camel_utf8_utf7:
 * @ptr: a UTF-8 string to convert
 *
 * Convert a UTF-8 string to a modified UTF-7 format.
 *
 * The IMAP rules [rfc2060] are used in the UTF-7 encoding.
 *
 * Returns: (transfer full): The converted string. Free it with g_free(),
 *    when no longer needed.
 **/
gchar *
camel_utf8_utf7 (const gchar *ptr)
{
	gunichar2 *utf16, *up;
	const guchar *cp = (guchar *) ptr;
	guint c;
	guint32 x, v = 0;
	gint state = 0;
	GString *out;
	gint i = 0;
	gchar *ret;

	g_return_val_if_fail (ptr != NULL, NULL);

	utf16 = g_utf8_to_utf16 (ptr, -1, NULL, NULL, NULL);
	up = utf16;

	out = g_string_new ("");

	while ((c = utf16 ? *up : camel_utf8_getc (&cp))) {
		if (utf16)
			up++;

		if (c >= 0x20 && c <= 0x7e) {
			if (state == 1) {
				utf7_closeb64 (out, v, i);
				state = 0;
				i = 0;
			}
			if (c == '&')
				g_string_append (out, "&-");
			else
				g_string_append_c (out, c);
		} else {
			if (state == 0) {
				g_string_append_c (out, '&');
				state = 1;
				v = 0;
			}
			v = (v << 16) | c;
			i += 16;
			while (i >= 6) {
				x = (v >> (i - 6)) & 0x3f;
				g_string_append_c (out, utf7_alphabet[x]);
				i -= 6;
			}
		}
	}

	if (state == 1)
		utf7_closeb64 (out, v, i);

	ret = g_strdup (out->str);
	g_string_free (out, TRUE);
	g_free (utf16);

	return ret;
}

/**
 * camel_utf8_ucs2:
 * @ptr: a UTF-8 string to convert
 *
 * Convert a UTF-8 string into a ucs2 one. The ucs string will be in
 * network byte order, and terminated with a 16-bit %NULL.
 *
 * Returns: (transfer full): The converted string. Free it with g_free(),
 *    when no longer needed.
 **/
gchar *
camel_utf8_ucs2 (const gchar *ptr)
{
	GByteArray *work = g_byte_array_new ();
	guint32 c;
	gchar *out;
	const guchar *uptr = (const guchar *) ptr;

	/* what if c is > 0xffff ? */

	while ((c = camel_utf8_getc (&uptr))) {
		guint16 s = g_htons (c);

		g_byte_array_append (work, (guchar *) &s, 2);
	}

	g_byte_array_append (work, (guchar *) "\000\000", 2);
	out = g_malloc (work->len);
	memcpy (out, work->data, work->len);
	g_byte_array_free (work, TRUE);

	return out;
}

/**
 * camel_ucs2_utf8:
 * @ptr: a ucs2 string to convert
 *
 * Convert a ucs2 string into a UTF-8 one. The ucs2 string is treated
 * as network byte ordered, and terminated with a 16-bit %NULL.
 *
 * Returns: (transfer full): The converted string. Free it with g_free(),
 *    when no longer needed.
 **/
gchar *
camel_ucs2_utf8 (const gchar *ptr)
{
	guint16 *ucs = (guint16 *) ptr;
	guint32 c;
	GString *work = g_string_new ("");
	gchar *out;

	while ((c = *ucs++))
		g_string_append_unichar (work, g_ntohs (c));

	out = g_strdup (work->str);
	g_string_free (work, TRUE);

	return out;
}

/**
 * camel_utf8_make_valid:
 * @text: a text to make valid
 *
 * Ensures the returned text will be valid UTF-8 string, with incorrect letters
 * changed to question marks.
 *
 * Returns: (transfer full): Valid UTF-8 string, with replaced incorrect letters.
 *    Free it with g_free(), when no longer needed.
 *
 * Since: 2.26
 **/
gchar *
camel_utf8_make_valid (const gchar *text)
{
	return camel_utf8_make_valid_len (text, -1);
}

/**
 * camel_utf8_make_valid_len:
 * @text: a text to make valid
 * @text_len: length of the @text, or -1 if NUL-terminated
 *
 * Ensures the returned text will be valid UTF-8 string, with incorrect letters
 * changed to question marks.
 *
 * Returns: (transfer full): Valid UTF-8 string, with replaced incorrect letters.
 *    Free it with g_free(), when no longer needed.
 *
 * Since: 3.34
 **/
gchar *
camel_utf8_make_valid_len (const gchar *text,
			   gssize text_len)
{
	/* almost identical copy of glib's _g_utf8_make_valid() */
	GString *string;
	const gchar *remainder, *invalid;
	gint remaining_bytes, valid_bytes;

	if (text && text_len < 0)
		text_len = strlen (text);

	if (!text || text_len <= 0 || !*text)
		return g_strdup (text);

	string = NULL;
	remainder = (gchar *) text,
	remaining_bytes = text_len;

	while (remaining_bytes != 0) {
		if (g_utf8_validate (remainder, remaining_bytes, &invalid))
			break;

		valid_bytes = invalid - remainder;

		if (!string)
			string = g_string_sized_new (remaining_bytes);

		g_string_append_len (string, remainder, valid_bytes);
		/* append U+FFFD REPLACEMENT CHARACTER */
		g_string_append (string, "\357\277\275");

		remaining_bytes -= valid_bytes + 1;
		remainder = invalid + 1;
	}

	if (!string)
		return g_strndup (text, text_len);

	if (remaining_bytes > 0)
		g_string_append_len (string, remainder, remaining_bytes);

	g_warn_if_fail (g_utf8_validate (string->str, -1, NULL));

	return g_string_free (string, FALSE);
}