Import parser construction utility library

svn path=/trunk/libparserutils/; revision=4111
author: John Mark Bell <jmb@netsurf-browser.org> 2008-05-01 16:34:46 +0000
committer: John Mark Bell <jmb@netsurf-browser.org> 2008-05-01 16:34:46 +0000
commit: 2777a04ed2ba4fd36138b991d66a32a283361f7e (patch)
tree: b0c3730533c36ca41402b6d0c5b98413f0a57bee /src/charset
download: libparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.gz
15 files changed, 3424 insertions, 0 deletions
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..fc34d7c
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Manipulate include paths
+override CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := aliases.c charset.c codec.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
new file mode 100644
index 0000000..1e7e6ea
--- /dev/null
+++ b/src/charset/aliases.c
@@ -0,0 +1,410 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+struct alias {
+	struct alias *next;
+	parserutils_charset_aliases_canon *canon;
+	uint16_t name_len;
+	char name[1];
+};
+
+#define HASH_SIZE (43)
+static parserutils_charset_aliases_canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static parserutils_error parserutils_charset_create_alias(const char *alias,
+		parserutils_charset_aliases_canon *c, 
+		parserutils_alloc alloc, void *pw);
+static parserutils_charset_aliases_canon *parserutils_charset_create_canon(
+		const char *canon, uint16_t mibenum, 
+		parserutils_alloc alloc, void *pw);
+static uint32_t parserutils_charset_hash_val(const char *alias, size_t len);
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename  The path to the Aliases file
+ * \param alloc     Memory (de)allocation function
+ * \param pw        Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_charset_aliases_create(const char *filename,
+		parserutils_alloc alloc, void *pw)
+{
+	char buf[300];
+	FILE *fp;
+
+	if (filename == NULL || alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	fp = fopen(filename, "r");
+	if (fp == NULL)
+		return PARSERUTILS_FILENOTFOUND;
+
+	while (fgets(buf, sizeof buf, fp)) {
+		char *p, *aliases = 0, *mib, *end;
+		parserutils_charset_aliases_canon *cf;
+
+		if (buf[0] == 0 || buf[0] == '#')
+			/* skip blank lines or comments */
+			continue;
+
+		buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+		end = buf + strlen(buf);
+
+		/* find end of canonical form */
+		for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		*p++ = '\0'; /* terminate canonical form */
+
+		/* skip whitespace */
+		for (; *p && isspace(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		mib = p;
+
+		/* find end of mibenum */
+		for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+			; /* do nothing */
+		if (p < end)
+			*p++ = '\0'; /* terminate mibenum */
+
+		cf = parserutils_charset_create_canon(buf, atoi(mib), alloc, pw);
+		if (cf == NULL)
+			continue;
+
+		/* skip whitespace */
+		for (; p < end && *p && isspace(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		aliases = p;
+
+		while (p < end) {
+			/* find end of alias */
+			for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+				; /* do nothing */
+			if (p > end)
+				/* stop if we've gone past the end */
+				break;
+			/* terminate current alias */
+			*p++ = '\0';
+
+			if (parserutils_charset_create_alias(aliases, cf,
+					alloc, pw) != PARSERUTILS_OK)
+				break;
+
+			/* in terminating, we may have advanced
+			 * past the end - check this here */
+			if (p >= end)
+				break;
+
+			/* skip whitespace */
+			for (; *p && isspace(*p); p++)
+				; /* do nothing */
+
+			if (p >= end)
+				/* gone past end => stop */
+				break;
+
+			/* update pointer to current alias */
+			aliases = p;
+		}
+	}
+
+	fclose(fp);
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Free all alias data
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data
+ */
+void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw)
+{
+	parserutils_charset_aliases_canon *c, *d;
+	struct alias *a, *b;
+	int i;
+
+	for (i = 0; i != HASH_SIZE; i++) {
+		for (c = canon_tab[i]; c; c = d) {
+			d = c->next;
+			alloc(c, 0, pw);
+		}
+		canon_tab[i] = NULL;
+
+		for (a = alias_tab[i]; a; a = b) {
+			b = a->next;
+			alloc(a, 0, pw);
+		}
+		alias_tab[i] = NULL;
+	}
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias  The alias to lookup
+ * \param len    The length of the alias string
+ * \return The MIB enum value, or 0 if not found
+ */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
+{
+	parserutils_charset_aliases_canon *c;
+
+	if (alias == NULL)
+		return 0;
+
+	c = parserutils_charset_alias_canonicalise(alias, len);
+	if (c == NULL)
+		return 0;
+
+	return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum)
+{
+	int i;
+	parserutils_charset_aliases_canon *c;
+
+	for (i = 0; i != HASH_SIZE; i++)
+		for (c = canon_tab[i]; c; c = c->next)
+			if (c->mib_enum == mibenum)
+				return c->name;
+
+	return NULL;
+}
+
+/**
+ * Detect if a parserutils_charset is Unicode
+ *
+ * \param mibenum  The MIB enum to consider
+ * \return true if a Unicode variant, false otherwise
+ */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum)
+{
+	static uint16_t ucs4;
+	static uint16_t ucs2;
+	static uint16_t utf8;
+	static uint16_t utf16;
+	static uint16_t utf16be;
+	static uint16_t utf16le;
+	static uint16_t utf32;
+	static uint16_t utf32be;
+	static uint16_t utf32le;
+
+	if (ucs4 == 0) {
+		ucs4 = parserutils_charset_mibenum_from_name("UCS-4", 
+				SLEN("UCS-4"));
+		ucs2 = parserutils_charset_mibenum_from_name("UCS-2", 
+				SLEN("UCS-2"));
+		utf8 = parserutils_charset_mibenum_from_name("UTF-8", 
+				SLEN("UTF-8"));
+		utf16 = parserutils_charset_mibenum_from_name("UTF-16", 
+				SLEN("UTF-16"));
+		utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+				SLEN("UTF-16BE"));
+		utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+				SLEN("UTF-16LE"));
+		utf32 = parserutils_charset_mibenum_from_name("UTF-32", 
+				SLEN("UTF-32"));
+		utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+				SLEN("UTF-32BE"));
+		utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+				SLEN("UTF-32LE"));
+	}
+
+	return (mibenum == ucs4 || mibenum == ucs2 || mibenum == utf8 ||
+			mibenum == utf16 || mibenum == utf16be || 
+			mibenum == utf16le || mibenum == utf32 ||
+			mibenum == utf32be || mibenum == utf32le);
+}
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias  The alias name
+ * \param len    The length of the alias name
+ * \return Pointer to canonical form or NULL if not found
+ */
+parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
+		const char *alias, size_t len)
+{
+	uint32_t hash;
+	parserutils_charset_aliases_canon *c;
+	struct alias *a;
+
+	if (alias == NULL)
+		return NULL;
+
+	hash = parserutils_charset_hash_val(alias, len);
+
+	for (c = canon_tab[hash]; c; c = c->next)
+		if (c->name_len == len &&
+				strncasecmp(c->name, alias, len) == 0)
+			break;
+	if (c)
+		return c;
+
+	for (a = alias_tab[hash]; a; a = a->next)
+		if (a->name_len == len &&
+				strncasecmp(a->name, alias, len) == 0)
+			break;
+	if (a)
+		return a->canon;
+
+	return NULL;
+}
+
+
+/**
+ * Create an alias
+ *
+ * \param alias  The alias name
+ * \param c      The canonical form
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_create_alias(const char *alias, 
+		parserutils_charset_aliases_canon *c,
+		parserutils_alloc alloc, void *pw)
+{
+	struct alias *a;
+	uint32_t hash;
+
+	if (alias == NULL || c == NULL || alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw);
+	if (a == NULL)
+		return PARSERUTILS_NOMEM;
+
+	a->canon = c;
+	a->name_len = strlen(alias);
+	strcpy(a->name, alias);
+	a->name[a->name_len] = '\0';
+
+	hash = parserutils_charset_hash_val(alias, a->name_len);
+
+	a->next = alias_tab[hash];
+	alias_tab[hash] = a;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon    The canonical name
+ * \param mibenum  The MIB enum value
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to canonical form or NULL on error
+ */
+parserutils_charset_aliases_canon *parserutils_charset_create_canon(
+		const char *canon, uint16_t mibenum, 
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_charset_aliases_canon *c;
+	uint32_t hash, len;
+
+	if (canon == NULL || alloc == NULL)
+		return NULL;
+
+	len = strlen(canon);
+
+	c = alloc(NULL, sizeof(parserutils_charset_aliases_canon) + len + 1, pw);
+	if (c == NULL)
+		return NULL;
+
+	c->mib_enum = mibenum;
+	c->name_len = len;
+	strcpy(c->name, canon);
+	c->name[len] = '\0';
+
+	hash = parserutils_charset_hash_val(canon, len);
+
+	c->next = canon_tab[hash];
+	canon_tab[hash] = c;
+
+	return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+uint32_t parserutils_charset_hash_val(const char *alias, size_t len)
+{
+	const char *s = alias;
+	uint32_t h = 5381;
+
+	if (alias == NULL)
+		return 0;
+
+	while (len--)
+		h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+	return h % HASH_SIZE;
+}
+
+
+#ifndef NDEBUG
+/**
+ * Dump all alias data to stdout
+ */
+void parserutils_charset_aliases_dump(void)
+{
+	parserutils_charset_aliases_canon *c;
+	struct alias *a;
+	int i;
+	size_t size = 0;
+
+	for (i = 0; i != HASH_SIZE; i++) {
+		for (c = canon_tab[i]; c; c = c->next) {
+			printf("%d %s\n", i, c->name);
+			size += offsetof(parserutils_charset_aliases_canon, 
+					name) + c->name_len;
+		}
+
+		for (a = alias_tab[i]; a; a = a->next) {
+			printf("%d %s\n", i, a->name);
+			size += offsetof(struct alias, name) + a->name_len;
+		}
+	}
+
+	size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+	size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+	printf("%u\n", (unsigned int) size);
+}
+#endif
diff --git a/src/charset/aliases.h b/src/charset/aliases.h
new file mode 100644
index 0000000..9abd2c8
--- /dev/null
+++ b/src/charset/aliases.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_aliases_h_
+#define parserutils_charset_aliases_h_
+
+#include <inttypes.h>
+
+#include <parserutils/charset/mibenum.h>
+
+typedef struct parserutils_charset_aliases_canon {
+	struct parserutils_charset_aliases_canon *next;
+	uint16_t mib_enum;
+	uint16_t name_len;
+	char name[1];
+} parserutils_charset_aliases_canon;
+
+/* Load encoding aliases from file */
+parserutils_error parserutils_charset_aliases_create(const char *filename,
+		parserutils_alloc alloc, void *pw);
+/* Destroy encoding aliases */
+void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw);
+
+/* Canonicalise an alias name */
+parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
+		const char *alias, size_t len);
+
+#ifndef NDEBUG
+void parserutils_charset_aliases_dump(void);
+#endif
+
+#endif
diff --git a/src/charset/charset.c b/src/charset/charset.c
new file mode 100644
index 0000000..3ef1a71
--- /dev/null
+++ b/src/charset/charset.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include "charset/aliases.h"
+#include "charset/charset.h"
+
+/**
+ * Initialise the Charset library for use.
+ *
+ * This _must_ be called before using any libparserutils charset functions
+ *
+ * \param aliases_file  Pointer to name of file containing encoding alias data
+ * \param alloc         Pointer to (de)allocation function
+ * \param pw            Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_charset_initialise(const char *aliases_file,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_error error;
+
+	if (aliases_file == NULL || alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	error = parserutils_charset_aliases_create(aliases_file, alloc, pw);
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clean up after Libparserutils
+ *
+ * \param alloc  Pointer to (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, 
+		void *pw)
+{
+	if (alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	parserutils_charset_aliases_destroy(alloc, pw);
+
+	return PARSERUTILS_OK;
+}
+
+
diff --git a/src/charset/charset.h b/src/charset/charset.h
new file mode 100644
index 0000000..4b07577
--- /dev/null
+++ b/src/charset/charset.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_charset_h_
+#define parserutils_charset_charset_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+
+/* Initialise the Charset library for use */
+parserutils_error parserutils_charset_initialise(const char *aliases_file,
+		parserutils_alloc alloc, void *pw);
+
+/* Clean up after Charset */
+parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, 
+		void *pw);
+
+#endif
+
diff --git a/src/charset/codec.c b/src/charset/codec.c
new file mode 100644
index 0000000..5c3fb3a
--- /dev/null
+++ b/src/charset/codec.c
@@ -0,0 +1,185 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "charset/codecs/codec_impl.h"
+
+#ifdef WITH_ICONV_CODEC
+extern parserutils_charset_handler iconv_codec_handler;
+#endif
+
+extern parserutils_charset_handler charset_utf8_codec_handler;
+extern parserutils_charset_handler charset_utf16_codec_handler;
+
+static parserutils_charset_handler *handler_table[] = {
+	&charset_utf8_codec_handler,
+	&charset_utf16_codec_handler,
+#ifdef WITH_ICONV_CODEC
+	&iconv_codec_handler,
+#endif
+	NULL,
+};
+
+/**
+ * Create a charset codec
+ *
+ * \param charset  Target charset
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec instance, or NULL on failure
+ */
+parserutils_charset_codec *parserutils_charset_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_charset_codec *codec;
+	parserutils_charset_handler **handler;
+	const parserutils_charset_aliases_canon * canon;
+
+	if (charset == NULL || alloc == NULL)
+		return NULL;
+
+	/* Canonicalise parserutils_charset name. */
+	canon = parserutils_charset_alias_canonicalise(charset, 
+			strlen(charset));
+	if (canon == NULL)
+		return NULL;
+
+	/* Search for handler class */
+	for (handler = handler_table; *handler != NULL; handler++) {
+		if ((*handler)->handles_charset(canon->name))
+			break;
+	}
+
+	/* None found */
+	if ((*handler) == NULL)
+		return NULL;
+
+	/* Instantiate class */
+	codec = (*handler)->create(canon->name, alloc, pw);
+	if (codec == NULL)
+		return NULL;
+
+	/* and initialise it */
+	codec->mibenum = canon->mib_enum;
+
+	codec->errormode = PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+
+	codec->alloc = alloc;
+	codec->alloc_pw = pw;
+
+	return codec;
+}
+
+/**
+ * Destroy a charset codec
+ *
+ * \param codec  The codec to destroy
+ */
+void parserutils_charset_codec_destroy(parserutils_charset_codec *codec)
+{
+	if (codec == NULL)
+		return;
+
+	codec->handler.destroy(codec);
+
+	codec->alloc(codec, 0, codec->alloc_pw);
+}
+
+/**
+ * Configure a charset codec
+ *
+ * \param codec   The codec to configure
+ * \parem type    The codec option type to configure
+ * \param params  Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_setopt(
+		parserutils_charset_codec *codec,
+		parserutils_charset_codec_opttype type,
+		parserutils_charset_codec_optparams *params)
+{
+	if (codec == NULL || params == NULL)
+		return PARSERUTILS_BADPARM;
+
+	switch (type) {
+	case PARSERUTILS_CHARSET_CODEC_ERROR_MODE:
+		codec->errormode = params->error_mode.mode;
+		break;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Encode a chunk of UCS4 data into a codec's charset
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ */
+parserutils_error parserutils_charset_codec_encode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	if (codec == NULL || source == NULL || *source == NULL ||
+			sourcelen == NULL || dest == NULL || *dest == NULL ||
+			destlen == NULL)
+		return PARSERUTILS_BADPARM;
+
+	return codec->handler.encode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Decode a chunk of data in a codec's charset into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ *
+ * Call this with a source length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_charset_codec_decode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	if (codec == NULL || source == NULL || *source == NULL ||
+			sourcelen == NULL || dest == NULL || *dest == NULL ||
+			destlen == NULL)
+		return PARSERUTILS_BADPARM;
+
+	return codec->handler.decode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Clear a charset codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_reset(
+		parserutils_charset_codec *codec)
+{
+	if (codec == NULL)
+		return PARSERUTILS_BADPARM;
+
+	return codec->handler.reset(codec);
+}
+
diff --git a/src/charset/codecs/Makefile b/src/charset/codecs/Makefile
new file mode 100644
index 0000000..6d3b78e
--- /dev/null
+++ b/src/charset/codecs/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Sources
+SRCS_$(d) := codec_iconv.c codec_utf8.c codec_utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/codecs/codec_iconv.c b/src/charset/codecs/codec_iconv.c
new file mode 100644
index 0000000..bbe8bc4
--- /dev/null
+++ b/src/charset/codecs/codec_iconv.c
@@ -0,0 +1,683 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/* This codec is hideously slow. Only use it as a last resort */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* We put this here rather than at the top as GCC complains 
+ * about the source file being empty otherwise. */
+#ifdef WITH_ICONV_CODEC
+
+#include <iconv.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "utils/utils.h"
+
+/**
+ * Iconv-based charset codec
+ */
+typedef struct iconv_codec {
+	parserutils_charset_codec base;	/**< Base class */
+
+	iconv_t read_cd;		/**< Iconv handle for reading */
+#define INVAL_BUFSIZE (32)
+	uint8_t inval_buf[INVAL_BUFSIZE];	/**< Buffer for fixing up
+						 * incomplete input
+						 * sequences */
+	size_t inval_len;		/**< Number of bytes in inval_buf */
+
+#define READ_BUFSIZE (8)
+	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (decode)
+						 */
+	size_t read_len;		/**< Number of characters in
+					 * read_buf */
+
+	iconv_t write_cd;		/**< Iconv handle for writing */
+#define WRITE_BUFSIZE (8)
+	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (encode)
+						 */
+	size_t write_len;		/**< Number of characters in
+					 * write_buf */
+} iconv_codec;
+
+
+static bool iconv_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *iconv_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw);
+static void iconv_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error iconv_codec_encode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_decode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_reset(parserutils_charset_codec *codec);
+static parserutils_error iconv_codec_output_decoded_char(
+		iconv_codec *c, uint32_t ucs4, uint8_t **dest,
+		size_t *destlen);
+static parserutils_error iconv_codec_read_char(iconv_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_write_char(iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool iconv_codec_handles_charset(const char *charset)
+{
+	iconv_t cd;
+	bool ret;
+
+	cd = iconv_open("UCS-4", charset);
+
+	ret = (cd != (iconv_t) -1);
+
+	if (ret)
+		iconv_close(cd);
+
+	return ret;
+}
+
+/**
+ * Create an iconv-based codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *iconv_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw)
+{
+	iconv_codec *codec;
+
+	codec = alloc(NULL, sizeof(iconv_codec), pw);
+	if (codec == NULL)
+		return NULL;
+
+	codec->read_cd = iconv_open("UCS-4", charset);
+	if (codec->read_cd == (iconv_t) -1) {
+		alloc(codec, 0, pw);
+		return NULL;
+	}
+
+	codec->write_cd = iconv_open(charset, "UCS-4");
+	if (codec->write_cd == (iconv_t) -1) {
+		iconv_close(codec->read_cd);
+		alloc(codec, 0, pw);
+		return NULL;
+	}
+
+	codec->inval_buf[0] = '\0';
+	codec->inval_len = 0;
+
+	codec->read_buf[0] = 0;
+	codec->read_len = 0;
+
+	codec->write_buf[0] = 0;
+	codec->write_len = 0;
+
+	/* Finally, populate vtable */
+	codec->base.handler.destroy = iconv_codec_destroy;
+	codec->base.handler.encode = iconv_codec_encode;
+	codec->base.handler.decode = iconv_codec_decode;
+	codec->base.handler.reset = iconv_codec_reset;
+
+	return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy an iconv-based codec
+ *
+ * \param codec  The codec to destroy
+ */
+void iconv_codec_destroy (parserutils_charset_codec *codec)
+{
+	iconv_codec *c = (iconv_codec *) codec;
+
+	iconv_close(c->read_cd);
+	iconv_close(c->write_cd);
+
+	return;
+}
+
+/**
+ * Encode a chunk of UCS4 data into an iconv-based codec's charset
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                             codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error iconv_codec_encode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	iconv_codec *c = (iconv_codec *) codec;
+	uint32_t ucs4;
+	const uint32_t *towrite;
+	size_t towritelen;
+	parserutils_error error;
+
+	/* Process any outstanding characters from the previous call */
+	if (c->write_len > 0) {
+		uint32_t *pwrite = c->write_buf;
+
+		while (c->write_len > 0) {
+			error = iconv_codec_write_char(c, pwrite[0],
+					dest, destlen);
+			if (error != PARSERUTILS_OK) {
+				/* Copy outstanding chars down, skipping
+				 * invalid one, if present, so as to avoid
+				 * reprocessing the invalid character */
+				if (error == PARSERUTILS_INVALID) {
+					for (ucs4 = 1; ucs4 < c->write_len;
+							ucs4++) {
+						c->write_buf[ucs4] =
+								pwrite[ucs4];
+					}
+				}
+
+				return error;
+			}
+
+			pwrite++;
+			c->write_len--;
+		}
+	}
+
+	/* Now process the characters for this call */
+	while (*sourcelen > 0) {
+		towrite = (const uint32_t *) (const void *) *source;
+		towritelen = 1;
+		ucs4 = *towrite;
+
+		/* Output current character(s) */
+		while (towritelen > 0) {
+			error = iconv_codec_write_char(c, towrite[0],
+					dest, destlen);
+
+			if (error != PARSERUTILS_OK) {
+				ucs4 = (error == PARSERUTILS_INVALID) ? 1 : 0;
+
+				if (towritelen - ucs4 >= WRITE_BUFSIZE)
+					abort();
+
+				c->write_len = towritelen - ucs4;
+
+				/* Copy pending chars to save area, for
+				 * processing next call; skipping invalid
+				 * character, if present, so it's not
+				 * reprocessed. */
+				for (; ucs4 < towritelen; ucs4++) {
+					c->write_buf[ucs4] = towrite[ucs4];
+				}
+
+				/* Claim character we've just buffered,
+				 * so it's not repreocessed */
+				*source += 4;
+				*sourcelen -= 4;
+
+				return error;
+			}
+
+			towrite++;
+			towritelen--;
+		}
+
+		*source += 4;
+		*sourcelen -= 4;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of data in an iconv-based codec's charset into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error iconv_codec_decode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	iconv_codec *c = (iconv_codec *) codec;
+	parserutils_error error;
+
+	if (c->read_len > 0) {
+		/* Output left over from last decode
+		 * Attempt to finish this here */
+		uint32_t *pread = c->read_buf;
+
+		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+			*((uint32_t *) (void *) *dest) = pread[0];
+
+			*dest += 4;
+			*destlen -= 4;
+
+			pread++;
+			c->read_len--;
+		}
+
+		if (*destlen < c->read_len * 4) {
+			/* Run out of output buffer */
+			size_t i;
+
+			/* Shuffle remaining output down */
+			for (i = 0; i < c->read_len; i++) {
+				c->read_buf[i] = pread[i];
+			}
+
+			return PARSERUTILS_NOMEM;
+		}
+	}
+
+	if (c->inval_len > 0) {
+		/* The last decode ended in an incomplete sequence.
+		 * Fill up inval_buf with data from the start of the
+		 * new chunk and process it. */
+		uint8_t *in = c->inval_buf;
+		size_t ol = c->inval_len;
+		size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+		size_t orig_l = l;
+
+		memcpy(c->inval_buf + ol, *source, l);
+
+		l += c->inval_len;
+
+		error = iconv_codec_read_char(c,
+				(const uint8_t **) &in, &l, dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			return error;
+		}
+
+
+		/* And now, fix everything up so the normal processing
+		 * does the right thing. */
+		*source += max((signed) (orig_l - l), 0);
+		*sourcelen -= max((signed) (orig_l - l), 0);
+
+		/* Failed to resolve an incomplete character and
+		 * ran out of buffer space. No recovery strategy
+		 * possible, so explode everywhere. */
+		if ((orig_l + ol) - l == 0)
+			abort();
+
+		/* Handle memry exhaustion case from above */
+		if (error != PARSERUTILS_OK)
+			return error;
+	}
+
+	while (*sourcelen > 0) {
+		error = iconv_codec_read_char(c,
+				source, sourcelen, dest, destlen);
+		if (error != PARSERUTILS_OK) {
+			return error;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clear an iconv-based codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error iconv_codec_reset(parserutils_charset_codec *codec)
+{
+	iconv_codec *c = (iconv_codec *) codec;
+
+	iconv(c->read_cd, NULL, NULL, NULL, NULL);
+	iconv(c->write_cd, NULL, NULL, NULL, NULL);
+
+	c->inval_buf[0] = '\0';
+	c->inval_len = 0;
+
+	c->read_buf[0] = 0;
+	c->read_len = 0;
+
+	c->write_buf[0] = 0;
+	c->write_len = 0;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS4 character (big endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error iconv_codec_output_decoded_char(iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	if (*destlen < 4) {
+		/* Run out of output buffer */
+
+		c->read_len = 1;
+		c->read_buf[0] = ucs4;
+
+		return PARSERUTILS_NOMEM;
+	}
+
+	*((uint32_t *) (void *) *dest) = ucs4;
+	*dest += 4;
+	*destlen -= 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Read a character from the codec's native charset to UCS4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error iconv_codec_read_char(iconv_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	size_t iconv_ret;
+	const uint8_t *origsrc = *source;
+	size_t origsrclen = *sourcelen;
+	uint32_t ucs4;
+	uint8_t *pucs4 = (uint8_t *) &ucs4;
+	size_t sucs4 = 4;
+	parserutils_error error;
+
+	/* Use iconv to convert a single character
+	 * Side effect: Updates *source to point at next input
+	 * character and *sourcelen to reflect reduced input length
+	 */
+	iconv_ret = iconv(c->read_cd, (char **) source, sourcelen,
+			(char **) (void *) &pucs4, &sucs4);
+
+	if (iconv_ret != (size_t) -1 ||
+			(*source != origsrc && sucs4 == 0)) {
+		/* Read a character */
+		error = iconv_codec_output_decoded_char(c, ucs4, dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			/* output failed; restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		return error;
+	} else if (errno == E2BIG) {
+		/* Should never happen */
+		abort();
+	} else if (errno == EINVAL) {
+		/* Incomplete input sequence */
+		if (*sourcelen > INVAL_BUFSIZE)
+			abort();
+
+		memmove(c->inval_buf, (const char *) *source, *sourcelen);
+		c->inval_buf[*sourcelen] = '\0';
+		c->inval_len = *sourcelen;
+
+		*source += *sourcelen;
+		*sourcelen = 0;
+
+		return PARSERUTILS_OK;
+	} else if (errno == EILSEQ) {
+		/* Illegal input sequence */
+		bool found = false;
+		const uint8_t *oldsrc;
+		size_t oldsrclen;
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		/* Strict errormode; simply flag invalid character */
+		if (c->base.errormode == 
+				PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+			/* restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+
+			return PARSERUTILS_INVALID;
+		}
+
+		/* Ok, this becomes problematic. The iconv API here
+		* is particularly unhelpful; *source will point at
+		* the _start_ of the illegal sequence. This means
+		* that we must find the end of the sequence */
+
+		/* Search for the start of the next valid input
+		 * sequence (or the end of the input stream) */
+		while (*sourcelen > 1) {
+			pucs4 = (uint8_t *) &ucs4;
+			sucs4 = 4;
+
+			(*source)++;
+			(*sourcelen)--;
+
+			oldsrc = *source;
+			oldsrclen = *sourcelen;
+
+			iconv_ret = iconv(c->read_cd,
+					(char **) source, sourcelen,
+					(char **) (void *) &pucs4, &sucs4);
+			if (iconv_ret != (size_t) -1 || errno != EILSEQ) {
+				found = true;
+				break;
+			}
+		}
+
+		if (found) {
+			/* Found start of next valid sequence */
+			*source = oldsrc;
+			*sourcelen = oldsrclen;
+		} else {
+			/* Not found - skip last byte in buffer */
+			(*source)++;
+			(*sourcelen)--;
+
+			if (*sourcelen != 0)
+				abort();
+		}
+
+		/* output U+FFFD and continue processing. */
+		error = iconv_codec_output_decoded_char(c,
+				htonl(0xFFFD), dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			/* output failed; restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+		}
+
+		return error;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Write a UCS4 character in a codec's native charset
+ *
+ * \param c        The codec
+ * \param ucs4     The UCS4 character to write (big endian)
+ * \param dest     Pointer to pointer to output buffer (updated on exit)
+ * \param destlen  Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK       on success,
+ *         PARSERUTILS_NOMEM    if output buffer is too small,
+ *         PARSERUTILS_INVALID  if character cannot be represented and the
+ *                         codec's error handling mode is set to STRICT.
+ */
+parserutils_error iconv_codec_write_char(iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	size_t iconv_ret;
+	uint8_t *pucs4 = (uint8_t *) &ucs4;
+	size_t sucs4 = 4;
+	uint8_t *origdest = *dest;
+
+	iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4,
+			&sucs4, (char **) dest, destlen);
+
+	if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+		/* Output buffer is too small */
+		return PARSERUTILS_NOMEM;
+	} else if (iconv_ret == (size_t) -1 && errno == EILSEQ) {
+		/* Illegal multibyte sequence */
+		/* This should never happen */
+		abort();
+	} else if (iconv_ret == (size_t) -1 && errno == EINVAL) {
+		/* Incomplete input character */
+		/* This should never happen */
+		abort();
+	} else if (*dest == origdest) {
+		/* Nothing was output */
+		switch (c->base.errormode) {
+		case PARSERUTILS_CHARSET_CODEC_ERROR_STRICT:
+			return PARSERUTILS_INVALID;
+
+		case PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT:
+			/** \todo transliteration */
+		case PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE:
+		{
+			pucs4 = (uint8_t *) &ucs4;
+			sucs4 = 4;
+
+			ucs4 = parserutils_charset_mibenum_is_unicode(
+					c->base.mibenum)
+					? htonl(0xFFFD) : htonl(0x3F);
+
+			iconv_ret = iconv(c->write_cd,
+					(char **) (void *) &pucs4, &sucs4,
+					(char **) dest, destlen);
+
+			if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+				return PARSERUTILS_NOMEM;
+			} else if (iconv_ret == (size_t) -1 &&
+					errno == EILSEQ) {
+				/* Illegal multibyte sequence */
+				/* This should never happen */
+				abort();
+			} else if (iconv_ret == (size_t) -1 &&
+					errno == EINVAL) {
+				/* Incomplete input character */
+				/* This should never happen */
+				abort();
+			}
+		}
+			break;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+const parserutils_charset_handler iconv_codec_handler = {
+	iconv_codec_handles_charset,
+	iconv_codec_create
+};
+
+#endif
diff --git a/src/charset/codecs/codec_impl.h b/src/charset/codecs/codec_impl.h
new file mode 100644
index 0000000..9183594
--- /dev/null
+++ b/src/charset/codecs/codec_impl.h
@@ -0,0 +1,48 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_codecs_codecimpl_h_
+#define parserutils_charset_codecs_codecimpl_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include <parserutils/charset/codec.h>
+
+/**
+ * Core charset codec definition; implementations extend this
+ */
+struct parserutils_charset_codec {
+	uint16_t mibenum;			/**< MIB enum for charset */
+
+	parserutils_charset_codec_errormode errormode;	/**< error mode */
+
+	parserutils_alloc alloc;		/**< allocation function */
+	void *alloc_pw;				/**< private word */
+
+	struct {
+		void (*destroy)(parserutils_charset_codec *codec);
+		parserutils_error (*encode)(parserutils_charset_codec *codec,
+				const uint8_t **source, size_t *sourcelen,
+				uint8_t **dest, size_t *destlen);
+		parserutils_error (*decode)(parserutils_charset_codec *codec,
+				const uint8_t **source, size_t *sourcelen,
+				uint8_t **dest, size_t *destlen);
+		parserutils_error (*reset)(parserutils_charset_codec *codec);
+	} handler; /**< Vtable for handler code */
+};
+
+/**
+ * Codec factory component definition
+ */
+typedef struct parserutils_charset_handler {
+	bool (*handles_charset)(const char *charset);
+	parserutils_charset_codec *(*create)(const char *charset,
+			parserutils_alloc alloc, void *pw);
+} parserutils_charset_handler;
+
+#endif
diff --git a/src/charset/codecs/codec_utf16.c b/src/charset/codecs/codec_utf16.c
new file mode 100644
index 0000000..0dd7a07
--- /dev/null
+++ b/src/charset/codecs/codec_utf16.c
@@ -0,0 +1,544 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf16.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "utils/utils.h"
+
+/**
+ * UTF-16 charset codec
+ */
+typedef struct charset_utf16_codec {
+	parserutils_charset_codec base;	/**< Base class */
+
+#define INVAL_BUFSIZE (32)
+	uint8_t inval_buf[INVAL_BUFSIZE];	/**< Buffer for fixing up
+						 * incomplete input
+						 * sequences */
+	size_t inval_len;		/*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (decode)
+						 * (host-endian) */
+	size_t read_len;		/**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (encode)
+						 * (host-endian) */
+	size_t write_len;		/**< Character length of write_buf */
+
+} charset_utf16_codec;
+
+static bool charset_utf16_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *charset_utf16_codec_create(
+		const char *charset, parserutils_alloc alloc, void *pw);
+static void charset_utf16_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error charset_utf16_codec_encode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_decode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_reset(
+		parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf16_codec_read_char(
+		charset_utf16_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf16_codec_output_decoded_char(
+		charset_utf16_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf16_codec_handles_charset(const char *charset)
+{
+	return parserutils_charset_mibenum_from_name(charset, strlen(charset)) 
+		==
+		parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
+}
+
+/**
+ * Create a utf16 codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *charset_utf16_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw)
+{
+	charset_utf16_codec *codec;
+
+	UNUSED(charset);
+
+	codec = alloc(NULL, sizeof(charset_utf16_codec), pw);
+	if (codec == NULL)
+		return NULL;
+
+	codec->inval_buf[0] = '\0';
+	codec->inval_len = 0;
+
+	codec->read_buf[0] = 0;
+	codec->read_len = 0;
+
+	codec->write_buf[0] = 0;
+	codec->write_len = 0;
+
+	/* Finally, populate vtable */
+	codec->base.handler.destroy = charset_utf16_codec_destroy;
+	codec->base.handler.encode = charset_utf16_codec_encode;
+	codec->base.handler.decode = charset_utf16_codec_decode;
+	codec->base.handler.reset = charset_utf16_codec_reset;
+
+	return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy a utf16 codec
+ *
+ * \param codec  The codec to destroy
+ */
+void charset_utf16_codec_destroy (parserutils_charset_codec *codec)
+{
+	UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf16
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call. 
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	charset_utf16_codec *c = (charset_utf16_codec *) codec;
+	uint32_t ucs4;
+	uint32_t *towrite;
+	size_t towritelen;
+	parserutils_error error;
+
+	/* Process any outstanding characters from the previous call */
+	if (c->write_len > 0) {
+		uint32_t *pwrite = c->write_buf;
+		uint8_t buf[4];
+		size_t len;
+
+		while (c->write_len > 0) {
+			error = parserutils_charset_utf16_from_ucs4(
+					pwrite[0], buf, &len);
+			if (error != PARSERUTILS_OK)
+				abort();
+
+			if (*destlen < len) {
+				/* Insufficient output buffer space */
+				for (len = 0; len < c->write_len; len++)
+					c->write_buf[len] = pwrite[len];
+
+				return PARSERUTILS_NOMEM;
+			}
+
+			memcpy(*dest, buf, len);
+
+			*dest += len;
+			*destlen -= len;
+
+			pwrite++;
+			c->write_len--;
+		}
+	}
+
+	/* Now process the characters for this call */
+	while (*sourcelen > 0) {
+		ucs4 = ntohl(*((uint32_t *) (void *) *source));
+		towrite = &ucs4;
+		towritelen = 1;
+
+		/* Output current characters */
+		while (towritelen > 0) {
+			uint8_t buf[4];
+			size_t len;
+
+			error = parserutils_charset_utf16_from_ucs4(
+					towrite[0], buf, &len);
+			if (error != PARSERUTILS_OK)
+				abort();
+
+			if (*destlen < len) {
+				/* Insufficient output space */
+				if (towritelen >= WRITE_BUFSIZE)
+					abort();
+
+				c->write_len = towritelen;
+
+				/* Copy pending chars to save area, for
+				 * processing next call. */
+				for (len = 0; len < towritelen; len++)
+					c->write_buf[len] = towrite[len];
+
+				/* Claim character we've just buffered,
+				 * so it's not reprocessed */
+				*source += 4;
+				*sourcelen -= 4;
+
+				return PARSERUTILS_NOMEM;
+			}
+
+			memcpy(*dest, buf, len);
+
+			*dest += len;
+			*destlen -= len;
+
+			towrite++;
+			towritelen--;
+		}
+
+		*source += 4;
+		*sourcelen -= 4;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of utf16 data into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	charset_utf16_codec *c = (charset_utf16_codec *) codec;
+	parserutils_error error;
+
+	if (c->read_len > 0) {
+		/* Output left over from last decode */
+		uint32_t *pread = c->read_buf;
+
+		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+			*((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+			*dest += 4;
+			*destlen -= 4;
+
+			pread++;
+			c->read_len--;
+		}
+
+		if (*destlen < c->read_len * 4) {
+			/* Ran out of output buffer */
+			size_t i;
+
+			/* Shuffle remaining output down */
+			for (i = 0; i < c->read_len; i++)
+				c->read_buf[i] = pread[i];
+
+			return PARSERUTILS_NOMEM;
+		}
+	}
+
+	if (c->inval_len > 0) {
+		/* The last decode ended in an incomplete sequence.
+		 * Fill up inval_buf with data from the start of the
+		 * new chunk and process it. */
+		uint8_t *in = c->inval_buf;
+		size_t ol = c->inval_len;
+		size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+		size_t orig_l = l;
+
+		memcpy(c->inval_buf + ol, *source, l);
+
+		l += c->inval_len;
+
+		error = charset_utf16_codec_read_char(c,
+				(const uint8_t **) &in, &l, dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			return error;
+		}
+
+		/* And now, fix up source pointers */
+		*source += max((signed) (orig_l - l), 0);
+		*sourcelen -= max((signed) (orig_l - l), 0);
+
+		/* Failed to resolve an incomplete character and
+		 * ran out of buffer space. No recovery strategy
+		 * possible, so explode everywhere. */
+		if ((orig_l + ol) - l == 0)
+			abort();
+
+		/* Report memory exhaustion case from above */
+		if (error != PARSERUTILS_OK)
+			return error;
+	}
+
+	/* Finally, the "normal" case; process all outstanding characters */
+	while (*sourcelen > 0) {
+		error = charset_utf16_codec_read_char(c,
+				source, sourcelen, dest, destlen);
+		if (error != PARSERUTILS_OK) {
+			return error;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clear a utf16 codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
+{
+	charset_utf16_codec *c = (charset_utf16_codec *) codec;
+
+	c->inval_buf[0] = '\0';
+	c->inval_len = 0;
+
+	c->read_buf[0] = 0;
+	c->read_len = 0;
+
+	c->write_buf[0] = 0;
+	c->write_len = 0;
+
+	return PARSERUTILS_OK;
+}
+
+
+/**
+ * Read a character from the UTF-16 to UCS4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	uint32_t ucs4;
+	size_t sucs4;
+	parserutils_error error;
+
+	/* Convert a single character */
+	error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen, 
+			&ucs4, &sucs4);
+	if (error == PARSERUTILS_OK) {
+		/* Read a character */
+		error = charset_utf16_codec_output_decoded_char(c,
+				ucs4, dest, destlen);
+		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+			/* output succeeded; update source pointers */
+			*source += sucs4;
+			*sourcelen -= sucs4;
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		return error;
+	} else if (error == PARSERUTILS_NEEDDATA) {
+		/* Incomplete input sequence */
+		if (*sourcelen > INVAL_BUFSIZE)
+			abort();
+
+		memmove(c->inval_buf, (char *) *source, *sourcelen);
+		c->inval_buf[*sourcelen] = '\0';
+		c->inval_len = *sourcelen;
+
+		*source += *sourcelen;
+		*sourcelen = 0;
+
+		return PARSERUTILS_OK;
+	} else if (error == PARSERUTILS_INVALID) {
+		/* Illegal input sequence */
+		uint32_t nextchar;
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		/* Strict errormode; simply flag invalid character */
+		if (c->base.errormode == 
+				PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+			return PARSERUTILS_INVALID;
+		}
+
+		/* Find next valid UTF-16 sequence.
+		 * We're processing client-provided data, so let's
+		 * be paranoid about its validity. */
+		error = parserutils_charset_utf16_next_paranoid(
+				*source, *sourcelen, 0, &nextchar);
+		if (error != PARSERUTILS_OK) {
+			if (error == PARSERUTILS_NEEDDATA) {
+				/* Need more data to be sure */
+				if (*sourcelen > INVAL_BUFSIZE)
+					abort();
+
+				memmove(c->inval_buf, (char *) *source,
+						*sourcelen);
+				c->inval_buf[*sourcelen] = '\0';
+				c->inval_len = *sourcelen;
+
+				*source += *sourcelen;
+				*sourcelen = 0;
+
+				nextchar = 0;
+			} else {
+				return error;
+			}
+		}
+
+		/* output U+FFFD and continue processing. */
+		error = charset_utf16_codec_output_decoded_char(c,
+				0xFFFD, dest, destlen);
+		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+			/* output succeeded; update source pointers */
+			*source += nextchar;
+			*sourcelen -= nextchar;
+		}
+
+		return error;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	if (*destlen < 4) {
+		/* Run out of output buffer */
+		c->read_len = 1;
+		c->read_buf[0] = ucs4;
+
+		return PARSERUTILS_NOMEM;
+	}
+
+	*((uint32_t *) (void *) *dest) = htonl(ucs4);
+	*dest += 4;
+	*destlen -= 4;
+
+	return PARSERUTILS_OK;
+}
+
+
+const parserutils_charset_handler charset_utf16_codec_handler = {
+	charset_utf16_codec_handles_charset,
+	charset_utf16_codec_create
+};
diff --git a/src/charset/codecs/codec_utf8.c b/src/charset/codecs/codec_utf8.c
new file mode 100644
index 0000000..838d051
--- /dev/null
+++ b/src/charset/codecs/codec_utf8.c
@@ -0,0 +1,546 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "charset/encodings/utf8impl.h"
+#include "utils/utils.h"
+
+/**
+ * UTF-8 charset codec
+ */
+typedef struct charset_utf8_codec {
+	parserutils_charset_codec base;	/**< Base class */
+
+#define INVAL_BUFSIZE (32)
+	uint8_t inval_buf[INVAL_BUFSIZE];	/**< Buffer for fixing up
+						 * incomplete input
+						 * sequences */
+	size_t inval_len;		/*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (decode)
+						 * (host-endian) */
+	size_t read_len;		/**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (encode)
+						 * (host-endian) */
+	size_t write_len;		/**< Character length of write_buf */
+
+} charset_utf8_codec;
+
+static bool charset_utf8_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *charset_utf8_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw);
+static void charset_utf8_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error charset_utf8_codec_encode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_decode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_reset(
+		parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf8_codec_read_char(
+		charset_utf8_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf8_codec_output_decoded_char(
+		charset_utf8_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf8_codec_handles_charset(const char *charset)
+{
+	return parserutils_charset_mibenum_from_name(charset, 
+				strlen(charset)) ==
+			parserutils_charset_mibenum_from_name("UTF-8", 
+				SLEN("UTF-8"));
+}
+
+/**
+ * Create a utf8 codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *charset_utf8_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw)
+{
+	charset_utf8_codec *codec;
+
+	UNUSED(charset);
+
+	codec = alloc(NULL, sizeof(charset_utf8_codec), pw);
+	if (codec == NULL)
+		return NULL;
+
+	codec->inval_buf[0] = '\0';
+	codec->inval_len = 0;
+
+	codec->read_buf[0] = 0;
+	codec->read_len = 0;
+
+	codec->write_buf[0] = 0;
+	codec->write_len = 0;
+
+	/* Finally, populate vtable */
+	codec->base.handler.destroy = charset_utf8_codec_destroy;
+	codec->base.handler.encode = charset_utf8_codec_encode;
+	codec->base.handler.decode = charset_utf8_codec_decode;
+	codec->base.handler.reset = charset_utf8_codec_reset;
+
+	return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy a utf8 codec
+ *
+ * \param codec  The codec to destroy
+ */
+void charset_utf8_codec_destroy (parserutils_charset_codec *codec)
+{
+	UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf8
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	charset_utf8_codec *c = (charset_utf8_codec *) codec;
+	uint32_t ucs4;
+	uint32_t *towrite;
+	size_t towritelen;
+	parserutils_error error;
+
+	/* Process any outstanding characters from the previous call */
+	if (c->write_len > 0) {
+		uint32_t *pwrite = c->write_buf;
+
+		while (c->write_len > 0) {
+			UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
+			if (error != PARSERUTILS_OK) {
+				if (error != PARSERUTILS_NOMEM)
+					abort();
+
+				/* Insufficient output buffer space */
+				for (uint32_t len = 0; 
+						len < c->write_len; len++) {
+					c->write_buf[len] = pwrite[len];
+				}
+
+				return PARSERUTILS_NOMEM;
+			}
+
+			pwrite++;
+			c->write_len--;
+		}
+	}
+
+	/* Now process the characters for this call */
+	while (*sourcelen > 0) {
+		ucs4 = ntohl(*((uint32_t *) (void *) *source));
+		towrite = &ucs4;
+		towritelen = 1;
+
+		/* Output current characters */
+		while (towritelen > 0) {
+			UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
+			if (error != PARSERUTILS_OK) {
+				if (error != PARSERUTILS_NOMEM)
+					abort();
+
+				/* Insufficient output space */
+				if (towritelen >= WRITE_BUFSIZE)
+					abort();
+
+				c->write_len = towritelen;
+
+				/* Copy pending chars to save area, for
+				 * processing next call. */
+				for (uint32_t len = 0; len < towritelen; len++)
+					c->write_buf[len] = towrite[len];
+
+				/* Claim character we've just buffered,
+				 * so it's not reprocessed */
+				*source += 4;
+				*sourcelen -= 4;
+
+				return PARSERUTILS_NOMEM;
+			}
+
+			towrite++;
+			towritelen--;
+		}
+
+		*source += 4;
+		*sourcelen -= 4;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of utf8 data into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	charset_utf8_codec *c = (charset_utf8_codec *) codec;
+	parserutils_error error;
+
+	if (c->read_len > 0) {
+		/* Output left over from last decode */
+		uint32_t *pread = c->read_buf;
+
+		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+			*((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+			*dest += 4;
+			*destlen -= 4;
+
+			pread++;
+			c->read_len--;
+		}
+
+		if (*destlen < c->read_len * 4) {
+			/* Ran out of output buffer */
+			size_t i;
+
+			/* Shuffle remaining output down */
+			for (i = 0; i < c->read_len; i++)
+				c->read_buf[i] = pread[i];
+
+			return PARSERUTILS_NOMEM;
+		}
+	}
+
+	if (c->inval_len > 0) {
+		/* The last decode ended in an incomplete sequence.
+		 * Fill up inval_buf with data from the start of the
+		 * new chunk and process it. */
+		uint8_t *in = c->inval_buf;
+		size_t ol = c->inval_len;
+		size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+		size_t orig_l = l;
+
+		memcpy(c->inval_buf + ol, *source, l);
+
+		l += c->inval_len;
+
+		error = charset_utf8_codec_read_char(c,
+				(const uint8_t **) &in, &l, dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			return error;
+		}
+
+		/* And now, fix up source pointers */
+		*source += max((signed) (orig_l - l), 0);
+		*sourcelen -= max((signed) (orig_l - l), 0);
+
+		/* Failed to resolve an incomplete character and
+		 * ran out of buffer space. No recovery strategy
+		 * possible, so explode everywhere. */
+		if ((orig_l + ol) - l == 0)
+			abort();
+
+		/* Report memory exhaustion case from above */
+		if (error != PARSERUTILS_OK)
+			return error;
+	}
+
+	/* Finally, the "normal" case; process all outstanding characters */
+	while (*sourcelen > 0) {
+		error = charset_utf8_codec_read_char(c,
+				source, sourcelen, dest, destlen);
+		if (error != PARSERUTILS_OK) {
+			return error;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clear a utf8 codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
+{
+	charset_utf8_codec *c = (charset_utf8_codec *) codec;
+
+	c->inval_buf[0] = '\0';
+	c->inval_len = 0;
+
+	c->read_buf[0] = 0;
+	c->read_len = 0;
+
+	c->write_buf[0] = 0;
+	c->write_len = 0;
+
+	return PARSERUTILS_OK;
+}
+
+
+/**
+ * Read a character from the UTF-8 to UCS4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	uint32_t ucs4;
+	size_t sucs4;
+	parserutils_error error;
+
+	/* Convert a single character */
+	{
+		const uint8_t *src = *source;
+		size_t srclen = *sourcelen;
+		uint32_t *uptr = &ucs4;
+		size_t *usptr = &sucs4;
+		UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
+	}
+	if (error == PARSERUTILS_OK) {
+		/* Read a character */
+		error = charset_utf8_codec_output_decoded_char(c,
+				ucs4, dest, destlen);
+		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+			/* output succeeded; update source pointers */
+			*source += sucs4;
+			*sourcelen -= sucs4;
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		return error;
+	} else if (error == PARSERUTILS_NEEDDATA) {
+		/* Incomplete input sequence */
+		if (*sourcelen > INVAL_BUFSIZE)
+			abort();
+
+		memmove(c->inval_buf, (char *) *source, *sourcelen);
+		c->inval_buf[*sourcelen] = '\0';
+		c->inval_len = *sourcelen;
+
+		*source += *sourcelen;
+		*sourcelen = 0;
+
+		return PARSERUTILS_OK;
+	} else if (error == PARSERUTILS_INVALID) {
+		/* Illegal input sequence */
+		uint32_t nextchar;
+	
+		/* Strict errormode; simply flag invalid character */
+		if (c->base.errormode == 
+				PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+			/* Clear inval buffer */
+			c->inval_buf[0] = '\0';
+			c->inval_len = 0;
+
+			return PARSERUTILS_INVALID;
+		}
+
+		/* Find next valid UTF-8 sequence.
+		 * We're processing client-provided data, so let's
+		 * be paranoid about its validity. */
+		{
+			const uint8_t *src = *source;
+			size_t srclen = *sourcelen;
+			uint32_t off = 0;
+			uint32_t *ncptr = &nextchar;
+
+			UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
+		}
+		if (error != PARSERUTILS_OK) {
+			if (error == PARSERUTILS_NEEDDATA) {
+				/* Need more data to be sure */
+				if (*sourcelen > INVAL_BUFSIZE)
+					abort();
+
+				memmove(c->inval_buf, (char *) *source,
+						*sourcelen);
+				c->inval_buf[*sourcelen] = '\0';
+				c->inval_len = *sourcelen;
+
+				*source += *sourcelen;
+				*sourcelen = 0;
+
+				nextchar = 0;
+			} else {
+				return error;
+			}
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		/* output U+FFFD and continue processing. */
+		error = charset_utf8_codec_output_decoded_char(c,
+				0xFFFD, dest, destlen);
+		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+			/* output succeeded; update source pointers */
+			*source += nextchar;
+			*sourcelen -= nextchar;
+		}
+
+		return error;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	if (*destlen < 4) {
+		/* Run out of output buffer */
+		c->read_len = 1;
+		c->read_buf[0] = ucs4;
+
+		return PARSERUTILS_NOMEM;
+	}
+
+	*((uint32_t *) (void *) *dest) = htonl(ucs4);
+	*dest += 4;
+	*destlen -= 4;
+
+	return PARSERUTILS_OK;
+}
+
+
+const parserutils_charset_handler charset_utf8_codec_handler = {
+	charset_utf8_codec_handles_charset,
+	charset_utf8_codec_create
+};
+
diff --git a/src/charset/encodings/Makefile b/src/charset/encodings/Makefile
new file mode 100644
index 0000000..47d9210
--- /dev/null
+++ b/src/charset/encodings/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Sources
+SRCS_$(d) := utf8.c utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c
new file mode 100644
index 0000000..95dc64f
--- /dev/null
+++ b/src/charset/encodings/utf16.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf16.h>
+
+/**
+ * Convert a UTF-16 sequence into a single UCS4 character
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence
+ * \param ucs4  Pointer to location to receive UCS4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-16 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, 
+		size_t len, uint32_t *ucs4, size_t *clen)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || ucs4 == NULL || clen == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (len < 2)
+		return PARSERUTILS_NEEDDATA;
+
+	if (*ss < 0xD800 || *ss > 0xDFFF) {
+		*ucs4 = *ss;
+		*clen = 2;
+	} else if (0xD800 <= *ss && *ss <= 0xBFFF) {
+		if (len < 4)
+			return PARSERUTILS_NEEDDATA;
+
+		if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
+			*ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
+					((s[0] & 0x3f) | (s[1] & 0x3ff));
+			*clen = 4;
+		} else {
+			return PARSERUTILS_INVALID;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-16 sequence
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to 4 byte long output buffer
+ * \param len   Pointer to location to receive length of multibyte sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
+		size_t *len)
+{
+	uint16_t *ss = (uint16_t *) (void *) s;
+	uint32_t l = 0;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+	else if (ucs4 < 0x10000) {
+		*ss = (uint16_t) ucs4;
+		l = 2;
+	} else if (ucs4 < 0x110000) {
+		ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
+		ss[1] = 0xDC00 | (ucs4 & 0x3ff);
+		l = 4;
+	} else {
+		return PARSERUTILS_INVALID;
+	}
+
+	*len = l;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-16 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
+		size_t *len)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+	const uint16_t *end = (const uint16_t *) (const void *) (s + max);
+	int l = 0;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+
+	while (ss < end) {
+		if (*ss < 0xD800 || 0xDFFF < *ss)
+			ss++;
+		else
+			ss += 2;
+
+		l++;
+	}
+
+	*len = l;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-16 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+		size_t *len)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (*ss < 0xD800 || 0xDFFF < *ss)
+		*len = 2;
+	else
+		*len = 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find previous legal UTF-16 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || prevoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (off < 2)
+		*prevoff = 0;
+	else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
+		*prevoff = off - 2;
+	else
+		*prevoff = (off < 4) ? 0 : off - 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || off >= len || nextoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (len - off < 4)
+		*nextoff = len;
+	else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
+		*nextoff = off + 2;
+	else
+		*nextoff = (len - off < 6) ? len : off + 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+		uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || off >= len || nextoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	while (1) {
+		if (len - off < 4) {
+			return PARSERUTILS_NEEDDATA;
+		} else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
+			*nextoff = off + 2;
+			break;
+		} else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
+			if (len - off < 6)
+				return PARSERUTILS_NEEDDATA;
+
+			if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
+				*nextoff = off + 4;
+				break;
+			} else {
+				ss++;
+				off += 2;
+			}
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c
new file mode 100644
index 0000000..5b4ba95
--- /dev/null
+++ b/src/charset/encodings/utf8.c
@@ -0,0 +1,175 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf8.h>
+#include "charset/encodings/utf8impl.h"
+
+/** Number of continuation bytes for a given start byte */
+const uint8_t numContinuations[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence
+ * \param ucs4  Pointer to location to receive UCS4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-8 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+		uint32_t *ucs4, size_t *clen)
+{
+	parserutils_error error;
+
+	UTF8_TO_UCS4(s, len, ucs4, clen, error);
+
+	return error;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to pointer to output buffer, updated on exit
+ * \param len   Pointer to length, in bytes, of output buffer, updated on exit
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, 
+		uint8_t **s, size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_FROM_UCS4(ucs4, s, len, error);
+
+	return error;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+		size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_LENGTH(s, max, len, error);
+
+	return error;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+		size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_CHAR_BYTE_LENGTH(s, len, error);
+
+	return error;
+}
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff)
+{
+	parserutils_error error;
+
+	UTF8_PREV(s, off, prevoff, error);
+
+	return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff)
+{
+	parserutils_error error;
+
+	UTF8_NEXT(s, len, off, nextoff, error);
+
+	return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, 
+		uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+	parserutils_error error;
+
+	UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
+
+	return error;
+}
+
diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h
new file mode 100644
index 0000000..1ca9de7
--- /dev/null
+++ b/src/charset/encodings/utf8impl.h
@@ -0,0 +1,339 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_encodings_utf8impl_h_
+#define parserutils_charset_encodings_utf8impl_h_
+
+/** \file
+ * UTF-8 manipulation macros (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+/** Number of continuation bytes for a given start byte */
+extern const uint8_t numContinuations[256];
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param s      The sequence to process
+ * \param len    Length of sequence
+ * \param ucs4   Pointer to location to receive UCS4 character (host endian)
+ * \param clen   Pointer to location to receive byte length of UTF-8 sequence
+ * \param error  Location to receive error code
+ */
+#define UTF8_TO_UCS4(s, len, ucs4, clen, error)				\
+do {									\
+	uint32_t c, min;						\
+	uint8_t n;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || ucs4 == NULL || clen == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	if (len == 0) {							\
+		error = PARSERUTILS_NEEDDATA;				\
+		break;							\
+	}								\
+									\
+	c = s[0];							\
+									\
+	if (c < 0x80) {							\
+		n = 1;							\
+		min = 0;						\
+	} else if ((c & 0xE0) == 0xC0) {				\
+		c &= 0x1F;						\
+		n = 2;							\
+		min = 0x80;						\
+	} else if ((c & 0xF0) == 0xE0) {				\
+		c &= 0x0F;						\
+		n = 3;							\
+		min = 0x800;						\
+	} else if ((c & 0xF8) == 0xF0) {				\
+		c &= 0x07;						\
+		n = 4;							\
+		min = 0x10000;						\
+	} else if ((c & 0xFC) == 0xF8) {				\
+		c &= 0x03;						\
+		n = 5;							\
+		min = 0x200000;						\
+	} else if ((c & 0xFE) == 0xFC) {				\
+		c &= 0x01;						\
+		n = 6;							\
+		min = 0x4000000;					\
+	} else {							\
+		error = PARSERUTILS_INVALID;				\
+		break;							\
+	}								\
+									\
+	if (len < n) {							\
+		error = PARSERUTILS_NEEDDATA;				\
+		break;							\
+	}								\
+									\
+	for (uint8_t i = 1; i < n; i++) {				\
+		uint32_t t = s[i];					\
+									\
+		if ((t & 0xC0) != 0x80) {				\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		c <<= 6;						\
+		c |= t & 0x3F;						\
+	}								\
+									\
+	if (error == PARSERUTILS_OK) {					\
+		/* Detect overlong sequences, surrogates and fffe/ffff */ \
+		if (c < min || (c >= 0xD800 && c <= 0xDFFF) ||		\
+				c == 0xFFFE || c == 0xFFFF) {		\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		*ucs4 = c;						\
+		*clen = n;						\
+	}								\
+} while(0)
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param ucs4   The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s      Pointer to pointer to output buffer, updated on exit
+ * \param len    Pointer to length, in bytes, of output buffer, updated on exit
+ * \param error  Location to receive error code
+ */
+#define UTF8_FROM_UCS4(ucs4, s, len, error)				\
+do {									\
+	uint8_t *buf;							\
+	uint8_t l = 0;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || *s == NULL || len == NULL) {			\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	if (ucs4 < 0x80) {						\
+		l = 1;							\
+	} else if (ucs4 < 0x800) {					\
+		l = 2;							\
+	} else if (ucs4 < 0x10000) {					\
+		l = 3;							\
+	} else if (ucs4 < 0x200000) {					\
+		l = 4;							\
+	} else if (ucs4 < 0x4000000) {					\
+		l = 5;							\
+	} else if (ucs4 <= 0x7FFFFFFF) {				\
+		l = 6;							\
+	} else {							\
+		error = PARSERUTILS_INVALID;				\
+		break;							\
+	}								\
+									\
+	if (l > *len) {							\
+		error = PARSERUTILS_NOMEM;				\
+		break;							\
+	}								\
+									\
+	buf = *s;							\
+									\
+	if (l == 1) {							\
+		buf[0] = (uint8_t) ucs4;				\
+	} else {							\
+		for (uint8_t i = l; i > 1; i--) {			\
+			buf[i - 1] = 0x80 | (ucs4 & 0x3F);		\
+			ucs4 >>= 6;					\
+		}							\
+		buf[0] = ~((1 << (8 - l)) - 1) | ucs4;			\
+	}								\
+									\
+	*s += l;							\
+	*len -= l;							\
+} while(0)
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s      The string
+ * \param max    Maximum length
+ * \param len    Pointer to location to receive length of string
+ * \param error  Location to receive error code
+ */
+#define UTF8_LENGTH(s, max, len, error)					\
+do {									\
+	const uint8_t *end = s + max;					\
+	int l = 0;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || len == NULL) {					\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	while (s < end) {						\
+		uint32_t c = s[0];					\
+									\
+		if ((c & 0x80) == 0x00)					\
+			s += 1;						\
+		else if ((c & 0xE0) == 0xC0)				\
+			s += 2;						\
+		else if ((c & 0xF0) == 0xE0)				\
+			s += 3;						\
+		else if ((c & 0xF8) == 0xF0)				\
+			s += 4;						\
+		else if ((c & 0xFC) == 0xF8)				\
+			s += 5;						\
+		else if ((c & 0xFE) == 0xFC)				\
+			s += 6;						\
+		else {							\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		l++;							\
+	}								\
+									\
+	if (error == PARSERUTILS_OK)					\
+		*len = l;						\
+} while(0)
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s      Pointer to start of character
+ * \param len    Pointer to location to receive length
+ * \param error  Location to receive error code
+ */
+#define UTF8_CHAR_BYTE_LENGTH(s, len, error)				\
+do {									\
+	if (s == NULL || len == NULL) {					\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	*len = numContinuations[s[0]] + 1 /* Start byte */;		\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_PREV(s, off, prevoff, error)				\
+do {									\
+	if (s == NULL || prevoff == NULL) {				\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	while (off != 0 && (s[--off] & 0xC0) == 0x80)			\
+		/* do nothing */;					\
+									\
+	*prevoff = off;							\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT(s, len, off, nextoff, error)				\
+do {									\
+	if (s == NULL || off >= len || nextoff == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	/* Skip current start byte (if present - may be mid-sequence) */\
+	if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)			\
+		off++;							\
+									\
+	while (off < len && (s[off] & 0xC0) == 0x80)			\
+		off++;							\
+									\
+	*nextoff = off;							\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Skip to start of next sequence in UTF-8 input
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)			\
+do {									\
+	uint8_t c;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || off >= len || nextoff == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	c = s[off];							\
+									\
+	/* If we're mid-sequence, simply advance to next byte */	\
+	if (!(c < 0x80 || (c & 0xC0) == 0xC0)) {			\
+		off++;							\
+	} else {							\
+		uint32_t nCont = numContinuations[c];			\
+		uint32_t nToSkip;					\
+									\
+		if (off + nCont + 1 >= len) {				\
+			error = PARSERUTILS_NEEDDATA;			\
+			break;						\
+		}							\
+									\
+		/* Verify continuation bytes */				\
+		for (nToSkip = 1; nToSkip <= nCont; nToSkip++) {	\
+			if ((s[off + nToSkip] & 0xC0) != 0x80)		\
+				break;					\
+		}							\
+									\
+		/* Skip over the valid bytes */				\
+		off += nToSkip;						\
+	}								\
+									\
+	*nextoff = off;							\
+} while(0)
+
+#endif
author	John Mark Bell <jmb@netsurf-browser.org>	2008-05-01 16:34:46 +0000
committer	John Mark Bell <jmb@netsurf-browser.org>	2008-05-01 16:34:46 +0000
commit	2777a04ed2ba4fd36138b991d66a32a283361f7e (patch)
tree	b0c3730533c36ca41402b6d0c5b98413f0a57bee /src/charset
download	libparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.gz