diff options
author | John Mark Bell <jmb@netsurf-browser.org> | 2008-05-01 16:34:46 +0000 |
---|---|---|
committer | John Mark Bell <jmb@netsurf-browser.org> | 2008-05-01 16:34:46 +0000 |
commit | 2777a04ed2ba4fd36138b991d66a32a283361f7e (patch) | |
tree | b0c3730533c36ca41402b6d0c5b98413f0a57bee /src/charset | |
download | libparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.gz |
Import parser construction utility library
svn path=/trunk/libparserutils/; revision=4111
Diffstat (limited to 'src/charset')
-rw-r--r-- | src/charset/Makefile | 49 | ||||
-rw-r--r-- | src/charset/aliases.c | 410 | ||||
-rw-r--r-- | src/charset/aliases.h | 36 | ||||
-rw-r--r-- | src/charset/charset.c | 54 | ||||
-rw-r--r-- | src/charset/charset.h | 24 | ||||
-rw-r--r-- | src/charset/codec.c | 185 | ||||
-rw-r--r-- | src/charset/codecs/Makefile | 46 | ||||
-rw-r--r-- | src/charset/codecs/codec_iconv.c | 683 | ||||
-rw-r--r-- | src/charset/codecs/codec_impl.h | 48 | ||||
-rw-r--r-- | src/charset/codecs/codec_utf16.c | 544 | ||||
-rw-r--r-- | src/charset/codecs/codec_utf8.c | 546 | ||||
-rw-r--r-- | src/charset/encodings/Makefile | 46 | ||||
-rw-r--r-- | src/charset/encodings/utf16.c | 239 | ||||
-rw-r--r-- | src/charset/encodings/utf8.c | 175 | ||||
-rw-r--r-- | src/charset/encodings/utf8impl.h | 339 |
15 files changed, 3424 insertions, 0 deletions
diff --git a/src/charset/Makefile b/src/charset/Makefile new file mode 100644 index 0000000..fc34d7c --- /dev/null +++ b/src/charset/Makefile @@ -0,0 +1,49 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Manipulate include paths +override CFLAGS := $(CFLAGS) -I$(d) + +# Sources +SRCS_$(d) := aliases.c charset.c codec.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/charset/aliases.c b/src/charset/aliases.c new file mode 100644 index 0000000..1e7e6ea --- /dev/null +++ b/src/charset/aliases.c @@ -0,0 +1,410 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <ctype.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "charset/aliases.h" +#include "utils/utils.h" + +struct alias { + struct alias *next; + parserutils_charset_aliases_canon *canon; + uint16_t name_len; + char name[1]; +}; + +#define HASH_SIZE (43) +static parserutils_charset_aliases_canon *canon_tab[HASH_SIZE]; +static struct alias *alias_tab[HASH_SIZE]; + +static parserutils_error parserutils_charset_create_alias(const char *alias, + parserutils_charset_aliases_canon *c, + parserutils_alloc alloc, void *pw); +static parserutils_charset_aliases_canon *parserutils_charset_create_canon( + const char *canon, uint16_t mibenum, + parserutils_alloc alloc, void *pw); +static uint32_t parserutils_charset_hash_val(const char *alias, size_t len); + +/** + * Create alias data from Aliases file + * + * \param filename The path to the Aliases file + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, appropriate error otherwise. + */ +parserutils_error parserutils_charset_aliases_create(const char *filename, + parserutils_alloc alloc, void *pw) +{ + char buf[300]; + FILE *fp; + + if (filename == NULL || alloc == NULL) + return PARSERUTILS_BADPARM; + + fp = fopen(filename, "r"); + if (fp == NULL) + return PARSERUTILS_FILENOTFOUND; + + while (fgets(buf, sizeof buf, fp)) { + char *p, *aliases = 0, *mib, *end; + parserutils_charset_aliases_canon *cf; + + if (buf[0] == 0 || buf[0] == '#') + /* skip blank lines or comments */ + continue; + + buf[strlen(buf) - 1] = 0; /* lose terminating newline */ + end = buf + strlen(buf); + + /* find end of canonical form */ + for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + *p++ = '\0'; /* terminate canonical form */ + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + mib = p; + + /* find end of mibenum */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p < end) + *p++ = '\0'; /* terminate mibenum */ + + cf = parserutils_charset_create_canon(buf, atoi(mib), alloc, pw); + if (cf == NULL) + continue; + + /* skip whitespace */ + for (; p < end && *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + aliases = p; + + while (p < end) { + /* find end of alias */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p > end) + /* stop if we've gone past the end */ + break; + /* terminate current alias */ + *p++ = '\0'; + + if (parserutils_charset_create_alias(aliases, cf, + alloc, pw) != PARSERUTILS_OK) + break; + + /* in terminating, we may have advanced + * past the end - check this here */ + if (p >= end) + break; + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + + if (p >= end) + /* gone past end => stop */ + break; + + /* update pointer to current alias */ + aliases = p; + } + } + + fclose(fp); + + return PARSERUTILS_OK; +} + +/** + * Free all alias data + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + */ +void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw) +{ + parserutils_charset_aliases_canon *c, *d; + struct alias *a, *b; + int i; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = d) { + d = c->next; + alloc(c, 0, pw); + } + canon_tab[i] = NULL; + + for (a = alias_tab[i]; a; a = b) { + b = a->next; + alloc(a, 0, pw); + } + alias_tab[i] = NULL; + } +} + +/** + * Retrieve the MIB enum value assigned to an encoding name + * + * \param alias The alias to lookup + * \param len The length of the alias string + * \return The MIB enum value, or 0 if not found + */ +uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len) +{ + parserutils_charset_aliases_canon *c; + + if (alias == NULL) + return 0; + + c = parserutils_charset_alias_canonicalise(alias, len); + if (c == NULL) + return 0; + + return c->mib_enum; +} + +/** + * Retrieve the canonical name of an encoding from the MIB enum + * + * \param mibenum The MIB enum value + * \return Pointer to canonical name, or NULL if not found + */ +const char *parserutils_charset_mibenum_to_name(uint16_t mibenum) +{ + int i; + parserutils_charset_aliases_canon *c; + + for (i = 0; i != HASH_SIZE; i++) + for (c = canon_tab[i]; c; c = c->next) + if (c->mib_enum == mibenum) + return c->name; + + return NULL; +} + +/** + * Detect if a parserutils_charset is Unicode + * + * \param mibenum The MIB enum to consider + * \return true if a Unicode variant, false otherwise + */ +bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum) +{ + static uint16_t ucs4; + static uint16_t ucs2; + static uint16_t utf8; + static uint16_t utf16; + static uint16_t utf16be; + static uint16_t utf16le; + static uint16_t utf32; + static uint16_t utf32be; + static uint16_t utf32le; + + if (ucs4 == 0) { + ucs4 = parserutils_charset_mibenum_from_name("UCS-4", + SLEN("UCS-4")); + ucs2 = parserutils_charset_mibenum_from_name("UCS-2", + SLEN("UCS-2")); + utf8 = parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); + utf16 = parserutils_charset_mibenum_from_name("UTF-16", + SLEN("UTF-16")); + utf16be = parserutils_charset_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + utf16le = parserutils_charset_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + utf32 = parserutils_charset_mibenum_from_name("UTF-32", + SLEN("UTF-32")); + utf32be = parserutils_charset_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + utf32le = parserutils_charset_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } + + return (mibenum == ucs4 || mibenum == ucs2 || mibenum == utf8 || + mibenum == utf16 || mibenum == utf16be || + mibenum == utf16le || mibenum == utf32 || + mibenum == utf32be || mibenum == utf32le); +} + +/** + * Retrieve the canonical form of an alias name + * + * \param alias The alias name + * \param len The length of the alias name + * \return Pointer to canonical form or NULL if not found + */ +parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise( + const char *alias, size_t len) +{ + uint32_t hash; + parserutils_charset_aliases_canon *c; + struct alias *a; + + if (alias == NULL) + return NULL; + + hash = parserutils_charset_hash_val(alias, len); + + for (c = canon_tab[hash]; c; c = c->next) + if (c->name_len == len && + strncasecmp(c->name, alias, len) == 0) + break; + if (c) + return c; + + for (a = alias_tab[hash]; a; a = a->next) + if (a->name_len == len && + strncasecmp(a->name, alias, len) == 0) + break; + if (a) + return a->canon; + + return NULL; +} + + +/** + * Create an alias + * + * \param alias The alias name + * \param c The canonical form + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_create_alias(const char *alias, + parserutils_charset_aliases_canon *c, + parserutils_alloc alloc, void *pw) +{ + struct alias *a; + uint32_t hash; + + if (alias == NULL || c == NULL || alloc == NULL) + return PARSERUTILS_BADPARM; + + a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw); + if (a == NULL) + return PARSERUTILS_NOMEM; + + a->canon = c; + a->name_len = strlen(alias); + strcpy(a->name, alias); + a->name[a->name_len] = '\0'; + + hash = parserutils_charset_hash_val(alias, a->name_len); + + a->next = alias_tab[hash]; + alias_tab[hash] = a; + + return PARSERUTILS_OK; +} + +/** + * Create a canonical form + * + * \param canon The canonical name + * \param mibenum The MIB enum value + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to canonical form or NULL on error + */ +parserutils_charset_aliases_canon *parserutils_charset_create_canon( + const char *canon, uint16_t mibenum, + parserutils_alloc alloc, void *pw) +{ + parserutils_charset_aliases_canon *c; + uint32_t hash, len; + + if (canon == NULL || alloc == NULL) + return NULL; + + len = strlen(canon); + + c = alloc(NULL, sizeof(parserutils_charset_aliases_canon) + len + 1, pw); + if (c == NULL) + return NULL; + + c->mib_enum = mibenum; + c->name_len = len; + strcpy(c->name, canon); + c->name[len] = '\0'; + + hash = parserutils_charset_hash_val(canon, len); + + c->next = canon_tab[hash]; + canon_tab[hash] = c; + + return c; +} + +/** + * Hash function + * + * \param alias String to hash + * \return The hashed value + */ +uint32_t parserutils_charset_hash_val(const char *alias, size_t len) +{ + const char *s = alias; + uint32_t h = 5381; + + if (alias == NULL) + return 0; + + while (len--) + h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */ + + return h % HASH_SIZE; +} + + +#ifndef NDEBUG +/** + * Dump all alias data to stdout + */ +void parserutils_charset_aliases_dump(void) +{ + parserutils_charset_aliases_canon *c; + struct alias *a; + int i; + size_t size = 0; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = c->next) { + printf("%d %s\n", i, c->name); + size += offsetof(parserutils_charset_aliases_canon, + name) + c->name_len; + } + + for (a = alias_tab[i]; a; a = a->next) { + printf("%d %s\n", i, a->name); + size += offsetof(struct alias, name) + a->name_len; + } + } + + size += (sizeof(canon_tab) / sizeof(canon_tab[0])); + size += (sizeof(alias_tab) / sizeof(alias_tab[0])); + + printf("%u\n", (unsigned int) size); +} +#endif diff --git a/src/charset/aliases.h b/src/charset/aliases.h new file mode 100644 index 0000000..9abd2c8 --- /dev/null +++ b/src/charset/aliases.h @@ -0,0 +1,36 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef parserutils_charset_aliases_h_ +#define parserutils_charset_aliases_h_ + +#include <inttypes.h> + +#include <parserutils/charset/mibenum.h> + +typedef struct parserutils_charset_aliases_canon { + struct parserutils_charset_aliases_canon *next; + uint16_t mib_enum; + uint16_t name_len; + char name[1]; +} parserutils_charset_aliases_canon; + +/* Load encoding aliases from file */ +parserutils_error parserutils_charset_aliases_create(const char *filename, + parserutils_alloc alloc, void *pw); +/* Destroy encoding aliases */ +void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw); + +/* Canonicalise an alias name */ +parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise( + const char *alias, size_t len); + +#ifndef NDEBUG +void parserutils_charset_aliases_dump(void); +#endif + +#endif diff --git a/src/charset/charset.c b/src/charset/charset.c new file mode 100644 index 0000000..3ef1a71 --- /dev/null +++ b/src/charset/charset.c @@ -0,0 +1,54 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include "charset/aliases.h" +#include "charset/charset.h" + +/** + * Initialise the Charset library for use. + * + * This _must_ be called before using any libparserutils charset functions + * + * \param aliases_file Pointer to name of file containing encoding alias data + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, applicable error otherwise. + */ +parserutils_error parserutils_charset_initialise(const char *aliases_file, + parserutils_alloc alloc, void *pw) +{ + parserutils_error error; + + if (aliases_file == NULL || alloc == NULL) + return PARSERUTILS_BADPARM; + + error = parserutils_charset_aliases_create(aliases_file, alloc, pw); + if (error != PARSERUTILS_OK) + return error; + + return PARSERUTILS_OK; +} + +/** + * Clean up after Libparserutils + * + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, applicable error otherwise. + */ +parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, + void *pw) +{ + if (alloc == NULL) + return PARSERUTILS_BADPARM; + + parserutils_charset_aliases_destroy(alloc, pw); + + return PARSERUTILS_OK; +} + + diff --git a/src/charset/charset.h b/src/charset/charset.h new file mode 100644 index 0000000..4b07577 --- /dev/null +++ b/src/charset/charset.h @@ -0,0 +1,24 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef parserutils_charset_charset_h_ +#define parserutils_charset_charset_h_ + +#include <parserutils/errors.h> +#include <parserutils/functypes.h> +#include <parserutils/types.h> + +/* Initialise the Charset library for use */ +parserutils_error parserutils_charset_initialise(const char *aliases_file, + parserutils_alloc alloc, void *pw); + +/* Clean up after Charset */ +parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, + void *pw); + +#endif + diff --git a/src/charset/codec.c b/src/charset/codec.c new file mode 100644 index 0000000..5c3fb3a --- /dev/null +++ b/src/charset/codec.c @@ -0,0 +1,185 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <string.h> + +#include "charset/aliases.h" +#include "charset/codecs/codec_impl.h" + +#ifdef WITH_ICONV_CODEC +extern parserutils_charset_handler iconv_codec_handler; +#endif + +extern parserutils_charset_handler charset_utf8_codec_handler; +extern parserutils_charset_handler charset_utf16_codec_handler; + +static parserutils_charset_handler *handler_table[] = { + &charset_utf8_codec_handler, + &charset_utf16_codec_handler, +#ifdef WITH_ICONV_CODEC + &iconv_codec_handler, +#endif + NULL, +}; + +/** + * Create a charset codec + * + * \param charset Target charset + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec instance, or NULL on failure + */ +parserutils_charset_codec *parserutils_charset_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + parserutils_charset_codec *codec; + parserutils_charset_handler **handler; + const parserutils_charset_aliases_canon * canon; + + if (charset == NULL || alloc == NULL) + return NULL; + + /* Canonicalise parserutils_charset name. */ + canon = parserutils_charset_alias_canonicalise(charset, + strlen(charset)); + if (canon == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->handles_charset(canon->name)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + /* Instantiate class */ + codec = (*handler)->create(canon->name, alloc, pw); + if (codec == NULL) + return NULL; + + /* and initialise it */ + codec->mibenum = canon->mib_enum; + + codec->errormode = PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE; + + codec->alloc = alloc; + codec->alloc_pw = pw; + + return codec; +} + +/** + * Destroy a charset codec + * + * \param codec The codec to destroy + */ +void parserutils_charset_codec_destroy(parserutils_charset_codec *codec) +{ + if (codec == NULL) + return; + + codec->handler.destroy(codec); + + codec->alloc(codec, 0, codec->alloc_pw); +} + +/** + * Configure a charset codec + * + * \param codec The codec to configure + * \parem type The codec option type to configure + * \param params Option-specific parameters + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_codec_setopt( + parserutils_charset_codec *codec, + parserutils_charset_codec_opttype type, + parserutils_charset_codec_optparams *params) +{ + if (codec == NULL || params == NULL) + return PARSERUTILS_BADPARM; + + switch (type) { + case PARSERUTILS_CHARSET_CODEC_ERROR_MODE: + codec->errormode = params->error_mode.mode; + break; + } + + return PARSERUTILS_OK; +} + +/** + * Encode a chunk of UCS4 data into a codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + */ +parserutils_error parserutils_charset_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return PARSERUTILS_BADPARM; + + return codec->handler.encode(codec, source, sourcelen, dest, destlen); +} + +/** + * Decode a chunk of data in a codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + * + * Call this with a source length of 0 to flush any buffers. + */ +parserutils_error parserutils_charset_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return PARSERUTILS_BADPARM; + + return codec->handler.decode(codec, source, sourcelen, dest, destlen); +} + +/** + * Clear a charset codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_codec_reset( + parserutils_charset_codec *codec) +{ + if (codec == NULL) + return PARSERUTILS_BADPARM; + + return codec->handler.reset(codec); +} + diff --git a/src/charset/codecs/Makefile b/src/charset/codecs/Makefile new file mode 100644 index 0000000..6d3b78e --- /dev/null +++ b/src/charset/codecs/Makefile @@ -0,0 +1,46 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Sources +SRCS_$(d) := codec_iconv.c codec_utf8.c codec_utf16.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/charset/codecs/codec_iconv.c b/src/charset/codecs/codec_iconv.c new file mode 100644 index 0000000..bbe8bc4 --- /dev/null +++ b/src/charset/codecs/codec_iconv.c @@ -0,0 +1,683 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +/* This codec is hideously slow. Only use it as a last resort */ + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +/* We put this here rather than at the top as GCC complains + * about the source file being empty otherwise. */ +#ifdef WITH_ICONV_CODEC + +#include <iconv.h> + +/* These two are for htonl / ntohl */ +#include <arpa/inet.h> +#include <netinet/in.h> + +#include <parserutils/charset/mibenum.h> + +#include "charset/codecs/codec_impl.h" +#include "utils/utils.h" + +/** + * Iconv-based charset codec + */ +typedef struct iconv_codec { + parserutils_charset_codec base; /**< Base class */ + + iconv_t read_cd; /**< Iconv handle for reading */ +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /**< Number of bytes in inval_buf */ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + */ + size_t read_len; /**< Number of characters in + * read_buf */ + + iconv_t write_cd; /**< Iconv handle for writing */ +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + */ + size_t write_len; /**< Number of characters in + * write_buf */ +} iconv_codec; + + +static bool iconv_codec_handles_charset(const char *charset); +static parserutils_charset_codec *iconv_codec_create(const char *charset, + parserutils_alloc alloc, void *pw); +static void iconv_codec_destroy (parserutils_charset_codec *codec); +static parserutils_error iconv_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error iconv_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error iconv_codec_reset(parserutils_charset_codec *codec); +static parserutils_error iconv_codec_output_decoded_char( + iconv_codec *c, uint32_t ucs4, uint8_t **dest, + size_t *destlen); +static parserutils_error iconv_codec_read_char(iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error iconv_codec_write_char(iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool iconv_codec_handles_charset(const char *charset) +{ + iconv_t cd; + bool ret; + + cd = iconv_open("UCS-4", charset); + + ret = (cd != (iconv_t) -1); + + if (ret) + iconv_close(cd); + + return ret; +} + +/** + * Create an iconv-based codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +parserutils_charset_codec *iconv_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + iconv_codec *codec; + + codec = alloc(NULL, sizeof(iconv_codec), pw); + if (codec == NULL) + return NULL; + + codec->read_cd = iconv_open("UCS-4", charset); + if (codec->read_cd == (iconv_t) -1) { + alloc(codec, 0, pw); + return NULL; + } + + codec->write_cd = iconv_open(charset, "UCS-4"); + if (codec->write_cd == (iconv_t) -1) { + iconv_close(codec->read_cd); + alloc(codec, 0, pw); + return NULL; + } + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = iconv_codec_destroy; + codec->base.handler.encode = iconv_codec_encode; + codec->base.handler.decode = iconv_codec_decode; + codec->base.handler.reset = iconv_codec_reset; + + return (parserutils_charset_codec *) codec; +} + +/** + * Destroy an iconv-based codec + * + * \param codec The codec to destroy + */ +void iconv_codec_destroy (parserutils_charset_codec *codec) +{ + iconv_codec *c = (iconv_codec *) codec; + + iconv_close(c->read_cd); + iconv_close(c->write_cd); + + return; +} + +/** + * Encode a chunk of UCS4 data into an iconv-based codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error iconv_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + iconv_codec *c = (iconv_codec *) codec; + uint32_t ucs4; + const uint32_t *towrite; + size_t towritelen; + parserutils_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + error = iconv_codec_write_char(c, pwrite[0], + dest, destlen); + if (error != PARSERUTILS_OK) { + /* Copy outstanding chars down, skipping + * invalid one, if present, so as to avoid + * reprocessing the invalid character */ + if (error == PARSERUTILS_INVALID) { + for (ucs4 = 1; ucs4 < c->write_len; + ucs4++) { + c->write_buf[ucs4] = + pwrite[ucs4]; + } + } + + return error; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + towrite = (const uint32_t *) (const void *) *source; + towritelen = 1; + ucs4 = *towrite; + + /* Output current character(s) */ + while (towritelen > 0) { + error = iconv_codec_write_char(c, towrite[0], + dest, destlen); + + if (error != PARSERUTILS_OK) { + ucs4 = (error == PARSERUTILS_INVALID) ? 1 : 0; + + if (towritelen - ucs4 >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen - ucs4; + + /* Copy pending chars to save area, for + * processing next call; skipping invalid + * character, if present, so it's not + * reprocessed. */ + for (; ucs4 < towritelen; ucs4++) { + c->write_buf[ucs4] = towrite[ucs4]; + } + + /* Claim character we've just buffered, + * so it's not repreocessed */ + *source += 4; + *sourcelen -= 4; + + return error; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return PARSERUTILS_OK; +} + +/** + * Decode a chunk of data in an iconv-based codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +parserutils_error iconv_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + iconv_codec *c = (iconv_codec *) codec; + parserutils_error error; + + if (c->read_len > 0) { + /* Output left over from last decode + * Attempt to finish this here */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = pread[0]; + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Run out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) { + c->read_buf[i] = pread[i]; + } + + return PARSERUTILS_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = iconv_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + return error; + } + + + /* And now, fix everything up so the normal processing + * does the right thing. */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Handle memry exhaustion case from above */ + if (error != PARSERUTILS_OK) + return error; + } + + while (*sourcelen > 0) { + error = iconv_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != PARSERUTILS_OK) { + return error; + } + } + + return PARSERUTILS_OK; +} + +/** + * Clear an iconv-based codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error iconv_codec_reset(parserutils_charset_codec *codec) +{ + iconv_codec *c = (iconv_codec *) codec; + + iconv(c->read_cd, NULL, NULL, NULL, NULL); + iconv(c->write_cd, NULL, NULL, NULL, NULL); + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return PARSERUTILS_OK; +} + +/** + * Output a UCS4 character + * + * \param c Codec to use + * \param ucs4 UCS4 character (big endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + */ +parserutils_error iconv_codec_output_decoded_char(iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (*destlen < 4) { + /* Run out of output buffer */ + + c->read_len = 1; + c->read_buf[0] = ucs4; + + return PARSERUTILS_NOMEM; + } + + *((uint32_t *) (void *) *dest) = ucs4; + *dest += 4; + *destlen -= 4; + + return PARSERUTILS_OK; +} + +/** + * Read a character from the codec's native charset to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error iconv_codec_read_char(iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + const uint8_t *origsrc = *source; + size_t origsrclen = *sourcelen; + uint32_t ucs4; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + parserutils_error error; + + /* Use iconv to convert a single character + * Side effect: Updates *source to point at next input + * character and *sourcelen to reflect reduced input length + */ + iconv_ret = iconv(c->read_cd, (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + + if (iconv_ret != (size_t) -1 || + (*source != origsrc && sucs4 == 0)) { + /* Read a character */ + error = iconv_codec_output_decoded_char(c, ucs4, dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + /* output failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (errno == E2BIG) { + /* Should never happen */ + abort(); + } else if (errno == EINVAL) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (const char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return PARSERUTILS_OK; + } else if (errno == EILSEQ) { + /* Illegal input sequence */ + bool found = false; + const uint8_t *oldsrc; + size_t oldsrclen; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { + /* restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + + return PARSERUTILS_INVALID; + } + + /* Ok, this becomes problematic. The iconv API here + * is particularly unhelpful; *source will point at + * the _start_ of the illegal sequence. This means + * that we must find the end of the sequence */ + + /* Search for the start of the next valid input + * sequence (or the end of the input stream) */ + while (*sourcelen > 1) { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + (*source)++; + (*sourcelen)--; + + oldsrc = *source; + oldsrclen = *sourcelen; + + iconv_ret = iconv(c->read_cd, + (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + if (iconv_ret != (size_t) -1 || errno != EILSEQ) { + found = true; + break; + } + } + + if (found) { + /* Found start of next valid sequence */ + *source = oldsrc; + *sourcelen = oldsrclen; + } else { + /* Not found - skip last byte in buffer */ + (*source)++; + (*sourcelen)--; + + if (*sourcelen != 0) + abort(); + } + + /* output U+FFFD and continue processing. */ + error = iconv_codec_output_decoded_char(c, + htonl(0xFFFD), dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + /* output failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + return error; + } + + return PARSERUTILS_OK; +} + +/** + * Write a UCS4 character in a codec's native charset + * + * \param c The codec + * \param ucs4 The UCS4 character to write (big endian) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if character cannot be represented and the + * codec's error handling mode is set to STRICT. + */ +parserutils_error iconv_codec_write_char(iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + uint8_t *origdest = *dest; + + iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4, + &sucs4, (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + /* Output buffer is too small */ + return PARSERUTILS_NOMEM; + } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } else if (*dest == origdest) { + /* Nothing was output */ + switch (c->base.errormode) { + case PARSERUTILS_CHARSET_CODEC_ERROR_STRICT: + return PARSERUTILS_INVALID; + + case PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT: + /** \todo transliteration */ + case PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE: + { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + ucs4 = parserutils_charset_mibenum_is_unicode( + c->base.mibenum) + ? htonl(0xFFFD) : htonl(0x3F); + + iconv_ret = iconv(c->write_cd, + (char **) (void *) &pucs4, &sucs4, + (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + return PARSERUTILS_NOMEM; + } else if (iconv_ret == (size_t) -1 && + errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && + errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } + } + break; + } + } + + return PARSERUTILS_OK; +} + +const parserutils_charset_handler iconv_codec_handler = { + iconv_codec_handles_charset, + iconv_codec_create +}; + +#endif diff --git a/src/charset/codecs/codec_impl.h b/src/charset/codecs/codec_impl.h new file mode 100644 index 0000000..9183594 --- /dev/null +++ b/src/charset/codecs/codec_impl.h @@ -0,0 +1,48 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef parserutils_charset_codecs_codecimpl_h_ +#define parserutils_charset_codecs_codecimpl_h_ + +#include <stdbool.h> +#include <inttypes.h> + +#include <parserutils/charset/codec.h> + +/** + * Core charset codec definition; implementations extend this + */ +struct parserutils_charset_codec { + uint16_t mibenum; /**< MIB enum for charset */ + + parserutils_charset_codec_errormode errormode; /**< error mode */ + + parserutils_alloc alloc; /**< allocation function */ + void *alloc_pw; /**< private word */ + + struct { + void (*destroy)(parserutils_charset_codec *codec); + parserutils_error (*encode)(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + parserutils_error (*decode)(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + parserutils_error (*reset)(parserutils_charset_codec *codec); + } handler; /**< Vtable for handler code */ +}; + +/** + * Codec factory component definition + */ +typedef struct parserutils_charset_handler { + bool (*handles_charset)(const char *charset); + parserutils_charset_codec *(*create)(const char *charset, + parserutils_alloc alloc, void *pw); +} parserutils_charset_handler; + +#endif diff --git a/src/charset/codecs/codec_utf16.c b/src/charset/codecs/codec_utf16.c new file mode 100644 index 0000000..0dd7a07 --- /dev/null +++ b/src/charset/codecs/codec_utf16.c @@ -0,0 +1,544 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdlib.h> +#include <string.h> + +/* These two are for htonl / ntohl */ +#include <arpa/inet.h> +#include <netinet/in.h> + +#include <parserutils/charset/mibenum.h> +#include <parserutils/charset/utf16.h> + +#include "charset/codecs/codec_impl.h" +#include "utils/utils.h" + +/** + * UTF-16 charset codec + */ +typedef struct charset_utf16_codec { + parserutils_charset_codec base; /**< Base class */ + +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /*< Byte length of inval_buf **/ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + * (host-endian) */ + size_t read_len; /**< Character length of read_buf */ + +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + * (host-endian) */ + size_t write_len; /**< Character length of write_buf */ + +} charset_utf16_codec; + +static bool charset_utf16_codec_handles_charset(const char *charset); +static parserutils_charset_codec *charset_utf16_codec_create( + const char *charset, parserutils_alloc alloc, void *pw); +static void charset_utf16_codec_destroy (parserutils_charset_codec *codec); +static parserutils_error charset_utf16_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_utf16_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_utf16_codec_reset( + parserutils_charset_codec *codec); +static inline parserutils_error charset_utf16_codec_read_char( + charset_utf16_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static inline parserutils_error charset_utf16_codec_output_decoded_char( + charset_utf16_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool charset_utf16_codec_handles_charset(const char *charset) +{ + return parserutils_charset_mibenum_from_name(charset, strlen(charset)) + == + parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16")); +} + +/** + * Create a utf16 codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +parserutils_charset_codec *charset_utf16_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + charset_utf16_codec *codec; + + UNUSED(charset); + + codec = alloc(NULL, sizeof(charset_utf16_codec), pw); + if (codec == NULL) + return NULL; + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = charset_utf16_codec_destroy; + codec->base.handler.encode = charset_utf16_codec_encode; + codec->base.handler.decode = charset_utf16_codec_decode; + codec->base.handler.reset = charset_utf16_codec_reset; + + return (parserutils_charset_codec *) codec; +} + +/** + * Destroy a utf16 codec + * + * \param codec The codec to destroy + */ +void charset_utf16_codec_destroy (parserutils_charset_codec *codec) +{ + UNUSED(codec); +} + +/** + * Encode a chunk of UCS4 data into utf16 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_utf16_codec *c = (charset_utf16_codec *) codec; + uint32_t ucs4; + uint32_t *towrite; + size_t towritelen; + parserutils_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + uint8_t buf[4]; + size_t len; + + while (c->write_len > 0) { + error = parserutils_charset_utf16_from_ucs4( + pwrite[0], buf, &len); + if (error != PARSERUTILS_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output buffer space */ + for (len = 0; len < c->write_len; len++) + c->write_buf[len] = pwrite[len]; + + return PARSERUTILS_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + ucs4 = ntohl(*((uint32_t *) (void *) *source)); + towrite = &ucs4; + towritelen = 1; + + /* Output current characters */ + while (towritelen > 0) { + uint8_t buf[4]; + size_t len; + + error = parserutils_charset_utf16_from_ucs4( + towrite[0], buf, &len); + if (error != PARSERUTILS_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output space */ + if (towritelen >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen; + + /* Copy pending chars to save area, for + * processing next call. */ + for (len = 0; len < towritelen; len++) + c->write_buf[len] = towrite[len]; + + /* Claim character we've just buffered, + * so it's not reprocessed */ + *source += 4; + *sourcelen -= 4; + + return PARSERUTILS_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return PARSERUTILS_OK; +} + +/** + * Decode a chunk of utf16 data into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_utf16_codec *c = (charset_utf16_codec *) codec; + parserutils_error error; + + if (c->read_len > 0) { + /* Output left over from last decode */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = htonl(pread[0]); + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Ran out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) + c->read_buf[i] = pread[i]; + + return PARSERUTILS_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = charset_utf16_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + return error; + } + + /* And now, fix up source pointers */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Report memory exhaustion case from above */ + if (error != PARSERUTILS_OK) + return error; + } + + /* Finally, the "normal" case; process all outstanding characters */ + while (*sourcelen > 0) { + error = charset_utf16_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != PARSERUTILS_OK) { + return error; + } + } + + return PARSERUTILS_OK; +} + +/** + * Clear a utf16 codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec) +{ + charset_utf16_codec *c = (charset_utf16_codec *) codec; + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return PARSERUTILS_OK; +} + + +/** + * Read a character from the UTF-16 to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + uint32_t ucs4; + size_t sucs4; + parserutils_error error; + + /* Convert a single character */ + error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen, + &ucs4, &sucs4); + if (error == PARSERUTILS_OK) { + /* Read a character */ + error = charset_utf16_codec_output_decoded_char(c, + ucs4, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += sucs4; + *sourcelen -= sucs4; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (error == PARSERUTILS_NEEDDATA) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return PARSERUTILS_OK; + } else if (error == PARSERUTILS_INVALID) { + /* Illegal input sequence */ + uint32_t nextchar; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { + return PARSERUTILS_INVALID; + } + + /* Find next valid UTF-16 sequence. + * We're processing client-provided data, so let's + * be paranoid about its validity. */ + error = parserutils_charset_utf16_next_paranoid( + *source, *sourcelen, 0, &nextchar); + if (error != PARSERUTILS_OK) { + if (error == PARSERUTILS_NEEDDATA) { + /* Need more data to be sure */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, + *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + nextchar = 0; + } else { + return error; + } + } + + /* output U+FFFD and continue processing. */ + error = charset_utf16_codec_output_decoded_char(c, + 0xFFFD, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += nextchar; + *sourcelen -= nextchar; + } + + return error; + } + + return PARSERUTILS_OK; +} + +/** + * Output a UCS4 character + * + * \param c Codec to use + * \param ucs4 UCS4 character (host endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + */ +parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (*destlen < 4) { + /* Run out of output buffer */ + c->read_len = 1; + c->read_buf[0] = ucs4; + + return PARSERUTILS_NOMEM; + } + + *((uint32_t *) (void *) *dest) = htonl(ucs4); + *dest += 4; + *destlen -= 4; + + return PARSERUTILS_OK; +} + + +const parserutils_charset_handler charset_utf16_codec_handler = { + charset_utf16_codec_handles_charset, + charset_utf16_codec_create +}; diff --git a/src/charset/codecs/codec_utf8.c b/src/charset/codecs/codec_utf8.c new file mode 100644 index 0000000..838d051 --- /dev/null +++ b/src/charset/codecs/codec_utf8.c @@ -0,0 +1,546 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdlib.h> +#include <string.h> + +/* These two are for htonl / ntohl */ +#include <arpa/inet.h> +#include <netinet/in.h> + +#include <parserutils/charset/mibenum.h> + +#include "charset/codecs/codec_impl.h" +#include "charset/encodings/utf8impl.h" +#include "utils/utils.h" + +/** + * UTF-8 charset codec + */ +typedef struct charset_utf8_codec { + parserutils_charset_codec base; /**< Base class */ + +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /*< Byte length of inval_buf **/ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + * (host-endian) */ + size_t read_len; /**< Character length of read_buf */ + +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + * (host-endian) */ + size_t write_len; /**< Character length of write_buf */ + +} charset_utf8_codec; + +static bool charset_utf8_codec_handles_charset(const char *charset); +static parserutils_charset_codec *charset_utf8_codec_create(const char *charset, + parserutils_alloc alloc, void *pw); +static void charset_utf8_codec_destroy (parserutils_charset_codec *codec); +static parserutils_error charset_utf8_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_utf8_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_utf8_codec_reset( + parserutils_charset_codec *codec); +static inline parserutils_error charset_utf8_codec_read_char( + charset_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static inline parserutils_error charset_utf8_codec_output_decoded_char( + charset_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool charset_utf8_codec_handles_charset(const char *charset) +{ + return parserutils_charset_mibenum_from_name(charset, + strlen(charset)) == + parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); +} + +/** + * Create a utf8 codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +parserutils_charset_codec *charset_utf8_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + charset_utf8_codec *codec; + + UNUSED(charset); + + codec = alloc(NULL, sizeof(charset_utf8_codec), pw); + if (codec == NULL) + return NULL; + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = charset_utf8_codec_destroy; + codec->base.handler.encode = charset_utf8_codec_encode; + codec->base.handler.decode = charset_utf8_codec_decode; + codec->base.handler.reset = charset_utf8_codec_reset; + + return (parserutils_charset_codec *) codec; +} + +/** + * Destroy a utf8 codec + * + * \param codec The codec to destroy + */ +void charset_utf8_codec_destroy (parserutils_charset_codec *codec) +{ + UNUSED(codec); +} + +/** + * Encode a chunk of UCS4 data into utf8 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_utf8_codec *c = (charset_utf8_codec *) codec; + uint32_t ucs4; + uint32_t *towrite; + size_t towritelen; + parserutils_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + UTF8_FROM_UCS4(pwrite[0], dest, destlen, error); + if (error != PARSERUTILS_OK) { + if (error != PARSERUTILS_NOMEM) + abort(); + + /* Insufficient output buffer space */ + for (uint32_t len = 0; + len < c->write_len; len++) { + c->write_buf[len] = pwrite[len]; + } + + return PARSERUTILS_NOMEM; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + ucs4 = ntohl(*((uint32_t *) (void *) *source)); + towrite = &ucs4; + towritelen = 1; + + /* Output current characters */ + while (towritelen > 0) { + UTF8_FROM_UCS4(towrite[0], dest, destlen, error); + if (error != PARSERUTILS_OK) { + if (error != PARSERUTILS_NOMEM) + abort(); + + /* Insufficient output space */ + if (towritelen >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen; + + /* Copy pending chars to save area, for + * processing next call. */ + for (uint32_t len = 0; len < towritelen; len++) + c->write_buf[len] = towrite[len]; + + /* Claim character we've just buffered, + * so it's not reprocessed */ + *source += 4; + *sourcelen -= 4; + + return PARSERUTILS_NOMEM; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return PARSERUTILS_OK; +} + +/** + * Decode a chunk of utf8 data into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_utf8_codec *c = (charset_utf8_codec *) codec; + parserutils_error error; + + if (c->read_len > 0) { + /* Output left over from last decode */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = htonl(pread[0]); + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Ran out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) + c->read_buf[i] = pread[i]; + + return PARSERUTILS_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = charset_utf8_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + return error; + } + + /* And now, fix up source pointers */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Report memory exhaustion case from above */ + if (error != PARSERUTILS_OK) + return error; + } + + /* Finally, the "normal" case; process all outstanding characters */ + while (*sourcelen > 0) { + error = charset_utf8_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != PARSERUTILS_OK) { + return error; + } + } + + return PARSERUTILS_OK; +} + +/** + * Clear a utf8 codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec) +{ + charset_utf8_codec *c = (charset_utf8_codec *) codec; + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return PARSERUTILS_OK; +} + + +/** + * Read a character from the UTF-8 to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + uint32_t ucs4; + size_t sucs4; + parserutils_error error; + + /* Convert a single character */ + { + const uint8_t *src = *source; + size_t srclen = *sourcelen; + uint32_t *uptr = &ucs4; + size_t *usptr = &sucs4; + UTF8_TO_UCS4(src, srclen, uptr, usptr, error); + } + if (error == PARSERUTILS_OK) { + /* Read a character */ + error = charset_utf8_codec_output_decoded_char(c, + ucs4, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += sucs4; + *sourcelen -= sucs4; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (error == PARSERUTILS_NEEDDATA) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return PARSERUTILS_OK; + } else if (error == PARSERUTILS_INVALID) { + /* Illegal input sequence */ + uint32_t nextchar; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return PARSERUTILS_INVALID; + } + + /* Find next valid UTF-8 sequence. + * We're processing client-provided data, so let's + * be paranoid about its validity. */ + { + const uint8_t *src = *source; + size_t srclen = *sourcelen; + uint32_t off = 0; + uint32_t *ncptr = &nextchar; + + UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error); + } + if (error != PARSERUTILS_OK) { + if (error == PARSERUTILS_NEEDDATA) { + /* Need more data to be sure */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, + *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + nextchar = 0; + } else { + return error; + } + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* output U+FFFD and continue processing. */ + error = charset_utf8_codec_output_decoded_char(c, + 0xFFFD, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += nextchar; + *sourcelen -= nextchar; + } + + return error; + } + + return PARSERUTILS_OK; +} + +/** + * Output a UCS4 character + * + * \param c Codec to use + * \param ucs4 UCS4 character (host endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + */ +parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (*destlen < 4) { + /* Run out of output buffer */ + c->read_len = 1; + c->read_buf[0] = ucs4; + + return PARSERUTILS_NOMEM; + } + + *((uint32_t *) (void *) *dest) = htonl(ucs4); + *dest += 4; + *destlen -= 4; + + return PARSERUTILS_OK; +} + + +const parserutils_charset_handler charset_utf8_codec_handler = { + charset_utf8_codec_handles_charset, + charset_utf8_codec_create +}; + diff --git a/src/charset/encodings/Makefile b/src/charset/encodings/Makefile new file mode 100644 index 0000000..47d9210 --- /dev/null +++ b/src/charset/encodings/Makefile @@ -0,0 +1,46 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Sources +SRCS_$(d) := utf8.c utf16.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c new file mode 100644 index 0000000..95dc64f --- /dev/null +++ b/src/charset/encodings/utf16.c @@ -0,0 +1,239 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +/** \file + * UTF-16 manipulation functions (implementation). + */ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +#include <parserutils/charset/utf16.h> + +/** + * Convert a UTF-16 sequence into a single UCS4 character + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-16 sequence + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, + size_t len, uint32_t *ucs4, size_t *clen) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || ucs4 == NULL || clen == NULL) + return PARSERUTILS_BADPARM; + + if (len < 2) + return PARSERUTILS_NEEDDATA; + + if (*ss < 0xD800 || *ss > 0xDFFF) { + *ucs4 = *ss; + *clen = 2; + } else if (0xD800 <= *ss && *ss <= 0xBFFF) { + if (len < 4) + return PARSERUTILS_NEEDDATA; + + if (0xDC00 <= ss[1] && ss[1] <= 0xE000) { + *ucs4 = (((s[0] >> 6) & 0x1f) + 1) | + ((s[0] & 0x3f) | (s[1] & 0x3ff)); + *clen = 4; + } else { + return PARSERUTILS_INVALID; + } + } + + return PARSERUTILS_OK; +} + +/** + * Convert a single UCS4 character into a UTF-16 sequence + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to 4 byte long output buffer + * \param len Pointer to location to receive length of multibyte sequence + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len) +{ + uint16_t *ss = (uint16_t *) (void *) s; + uint32_t l = 0; + + if (s == NULL || len == NULL) + return PARSERUTILS_BADPARM; + else if (ucs4 < 0x10000) { + *ss = (uint16_t) ucs4; + l = 2; + } else if (ucs4 < 0x110000) { + ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10); + ss[1] = 0xDC00 | (ucs4 & 0x3ff); + l = 4; + } else { + return PARSERUTILS_INVALID; + } + + *len = l; + + return PARSERUTILS_OK; +} + +/** + * Calculate the length (in characters) of a bounded UTF-16 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max, + size_t *len) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + const uint16_t *end = (const uint16_t *) (const void *) (s + max); + int l = 0; + + if (s == NULL || len == NULL) + return PARSERUTILS_BADPARM; + + while (ss < end) { + if (*ss < 0xD800 || 0xDFFF < *ss) + ss++; + else + ss += 2; + + l++; + } + + *len = l; + + return PARSERUTILS_OK; +} + +/** + * Calculate the length (in bytes) of a UTF-16 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s, + size_t *len) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || len == NULL) + return PARSERUTILS_BADPARM; + + if (*ss < 0xD800 || 0xDFFF < *ss) + *len = 2; + else + *len = 4; + + return PARSERUTILS_OK; +} + +/** + * Find previous legal UTF-16 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || prevoff == NULL) + return PARSERUTILS_BADPARM; + + if (off < 2) + *prevoff = 0; + else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF) + *prevoff = off - 2; + else + *prevoff = (off < 4) ? 0 : off - 4; + + return PARSERUTILS_OK; +} + +/** + * Find next legal UTF-16 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || off >= len || nextoff == NULL) + return PARSERUTILS_BADPARM; + + if (len - off < 4) + *nextoff = len; + else if (ss[1] < 0xD800 || ss[1] > 0xDBFF) + *nextoff = off + 2; + else + *nextoff = (len - off < 6) ? len : off + 4; + + return PARSERUTILS_OK; +} + +/** + * Find next legal UTF-16 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || off >= len || nextoff == NULL) + return PARSERUTILS_BADPARM; + + while (1) { + if (len - off < 4) { + return PARSERUTILS_NEEDDATA; + } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) { + *nextoff = off + 2; + break; + } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) { + if (len - off < 6) + return PARSERUTILS_NEEDDATA; + + if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) { + *nextoff = off + 4; + break; + } else { + ss++; + off += 2; + } + } + } + + return PARSERUTILS_OK; +} + diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c new file mode 100644 index 0000000..5b4ba95 --- /dev/null +++ b/src/charset/encodings/utf8.c @@ -0,0 +1,175 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +/** \file + * UTF-8 manipulation functions (implementation). + */ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +#include <parserutils/charset/utf8.h> +#include "charset/encodings/utf8impl.h" + +/** Number of continuation bytes for a given start byte */ +const uint8_t numContinuations[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, +}; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen) +{ + parserutils_error error; + + UTF8_TO_UCS4(s, len, ucs4, clen, error); + + return error; +} + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to pointer to output buffer, updated on exit + * \param len Pointer to length, in bytes, of output buffer, updated on exit + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, + uint8_t **s, size_t *len) +{ + parserutils_error error; + + UTF8_FROM_UCS4(ucs4, s, len, error); + + return error; +} + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max, + size_t *len) +{ + parserutils_error error; + + UTF8_LENGTH(s, max, len, error); + + return error; +} + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s, + size_t *len) +{ + parserutils_error error; + + UTF8_CHAR_BYTE_LENGTH(s, len, error); + + return error; +} + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + parserutils_error error; + + UTF8_PREV(s, off, prevoff, error); + + return error; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + parserutils_error error; + + UTF8_NEXT(s, len, off, nextoff, error); + + return error; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff) +{ + parserutils_error error; + + UTF8_NEXT_PARANOID(s, len, off, nextoff, error); + + return error; +} + diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h new file mode 100644 index 0000000..1ca9de7 --- /dev/null +++ b/src/charset/encodings/utf8impl.h @@ -0,0 +1,339 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef parserutils_charset_encodings_utf8impl_h_ +#define parserutils_charset_encodings_utf8impl_h_ + +/** \file + * UTF-8 manipulation macros (implementation). + */ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +/** Number of continuation bytes for a given start byte */ +extern const uint8_t numContinuations[256]; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This macro conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \param error Location to receive error code + */ +#define UTF8_TO_UCS4(s, len, ucs4, clen, error) \ +do { \ + uint32_t c, min; \ + uint8_t n; \ + \ + error = PARSERUTILS_OK; \ + \ + if (s == NULL || ucs4 == NULL || clen == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + if (len == 0) { \ + error = PARSERUTILS_NEEDDATA; \ + break; \ + } \ + \ + c = s[0]; \ + \ + if (c < 0x80) { \ + n = 1; \ + min = 0; \ + } else if ((c & 0xE0) == 0xC0) { \ + c &= 0x1F; \ + n = 2; \ + min = 0x80; \ + } else if ((c & 0xF0) == 0xE0) { \ + c &= 0x0F; \ + n = 3; \ + min = 0x800; \ + } else if ((c & 0xF8) == 0xF0) { \ + c &= 0x07; \ + n = 4; \ + min = 0x10000; \ + } else if ((c & 0xFC) == 0xF8) { \ + c &= 0x03; \ + n = 5; \ + min = 0x200000; \ + } else if ((c & 0xFE) == 0xFC) { \ + c &= 0x01; \ + n = 6; \ + min = 0x4000000; \ + } else { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + if (len < n) { \ + error = PARSERUTILS_NEEDDATA; \ + break; \ + } \ + \ + for (uint8_t i = 1; i < n; i++) { \ + uint32_t t = s[i]; \ + \ + if ((t & 0xC0) != 0x80) { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + c <<= 6; \ + c |= t & 0x3F; \ + } \ + \ + if (error == PARSERUTILS_OK) { \ + /* Detect overlong sequences, surrogates and fffe/ffff */ \ + if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \ + c == 0xFFFE || c == 0xFFFF) { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + *ucs4 = c; \ + *clen = n; \ + } \ +} while(0) + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This macro conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to pointer to output buffer, updated on exit + * \param len Pointer to length, in bytes, of output buffer, updated on exit + * \param error Location to receive error code + */ +#define UTF8_FROM_UCS4(ucs4, s, len, error) \ +do { \ + uint8_t *buf; \ + uint8_t l = 0; \ + \ + error = PARSERUTILS_OK; \ + \ + if (s == NULL || *s == NULL || len == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + if (ucs4 < 0x80) { \ + l = 1; \ + } else if (ucs4 < 0x800) { \ + l = 2; \ + } else if (ucs4 < 0x10000) { \ + l = 3; \ + } else if (ucs4 < 0x200000) { \ + l = 4; \ + } else if (ucs4 < 0x4000000) { \ + l = 5; \ + } else if (ucs4 <= 0x7FFFFFFF) { \ + l = 6; \ + } else { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + if (l > *len) { \ + error = PARSERUTILS_NOMEM; \ + break; \ + } \ + \ + buf = *s; \ + \ + if (l == 1) { \ + buf[0] = (uint8_t) ucs4; \ + } else { \ + for (uint8_t i = l; i > 1; i--) { \ + buf[i - 1] = 0x80 | (ucs4 & 0x3F); \ + ucs4 >>= 6; \ + } \ + buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \ + } \ + \ + *s += l; \ + *len -= l; \ +} while(0) + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \param error Location to receive error code + */ +#define UTF8_LENGTH(s, max, len, error) \ +do { \ + const uint8_t *end = s + max; \ + int l = 0; \ + \ + error = PARSERUTILS_OK; \ + \ + if (s == NULL || len == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + while (s < end) { \ + uint32_t c = s[0]; \ + \ + if ((c & 0x80) == 0x00) \ + s += 1; \ + else if ((c & 0xE0) == 0xC0) \ + s += 2; \ + else if ((c & 0xF0) == 0xE0) \ + s += 3; \ + else if ((c & 0xF8) == 0xF0) \ + s += 4; \ + else if ((c & 0xFC) == 0xF8) \ + s += 5; \ + else if ((c & 0xFE) == 0xFC) \ + s += 6; \ + else { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + l++; \ + } \ + \ + if (error == PARSERUTILS_OK) \ + *len = l; \ +} while(0) + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \param error Location to receive error code + */ +#define UTF8_CHAR_BYTE_LENGTH(s, len, error) \ +do { \ + if (s == NULL || len == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + *len = numContinuations[s[0]] + 1 /* Start byte */; \ + \ + error = PARSERUTILS_OK; \ +} while(0) + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \param error Location to receive error code + */ +#define UTF8_PREV(s, off, prevoff, error) \ +do { \ + if (s == NULL || prevoff == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + while (off != 0 && (s[--off] & 0xC0) == 0x80) \ + /* do nothing */; \ + \ + *prevoff = off; \ + \ + error = PARSERUTILS_OK; \ +} while(0) + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \param error Location to receive error code + */ +#define UTF8_NEXT(s, len, off, nextoff, error) \ +do { \ + if (s == NULL || off >= len || nextoff == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + /* Skip current start byte (if present - may be mid-sequence) */\ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \ + off++; \ + \ + while (off < len && (s[off] & 0xC0) == 0x80) \ + off++; \ + \ + *nextoff = off; \ + \ + error = PARSERUTILS_OK; \ +} while(0) + +/** + * Skip to start of next sequence in UTF-8 input + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \param error Location to receive error code + */ +#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \ +do { \ + uint8_t c; \ + \ + error = PARSERUTILS_OK; \ + \ + if (s == NULL || off >= len || nextoff == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + c = s[off]; \ + \ + /* If we're mid-sequence, simply advance to next byte */ \ + if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \ + off++; \ + } else { \ + uint32_t nCont = numContinuations[c]; \ + uint32_t nToSkip; \ + \ + if (off + nCont + 1 >= len) { \ + error = PARSERUTILS_NEEDDATA; \ + break; \ + } \ + \ + /* Verify continuation bytes */ \ + for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \ + if ((s[off + nToSkip] & 0xC0) != 0x80) \ + break; \ + } \ + \ + /* Skip over the valid bytes */ \ + off += nToSkip; \ + } \ + \ + *nextoff = off; \ +} while(0) + +#endif |