diff options
author | Daniel Veillard <veillard@src.gnome.org> | 2001-05-30 19:14:17 +0000 |
---|---|---|
committer | Daniel Veillard <veillard@src.gnome.org> | 2001-05-30 19:14:17 +0000 |
commit | 97ac13197ce5a6a754a7071a0e95b07f1f54ac6c (patch) | |
tree | ef41f68f6d6861de2a3aaa5cc6147ae13e673ed6 /encoding.c | |
parent | 2d70372ce33920712a2a4b0ebdae61c826418324 (diff) | |
download | libxml2-97ac13197ce5a6a754a7071a0e95b07f1f54ac6c.tar.gz |
- xpath.c encoding.[ch]: William M. Brack provided a set of UTF8
string oriented functions and started cleaning the related areas
in xpath.c which needed fixing in this respect
Daniel
Diffstat (limited to 'encoding.c')
-rw-r--r-- | encoding.c | 234 |
1 files changed, 216 insertions, 18 deletions
@@ -13,11 +13,14 @@ * [US-ASCII] Coded Character Set--7-bit American Standard Code for * Information Interchange, ANSI X3.4-1986. * - * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org> - * * See Copyright for the status of this software. * * Daniel.Veillard@w3.org + * + * UTF8 string routines from: + * "William M. Brack" <wbrack@mmm.com.hk> + * + * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org> */ #include "libxml.h" @@ -64,16 +67,20 @@ static int xmlCharEncodingAliasesMax = 0; static int xmlLittleEndian = 1; -/* - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * I hope we won't use values > 0xFFFF anytime soon ! - */ +/************************************************************************ + * * + * Generic UTF8 handling routines * + * * + * From rfc2044: encoding of the Unicode values on UTF-8: * + * * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) * + * 0000 0000-0000 007F 0xxxxxxx * + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * + * * + * I hope we won't use values > 0xFFFF anytime soon ! * + * * + ************************************************************************/ /** * xmlUTF8Strlen: @@ -85,7 +92,7 @@ static int xmlLittleEndian = 1; * Returns the number of characters in the string or -1 in case of error */ int -xmlUTF8Strlen(const unsigned char *utf) { +xmlUTF8Strlen(const xmlChar *utf) { int ret = 0; if (utf == NULL) @@ -228,6 +235,178 @@ xmlCheckUTF8(const unsigned char *utf) } /** + * xmlUTF8Strsize: + * @utf: a sequence of UTF-8 encoded bytes + * @len: the number of characters in the array + * + * storage size of an UTF8 string + * + * Returns the storage size of + * the first 'len' characters of ARRAY + * + */ + +int +xmlUTF8Strsize(const xmlChar *utf, int len) { + const xmlChar *ptr=utf; + xmlChar ch; + + if (len <= 0) + return(0); + + while ( len-- > 0) { + if ( !*ptr ) + break; + if ( (ch = *ptr++) & 0x80) + while ( (ch<<=1) & 0x80 ) + ptr++; + } + return (ptr - utf); +} + + +/** + * xmlUTF8Strndup: + * @utf: the input UTF8 * + * @len: the len of @utf (in chars) + * + * a strndup for array of UTF8's + * + * Returns a new UTF8 * or NULL + */ +xmlChar * +xmlUTF8Strndup(const xmlChar *utf, int len) { + xmlChar *ret; + int i; + + if ((utf == NULL) || (len < 0)) return(NULL); + i = xmlUTF8Strsize(utf, len); + ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar)); + if (ret == NULL) { + xmlGenericError(xmlGenericErrorContext, + "malloc of %ld byte failed\n", + (len + 1) * (long)sizeof(xmlChar)); + return(NULL); + } + memcpy(ret, utf, i * sizeof(xmlChar)); + ret[i] = 0; + return(ret); +} + +/** + * xmlUTF8Strpos: + * @utf: the input UTF8 * + * @pos: the position of the desired UTF8 char (in chars) + * + * a function to provide the equivalent of fetching a + * character from a string array + * + * Returns a pointer to the UTF8 character or NULL + */ +xmlChar * +xmlUTF8Strpos(const xmlChar *utf, int pos) { + xmlChar ch; + + if (utf == NULL) return(NULL); + if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) ) + return(NULL); + while (pos--) { + if ((ch=*utf++) == 0) return(NULL); + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(NULL); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(NULL); + } + } + return((xmlChar *)utf); +} + +/** + * xmlUTF8Strloc: + * @utf: the input UTF8 * + * @utfchar: the UTF8 character to be found + * + * a function to provide relative location of a UTF8 char + * + * Returns the relative character position of the desired char + * or -1 if not found + */ +int +xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { + int i, size; + xmlChar ch; + + if (utf==NULL || utfchar==NULL) return -1; + size = xmlUTF8Strsize(utfchar, 1); + for(i=0; (ch=*utf) != 0; i++) { + if (xmlStrncmp(utf, utfchar, size)==0) + return(i); + utf++; + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(-1); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(-1); + } + } + + return(-1); +} +/** + * xmlUTF8Strsub: + * @utf: a sequence of UTF-8 encoded bytes + * + * @start: relative pos of first char + * @len: total number to copy + * + * Note: positions are given in units of UTF-8 chars + * + * Returns a pointer to a newly created string + * or NULL if any problem + */ + +xmlChar * +xmlUTF8Strsub(const xmlChar *utf, int start, int len) { + int i; + xmlChar ch; + + if (utf == NULL) return(NULL); + if (start < 0) return(NULL); + if (len < 0) return(NULL); + + /* + * Skip over any leading chars + */ + for (i = 0;i < start;i++) { + if ((ch=*utf++) == 0) return(NULL); + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(NULL); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(NULL); + } + } + + return(xmlUTF8Strndup(utf, len)); +} + +/************************************************************************ + * * + * Conversions To/From UTF8 encoding * + * * + ************************************************************************/ + +/** * asciiToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out @@ -912,6 +1091,12 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen, return(0); } +/************************************************************************ + * * + * Generic encoding handling routines * + * * + ************************************************************************/ + /** * xmlDetectCharEncoding: * @in: a pointer to the first bytes of the XML entity, must be at least @@ -1256,11 +1441,12 @@ xmlGetCharEncodingName(xmlCharEncoding enc) { return(NULL); } -/**************************************************************** - * * - * Char encoding handlers * - * * - ****************************************************************/ +/************************************************************************ + * * + * Char encoding handlers * + * * + ************************************************************************/ + /* the size should be growable, but it's not a big deal ... */ #define MAX_ENCODING_HANDLERS 50 @@ -1669,6 +1855,12 @@ xmlFindCharEncodingHandler(const char *name) { return(NULL); } +/************************************************************************ + * * + * ICONV based generic conversion functions * + * * + ************************************************************************/ + #ifdef LIBXML_ICONV_ENABLED /** * xmlIconvWrapper: @@ -1730,6 +1922,12 @@ xmlIconvWrapper(iconv_t cd, } #endif /* LIBXML_ICONV_ENABLED */ +/************************************************************************ + * * + * The real API used by libxml for on-the-fly conversion * + * * + ************************************************************************/ + /** * xmlCharEncFirstLine: * @handler: char enconding transformation data structure |