summaryrefslogtreecommitdiff
path: root/encoding.c
diff options
context:
space:
mode:
authorDaniel Veillard <veillard@src.gnome.org>2001-05-30 19:14:17 +0000
committerDaniel Veillard <veillard@src.gnome.org>2001-05-30 19:14:17 +0000
commit97ac13197ce5a6a754a7071a0e95b07f1f54ac6c (patch)
treeef41f68f6d6861de2a3aaa5cc6147ae13e673ed6 /encoding.c
parent2d70372ce33920712a2a4b0ebdae61c826418324 (diff)
downloadlibxml2-97ac13197ce5a6a754a7071a0e95b07f1f54ac6c.tar.gz
- xpath.c encoding.[ch]: William M. Brack provided a set of UTF8
string oriented functions and started cleaning the related areas in xpath.c which needed fixing in this respect Daniel
Diffstat (limited to 'encoding.c')
-rw-r--r--encoding.c234
1 files changed, 216 insertions, 18 deletions
diff --git a/encoding.c b/encoding.c
index 020f4de8..c0b73163 100644
--- a/encoding.c
+++ b/encoding.c
@@ -13,11 +13,14 @@
* [US-ASCII] Coded Character Set--7-bit American Standard Code for
* Information Interchange, ANSI X3.4-1986.
*
- * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
- *
* See Copyright for the status of this software.
*
* Daniel.Veillard@w3.org
+ *
+ * UTF8 string routines from:
+ * "William M. Brack" <wbrack@mmm.com.hk>
+ *
+ * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
*/
#include "libxml.h"
@@ -64,16 +67,20 @@ static int xmlCharEncodingAliasesMax = 0;
static int xmlLittleEndian = 1;
-/*
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.) UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F 0xxxxxxx
- * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
- *
- * I hope we won't use values > 0xFFFF anytime soon !
- */
+/************************************************************************
+ * *
+ * Generic UTF8 handling routines *
+ * *
+ * From rfc2044: encoding of the Unicode values on UTF-8: *
+ * *
+ * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
+ * 0000 0000-0000 007F 0xxxxxxx *
+ * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
+ * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
+ * *
+ * I hope we won't use values > 0xFFFF anytime soon ! *
+ * *
+ ************************************************************************/
/**
* xmlUTF8Strlen:
@@ -85,7 +92,7 @@ static int xmlLittleEndian = 1;
* Returns the number of characters in the string or -1 in case of error
*/
int
-xmlUTF8Strlen(const unsigned char *utf) {
+xmlUTF8Strlen(const xmlChar *utf) {
int ret = 0;
if (utf == NULL)
@@ -228,6 +235,178 @@ xmlCheckUTF8(const unsigned char *utf)
}
/**
+ * xmlUTF8Strsize:
+ * @utf: a sequence of UTF-8 encoded bytes
+ * @len: the number of characters in the array
+ *
+ * storage size of an UTF8 string
+ *
+ * Returns the storage size of
+ * the first 'len' characters of ARRAY
+ *
+ */
+
+int
+xmlUTF8Strsize(const xmlChar *utf, int len) {
+ const xmlChar *ptr=utf;
+ xmlChar ch;
+
+ if (len <= 0)
+ return(0);
+
+ while ( len-- > 0) {
+ if ( !*ptr )
+ break;
+ if ( (ch = *ptr++) & 0x80)
+ while ( (ch<<=1) & 0x80 )
+ ptr++;
+ }
+ return (ptr - utf);
+}
+
+
+/**
+ * xmlUTF8Strndup:
+ * @utf: the input UTF8 *
+ * @len: the len of @utf (in chars)
+ *
+ * a strndup for array of UTF8's
+ *
+ * Returns a new UTF8 * or NULL
+ */
+xmlChar *
+xmlUTF8Strndup(const xmlChar *utf, int len) {
+ xmlChar *ret;
+ int i;
+
+ if ((utf == NULL) || (len < 0)) return(NULL);
+ i = xmlUTF8Strsize(utf, len);
+ ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
+ if (ret == NULL) {
+ xmlGenericError(xmlGenericErrorContext,
+ "malloc of %ld byte failed\n",
+ (len + 1) * (long)sizeof(xmlChar));
+ return(NULL);
+ }
+ memcpy(ret, utf, i * sizeof(xmlChar));
+ ret[i] = 0;
+ return(ret);
+}
+
+/**
+ * xmlUTF8Strpos:
+ * @utf: the input UTF8 *
+ * @pos: the position of the desired UTF8 char (in chars)
+ *
+ * a function to provide the equivalent of fetching a
+ * character from a string array
+ *
+ * Returns a pointer to the UTF8 character or NULL
+ */
+xmlChar *
+xmlUTF8Strpos(const xmlChar *utf, int pos) {
+ xmlChar ch;
+
+ if (utf == NULL) return(NULL);
+ if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
+ return(NULL);
+ while (pos--) {
+ if ((ch=*utf++) == 0) return(NULL);
+ if ( ch & 0x80 ) {
+ /* if not simple ascii, verify proper format */
+ if ( (ch & 0xc0) != 0xc0 )
+ return(NULL);
+ /* then skip over remaining bytes for this char */
+ while ( (ch <<= 1) & 0x80 )
+ if ( (*utf++ & 0xc0) != 0x80 )
+ return(NULL);
+ }
+ }
+ return((xmlChar *)utf);
+}
+
+/**
+ * xmlUTF8Strloc:
+ * @utf: the input UTF8 *
+ * @utfchar: the UTF8 character to be found
+ *
+ * a function to provide relative location of a UTF8 char
+ *
+ * Returns the relative character position of the desired char
+ * or -1 if not found
+ */
+int
+xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
+ int i, size;
+ xmlChar ch;
+
+ if (utf==NULL || utfchar==NULL) return -1;
+ size = xmlUTF8Strsize(utfchar, 1);
+ for(i=0; (ch=*utf) != 0; i++) {
+ if (xmlStrncmp(utf, utfchar, size)==0)
+ return(i);
+ utf++;
+ if ( ch & 0x80 ) {
+ /* if not simple ascii, verify proper format */
+ if ( (ch & 0xc0) != 0xc0 )
+ return(-1);
+ /* then skip over remaining bytes for this char */
+ while ( (ch <<= 1) & 0x80 )
+ if ( (*utf++ & 0xc0) != 0x80 )
+ return(-1);
+ }
+ }
+
+ return(-1);
+}
+/**
+ * xmlUTF8Strsub:
+ * @utf: a sequence of UTF-8 encoded bytes
+ *
+ * @start: relative pos of first char
+ * @len: total number to copy
+ *
+ * Note: positions are given in units of UTF-8 chars
+ *
+ * Returns a pointer to a newly created string
+ * or NULL if any problem
+ */
+
+xmlChar *
+xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
+ int i;
+ xmlChar ch;
+
+ if (utf == NULL) return(NULL);
+ if (start < 0) return(NULL);
+ if (len < 0) return(NULL);
+
+ /*
+ * Skip over any leading chars
+ */
+ for (i = 0;i < start;i++) {
+ if ((ch=*utf++) == 0) return(NULL);
+ if ( ch & 0x80 ) {
+ /* if not simple ascii, verify proper format */
+ if ( (ch & 0xc0) != 0xc0 )
+ return(NULL);
+ /* then skip over remaining bytes for this char */
+ while ( (ch <<= 1) & 0x80 )
+ if ( (*utf++ & 0xc0) != 0x80 )
+ return(NULL);
+ }
+ }
+
+ return(xmlUTF8Strndup(utf, len));
+}
+
+/************************************************************************
+ * *
+ * Conversions To/From UTF8 encoding *
+ * *
+ ************************************************************************/
+
+/**
* asciiToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
@@ -912,6 +1091,12 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
return(0);
}
+/************************************************************************
+ * *
+ * Generic encoding handling routines *
+ * *
+ ************************************************************************/
+
/**
* xmlDetectCharEncoding:
* @in: a pointer to the first bytes of the XML entity, must be at least
@@ -1256,11 +1441,12 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
return(NULL);
}
-/****************************************************************
- * *
- * Char encoding handlers *
- * *
- ****************************************************************/
+/************************************************************************
+ * *
+ * Char encoding handlers *
+ * *
+ ************************************************************************/
+
/* the size should be growable, but it's not a big deal ... */
#define MAX_ENCODING_HANDLERS 50
@@ -1669,6 +1855,12 @@ xmlFindCharEncodingHandler(const char *name) {
return(NULL);
}
+/************************************************************************
+ * *
+ * ICONV based generic conversion functions *
+ * *
+ ************************************************************************/
+
#ifdef LIBXML_ICONV_ENABLED
/**
* xmlIconvWrapper:
@@ -1730,6 +1922,12 @@ xmlIconvWrapper(iconv_t cd,
}
#endif /* LIBXML_ICONV_ENABLED */
+/************************************************************************
+ * *
+ * The real API used by libxml for on-the-fly conversion *
+ * *
+ ************************************************************************/
+
/**
* xmlCharEncFirstLine:
* @handler: char enconding transformation data structure