summaryrefslogtreecommitdiff
path: root/encoding.c
diff options
context:
space:
mode:
authorDaniel Veillard <veillard@src.gnome.org>2000-07-14 14:49:25 +0000
committerDaniel Veillard <veillard@src.gnome.org>2000-07-14 14:49:25 +0000
commit32bc74ef98c7bd6172327fb03f68766f6ee4b6a2 (patch)
treeddef7c9058df5c69694c2de2f6ae3be6eb966f65 /encoding.c
parent8d86964a4aa0f89344ba2065830ad30e6d0e0d47 (diff)
downloadlibxml2-32bc74ef98c7bd6172327fb03f68766f6ee4b6a2.tar.gz
- doc/encoding.html doc/xml.html: added I18N doc
- encoding.[ch] HTMLtree.[ch] parser.c HTMLparser.c: I18N encoding improvements, both parser and filters, added ASCII & HTML, fixed the ISO-Latin-1 one - xmllint.c testHTML.c: added/made visible --encode - debugXML.c : cleanup - most .c files: applied patches due to warning on Windows and when using Sun Pro cc compiler - xpath.c : cleanup memleaks - nanoftp.c : added a TESTING preprocessor flag for standalong compile so that people can report bugs more easilly - nanohttp.c : ditched socklen_t which was a portability mess and replaced it with unsigned int. - tree.[ch]: added xmlHasProp() - TODO: updated - test/ : added more test for entities, NS, encoding, HTML, wap - configure.in: preparing for 2.2.0 release Daniel
Diffstat (limited to 'encoding.c')
-rw-r--r--encoding.c300
1 files changed, 259 insertions, 41 deletions
diff --git a/encoding.c b/encoding.c
index dd367201..3d997345 100644
--- a/encoding.c
+++ b/encoding.c
@@ -43,6 +43,9 @@
#endif
#include <libxml/encoding.h>
#include <libxml/xmlmemory.h>
+#ifdef LIBXML_HTML_ENABLED
+#include <libxml/HTMLparser.h>
+#endif
xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
@@ -178,6 +181,140 @@ xmlCheckUTF8(const unsigned char *utf)
}
/**
+ * asciiToUTF8:
+ * @out: a pointer to an array of bytes to store the result
+ * @outlen: the length of @out
+ * @in: a pointer to an array of ASCII chars
+ * @inlen: the length of @in
+ *
+ * Take a block of ASCII chars in and try to convert it to an UTF-8
+ * block of chars out.
+ * Returns 0 if success, or -1 otherwise
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
+ */
+int
+asciiToUTF8(unsigned char* out, int *outlen,
+ const unsigned char* in, int *inlen) {
+ unsigned char* outstart = out;
+ const unsigned char* base = in;
+ const unsigned char* processed = in;
+ unsigned char* outend = out + *outlen;
+ const unsigned char* inend;
+ unsigned int c;
+ int bits;
+
+ inend = in + (*inlen);
+ while ((in < inend) && (out - outstart + 5 < *outlen)) {
+ c= *in++;
+
+ /* assertion: c is a single UTF-4 value */
+ if (out >= outend)
+ break;
+ if (c < 0x80) { *out++= c; bits= -6; }
+ else {
+ *outlen = out - outstart;
+ *inlen = processed - base;
+ return(-1);
+ }
+
+ for ( ; bits >= 0; bits-= 6) {
+ if (out >= outend)
+ break;
+ *out++= ((c >> bits) & 0x3F) | 0x80;
+ }
+ processed = (const unsigned char*) in;
+ }
+ *outlen = out - outstart;
+ *inlen = processed - base;
+ return(0);
+}
+
+/**
+ * UTF8Toascii:
+ * @out: a pointer to an array of bytes to store the result
+ * @outlen: the length of @out
+ * @in: a pointer to an array of UTF-8 chars
+ * @inlen: the length of @in
+ *
+ * Take a block of UTF-8 chars in and try to convert it to an ASCII
+ * block of chars out.
+ *
+ * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
+ */
+int
+UTF8Toascii(unsigned char* out, int *outlen,
+ const unsigned char* in, int *inlen) {
+ const unsigned char* processed = in;
+ const unsigned char* outend;
+ const unsigned char* outstart = out;
+ const unsigned char* instart = in;
+ const unsigned char* inend;
+ unsigned int c, d;
+ int trailing;
+
+ if (in == NULL) {
+ /*
+ * initialization nothing to do
+ */
+ *outlen = 0;
+ *inlen = 0;
+ return(0);
+ }
+ inend = in + (*inlen);
+ outend = out + (*outlen);
+ while (in < inend) {
+ d = *in++;
+ if (d < 0x80) { c= d; trailing= 0; }
+ else if (d < 0xC0) {
+ /* trailing byte in leading position */
+ *outlen = out - outstart;
+ *inlen = processed - instart;
+ return(-2);
+ } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
+ else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
+ else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
+ else {
+ /* no chance for this in Ascii */
+ *outlen = out - outstart;
+ *inlen = processed - instart;
+ return(-2);
+ }
+
+ if (inend - in < trailing) {
+ break;
+ }
+
+ for ( ; trailing; trailing--) {
+ if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
+ break;
+ c <<= 6;
+ c |= d & 0x3F;
+ }
+
+ /* assertion: c is a single UTF-4 value */
+ if (c < 0x80) {
+ if (out >= outend)
+ break;
+ *out++ = c;
+ } else {
+ /* no chance for this in Ascii */
+ *outlen = out - outstart;
+ *inlen = processed - instart;
+ return(-2);
+ }
+ processed = in;
+ }
+ *outlen = out - outstart;
+ *inlen = processed - instart;
+ return(0);
+}
+
+/**
* isolat1ToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
@@ -195,28 +332,32 @@ int
isolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
+ const unsigned char* base = in;
const unsigned char* processed = in;
unsigned char* outend = out + *outlen;
- const unsigned char* inend = in + *inlen;
- unsigned char c;
+ const unsigned char* inend;
+ unsigned int c;
+ int bits;
- while (in < inend) {
- c= *in++;
- if (c < 0x80) {
+ inend = in + (*inlen);
+ while ((in < inend) && (out - outstart + 5 < *outlen)) {
+ c= *in++;
+
+ /* assertion: c is a single UTF-4 value */
+ if (out >= outend)
+ break;
+ if (c < 0x80) { *out++= c; bits= -6; }
+ else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
+
+ for ( ; bits >= 0; bits-= 6) {
if (out >= outend)
- break;
- *out++ = c;
- }
- else {
- if (out + 1 >= outend) break;
- *out++ = 0xC0 | (c >> 6);
- *out++ = 0x80 | (0x3F & c);
+ break;
+ *out++= ((c >> bits) & 0x3F) | 0x80;
}
- processed = in;
+ processed = (const unsigned char*) in;
}
*outlen = out - outstart;
- *inlen = processed - in;
-
+ *inlen = processed - base;
return(0);
}
@@ -229,7 +370,6 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
*
* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
* block of chars out.
- * TODO: UTF8Toisolat1 need a fallback mechanism ...
*
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
@@ -239,34 +379,68 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
int
UTF8Toisolat1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
- unsigned char* outstart = out;
const unsigned char* processed = in;
- unsigned char* outend = out + *outlen;
- const unsigned char* inend = in + *inlen;
- unsigned char c;
+ const unsigned char* outend;
+ const unsigned char* outstart = out;
+ const unsigned char* instart = in;
+ const unsigned char* inend;
+ unsigned int c, d;
+ int trailing;
+ if (in == NULL) {
+ /*
+ * initialization nothing to do
+ */
+ *outlen = 0;
+ *inlen = 0;
+ return(0);
+ }
+ inend = in + (*inlen);
+ outend = out + (*outlen);
while (in < inend) {
- c= *in++;
- if (c < 0x80) {
- if (out >= outend) return(-1);
- *out++= c;
- }
- else if (in == inend) {
- break;
+ d = *in++;
+ if (d < 0x80) { c= d; trailing= 0; }
+ else if (d < 0xC0) {
+ /* trailing byte in leading position */
+ *outlen = out - outstart;
+ *inlen = processed - instart;
+ return(-2);
+ } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
+ else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
+ else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
+ else {
+ /* no chance for this in IsoLat1 */
+ *outlen = out - outstart;
+ *inlen = processed - instart;
+ return(-2);
}
- else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
- /* a two byte utf-8 and can be encoding as isolate1 */
- *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
+
+ if (inend - in < trailing) {
+ break;
+ }
+
+ for ( ; trailing; trailing--) {
+ if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
+ break;
+ c <<= 6;
+ c |= d & 0x3F;
}
- else {
+
+ /* assertion: c is a single UTF-4 value */
+ if (c <= 0xFF) {
+ if (out >= outend)
+ break;
+ *out++ = c;
+ } else {
+ /* no chance for this in IsoLat1 */
*outlen = out - outstart;
- *inlen = processed - in;
+ *inlen = processed - instart;
return(-2);
}
processed = in;
}
*outlen = out - outstart;
- *inlen = processed - in;
+ *inlen = processed - instart;
return(0);
}
@@ -367,7 +541,6 @@ UTF16LEToUTF8(unsigned char* out, int *outlen,
*
* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
* block of chars out.
- * TODO: UTF8ToUTF16LE need a fallback mechanism ...
*
* Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
@@ -410,7 +583,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
- *outlen = out - outstart;
+ *outlen = (out - outstart) * 2;
*inlen = processed - in;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
@@ -418,7 +591,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in UTF-16 */
- *outlen = out - outstart;
+ *outlen = (out - outstart) * 2;
*inlen = processed - in;
return(-2);
}
@@ -578,7 +751,6 @@ UTF16BEToUTF8(unsigned char* out, int *outlen,
*
* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
* block of chars out.
- * TODO: UTF8ToUTF16BE need a fallback mechanism ...
*
* Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
@@ -861,6 +1033,8 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
return("Shift-JIS");
case XML_CHAR_ENCODING_EUC_JP:
return("EUC-JP");
+ case XML_CHAR_ENCODING_ASCII:
+ return("ASCII");
}
return(NULL);
}
@@ -974,6 +1148,10 @@ xmlInitCharEncodingHandlers(void) {
xmlUTF16BEHandler =
xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
+ xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
+#ifdef LIBXML_HTML_ENABLED
+ xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
+#endif
}
/**
@@ -1081,16 +1259,51 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
handler = xmlFindCharEncodingHandler("UCS2");
if (handler != NULL) return(handler);
break;
+
+ /*
+ * We used to keep ISO Latin encodings native in the
+ * generated data. This led to so many problems that
+ * this has been removed. One can still change this
+ * back by registering no-ops encoders for those
+ */
case XML_CHAR_ENCODING_8859_1:
+ handler = xmlFindCharEncodingHandler("ISO-8859-1");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_2:
+ handler = xmlFindCharEncodingHandler("ISO-8859-2");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_3:
+ handler = xmlFindCharEncodingHandler("ISO-8859-3");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_4:
+ handler = xmlFindCharEncodingHandler("ISO-8859-4");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_5:
+ handler = xmlFindCharEncodingHandler("ISO-8859-5");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_6:
+ handler = xmlFindCharEncodingHandler("ISO-8859-6");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_7:
+ handler = xmlFindCharEncodingHandler("ISO-8859-7");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_8:
+ handler = xmlFindCharEncodingHandler("ISO-8859-8");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_9:
- return(NULL);
+ handler = xmlFindCharEncodingHandler("ISO-8859-9");
+ if (handler != NULL) return(handler);
+ break;
+
+
case XML_CHAR_ENCODING_2022_JP:
handler = xmlFindCharEncodingHandler("ISO-2022-JP");
if (handler != NULL) return(handler);
@@ -1161,7 +1374,8 @@ xmlFindCharEncodingHandler(const char *name) {
icv_in = iconv_open("UTF-8", name);
icv_out = iconv_open(name, "UTF-8");
if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
- enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
+ enc = (xmlCharEncodingHandlerPtr)
+ xmlMalloc(sizeof(xmlCharEncodingHandler));
if (enc == NULL) {
iconv_close(icv_in);
iconv_close(icv_out);
@@ -1506,6 +1720,10 @@ retry:
if (ret == -1) ret = -3;
}
#endif /* LIBXML_ICONV_ENABLED */
+ else {
+ fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
+ return(-1);
+ }
if (ret >= 0) output += ret;
@@ -1528,7 +1746,7 @@ retry:
#endif
case -2: {
int len = in->use;
- const char *utf = (const char *) in->content;
+ const xmlChar *utf = (const xmlChar *) in->content;
int cur;
cur = xmlGetUTF8Char(utf, &len);
@@ -1546,7 +1764,7 @@ retry:
* and continue the transcoding phase, hoping the error
* did not mangle the encoder state.
*/
- sprintf(charref, "&#x%X;", cur);
+ sprintf((char *) charref, "&#x%X;", cur);
xmlBufferShrink(in, len);
xmlBufferAddHead(in, charref, -1);