diff options
author | Daniel Veillard <veillard@src.gnome.org> | 2000-07-14 14:49:25 +0000 |
---|---|---|
committer | Daniel Veillard <veillard@src.gnome.org> | 2000-07-14 14:49:25 +0000 |
commit | 32bc74ef98c7bd6172327fb03f68766f6ee4b6a2 (patch) | |
tree | ddef7c9058df5c69694c2de2f6ae3be6eb966f65 /encoding.c | |
parent | 8d86964a4aa0f89344ba2065830ad30e6d0e0d47 (diff) | |
download | libxml2-32bc74ef98c7bd6172327fb03f68766f6ee4b6a2.tar.gz |
- doc/encoding.html doc/xml.html: added I18N doc
- encoding.[ch] HTMLtree.[ch] parser.c HTMLparser.c: I18N encoding
improvements, both parser and filters, added ASCII & HTML,
fixed the ISO-Latin-1 one
- xmllint.c testHTML.c: added/made visible --encode
- debugXML.c : cleanup
- most .c files: applied patches due to warning on Windows and
when using Sun Pro cc compiler
- xpath.c : cleanup memleaks
- nanoftp.c : added a TESTING preprocessor flag for standalong
compile so that people can report bugs more easilly
- nanohttp.c : ditched socklen_t which was a portability mess
and replaced it with unsigned int.
- tree.[ch]: added xmlHasProp()
- TODO: updated
- test/ : added more test for entities, NS, encoding, HTML, wap
- configure.in: preparing for 2.2.0 release
Daniel
Diffstat (limited to 'encoding.c')
-rw-r--r-- | encoding.c | 300 |
1 files changed, 259 insertions, 41 deletions
@@ -43,6 +43,9 @@ #endif #include <libxml/encoding.h> #include <libxml/xmlmemory.h> +#ifdef LIBXML_HTML_ENABLED +#include <libxml/HTMLparser.h> +#endif xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; @@ -178,6 +181,140 @@ xmlCheckUTF8(const unsigned char *utf) } /** + * asciiToUTF8: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ASCII chars + * @inlen: the length of @in + * + * Take a block of ASCII chars in and try to convert it to an UTF-8 + * block of chars out. + * Returns 0 if success, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. + */ +int +asciiToUTF8(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + unsigned char* outstart = out; + const unsigned char* base = in; + const unsigned char* processed = in; + unsigned char* outend = out + *outlen; + const unsigned char* inend; + unsigned int c; + int bits; + + inend = in + (*inlen); + while ((in < inend) && (out - outstart + 5 < *outlen)) { + c= *in++; + + /* assertion: c is a single UTF-4 value */ + if (out >= outend) + break; + if (c < 0x80) { *out++= c; bits= -6; } + else { + *outlen = out - outstart; + *inlen = processed - base; + return(-1); + } + + for ( ; bits >= 0; bits-= 6) { + if (out >= outend) + break; + *out++= ((c >> bits) & 0x3F) | 0x80; + } + processed = (const unsigned char*) in; + } + *outlen = out - outstart; + *inlen = processed - base; + return(0); +} + +/** + * UTF8Toascii: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in + * + * Take a block of UTF-8 chars in and try to convert it to an ASCII + * block of chars out. + * + * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. + */ +int +UTF8Toascii(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + const unsigned char* processed = in; + const unsigned char* outend; + const unsigned char* outstart = out; + const unsigned char* instart = in; + const unsigned char* inend; + unsigned int c, d; + int trailing; + + if (in == NULL) { + /* + * initialization nothing to do + */ + *outlen = 0; + *inlen = 0; + return(0); + } + inend = in + (*inlen); + outend = out + (*outlen); + while (in < inend) { + d = *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) + break; + c <<= 6; + c |= d & 0x3F; + } + + /* assertion: c is a single UTF-4 value */ + if (c < 0x80) { + if (out >= outend) + break; + *out++ = c; + } else { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + processed = in; + } + *outlen = out - outstart; + *inlen = processed - instart; + return(0); +} + +/** * isolat1ToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out @@ -195,28 +332,32 @@ int isolat1ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { unsigned char* outstart = out; + const unsigned char* base = in; const unsigned char* processed = in; unsigned char* outend = out + *outlen; - const unsigned char* inend = in + *inlen; - unsigned char c; + const unsigned char* inend; + unsigned int c; + int bits; - while (in < inend) { - c= *in++; - if (c < 0x80) { + inend = in + (*inlen); + while ((in < inend) && (out - outstart + 5 < *outlen)) { + c= *in++; + + /* assertion: c is a single UTF-4 value */ + if (out >= outend) + break; + if (c < 0x80) { *out++= c; bits= -6; } + else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } + + for ( ; bits >= 0; bits-= 6) { if (out >= outend) - break; - *out++ = c; - } - else { - if (out + 1 >= outend) break; - *out++ = 0xC0 | (c >> 6); - *out++ = 0x80 | (0x3F & c); + break; + *out++= ((c >> bits) & 0x3F) | 0x80; } - processed = in; + processed = (const unsigned char*) in; } *outlen = out - outstart; - *inlen = processed - in; - + *inlen = processed - base; return(0); } @@ -229,7 +370,6 @@ isolat1ToUTF8(unsigned char* out, int *outlen, * * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 * block of chars out. - * TODO: UTF8Toisolat1 need a fallback mechanism ... * * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise * The value of @inlen after return is the number of octets consumed @@ -239,34 +379,68 @@ isolat1ToUTF8(unsigned char* out, int *outlen, int UTF8Toisolat1(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { - unsigned char* outstart = out; const unsigned char* processed = in; - unsigned char* outend = out + *outlen; - const unsigned char* inend = in + *inlen; - unsigned char c; + const unsigned char* outend; + const unsigned char* outstart = out; + const unsigned char* instart = in; + const unsigned char* inend; + unsigned int c, d; + int trailing; + if (in == NULL) { + /* + * initialization nothing to do + */ + *outlen = 0; + *inlen = 0; + return(0); + } + inend = in + (*inlen); + outend = out + (*outlen); while (in < inend) { - c= *in++; - if (c < 0x80) { - if (out >= outend) return(-1); - *out++= c; - } - else if (in == inend) { - break; + d = *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in IsoLat1 */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); } - else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) { - /* a two byte utf-8 and can be encoding as isolate1 */ - *out++= ((c & 0x03) << 6) | (*in++ & 0x3F); + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) + break; + c <<= 6; + c |= d & 0x3F; } - else { + + /* assertion: c is a single UTF-4 value */ + if (c <= 0xFF) { + if (out >= outend) + break; + *out++ = c; + } else { + /* no chance for this in IsoLat1 */ *outlen = out - outstart; - *inlen = processed - in; + *inlen = processed - instart; return(-2); } processed = in; } *outlen = out - outstart; - *inlen = processed - in; + *inlen = processed - instart; return(0); } @@ -367,7 +541,6 @@ UTF16LEToUTF8(unsigned char* out, int *outlen, * * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE * block of chars out. - * TODO: UTF8ToUTF16LE need a fallback mechanism ... * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. @@ -410,7 +583,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen, if (d < 0x80) { c= d; trailing= 0; } else if (d < 0xC0) { /* trailing byte in leading position */ - *outlen = out - outstart; + *outlen = (out - outstart) * 2; *inlen = processed - in; return(-2); } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } @@ -418,7 +591,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen, else if (d < 0xF8) { c= d & 0x07; trailing= 3; } else { /* no chance for this in UTF-16 */ - *outlen = out - outstart; + *outlen = (out - outstart) * 2; *inlen = processed - in; return(-2); } @@ -578,7 +751,6 @@ UTF16BEToUTF8(unsigned char* out, int *outlen, * * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE * block of chars out. - * TODO: UTF8ToUTF16BE need a fallback mechanism ... * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. @@ -861,6 +1033,8 @@ xmlGetCharEncodingName(xmlCharEncoding enc) { return("Shift-JIS"); case XML_CHAR_ENCODING_EUC_JP: return("EUC-JP"); + case XML_CHAR_ENCODING_ASCII: + return("ASCII"); } return(NULL); } @@ -974,6 +1148,10 @@ xmlInitCharEncodingHandlers(void) { xmlUTF16BEHandler = xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE); xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1); + xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii); +#ifdef LIBXML_HTML_ENABLED + xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml); +#endif } /** @@ -1081,16 +1259,51 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { handler = xmlFindCharEncodingHandler("UCS2"); if (handler != NULL) return(handler); break; + + /* + * We used to keep ISO Latin encodings native in the + * generated data. This led to so many problems that + * this has been removed. One can still change this + * back by registering no-ops encoders for those + */ case XML_CHAR_ENCODING_8859_1: + handler = xmlFindCharEncodingHandler("ISO-8859-1"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_2: + handler = xmlFindCharEncodingHandler("ISO-8859-2"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_3: + handler = xmlFindCharEncodingHandler("ISO-8859-3"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_4: + handler = xmlFindCharEncodingHandler("ISO-8859-4"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_5: + handler = xmlFindCharEncodingHandler("ISO-8859-5"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_6: + handler = xmlFindCharEncodingHandler("ISO-8859-6"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_7: + handler = xmlFindCharEncodingHandler("ISO-8859-7"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_8: + handler = xmlFindCharEncodingHandler("ISO-8859-8"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_9: - return(NULL); + handler = xmlFindCharEncodingHandler("ISO-8859-9"); + if (handler != NULL) return(handler); + break; + + case XML_CHAR_ENCODING_2022_JP: handler = xmlFindCharEncodingHandler("ISO-2022-JP"); if (handler != NULL) return(handler); @@ -1161,7 +1374,8 @@ xmlFindCharEncodingHandler(const char *name) { icv_in = iconv_open("UTF-8", name); icv_out = iconv_open(name, "UTF-8"); if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) { - enc = xmlMalloc(sizeof(xmlCharEncodingHandler)); + enc = (xmlCharEncodingHandlerPtr) + xmlMalloc(sizeof(xmlCharEncodingHandler)); if (enc == NULL) { iconv_close(icv_in); iconv_close(icv_out); @@ -1506,6 +1720,10 @@ retry: if (ret == -1) ret = -3; } #endif /* LIBXML_ICONV_ENABLED */ + else { + fprintf(stderr, "xmlCharEncOutFunc: no output function !\n"); + return(-1); + } if (ret >= 0) output += ret; @@ -1528,7 +1746,7 @@ retry: #endif case -2: { int len = in->use; - const char *utf = (const char *) in->content; + const xmlChar *utf = (const xmlChar *) in->content; int cur; cur = xmlGetUTF8Char(utf, &len); @@ -1546,7 +1764,7 @@ retry: * and continue the transcoding phase, hoping the error * did not mangle the encoder state. */ - sprintf(charref, "&#x%X;", cur); + sprintf((char *) charref, "&#x%X;", cur); xmlBufferShrink(in, len); xmlBufferAddHead(in, charref, -1); |