diff options
author | Daniel Veillard <veillard@src.gnome.org> | 2000-07-14 14:49:25 +0000 |
---|---|---|
committer | Daniel Veillard <veillard@src.gnome.org> | 2000-07-14 14:49:25 +0000 |
commit | 32bc74ef98c7bd6172327fb03f68766f6ee4b6a2 (patch) | |
tree | ddef7c9058df5c69694c2de2f6ae3be6eb966f65 | |
parent | 8d86964a4aa0f89344ba2065830ad30e6d0e0d47 (diff) | |
download | libxml2-32bc74ef98c7bd6172327fb03f68766f6ee4b6a2.tar.gz |
- doc/encoding.html doc/xml.html: added I18N doc
- encoding.[ch] HTMLtree.[ch] parser.c HTMLparser.c: I18N encoding
improvements, both parser and filters, added ASCII & HTML,
fixed the ISO-Latin-1 one
- xmllint.c testHTML.c: added/made visible --encode
- debugXML.c : cleanup
- most .c files: applied patches due to warning on Windows and
when using Sun Pro cc compiler
- xpath.c : cleanup memleaks
- nanoftp.c : added a TESTING preprocessor flag for standalong
compile so that people can report bugs more easilly
- nanohttp.c : ditched socklen_t which was a portability mess
and replaced it with unsigned int.
- tree.[ch]: added xmlHasProp()
- TODO: updated
- test/ : added more test for entities, NS, encoding, HTML, wap
- configure.in: preparing for 2.2.0 release
Daniel
41 files changed, 2065 insertions, 925 deletions
@@ -1,3 +1,23 @@ +Fri Jul 14 16:12:20 MEST 2000 Daniel Veillard <Daniel.Veillard@w3.org> + + * doc/encoding.html doc/xml.html: added I18N doc + * encoding.[ch] HTMLtree.[ch] parser.c HTMLparser.c: I18N encoding + improvements, both parser and filters, added ASCII & HTML, + fixed the ISO-Latin-1 one + * xmllint.c testHTML.c: added/made visible --encode + * debugXML.c : cleanup + * most .c files: applied patches due to warning on Windows and + when using Sun Pro cc compiler + * xpath.c : cleanup memleaks + * nanoftp.c : added a TESTING preprocessor flag for standalong + compile so that people can report bugs more easilly + * nanohttp.c : ditched socklen_t which was a portability mess + and replaced it with unsigned int. + * tree.[ch]: added xmlHasProp() + * TODO: updated + * test/ : added more test for entities, NS, encoding, HTML, wap + * configure.in: preparing for 2.2.0 release + Mon Jul 10 16:17:18 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org> * nanoftp.c: fixed the way the control connection is handled diff --git a/HTMLparser.c b/HTMLparser.c index ff331488..0877f4cc 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -16,7 +16,7 @@ #ifdef LIBXML_HTML_ENABLED #include <stdio.h> -#include <string.h> /* for memset() only */ +#include <string.h> #ifdef HAVE_CTYPE_H #include <ctype.h> #endif @@ -41,6 +41,7 @@ #include <libxml/HTMLparser.h> #include <libxml/entities.h> #include <libxml/encoding.h> +#include <libxml/parser.h> #include <libxml/valid.h> #include <libxml/parserInternals.h> #include <libxml/xmlIO.h> @@ -48,7 +49,7 @@ #define HTML_MAX_NAMELEN 1000 #define INPUT_CHUNK 50 -#define HTML_PARSER_BIG_BUFFER_SIZE 1024 +#define HTML_PARSER_BIG_BUFFER_SIZE 1000 #define HTML_PARSER_BUFFER_SIZE 100 /* #define DEBUG */ @@ -68,7 +69,7 @@ scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \ if (ctxt->name##Nr >= ctxt->name##Max) { \ ctxt->name##Max *= 2; \ - ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \ + ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \ ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \ if (ctxt->name##Tab == NULL) { \ fprintf(stderr, "realloc failed !\n"); \ @@ -124,8 +125,6 @@ PUSH_AND_POP(extern, xmlChar*, name) * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */ -#define CUR ((int) (*ctxt->input->cur)) - #define UPPER (toupper(*ctxt->input->cur)) #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val) @@ -142,9 +141,172 @@ PUSH_AND_POP(extern, xmlChar*, name) #define CURRENT ((int) (*ctxt->input->cur)) +#define SKIP_BLANKS htmlSkipBlankChars(ctxt); + +#if 0 +#define CUR ((int) (*ctxt->input->cur)) #define NEXT htmlNextChar(ctxt); +#else +/* Inported from XML */ -#define SKIP_BLANKS htmlSkipBlankChars(ctxt); +/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ +#define CUR ((int) (*ctxt->input->cur)) +#define NEXT xmlNextChar(ctxt);ctxt->nbChars++; + +#define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) +#define NXT(val) ctxt->input->cur[(val)] +#define CUR_PTR ctxt->input->cur + + +#define NEXTL(l) \ + if (*(ctxt->input->cur) == '\n') { \ + ctxt->input->line++; ctxt->input->col = 1; \ + } else ctxt->input->col++; \ + ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; + +/************ + \ + if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ + if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); + ************/ + +#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l); +#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l); + +#define COPY_BUF(l,b,i,v) \ + if (l == 1) b[i++] = (xmlChar) v; \ + else i += xmlCopyChar(l,&b[i],v); +#endif + +/** + * htmlCurrentChar: + * @ctxt: the HTML parser context + * @len: pointer to the length of the char read + * + * The current char value, if using UTF-8 this may actaully span multiple + * bytes in the input buffer. Implement the end of line normalization: + * 2.11 End-of-Line Handling + * If the encoding is unspecified, in the case we find an ISO-Latin-1 + * char, then the encoding converter is plugged in automatically. + * + * Returns the current char value and its lenght + */ + +int +htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { + if (ctxt->instate == XML_PARSER_EOF) + return(0); + + if (ctxt->token != 0) { + *len = 0; + return(ctxt->token); + } + if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { + /* + * We are supposed to handle UTF8, check it's valid + * From rfc2044: encoding of the Unicode values on UTF-8: + * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) + * 0000 0000-0000 007F 0xxxxxxx + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + * + * Check for the 0x110000 limit too + */ + const unsigned char *cur = ctxt->input->cur; + unsigned char c; + unsigned int val; + + c = *cur; + if (c & 0x80) { + if (cur[1] == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + if ((cur[1] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xe0) == 0xe0) { + + if (cur[2] == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + if ((cur[2] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xf0) == 0xf0) { + if (cur[3] == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + if (((c & 0xf8) != 0xf0) || + ((cur[3] & 0xc0) != 0x80)) + goto encoding_error; + /* 4-byte code */ + *len = 4; + val = (cur[0] & 0x7) << 18; + val |= (cur[1] & 0x3f) << 12; + val |= (cur[2] & 0x3f) << 6; + val |= cur[3] & 0x3f; + } else { + /* 3-byte code */ + *len = 3; + val = (cur[0] & 0xf) << 12; + val |= (cur[1] & 0x3f) << 6; + val |= cur[2] & 0x3f; + } + } else { + /* 2-byte code */ + *len = 2; + val = (cur[0] & 0x1f) << 6; + val |= cur[1] & 0x3f; + } + if (!IS_CHAR(val)) { + if ((ctxt->sax != NULL) && + (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Char 0x%X out of allowed range\n", val); + ctxt->errNo = XML_ERR_INVALID_ENCODING; + ctxt->wellFormed = 0; + ctxt->disableSAX = 1; + } + return(val); + } else { + /* 1-byte code */ + *len = 1; + return((int) *ctxt->input->cur); + } + } + /* + * Assume it's a fixed lenght encoding (1) with + * a compatibke encoding for the ASCII set, since + * XML constructs only use < 128 chars + */ + *len = 1; + if ((int) *ctxt->input->cur < 0x80) + return((int) *ctxt->input->cur); + + /* + * Humm this is bad, do an automatic flow conversion + */ + xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); + ctxt->charset = XML_CHAR_ENCODING_UTF8; + return(xmlCurrentChar(ctxt, len)); + +encoding_error: + /* + * If we detect an UTF8 error that probably mean that the + * input encoding didn't get properly advertized in the + * declaration header. Report the error and switch the encoding + * to ISO-Latin-1 (if you don't like this policy, just declare the + * encoding !) + */ + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { + ctxt->sax->error(ctxt->userData, + "Input is not proper UTF-8, indicate encoding !\n"); + ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + ctxt->input->cur[0], ctxt->input->cur[1], + ctxt->input->cur[2], ctxt->input->cur[3]); + } + ctxt->errNo = XML_ERR_INVALID_ENCODING; + + ctxt->charset = XML_CHAR_ENCODING_8859_1; + *len = 1; + return((int) *ctxt->input->cur); +} /** * htmlNextChar: @@ -443,8 +605,8 @@ htmlTagLookup(const xmlChar *tag) { /** * htmlCheckAutoClose: - * @new: The new tag name - * @old: The old tag name + * @newtag: The new tag name + * @oldtag: The old tag name * * Checks wether the new tag is one of the registered valid tags for closing old. * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. @@ -452,7 +614,7 @@ htmlTagLookup(const xmlChar *tag) { * Returns 0 if no, 1 if yes. */ int -htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) { +htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) { int i, index; char **close; @@ -462,13 +624,13 @@ htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) { for (index = 0; index < 100;index++) { close = htmlStartCloseIndex[index]; if (close == NULL) return(0); - if (!xmlStrcmp(BAD_CAST *close, new)) break; + if (!xmlStrcmp(BAD_CAST *close, newtag)) break; } i = close - htmlStartClose; i++; while (htmlStartClose[i] != NULL) { - if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) { + if (!xmlStrcmp(BAD_CAST htmlStartClose[i], oldtag)) { return(1); } i++; @@ -477,24 +639,73 @@ htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) { } /** + * htmlAutoCloseOnClose: + * @ctxt: an HTML parser context + * @newtag: The new tag name + * + * The HTmL DtD allows an ending tag to implicitely close other tags. + */ +void +htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { + htmlElemDescPtr info; + xmlChar *oldname; + int i; + +#ifdef DEBUG + fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr); + for (i = 0;i < ctxt->nameNr;i++) + fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]); +#endif + + for (i = (ctxt->nameNr - 1);i >= 0;i--) { + if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break; + } + if (i < 0) return; + + while (xmlStrcmp(newtag, ctxt->name)) { + info = htmlTagLookup(ctxt->name); + if ((info == NULL) || (info->endTag == 1)) { +#ifdef DEBUG + fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name); +#endif + } else { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Opening and ending tag mismatch: %s and %s\n", + newtag, ctxt->name); + ctxt->wellFormed = 0; + } + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, ctxt->name); + oldname = htmlnamePop(ctxt); + if (oldname != NULL) { +#ifdef DEBUG + fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname); +#endif + xmlFree(oldname); + } + } +} + +/** * htmlAutoClose: * @ctxt: an HTML parser context - * @new: The new tag name or NULL + * @newtag: The new tag name or NULL * * The HTmL DtD allows a tag to implicitely close other tags. * The list is kept in htmlStartClose array. This function is * called when a new tag has been detected and generates the * appropriates closes if possible/needed. - * If new is NULL this mean we are at the end of the resource + * If newtag is NULL this mean we are at the end of the resource * and we should check */ void -htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { +htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { xmlChar *oldname; - while ((new != NULL) && (ctxt->name != NULL) && - (htmlCheckAutoClose(new, ctxt->name))) { + while ((newtag != NULL) && (ctxt->name != NULL) && + (htmlCheckAutoClose(newtag, ctxt->name))) { #ifdef DEBUG - fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name); + fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name); #endif if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, ctxt->name); @@ -506,7 +717,12 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { xmlFree(oldname); } } - while ((new == NULL) && (ctxt->name != NULL) && + if (newtag == NULL) { + htmlAutoCloseOnClose(ctxt, BAD_CAST"head"); + htmlAutoCloseOnClose(ctxt, BAD_CAST"body"); + htmlAutoCloseOnClose(ctxt, BAD_CAST"html"); + } + while ((newtag == NULL) && (ctxt->name != NULL) && ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) || (!xmlStrcmp(ctxt->name, BAD_CAST"body")) || (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) { @@ -579,66 +795,17 @@ htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { } /** - * htmlAutoCloseOnClose: - * @ctxt: an HTML parser context - * @new: The new tag name - * - * The HTmL DtD allows an ending tag to implicitely close other tags. - */ -void -htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { - htmlElemDescPtr info; - xmlChar *oldname; - int i; - -#ifdef DEBUG - fprintf(stderr,"Close of %s stack: %d elements\n", new, ctxt->nameNr); - for (i = 0;i < ctxt->nameNr;i++) - fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]); -#endif - - for (i = (ctxt->nameNr - 1);i >= 0;i--) { - if (!xmlStrcmp(new, ctxt->nameTab[i])) break; - } - if (i < 0) return; - - while (xmlStrcmp(new, ctxt->name)) { - info = htmlTagLookup(ctxt->name); - if ((info == NULL) || (info->endTag == 1)) { -#ifdef DEBUG - fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name); -#endif - } else { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "Opening and ending tag mismatch: %s and %s\n", - new, ctxt->name); - ctxt->wellFormed = 0; - } - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, ctxt->name); - oldname = htmlnamePop(ctxt); - if (oldname != NULL) { -#ifdef DEBUG - fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname); -#endif - xmlFree(oldname); - } - } -} - -/** * htmlCheckImplied: * @ctxt: an HTML parser context - * @new: The new tag name + * @newtag: The new tag name * * The HTmL DtD allows a tag to exists only implicitely * called when a new tag has been detected and generates the * appropriates implicit tags if missing */ void -htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *new) { - if (!xmlStrcmp(new, BAD_CAST"html")) +htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { + if (!xmlStrcmp(newtag, BAD_CAST"html")) return; if (ctxt->nameNr <= 0) { #ifdef DEBUG @@ -648,15 +815,15 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *new) { if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); } - if ((!xmlStrcmp(new, BAD_CAST"body")) || (!xmlStrcmp(new, BAD_CAST"head"))) + if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head"))) return; if (ctxt->nameNr <= 1) { - if ((!xmlStrcmp(new, BAD_CAST"script")) || - (!xmlStrcmp(new, BAD_CAST"style")) || - (!xmlStrcmp(new, BAD_CAST"meta")) || - (!xmlStrcmp(new, BAD_CAST"link")) || - (!xmlStrcmp(new, BAD_CAST"title")) || - (!xmlStrcmp(new, BAD_CAST"base"))) { + if ((!xmlStrcmp(newtag, BAD_CAST"script")) || + (!xmlStrcmp(newtag, BAD_CAST"style")) || + (!xmlStrcmp(newtag, BAD_CAST"meta")) || + (!xmlStrcmp(newtag, BAD_CAST"link")) || + (!xmlStrcmp(newtag, BAD_CAST"title")) || + (!xmlStrcmp(newtag, BAD_CAST"base"))) { /* * dropped OBJECT ... i you put it first BODY will be * assumed ! @@ -1006,6 +1173,114 @@ htmlEntityLookup(const xmlChar *name) { return(NULL); } +/** + * UTF8ToHtml: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in + * + * Take a block of UTF-8 chars in and try to convert it to an ASCII + * plus HTML entities block of chars out. + * + * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of octets consumed. + */ +int +UTF8ToHtml(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + const unsigned char* processed = in; + const unsigned char* outend; + const unsigned char* outstart = out; + const unsigned char* instart = in; + const unsigned char* inend; + unsigned int c, d; + int trailing; + + if (in == NULL) { + /* + * initialization nothing to do + */ + *outlen = 0; + *inlen = 0; + return(0); + } + inend = in + (*inlen); + outend = out + (*outlen); + while (in < inend) { + d = *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) + break; + c <<= 6; + c |= d & 0x3F; + } + + /* assertion: c is a single UTF-4 value */ + if (c < 0x80) { + if (out >= outend) + break; + *out++ = c; + } else { + int i, j, len; + /* + * Try to lookup a predefined HTML entity for it + */ + + for (i = 0;i < (sizeof(html40EntitiesTable)/ + sizeof(html40EntitiesTable[0]));i++) { + if (html40EntitiesTable[i].value == c) { +#ifdef DEBUG + fprintf(stderr,"Found entity %s\n", name); +#endif + goto found_ent; + } + if (html40EntitiesTable[i].value > c) + break; + } + + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); +found_ent: + len = strlen(html40EntitiesTable[i].name); + if (out + 2 + len >= outend) + break; + *out++ = '&'; + for (j = 0;j < len;j++) + *out++ = html40EntitiesTable[i].name[j]; + *out++ = ';'; + } + processed = in; + } + *outlen = out - outstart; + *inlen = processed - instart; + return(0); +} + /** * htmlDecodeEntities: @@ -1025,15 +1300,23 @@ htmlEntityLookup(const xmlChar *name) { xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, xmlChar end, xmlChar end2, xmlChar end3) { - xmlChar *buffer = NULL; - int buffer_size = 0; - xmlChar *out = NULL; xmlChar *name = NULL; - - xmlChar *cur = NULL; + xmlChar *buffer = NULL; + unsigned int buffer_size = 0; + unsigned int nbchars = 0; htmlEntityDescPtr ent; - int nbchars = 0; unsigned int max = (unsigned int) len; + int c,l; + + if (ctxt->depth > 40) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Detected entity reference loop\n"); + ctxt->wellFormed = 0; + ctxt->disableSAX = 1; + ctxt->errNo = XML_ERR_ENTITY_LOOP; + return(NULL); + } /* * allocate a translation buffer. @@ -1041,68 +1324,52 @@ htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, buffer_size = HTML_PARSER_BIG_BUFFER_SIZE; buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); if (buffer == NULL) { - perror("htmlDecodeEntities: malloc failed"); + perror("xmlDecodeEntities: malloc failed"); return(NULL); } - out = buffer; /* * Ok loop until we reach one of the ending char or a size limit. */ - while ((nbchars < (int) max) && (CUR != end) && - (CUR != end2) && (CUR != end3)) { - - if (CUR == '&') { - if (NXT(1) == '#') { - int val = htmlParseCharRef(ctxt); - /* invalid for UTF-8 variable encoding !!!!! */ - *out++ = val; - nbchars += 3; /* !!!! */ - } else { - ent = htmlParseEntityRef(ctxt, &name); - if (name != NULL) { - if ((ent == NULL) || (ent->value <= 0) || - (ent->value >= 255)) { - *out++ = '&'; - cur = name; - while (*cur != 0) { - if (out - buffer > buffer_size - 100) { - int index = out - buffer; - - growBuffer(buffer); - out = &buffer[index]; - } - *out++ = *cur++; - } - *out++ = ';'; - } else { - /* invalid for UTF-8 variable encoding !!!!! */ - *out++ = (xmlChar)ent->value; - if (out - buffer > buffer_size - 100) { - int index = out - buffer; + c = CUR_CHAR(l); + while ((nbchars < max) && (c != end) && + (c != end2) && (c != end3)) { + + if (c == 0) break; + if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) { + int val = htmlParseCharRef(ctxt); + COPY_BUF(0,buffer,nbchars,val); + NEXTL(l); + } else if ((c == '&') && (ctxt->token != '&')) { + ent = htmlParseEntityRef(ctxt, &name); + if (name != NULL) { + if (ent != NULL) { + int val = ent->value; + COPY_BUF(0,buffer,nbchars,val); + NEXTL(l); + } else { + const xmlChar *cur = name; - growBuffer(buffer); - out = &buffer[index]; - } + buffer[nbchars++] = '&'; + if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) { + growBuffer(buffer); } - nbchars += 2 + xmlStrlen(name); - xmlFree(name); + while (*cur != 0) { + buffer[nbchars++] = *cur++; + } + buffer[nbchars++] = ';'; } } } else { - /* invalid for UTF-8 , use COPY(out); !!!!! */ - *out++ = CUR; - nbchars++; - if (out - buffer > buffer_size - 100) { - int index = out - buffer; - + COPY_BUF(l,buffer,nbchars,c); + NEXTL(l); + if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) { growBuffer(buffer); - out = &buffer[index]; } - NEXT; } + c = CUR_CHAR(l); } - *out++ = 0; + buffer[nbchars++] = 0; return(buffer); } @@ -1152,6 +1419,7 @@ htmlNewInputStream(htmlParserCtxtPtr ctxt) { ctxt->errNo = XML_ERR_NO_MEMORY; return(NULL); } + memset(input, 0, sizeof(htmlParserInput)); input->filename = NULL; input->directory = NULL; input->base = NULL; @@ -1161,6 +1429,7 @@ htmlNewInputStream(htmlParserCtxtPtr ctxt) { input->col = 1; input->buf = NULL; input->free = NULL; + input->version = NULL; input->consumed = 0; input->length = 0; return(input); @@ -1191,6 +1460,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { for (i = 0;i < len;i++) if (!(IS_BLANK(str[i]))) return(0); + if (CUR == 0) return(1); if (CUR != '<') return(0); if (ctxt->node == NULL) return(0); lastChild = xmlGetLastChild(ctxt->node); @@ -1427,8 +1697,22 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { if ((stop == 0) && (IS_BLANK(CUR))) break; if (CUR == '&') { if (NXT(1) == '#') { - int val = htmlParseCharRef(ctxt); - *out++ = val; + unsigned int c; + int bits; + + c = htmlParseCharRef(ctxt); + if (c < 0x80) + { *out++ = c; bits= -6; } + else if (c < 0x800) + { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } + else if (c < 0x10000) + { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } + else + { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } + + for ( ; bits >= 0; bits-= 6) { + *out++ = ((c >> bits) & 0x3F) | 0x80; + } } else { ent = htmlParseEntityRef(ctxt, &name); if (name == NULL) { @@ -1439,8 +1723,7 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { growBuffer(buffer); out = &buffer[index]; } - } else if ((ent == NULL) || (ent->value <= 0) || - (ent->value >= 255)) { + } else if (ent == NULL) { *out++ = '&'; cur = name; while (*cur != 0) { @@ -1454,23 +1737,53 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { } xmlFree(name); } else { - *out++ = ent->value; + unsigned int c; + int bits; + if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } + c = (xmlChar)ent->value; + if (c < 0x80) + { *out++ = c; bits= -6; } + else if (c < 0x800) + { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } + else if (c < 0x10000) + { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } + else + { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } + + for ( ; bits >= 0; bits-= 6) { + *out++ = ((c >> bits) & 0x3F) | 0x80; + } xmlFree(name); } } } else { - *out++ = CUR; + unsigned int c; + int bits; + if (out - buffer > buffer_size - 100) { - int index = out - buffer; - - growBuffer(buffer); - out = &buffer[index]; + int index = out - buffer; + + growBuffer(buffer); + out = &buffer[index]; + } + c = CUR; + if (c < 0x80) + { *out++ = c; bits= -6; } + else if (c < 0x800) + { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } + else if (c < 0x10000) + { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } + else + { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } + + for ( ; bits >= 0; bits-= 6) { + *out++ = ((c >> bits) & 0x3F) | 0x80; } NEXT; } @@ -1729,60 +2042,49 @@ htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { void htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) { - xmlChar *buf = NULL; - int len = 0; - int size = HTML_PARSER_BUFFER_SIZE; - xmlChar q; - - buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar)); - if (buf == NULL) { - fprintf(stderr, "malloc of %d byte failed\n", size); - return; - } - - q = CUR; - while ((IS_CHAR(q)) && (q != '<') && - (q != '&')) { - if ((q == ']') && (NXT(1) == ']') && - (NXT(2) == '>')) { - if (cdata) break; - else { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "Sequence ']]>' not allowed in content\n"); - ctxt->wellFormed = 0; + xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; + int nbchar = 0; + int cur, l; + + SHRINK; + cur = CUR_CHAR(l); + while (((cur != '<') || (ctxt->token == '<')) && + ((cur != '&') || (ctxt->token == '&')) && + (IS_CHAR(cur))) { + COPY_BUF(l,buf,nbchar,cur); + if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { + /* + * Ok the segment is to be consumed as chars. + */ + if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { + if (areBlanks(ctxt, buf, nbchar)) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, + buf, nbchar); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, nbchar); + } } + nbchar = 0; } - if (len + 1 >= size) { - size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); - if (buf == NULL) { - fprintf(stderr, "realloc of %d byte failed\n", size); - return; + NEXTL(l); + cur = CUR_CHAR(l); + } + if (nbchar != 0) { + /* + * Ok the segment is to be consumed as chars. + */ + if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { + if (areBlanks(ctxt, buf, nbchar)) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, nbchar); } } - buf[len++] = q; - NEXT; - q = CUR; - } - if (len == 0) { - xmlFree(buf); - return; - } - - /* - * Ok the buffer is to be consumed as chars. - */ - if (ctxt->sax != NULL) { - if (areBlanks(ctxt, buf, len)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len); - } else { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, buf, len); - } } - xmlFree(buf); } /** @@ -1889,7 +2191,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { ((s != '>') || (r != '-') || (q != '-'))) { if (len + 1 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); return; @@ -2130,8 +2432,6 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { if ((ctxt == NULL) || (attvalue == NULL)) return; -fprintf(stderr, "htmlCheckEncoding: \"%s\"\n", attvalue); - encoding = xmlStrstr(attvalue, BAD_CAST"charset="); if (encoding == NULL) encoding = xmlStrstr(attvalue, BAD_CAST"Charset="); @@ -2164,6 +2464,7 @@ fprintf(stderr, "htmlCheckEncoding: \"%s\"\n", attvalue); */ if (enc != XML_CHAR_ENCODING_ERROR) { xmlSwitchEncoding(ctxt, enc); + ctxt->charset = XML_CHAR_ENCODING_UTF8; } else { /* * fallback for unknown encodings @@ -2492,36 +2793,60 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) { void htmlParseReference(htmlParserCtxtPtr ctxt) { htmlEntityDescPtr ent; - xmlChar out[2]; + xmlChar out[6]; xmlChar *name; - int val; if (CUR != '&') return; if (NXT(1) == '#') { - val = htmlParseCharRef(ctxt); - /* invalid for UTF-8 variable encoding !!!!! */ - out[0] = val; - out[1] = 0; + unsigned int c; + int bits, i = 0; + + c = htmlParseCharRef(ctxt); + if (c < 0x80) { out[i++]= c; bits= -6; } + else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } + else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } + else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } + + for ( ; bits >= 0; bits-= 6) { + out[i++]= ((c >> bits) & 0x3F) | 0x80; + } + out[i] = 0; + if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, out, 1); + ctxt->sax->characters(ctxt->userData, out, i); } else { ent = htmlParseEntityRef(ctxt, &name); if (name == NULL) { ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); return; } - if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) { + if ((ent == NULL) || (ent->value <= 0)) { if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ } } else { - /* invalid for UTF-8 variable encoding !!!!! */ - out[0] = ent->value; - out[1] = 0; + unsigned int c; + int bits, i = 0; + + c = ent->value; + if (c < 0x80) + { out[i++]= c; bits= -6; } + else if (c < 0x800) + { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } + else if (c < 0x10000) + { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } + else + { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } + + for ( ; bits >= 0; bits-= 6) { + out[i++]= ((c >> bits) & 0x3F) | 0x80; + } + out[i] = 0; + if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, out, 1); + ctxt->sax->characters(ctxt->userData, out, i); } xmlFree(name); } @@ -2761,10 +3086,12 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { } if (!IS_CHAR(CUR)) { + /************ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Premature end of data in tag %s\n", currentNode); ctxt->wellFormed = 0; + *************/ /* * end of parsing of this node. @@ -3458,9 +3785,17 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if ((avail == 1) && (terminate)) { cur = in->cur[0]; if ((cur != '<') && (cur != '&')) { - if ((ctxt->sax != NULL) && - (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, &cur, 1); + if (ctxt->sax != NULL) { + if (IS_BLANK(cur)) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace( + ctxt->userData, &cur, 1); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters( + ctxt->userData, &cur, 1); + } + } ctxt->token = 0; ctxt->checkIndex = 0; NEXT; @@ -3599,6 +3934,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { fprintf(stderr, "HPP: entering START_TAG\n"); #endif break; + case XML_PARSER_SYSTEM_LITERAL: + fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n"); + ctxt->instate = XML_PARSER_CONTENT; + ctxt->checkIndex = 0; +#ifdef DEBUG_PUSH + fprintf(stderr, "HPP: entering CONTENT\n"); +#endif + break; } } done: diff --git a/HTMLparser.h b/HTMLparser.h index 44d9c271..b04e3b09 100644 --- a/HTMLparser.h +++ b/HTMLparser.h @@ -81,6 +81,10 @@ htmlDocPtr htmlSAXParseFile(const char *filename, void *userData); htmlDocPtr htmlParseFile (const char *filename, const char *encoding); +int UTF8ToHtml (unsigned char* out, + int *outlen, + const unsigned char* in, + int *inlen); /** * Interfaces for the Push mode @@ -32,6 +32,305 @@ #include <libxml/entities.h> #include <libxml/valid.h> +/************************************************************************ + * * + * Getting/Setting encoding meta tags * + * * + ************************************************************************/ + +/** + * htmlGetMetaEncoding: + * @doc: the document + * + * Encoding definition lookup in the Meta tags + * + * Returns the current encoding as flagged in the HTML source + */ +const xmlChar * +htmlGetMetaEncoding(htmlDocPtr doc) { + htmlNodePtr cur; + const xmlChar *content; + const xmlChar *encoding; + + if (doc == NULL) + return(NULL); + cur = doc->children; + + /* + * Search the html + */ + while (cur != NULL) { + if (cur->name != NULL) { + if (!xmlStrcmp(cur->name, BAD_CAST"html")) + break; + if (!xmlStrcmp(cur->name, BAD_CAST"head")) + goto found_head; + if (!xmlStrcmp(cur->name, BAD_CAST"meta")) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(NULL); + cur = cur->children; + + /* + * Search the head + */ + while (cur != NULL) { + if (cur->name != NULL) { + if (!xmlStrcmp(cur->name, BAD_CAST"head")) + break; + if (!xmlStrcmp(cur->name, BAD_CAST"meta")) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(NULL); +found_head: + cur = cur->children; + + /* + * Search the meta elements + */ +found_meta: + while (cur != NULL) { + if (cur->name != NULL) { + if (!xmlStrcmp(cur->name, BAD_CAST"meta")) { + xmlAttrPtr attr = cur->properties; + int http; + const xmlChar *value; + + content = NULL; + http = 0; + while (attr != NULL) { + if ((attr->children != NULL) && + (attr->children->type == XML_TEXT_NODE) && + (attr->children->next == NULL)) { +#ifndef XML_USE_BUFFER_CONTENT + value = attr->children->content; +#else + value = xmlBufferContent(attr->children->content); +#endif + if (((!xmlStrcmp(attr->name, BAD_CAST"http-equiv")) || + (!xmlStrcmp(attr->name, BAD_CAST"Http-Equiv")) || + (!xmlStrcmp(attr->name, BAD_CAST"HTTP-EQUIV"))) && + ((!xmlStrcmp(value, BAD_CAST"Content-Type")) || + (!xmlStrcmp(value, BAD_CAST"content-type")) || + (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE")))) + http = 1; + else if ((value != NULL) && + ((!xmlStrcmp(attr->name, BAD_CAST"content")) || + (!xmlStrcmp(attr->name, BAD_CAST"Content")) || + (!xmlStrcmp(attr->name, BAD_CAST"CONTENT")))) + content = value; + if ((http != 0) && (content != NULL)) + goto found_content; + } + attr = attr->next; + } + } + } + cur = cur->next; + } + return(NULL); + +found_content: + encoding = xmlStrstr(content, BAD_CAST"charset="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"Charset="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"CHARSET="); + if (encoding != NULL) { + encoding += 8; + } else { + encoding = xmlStrstr(content, BAD_CAST"charset ="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"Charset ="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); + if (encoding != NULL) + encoding += 9; + } + if (encoding != NULL) { + while ((*encoding == ' ') || (*encoding == '\t')) encoding++; + } + return(encoding); +} + +/** + * htmlSetMetaEncoding: + * @doc: the document + * @encoding: the encoding string + * + * Sets the current encoding in the Meta tags + * NOTE: this will not change the document content encoding, just + * the META flag associated. + * + * Returns 0 in case of success and -1 in case of error + */ +int +htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { + htmlNodePtr cur, meta; + const xmlChar *content; + char newcontent[100]; + + + if (doc == NULL) + return(-1); + + if (encoding != NULL) { +#ifndef HAVE_SNPRINTF + sprintf(newcontent, "text/html; charset=%s", encoding); +#else /* HAVE_SNPRINTF */ + snprintf(newcontent, 99, "text/html; charset=%s", encoding); +#endif /* HAVE_SNPRINTF */ + newcontent[99] = 0; + } + + cur = doc->children; + + /* + * Search the html + */ + while (cur != NULL) { + if (cur->name != NULL) { + if (!xmlStrcmp(cur->name, BAD_CAST"html")) + break; + if (!xmlStrcmp(cur->name, BAD_CAST"body")) { + if (encoding == NULL) + return(0); + meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL); + xmlAddPrevSibling(cur, meta); + cur = meta; + meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); + xmlAddChild(cur, meta); + xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); + xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); + return(0); + } + if (!xmlStrcmp(cur->name, BAD_CAST"head")) + goto found_head; + if (!xmlStrcmp(cur->name, BAD_CAST"meta")) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(-1); + cur = cur->children; + + /* + * Search the head + */ + while (cur != NULL) { + if (cur->name != NULL) { + if (!xmlStrcmp(cur->name, BAD_CAST"head")) + break; + if (!xmlStrcmp(cur->name, BAD_CAST"body")) { + if (encoding == NULL) + return(0); + meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL); + xmlAddPrevSibling(cur, meta); + cur = meta; + meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); + xmlAddChild(cur, meta); + xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); + xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); + return(0); + } + if (!xmlStrcmp(cur->name, BAD_CAST"meta")) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(-1); +found_head: + if (cur->children == NULL) { + if (encoding == NULL) + return(0); + meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); + xmlAddChild(cur, meta); + xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); + xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); + return(0); + } + cur = cur->children; + +found_meta: + if (encoding != NULL) { + /* + * Create a new Meta element with the right aatributes + */ + + meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); + xmlAddPrevSibling(cur, meta); + xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); + xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); + } + + /* + * Search and destroy all the remaining the meta elements carrying + * encoding informations + */ + while (cur != NULL) { + if (cur->name != NULL) { + if (!xmlStrcmp(cur->name, BAD_CAST"meta")) { + xmlAttrPtr attr = cur->properties; + int http; + const xmlChar *value; + + content = NULL; + http = 0; + while (attr != NULL) { + if ((attr->children != NULL) && + (attr->children->type == XML_TEXT_NODE) && + (attr->children->next == NULL)) { +#ifndef XML_USE_BUFFER_CONTENT + value = attr->children->content; +#else + value = xmlBufferContent(attr->children->content); +#endif + if (((!xmlStrcmp(attr->name, BAD_CAST"http-equiv")) || + (!xmlStrcmp(attr->name, BAD_CAST"Http-Equiv")) || + (!xmlStrcmp(attr->name, BAD_CAST"HTTP-EQUIV"))) && + ((!xmlStrcmp(value, BAD_CAST"Content-Type")) || + (!xmlStrcmp(value, BAD_CAST"content-type")) || + (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE")))) + http = 1; + else if ((value != NULL) && + ((!xmlStrcmp(attr->name, BAD_CAST"content")) || + (!xmlStrcmp(attr->name, BAD_CAST"Content")) || + (!xmlStrcmp(attr->name, BAD_CAST"CONTENT")))) + content = value; + if ((http != 0) && (content != NULL)) + break; + } + attr = attr->next; + } + if ((http != 0) && (content != NULL)) { + meta = cur; + cur = cur->next; + xmlUnlinkNode(meta); + xmlFreeNode(meta); + continue; + } + + } + } + cur = cur->next; + } + return(0); +} + +/************************************************************************ + * * + * Dumping HTML tree content to a simple buffer * + * * + ************************************************************************/ + static void htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur); @@ -168,7 +467,6 @@ htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { if (cur->content != NULL) { xmlChar *buffer; - /* uses the HTML encoding routine !!!!!!!!!! */ #ifndef XML_USE_BUFFER_CONTENT buffer = xmlEncodeEntitiesReentrant(doc, cur->content); #else @@ -319,7 +617,7 @@ htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) { htmlNodeListDump(buf, cur, cur->children); } xmlBufferWriteChar(buf, "\n"); - cur->type = type; + cur->type = (xmlElementType) type; } /** @@ -357,59 +655,470 @@ htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { } +/************************************************************************ + * * + * Dumping HTML tree content to an I/O output buffer * + * * + ************************************************************************/ + +static void +htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding); + +/** + * htmlDtdDump: + * @buf: the HTML buffer output + * @doc: the document + * + * Dump the HTML document DTD, if any. + */ +static void +htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, const char *encoding) { + xmlDtdPtr cur = doc->intSubset; + + if (cur == NULL) { + fprintf(stderr, "htmlDtdDump : no internal subset\n"); + return; + } + xmlOutputBufferWriteString(buf, "<!DOCTYPE "); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + if (cur->ExternalID != NULL) { + xmlOutputBufferWriteString(buf, " PUBLIC "); + xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID); + if (cur->SystemID != NULL) { + xmlOutputBufferWriteString(buf, " "); + xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); + } + } else if (cur->SystemID != NULL) { + xmlOutputBufferWriteString(buf, " SYSTEM "); + xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); + } + xmlOutputBufferWriteString(buf, ">\n"); +} + +/** + * htmlAttrDump: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the attribute pointer + * + * Dump an HTML attribute + */ +static void +htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { + xmlChar *value; + + if (cur == NULL) { + fprintf(stderr, "htmlAttrDump : property == NULL\n"); + return; + } + xmlOutputBufferWriteString(buf, " "); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + if (cur->children != NULL) { + value = xmlNodeListGetString(doc, cur->children, 0); + if (value) { + xmlOutputBufferWriteString(buf, "="); + xmlBufferWriteQuotedString(buf->buffer, value); + xmlFree(value); + } else { + xmlOutputBufferWriteString(buf, "=\"\""); + } + } +} + +/** + * htmlAttrListDump: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the first attribute pointer + * + * Dump a list of HTML attributes + */ +static void +htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { + if (cur == NULL) { + fprintf(stderr, "htmlAttrListDump : property == NULL\n"); + return; + } + while (cur != NULL) { + htmlAttrDumpOutput(buf, doc, cur, encoding); + cur = cur->next; + } +} + + +void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + xmlNodePtr cur, const char *encoding); + +/** + * htmlNodeListDump: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the first node + * + * Dump an HTML node list, recursive behaviour,children are printed too. + */ +static void +htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) { + if (cur == NULL) { + fprintf(stderr, "htmlNodeListDump : node == NULL\n"); + return; + } + while (cur != NULL) { + htmlNodeDumpOutput(buf, doc, cur, encoding); + cur = cur->next; + } +} + +/** + * htmlNodeDump: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the current node + * + * Dump an HTML node, recursive behaviour,children are printed too. + */ +void +htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) { + htmlElemDescPtr info; + + if (cur == NULL) { + fprintf(stderr, "htmlNodeDump : node == NULL\n"); + return; + } + /* + * Special cases. + */ + if (cur->type == XML_DTD_NODE) + return; + if (cur->type == XML_HTML_DOCUMENT_NODE) { + htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); + return; + } + if (cur->type == HTML_TEXT_NODE) { + if (cur->content != NULL) { + xmlChar *buffer; + +#ifndef XML_USE_BUFFER_CONTENT + buffer = xmlEncodeEntitiesReentrant(doc, cur->content); +#else + buffer = xmlEncodeEntitiesReentrant(doc, + xmlBufferContent(cur->content)); +#endif + if (buffer != NULL) { + xmlOutputBufferWriteString(buf, (const char *)buffer); + xmlFree(buffer); + } + } + return; + } + if (cur->type == HTML_COMMENT_NODE) { + if (cur->content != NULL) { + xmlOutputBufferWriteString(buf, "<!--"); +#ifndef XML_USE_BUFFER_CONTENT + xmlOutputBufferWriteString(buf, (const char *)cur->content); +#else + xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content)); +#endif + xmlOutputBufferWriteString(buf, "-->"); + } + return; + } + if (cur->type == HTML_ENTITY_REF_NODE) { + xmlOutputBufferWriteString(buf, "&"); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + xmlOutputBufferWriteString(buf, ";"); + return; + } + + /* + * Get specific HTmL info for taht node. + */ + info = htmlTagLookup(cur->name); + + xmlOutputBufferWriteString(buf, "<"); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + if (cur->properties != NULL) + htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); + + if ((info != NULL) && (info->empty)) { + xmlOutputBufferWriteString(buf, ">"); + if (cur->next != NULL) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE)) + xmlOutputBufferWriteString(buf, "\n"); + } + return; + } + if ((cur->content == NULL) && (cur->children == NULL)) { + if ((info != NULL) && (info->endTag != 0)) + xmlOutputBufferWriteString(buf, ">"); + else { + xmlOutputBufferWriteString(buf, "></"); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + xmlOutputBufferWriteString(buf, ">"); + } + if (cur->next != NULL) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE)) + xmlOutputBufferWriteString(buf, "\n"); + } + return; + } + xmlOutputBufferWriteString(buf, ">"); + if (cur->content != NULL) { +#if 0 + xmlChar *buffer; + +#ifndef XML_USE_BUFFER_CONTENT + buffer = xmlEncodeEntitiesReentrant(doc, cur->content); +#else + buffer = xmlEncodeEntitiesReentrant(doc, + xmlBufferContent(cur->content)); +#endif + if (buffer != NULL) { + xmlOutputBufferWriteString(buf, buffer); + xmlFree(buffer); + } +#else + /* + * Uses the OutputBuffer property to automatically convert + * invalids to charrefs + */ + +#ifndef XML_USE_BUFFER_CONTENT + xmlOutputBufferWriteString(buf, (const char *) cur->content); +#else + xmlOutputBufferWriteString(buf, + (const char *) xmlBufferContent(cur->content)); +#endif +#endif + } + if (cur->children != NULL) { + if ((cur->children->type != HTML_TEXT_NODE) && + (cur->children->type != HTML_ENTITY_REF_NODE) && + (cur->children != cur->last)) + xmlOutputBufferWriteString(buf, "\n"); + htmlNodeListDumpOutput(buf, doc, cur->children, encoding); + if ((cur->last->type != HTML_TEXT_NODE) && + (cur->last->type != HTML_ENTITY_REF_NODE) && + (cur->children != cur->last)) + xmlOutputBufferWriteString(buf, "\n"); + } + if (!htmlIsAutoClosed(doc, cur)) { + xmlOutputBufferWriteString(buf, "</"); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + xmlOutputBufferWriteString(buf, ">"); + } + if (cur->next != NULL) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE)) + xmlOutputBufferWriteString(buf, "\n"); + } +} + +/** + * htmlDocContentDump: + * @buf: the HTML buffer output + * @cur: the document + * + * Dump an HTML document. + */ +static void +htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) { + int type; + + /* + * force to output the stuff as HTML, especially for entities + */ + type = cur->type; + cur->type = XML_HTML_DOCUMENT_NODE; + if (cur->intSubset != NULL) + htmlDtdDumpOutput(buf, cur, NULL); + else { + /* Default to HTML-4.0 transitionnal @@@@ */ + xmlOutputBufferWriteString(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">"); + + } + if (cur->children != NULL) { + htmlNodeListDumpOutput(buf, cur, cur->children, encoding); + } + xmlOutputBufferWriteString(buf, "\n"); + cur->type = (xmlElementType) type; +} + + +/************************************************************************ + * * + * Saving functions front-ends * + * * + ************************************************************************/ + /** * htmlDocDump: * @f: the FILE* * @cur: the document * * Dump an HTML document to an open FILE. + * + * returns: the number of byte written or -1 in case of failure. */ -void +int htmlDocDump(FILE *f, xmlDocPtr cur) { - xmlBufferPtr buf; + xmlOutputBufferPtr buf; + xmlCharEncodingHandlerPtr handler = NULL; + const char *encoding; + int ret; if (cur == NULL) { #ifdef DEBUG_TREE fprintf(stderr, "htmlDocDump : document == NULL\n"); #endif - return; + return(-1); } - buf = xmlBufferCreate(); - if (buf == NULL) return; - htmlDocContentDump(buf, cur); - xmlBufferDump(f, buf); - xmlBufferFree(buf); + + encoding = (const char *) htmlGetMetaEncoding(cur); + + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + if (enc != cur->charset) { + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + /* + * Not supported yet + */ + return(-1); + } + + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) + return(-1); + } + } + + /* + * Fallback to HTML or ASCII when the encoding is unspecified + */ + if (handler == NULL) + handler = xmlFindCharEncodingHandler("HTML"); + if (handler == NULL) + handler = xmlFindCharEncodingHandler("ascii"); + + buf = xmlOutputBufferCreateFile(f, handler); + if (buf == NULL) return(-1); + htmlDocContentDumpOutput(buf, cur, NULL); + + ret = xmlOutputBufferClose(buf); + return(ret); } /** * htmlSaveFile: - * @filename: the filename + * @filename: the filename (or URL) * @cur: the document * - * Dump an HTML document to a file. - * + * Dump an HTML document to a file. If @filename is "-" the stdout file is + * used. * returns: the number of byte written or -1 in case of failure. */ int htmlSaveFile(const char *filename, xmlDocPtr cur) { - xmlBufferPtr buf; - FILE *output = NULL; + xmlOutputBufferPtr buf; + xmlCharEncodingHandlerPtr handler = NULL; + const char *encoding; int ret; + encoding = (const char *) htmlGetMetaEncoding(cur); + + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + if (enc != cur->charset) { + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + /* + * Not supported yet + */ + return(-1); + } + + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) + return(-1); + } + } + + /* + * Fallback to HTML or ASCII when the encoding is unspecified + */ + if (handler == NULL) + handler = xmlFindCharEncodingHandler("HTML"); + if (handler == NULL) + handler = xmlFindCharEncodingHandler("ascii"); + /* * save the content to a temp buffer. */ - buf = xmlBufferCreate(); + buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); if (buf == NULL) return(0); - htmlDocContentDump(buf, cur); - output = fopen(filename, "w"); - if (output == NULL) return(-1); - ret = xmlBufferDump(output, buf); - fclose(output); + htmlDocContentDumpOutput(buf, cur, NULL); - xmlBufferFree(buf); - return(ret * sizeof(xmlChar)); + ret = xmlOutputBufferClose(buf); + return(ret); } +/** + * htmlSaveFileEnc: + * @filename: the filename + * @cur: the document + * + * Dump an HTML document to a file using a given encoding. + * + * returns: the number of byte written or -1 in case of failure. + */ +int +htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { + xmlOutputBufferPtr buf; + xmlCharEncodingHandlerPtr handler = NULL; + int ret; + + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + if (enc != cur->charset) { + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + /* + * Not supported yet + */ + return(-1); + } + + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) + return(-1); + htmlSetMetaEncoding(cur, (const xmlChar *) encoding); + } + } + + /* + * Fallback to HTML or ASCII when the encoding is unspecified + */ + if (handler == NULL) + handler = xmlFindCharEncodingHandler("HTML"); + if (handler == NULL) + handler = xmlFindCharEncodingHandler("ascii"); + + /* + * save the content to a temp buffer. + */ + buf = xmlOutputBufferCreateFilename(filename, handler, 0); + if (buf == NULL) return(0); + + htmlDocContentDumpOutput(buf, cur, encoding); + + ret = xmlOutputBufferClose(buf); + return(ret); +} #endif /* LIBXML_HTML_ENABLED */ @@ -23,12 +23,27 @@ extern "C" { #define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE #define HTML_COMMENT_NODE XML_COMMENT_NODE -void htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size); -void htmlDocDump(FILE *f, xmlDocPtr cur); -int htmlSaveFile(const char *filename, xmlDocPtr cur); -void htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur); -void htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur); -htmlDocPtr htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID); +htmlDocPtr htmlNewDoc (const xmlChar *URI, + const xmlChar *ExternalID); +const xmlChar * htmlGetMetaEncoding (htmlDocPtr doc); +int htmlSetMetaEncoding (htmlDocPtr doc, + const xmlChar *encoding); +void htmlDocDumpMemory (xmlDocPtr cur, + xmlChar**mem, + int *size); +int htmlDocDump (FILE *f, + xmlDocPtr cur); +int htmlSaveFile (const char *filename, + xmlDocPtr cur); +void htmlNodeDump (xmlBufferPtr buf, + xmlDocPtr doc, + xmlNodePtr cur); +void htmlNodeDumpFile (FILE *out, + xmlDocPtr doc, + xmlNodePtr cur); +int htmlSaveFileEnc (const char *filename, + xmlDocPtr cur, + const char *encoding); #ifdef __cplusplus } @@ -14,13 +14,14 @@ #endif #include <stdio.h> #include <stdlib.h> +#include <string.h> #include <libxml/xmlmemory.h> #include <libxml/tree.h> #include <libxml/parser.h> #include <libxml/parserInternals.h> #include <libxml/valid.h> #include <libxml/entities.h> -#include "xml-error.h" +#include <libxml/xml-error.h> #include <libxml/debugXML.h> #include <libxml/xmlIO.h> #include <libxml/SAX.h> @@ -206,7 +207,7 @@ externalSubset(void *ctx, const xmlChar *name, int oldwellFormed; xmlParserInputPtr input = NULL; xmlCharEncoding enc; - xmlCharEncoding oldcharset; + int oldcharset; /* * Ask the Entity resolver to load the damn thing @@ -426,10 +427,12 @@ attributeDecl(void *ctx, const xmlChar *elem, const xmlChar *fullname, name = xmlSplitQName(ctxt, fullname, &prefix); if (ctxt->inSubset == 1) attr = xmlAddAttributeDecl(&ctxt->vctxt, ctxt->myDoc->intSubset, elem, - name, prefix, type, def, defaultValue, tree); + name, prefix, (xmlAttributeType) type, + (xmlAttributeDefault) def, defaultValue, tree); else if (ctxt->inSubset == 2) attr = xmlAddAttributeDecl(&ctxt->vctxt, ctxt->myDoc->extSubset, elem, - name, prefix, type, def, defaultValue, tree); + name, prefix, (xmlAttributeType) type, + (xmlAttributeDefault) def, defaultValue, tree); else { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt, @@ -470,10 +473,10 @@ elementDecl(void *ctx, const xmlChar *name, int type, if (ctxt->inSubset == 1) elem = xmlAddElementDecl(&ctxt->vctxt, ctxt->myDoc->intSubset, - name, type, content); + name, (xmlElementTypeVal) type, content); else if (ctxt->inSubset == 2) elem = xmlAddElementDecl(&ctxt->vctxt, ctxt->myDoc->extSubset, - name, type, content); + name, (xmlElementTypeVal) type, content); else { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt, @@ -16,7 +16,7 @@ #include <libxml/xlink.h> #ifdef __cplusplus -#define extern "C" { +extern "C" { #endif const xmlChar * getPublicId (void *ctx); const xmlChar * getSystemId (void *ctx); @@ -6,8 +6,6 @@ TODO: ===== -- If the internal encoding is not UTF8 saving to a given encoding doesn't - work - problem when parsing hrefs with & with the HTML parser (IRC ac) - DOM needs xmlAttrPtr xmlNewDocProp(xmlDocPtr doc, const xmlChar *name, const xmlChar *value) @@ -26,11 +24,8 @@ TODO: http://www.w3.org/XML/xml-19980210-errata ... bummmer - Handle undefined namespaces in entity contents better ... at least issue a warning -- Issue warning when using non-absolute namespaces URI. - fix --disable-corba configure switch handling, and use XML_WITHOUT_CORBA not WITHOUT_CORBA flag -- the html parser should add <head> and <body> if they don't exist - started, not finished. TODO: ===== @@ -96,8 +91,16 @@ EXTENSIONS: Done: ===== +- If the internal encoding is not UTF8 saving to a given encoding doesn't + work => fix to force UTF8 encoding ... + done, added documentation too +- Add an ASCII I/O encoder (asciiToUTF8 and UTF8Toascii) +- Issue warning when using non-absolute namespaces URI. +- the html parser should add <head> and <body> if they don't exist + started, not finished. + Done, the automatic closing is added and 3 testcases were inserted - Command to force the parser to stop parsing and ignore the rest of the file. - xmlStopParser() should allow this + xmlStopParser() should allow this, mostly untested - support for HTML empty attributes like <hr noshade> - plugged iconv() in for support of a large set of encodings. - xmlSwitchToEncoding() rewrite done diff --git a/configure.in b/configure.in index 7c253234..0ae4e05d 100644 --- a/configure.in +++ b/configure.in @@ -4,8 +4,8 @@ AC_INIT(entities.h) AM_CONFIG_HEADER(config.h) LIBXML_MAJOR_VERSION=2 -LIBXML_MINOR_VERSION=1 -LIBXML_MICRO_VERSION=1 +LIBXML_MINOR_VERSION=2 +LIBXML_MICRO_VERSION=0 LIBXML_VERSION=$LIBXML_MAJOR_VERSION.$LIBXML_MINOR_VERSION.$LIBXML_MICRO_VERSION LIBXML_VERSION_INFO=`expr $LIBXML_MAJOR_VERSION + $LIBXML_MINOR_VERSION`:$LIBXML_MICRO_VERSION:$LIBXML_MINOR_VERSION @@ -17,6 +17,7 @@ #ifdef LIBXML_DEBUG_ENABLED #include <stdio.h> +#include <string.h> #ifdef HAVE_STDLIB_H #include <stdlib.h> #endif @@ -39,6 +40,8 @@ void xmlDebugDumpString(FILE *output, const xmlChar *str) { for (i = 0;i < 40;i++) if (str[i] == 0) return; else if (IS_BLANK(str[i])) fputc(' ', output); + else if (str[i] >= 0x80) + fprintf(output, "#%X", str[i]); else fputc(str[i], output); fprintf(output, "..."); } @@ -221,9 +224,11 @@ void xmlDebugDumpElemDecl(FILE *output, xmlElementPtr elem, int depth) { fprintf(output, "PBM: not a Elem\n"); return; } - if (elem->name != NULL) - fprintf(output, "ELEMDECL(%s)", elem->name); - else + if (elem->name != NULL) { + fprintf(output, "ELEMDECL("); + xmlDebugDumpString(output, elem->name); + fprintf(output, ")"); + } else fprintf(output, "PBM ELEMDECL noname!!!"); switch (elem->etype) { case XML_ELEMENT_TYPE_EMPTY: @@ -288,9 +293,11 @@ void xmlDebugDumpEntityDecl(FILE *output, xmlEntityPtr ent, int depth) { fprintf(output, "PBM: not a Entity decl\n"); return; } - if (ent->name != NULL) - fprintf(output, "ENTITYDECL(%s)", ent->name); - else + if (ent->name != NULL) { + fprintf(output, "ENTITYDECL("); + xmlDebugDumpString(output, ent->name); + fprintf(output, ")"); + } else fprintf(output, "PBM ENTITYDECL noname!!!"); switch (ent->etype) { case XML_INTERNAL_GENERAL_ENTITY: @@ -434,7 +441,9 @@ void xmlDebugDumpAttr(FILE *output, xmlAttrPtr attr, int depth) { fprintf(output, shift); - fprintf(output, "ATTRIBUTE %s\n", attr->name); + fprintf(output, "ATTRIBUTE "); + xmlDebugDumpString(output, attr->name); + fprintf(output, "\n"); if (attr->children != NULL) xmlDebugDumpNodeList(output, attr->children, depth + 1); @@ -479,10 +488,12 @@ void xmlDebugDumpOneNode(FILE *output, xmlNodePtr node, int depth) { case XML_ELEMENT_NODE: fprintf(output, shift); fprintf(output, "ELEMENT "); - if (node->ns != NULL) - fprintf(output, "%s:%s\n", node->ns->prefix, node->name); - else - fprintf(output, "%s\n", node->name); + if (node->ns != NULL) { + xmlDebugDumpString(output, node->ns->prefix); + fprintf(output, ":"); + } + xmlDebugDumpString(output, node->name); + fprintf(output, "\n"); break; case XML_ATTRIBUTE_NODE: fprintf(output, shift); diff --git a/doc/html/book1.html b/doc/html/book1.html index e066da65..e69de29b 100644 --- a/doc/html/book1.html +++ b/doc/html/book1.html @@ -1,276 +0,0 @@ -<HTML -><HEAD -><TITLE ->Gnome XML Library Reference Manual</TITLE -><META -NAME="GENERATOR" -CONTENT="Modular DocBook HTML Stylesheet Version 1.33"><LINK -REL="NEXT" -TITLE="Libxml Programming Notes" -HREF="libxml-notes.html"></HEAD -><BODY -BGCOLOR="#FFFFFF" -TEXT="#000000" -><DIV -CLASS="BOOK" -><DIV -CLASS="TITLEPAGE" -><TABLE -WIDTH="100%" -BORDER="0" -BGCOLOR="#000000" -CELLPADDING="1" -CELLSPACING="0" -><TR -><TH -ALIGN="center" -VALIGN="center" -><FONT -COLOR="#FFFFFF" -SIZE="7" -><P -CLASS="TITLE" -><A -NAME="AEN2" ->Gnome XML Library Reference Manual</A -></P -></FONT -></TH -></TR -></TABLE -><H3 -CLASS="AUTHOR" ->Daniel Veillard</H3 -><DIV -CLASS="AFFILIATION" -><DIV -CLASS="ADDRESS" -><P -CLASS="LITERALLAYOUT" -> Daniel.Veillard@w3.org<br> - </P -></DIV -></DIV -><P -CLASS="COPYRIGHT" ->Copyright © 1999 by <SPAN -CLASS="HOLDER" ->Daniel Veillard</SPAN -></P -><DIV -><DIV -CLASS="ABSTRACT" -><P -></P -><P ->This manual documents the interfaces of the libxml - library and has some short notes to help get you up to speed - with using the library.</P -><P -></P -></DIV -></DIV -><DIV -CLASS="LEGALNOTICE" -><P -></P -><P ->Permission is granted to make and distribute verbatim - copies of this manual provided the copyright notice and this - permission notice are preserved on all copies.</P -><P ->Permission is granted to copy and distribute modified - versions of this manual under the conditions for verbatim - copying, provided also that the entire resulting derived work is - distributed under the terms of a permission notice identical to - this one.</P -><P ->Permission is granted to copy and distribute translations - of this manual into another language, under the above conditions - for modified versions.</P -><P -></P -></DIV -></DIV -><DIV -CLASS="TOC" -><DL -><DT -><B ->Table of Contents</B -></DT -><DT -><A -HREF="libxml-notes.html" ->Libxml Programming Notes</A -></DT -><DT -><A -HREF="libxml-lib.html" ->Libxml Library Reference</A -></DT -><DD -><DL -><DT -><A -HREF="gnome-xml-parser.html" ->parser</A -> — </DT -><DT -><A -HREF="gnome-xml-sax.html" ->SAX</A -> — </DT -><DT -><A -HREF="gnome-xml-tree.html" ->tree</A -> — </DT -><DT -><A -HREF="gnome-xml-entities.html" ->entities</A -> — </DT -><DT -><A -HREF="gnome-xml-valid.html" ->valid</A -> — </DT -><DT -><A -HREF="gnome-xml-uri.html" ->uri</A -> — </DT -><DT -><A -HREF="gnome-xml-xml-error.html" ->xml-error</A -> — </DT -><DT -><A -HREF="gnome-xml-htmlparser.html" ->HTMLparser</A -> — </DT -><DT -><A -HREF="gnome-xml-htmltree.html" ->HTMLtree</A -> — </DT -><DT -><A -HREF="gnome-xml-xpath.html" ->xpath</A -> — </DT -><DT -><A -HREF="gnome-xml-nanohttp.html" ->nanohttp</A -> — </DT -><DT -><A -HREF="gnome-xml-nanoftp.html" ->nanoftp</A -> — </DT -><DT -><A -HREF="gnome-xml-xmlio.html" ->xmlIO</A -> — </DT -><DT -><A -HREF="gnome-xml-parserinternals.html" ->parserInternals</A -> — </DT -><DT -><A -HREF="gnome-xml-encoding.html" ->encoding</A -> — </DT -><DT -><A -HREF="gnome-xml-debugxml.html" ->debugXML</A -> — </DT -><DT -><A -HREF="gnome-xml-xmlmemory.html" ->xmlmemory</A -> — </DT -></DL -></DD -></DL -></DIV -></DIV -><DIV -CLASS="NAVFOOTER" -><BR -CLEAR="all"><BR><TABLE -WIDTH="100%" -BORDER="0" -BGCOLOR="#000000" -CELLPADDING="1" -CELLSPACING="0" -><TR -><TD -WIDTH="25%" -BGCOLOR="#C00000" -ALIGN="left" -> </TD -><TD -WIDTH="25%" -BGCOLOR="#0000C0" -ALIGN="center" -><FONT -COLOR="#FFFFFF" -SIZE="3" -><B -> </B -></FONT -></TD -><TD -WIDTH="25%" -BGCOLOR="#00C000" -ALIGN="center" -><FONT -COLOR="#FFFFFF" -SIZE="3" -><B -> </B -></FONT -></TD -><TD -WIDTH="25%" -BGCOLOR="#C00000" -ALIGN="right" -><A -HREF="libxml-notes.html" -><FONT -COLOR="#FFFFFF" -SIZE="3" -><B ->Next Page >>></B -></FONT -></A -></TD -></TR -><TR -><TD -COLSPAN="2" -ALIGN="left" -> </TD -><TD -COLSPAN="2" -ALIGN="right" -><FONT -COLOR="#FFFFFF" -SIZE="3" -><B ->Libxml Programming Notes</B -></FONT -></TD -></TR -></TABLE -></DIV -></BODY -></HTML ->
\ No newline at end of file diff --git a/doc/html/libxml-notes.html b/doc/html/libxml-notes.html index 985cbef7..9c591047 100644 --- a/doc/html/libxml-notes.html +++ b/doc/html/libxml-notes.html @@ -4,7 +4,7 @@ >Libxml Programming Notes</TITLE ><META NAME="GENERATOR" -CONTENT="Modular DocBook HTML Stylesheet Version 1.33"><LINK +CONTENT="Modular DocBook HTML Stylesheet Version 1.52"><LINK REL="HOME" TITLE="Gnome XML Library Reference Manual" HREF="book1.html"><LINK @@ -15,8 +15,12 @@ REL="NEXT" TITLE="Libxml Library Reference" HREF="libxml-lib.html"></HEAD ><BODY +CLASS="CHAPTER" BGCOLOR="#FFFFFF" TEXT="#000000" +LINK="#0000FF" +VLINK="#840084" +ALINK="#0000FF" ><DIV CLASS="NAVHEADER" ><TABLE @@ -43,6 +43,9 @@ #endif #include <libxml/encoding.h> #include <libxml/xmlmemory.h> +#ifdef LIBXML_HTML_ENABLED +#include <libxml/HTMLparser.h> +#endif xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; @@ -178,6 +181,140 @@ xmlCheckUTF8(const unsigned char *utf) } /** + * asciiToUTF8: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ASCII chars + * @inlen: the length of @in + * + * Take a block of ASCII chars in and try to convert it to an UTF-8 + * block of chars out. + * Returns 0 if success, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. + */ +int +asciiToUTF8(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + unsigned char* outstart = out; + const unsigned char* base = in; + const unsigned char* processed = in; + unsigned char* outend = out + *outlen; + const unsigned char* inend; + unsigned int c; + int bits; + + inend = in + (*inlen); + while ((in < inend) && (out - outstart + 5 < *outlen)) { + c= *in++; + + /* assertion: c is a single UTF-4 value */ + if (out >= outend) + break; + if (c < 0x80) { *out++= c; bits= -6; } + else { + *outlen = out - outstart; + *inlen = processed - base; + return(-1); + } + + for ( ; bits >= 0; bits-= 6) { + if (out >= outend) + break; + *out++= ((c >> bits) & 0x3F) | 0x80; + } + processed = (const unsigned char*) in; + } + *outlen = out - outstart; + *inlen = processed - base; + return(0); +} + +/** + * UTF8Toascii: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in + * + * Take a block of UTF-8 chars in and try to convert it to an ASCII + * block of chars out. + * + * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. + */ +int +UTF8Toascii(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + const unsigned char* processed = in; + const unsigned char* outend; + const unsigned char* outstart = out; + const unsigned char* instart = in; + const unsigned char* inend; + unsigned int c, d; + int trailing; + + if (in == NULL) { + /* + * initialization nothing to do + */ + *outlen = 0; + *inlen = 0; + return(0); + } + inend = in + (*inlen); + outend = out + (*outlen); + while (in < inend) { + d = *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) + break; + c <<= 6; + c |= d & 0x3F; + } + + /* assertion: c is a single UTF-4 value */ + if (c < 0x80) { + if (out >= outend) + break; + *out++ = c; + } else { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + processed = in; + } + *outlen = out - outstart; + *inlen = processed - instart; + return(0); +} + +/** * isolat1ToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out @@ -195,28 +332,32 @@ int isolat1ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { unsigned char* outstart = out; + const unsigned char* base = in; const unsigned char* processed = in; unsigned char* outend = out + *outlen; - const unsigned char* inend = in + *inlen; - unsigned char c; + const unsigned char* inend; + unsigned int c; + int bits; - while (in < inend) { - c= *in++; - if (c < 0x80) { + inend = in + (*inlen); + while ((in < inend) && (out - outstart + 5 < *outlen)) { + c= *in++; + + /* assertion: c is a single UTF-4 value */ + if (out >= outend) + break; + if (c < 0x80) { *out++= c; bits= -6; } + else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } + + for ( ; bits >= 0; bits-= 6) { if (out >= outend) - break; - *out++ = c; - } - else { - if (out + 1 >= outend) break; - *out++ = 0xC0 | (c >> 6); - *out++ = 0x80 | (0x3F & c); + break; + *out++= ((c >> bits) & 0x3F) | 0x80; } - processed = in; + processed = (const unsigned char*) in; } *outlen = out - outstart; - *inlen = processed - in; - + *inlen = processed - base; return(0); } @@ -229,7 +370,6 @@ isolat1ToUTF8(unsigned char* out, int *outlen, * * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 * block of chars out. - * TODO: UTF8Toisolat1 need a fallback mechanism ... * * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise * The value of @inlen after return is the number of octets consumed @@ -239,34 +379,68 @@ isolat1ToUTF8(unsigned char* out, int *outlen, int UTF8Toisolat1(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { - unsigned char* outstart = out; const unsigned char* processed = in; - unsigned char* outend = out + *outlen; - const unsigned char* inend = in + *inlen; - unsigned char c; + const unsigned char* outend; + const unsigned char* outstart = out; + const unsigned char* instart = in; + const unsigned char* inend; + unsigned int c, d; + int trailing; + if (in == NULL) { + /* + * initialization nothing to do + */ + *outlen = 0; + *inlen = 0; + return(0); + } + inend = in + (*inlen); + outend = out + (*outlen); while (in < inend) { - c= *in++; - if (c < 0x80) { - if (out >= outend) return(-1); - *out++= c; - } - else if (in == inend) { - break; + d = *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in IsoLat1 */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); } - else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) { - /* a two byte utf-8 and can be encoding as isolate1 */ - *out++= ((c & 0x03) << 6) | (*in++ & 0x3F); + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) + break; + c <<= 6; + c |= d & 0x3F; } - else { + + /* assertion: c is a single UTF-4 value */ + if (c <= 0xFF) { + if (out >= outend) + break; + *out++ = c; + } else { + /* no chance for this in IsoLat1 */ *outlen = out - outstart; - *inlen = processed - in; + *inlen = processed - instart; return(-2); } processed = in; } *outlen = out - outstart; - *inlen = processed - in; + *inlen = processed - instart; return(0); } @@ -367,7 +541,6 @@ UTF16LEToUTF8(unsigned char* out, int *outlen, * * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE * block of chars out. - * TODO: UTF8ToUTF16LE need a fallback mechanism ... * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. @@ -410,7 +583,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen, if (d < 0x80) { c= d; trailing= 0; } else if (d < 0xC0) { /* trailing byte in leading position */ - *outlen = out - outstart; + *outlen = (out - outstart) * 2; *inlen = processed - in; return(-2); } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } @@ -418,7 +591,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen, else if (d < 0xF8) { c= d & 0x07; trailing= 3; } else { /* no chance for this in UTF-16 */ - *outlen = out - outstart; + *outlen = (out - outstart) * 2; *inlen = processed - in; return(-2); } @@ -578,7 +751,6 @@ UTF16BEToUTF8(unsigned char* out, int *outlen, * * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE * block of chars out. - * TODO: UTF8ToUTF16BE need a fallback mechanism ... * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. @@ -861,6 +1033,8 @@ xmlGetCharEncodingName(xmlCharEncoding enc) { return("Shift-JIS"); case XML_CHAR_ENCODING_EUC_JP: return("EUC-JP"); + case XML_CHAR_ENCODING_ASCII: + return("ASCII"); } return(NULL); } @@ -974,6 +1148,10 @@ xmlInitCharEncodingHandlers(void) { xmlUTF16BEHandler = xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE); xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1); + xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii); +#ifdef LIBXML_HTML_ENABLED + xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml); +#endif } /** @@ -1081,16 +1259,51 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { handler = xmlFindCharEncodingHandler("UCS2"); if (handler != NULL) return(handler); break; + + /* + * We used to keep ISO Latin encodings native in the + * generated data. This led to so many problems that + * this has been removed. One can still change this + * back by registering no-ops encoders for those + */ case XML_CHAR_ENCODING_8859_1: + handler = xmlFindCharEncodingHandler("ISO-8859-1"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_2: + handler = xmlFindCharEncodingHandler("ISO-8859-2"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_3: + handler = xmlFindCharEncodingHandler("ISO-8859-3"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_4: + handler = xmlFindCharEncodingHandler("ISO-8859-4"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_5: + handler = xmlFindCharEncodingHandler("ISO-8859-5"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_6: + handler = xmlFindCharEncodingHandler("ISO-8859-6"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_7: + handler = xmlFindCharEncodingHandler("ISO-8859-7"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_8: + handler = xmlFindCharEncodingHandler("ISO-8859-8"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_9: - return(NULL); + handler = xmlFindCharEncodingHandler("ISO-8859-9"); + if (handler != NULL) return(handler); + break; + + case XML_CHAR_ENCODING_2022_JP: handler = xmlFindCharEncodingHandler("ISO-2022-JP"); if (handler != NULL) return(handler); @@ -1161,7 +1374,8 @@ xmlFindCharEncodingHandler(const char *name) { icv_in = iconv_open("UTF-8", name); icv_out = iconv_open(name, "UTF-8"); if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) { - enc = xmlMalloc(sizeof(xmlCharEncodingHandler)); + enc = (xmlCharEncodingHandlerPtr) + xmlMalloc(sizeof(xmlCharEncodingHandler)); if (enc == NULL) { iconv_close(icv_in); iconv_close(icv_out); @@ -1506,6 +1720,10 @@ retry: if (ret == -1) ret = -3; } #endif /* LIBXML_ICONV_ENABLED */ + else { + fprintf(stderr, "xmlCharEncOutFunc: no output function !\n"); + return(-1); + } if (ret >= 0) output += ret; @@ -1528,7 +1746,7 @@ retry: #endif case -2: { int len = in->use; - const char *utf = (const char *) in->content; + const xmlChar *utf = (const xmlChar *) in->content; int cur; cur = xmlGetUTF8Char(utf, &len); @@ -1546,7 +1764,7 @@ retry: * and continue the transcoding phase, hoping the error * did not mangle the encoder state. */ - sprintf(charref, "&#x%X;", cur); + sprintf((char *) charref, "&#x%X;", cur); xmlBufferShrink(in, len); xmlBufferAddHead(in, charref, -1); @@ -70,7 +70,8 @@ typedef enum { XML_CHAR_ENCODING_8859_9= 18,/* ISO-8859-9 */ XML_CHAR_ENCODING_2022_JP= 19,/* ISO-2022-JP */ XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */ - XML_CHAR_ENCODING_EUC_JP= 21 /* EUC-JP */ + XML_CHAR_ENCODING_EUC_JP= 21,/* EUC-JP */ + XML_CHAR_ENCODING_ASCII= 22 /* pure ASCII */ } xmlCharEncoding; /** @@ -128,7 +128,7 @@ xmlAddEntity(xmlEntitiesTablePtr table, const xmlChar *name, int type, * fill the structure. */ ret->name = xmlStrdup(name); - ret->etype = type; + ret->etype = (xmlEntityType) type; if (ExternalID != NULL) ret->ExternalID = xmlStrdup(ExternalID); if (SystemID != NULL) @@ -754,9 +754,6 @@ xmlEncodeEntities(xmlDocPtr doc, const xmlChar *input) { * Contrary to xmlEncodeEntities, this routine is reentrant, and result * must be deallocated. * - * TODO !!!! Once moved to UTF-8 internal encoding, the encoding of non-ascii - * get erroneous. - * * Returns A newly allocated string with the substitution done. */ xmlChar * @@ -832,20 +829,7 @@ xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input) { */ *out++ = *cur; } else if (*cur >= 0x80) { - if (html) { - char buf[15], *ptr; - - /* - * TODO: improve by searching in html40EntitiesTable - */ -#ifdef HAVE_SNPRINTF - snprintf(buf, 9, "&#%d;", *cur); -#else - sprintf(buf, "&#%d;", *cur); -#endif - ptr = buf; - while (*ptr != 0) *out++ = *ptr++; - } else if (doc->encoding != NULL) { + if ((doc->encoding != NULL) || (html)) { /* * TODO !!! */ @@ -900,6 +884,7 @@ xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input) { #else sprintf(buf, "&#%d;", *cur); #endif + buf[9] = 0; ptr = buf; while (*ptr != 0) *out++ = *ptr++; cur++; @@ -909,11 +894,11 @@ xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input) { * We could do multiple things here. Just save as a char ref */ #ifdef HAVE_SNPRINTF - snprintf(buf, 14, "&#x%X;", val); + snprintf(buf, 9, "&#x%X;", val); #else sprintf(buf, "&#x%X;", val); #endif - buf[14] = 0; + buf[9] = 0; ptr = buf; while (*ptr != 0) *out++ = *ptr++; cur += l; @@ -927,6 +912,7 @@ xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input) { #else sprintf(buf, "&#%d;", *cur); #endif + buf[9] = 0; ptr = buf; while (*ptr != 0) *out++ = *ptr++; } diff --git a/include/libxml/HTMLparser.h b/include/libxml/HTMLparser.h index 44d9c271..b04e3b09 100644 --- a/include/libxml/HTMLparser.h +++ b/include/libxml/HTMLparser.h @@ -81,6 +81,10 @@ htmlDocPtr htmlSAXParseFile(const char *filename, void *userData); htmlDocPtr htmlParseFile (const char *filename, const char *encoding); +int UTF8ToHtml (unsigned char* out, + int *outlen, + const unsigned char* in, + int *inlen); /** * Interfaces for the Push mode diff --git a/include/libxml/HTMLtree.h b/include/libxml/HTMLtree.h index d41d8d9b..feff3a47 100644 --- a/include/libxml/HTMLtree.h +++ b/include/libxml/HTMLtree.h @@ -23,12 +23,27 @@ extern "C" { #define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE #define HTML_COMMENT_NODE XML_COMMENT_NODE -void htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size); -void htmlDocDump(FILE *f, xmlDocPtr cur); -int htmlSaveFile(const char *filename, xmlDocPtr cur); -void htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur); -void htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur); -htmlDocPtr htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID); +htmlDocPtr htmlNewDoc (const xmlChar *URI, + const xmlChar *ExternalID); +const xmlChar * htmlGetMetaEncoding (htmlDocPtr doc); +int htmlSetMetaEncoding (htmlDocPtr doc, + const xmlChar *encoding); +void htmlDocDumpMemory (xmlDocPtr cur, + xmlChar**mem, + int *size); +int htmlDocDump (FILE *f, + xmlDocPtr cur); +int htmlSaveFile (const char *filename, + xmlDocPtr cur); +void htmlNodeDump (xmlBufferPtr buf, + xmlDocPtr doc, + xmlNodePtr cur); +void htmlNodeDumpFile (FILE *out, + xmlDocPtr doc, + xmlNodePtr cur); +int htmlSaveFileEnc (const char *filename, + xmlDocPtr cur, + const char *encoding); #ifdef __cplusplus } diff --git a/include/libxml/SAX.h b/include/libxml/SAX.h index a3bd1025..3c0f4cbd 100644 --- a/include/libxml/SAX.h +++ b/include/libxml/SAX.h @@ -16,7 +16,7 @@ #include <libxml/xlink.h> #ifdef __cplusplus -#define extern "C" { +extern "C" { #endif const xmlChar * getPublicId (void *ctx); const xmlChar * getSystemId (void *ctx); diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index ce0ab755..5b6af9fa 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -70,7 +70,8 @@ typedef enum { XML_CHAR_ENCODING_8859_9= 18,/* ISO-8859-9 */ XML_CHAR_ENCODING_2022_JP= 19,/* ISO-2022-JP */ XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */ - XML_CHAR_ENCODING_EUC_JP= 21 /* EUC-JP */ + XML_CHAR_ENCODING_EUC_JP= 21,/* EUC-JP */ + XML_CHAR_ENCODING_ASCII= 22 /* pure ASCII */ } xmlCharEncoding; /** diff --git a/include/libxml/parserInternals.h b/include/libxml/parserInternals.h index e7e6fa0a..f0f7561c 100644 --- a/include/libxml/parserInternals.h +++ b/include/libxml/parserInternals.h @@ -572,6 +572,16 @@ int inputPush (xmlParserCtxtPtr ctxt, xmlParserInputPtr value); xmlParserInputPtr inputPop (xmlParserCtxtPtr ctxt); +/* + * Really core function shared with HTML parser + */ +int xmlCurrentChar (xmlParserCtxtPtr ctxt, + int *len); +int xmlCopyChar (int len, + xmlChar *out, + int val); +void xmlNextChar (xmlParserCtxtPtr ctxt); +void xmlParserInputShrink (xmlParserInputPtr in); #ifdef __cplusplus } #endif diff --git a/include/libxml/tree.h b/include/libxml/tree.h index 6c68dc31..00f4ee6d 100644 --- a/include/libxml/tree.h +++ b/include/libxml/tree.h @@ -549,6 +549,8 @@ xmlAttrPtr xmlSetProp (xmlNodePtr node, const xmlChar *value); xmlChar * xmlGetProp (xmlNodePtr node, const xmlChar *name); +xmlAttrPtr xmlHasProp (xmlNodePtr node, + const xmlChar *name); xmlChar * xmlGetNsProp (xmlNodePtr node, const xmlChar *name, const xmlChar *nameSpace); diff --git a/include/libxml/xlink.h b/include/libxml/xlink.h index 68a35fee..37a54151 100644 --- a/include/libxml/xlink.h +++ b/include/libxml/xlink.h @@ -16,7 +16,7 @@ #include <libxml/tree.h> #ifdef __cplusplus -#define extern "C" { +extern "C" { #endif /** * Various defines for the various Link properties. @@ -4,12 +4,22 @@ * Reference: RFC 959 */ +#ifdef TESTING +#define STANDALONE +#define HAVE_STDLIB_H +#define HAVE_UNISTD_H +#define HAVE_SYS_SOCKET_H +#define HAVE_NETINET_IN_H +#define HAVE_NETDB_H +#define HAVE_SYS_TIME_H +#else /* STANDALONE */ #ifdef WIN32 #define INCLUDE_WINSOCK #include "win32config.h" #else #include "config.h" #endif +#endif /* STANDALONE */ #include "xmlversion.h" @@ -656,7 +656,7 @@ xmlNanoHTTPConnectAttempt(struct in_addr ia, int port) } if ( FD_ISSET(s, &wfd) ) { - socklen_t len; + unsigned int len; /* was socklen_t barfed on some systems :-( */ len = sizeof(status); if (getsockopt(s, SOL_SOCKET, SO_ERROR, &status, &len) < 0 ) { /* Solaris error code */ @@ -13,7 +13,7 @@ #endif #include <stdio.h> -#include <string.h> /* for memset() only */ +#include <string.h> #ifdef HAVE_CTYPE_H #include <ctype.h> #endif @@ -306,7 +306,7 @@ xmlEntityPtr xmlParseStringEntityRef(xmlParserCtxtPtr ctxt, scope int name##Push(xmlParserCtxtPtr ctxt, type value) { \ if (ctxt->name##Nr >= ctxt->name##Max) { \ ctxt->name##Max *= 2; \ - ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \ + ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \ ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \ if (ctxt->name##Tab == NULL) { \ fprintf(stderr, "realloc failed !\n"); \ @@ -337,7 +337,7 @@ PUSH_AND_POP(extern, xmlChar*, name) int spacePush(xmlParserCtxtPtr ctxt, int val) { if (ctxt->spaceNr >= ctxt->spaceMax) { ctxt->spaceMax *= 2; - ctxt->spaceTab = (void *) xmlRealloc(ctxt->spaceTab, + ctxt->spaceTab = (int *) xmlRealloc(ctxt->spaceTab, ctxt->spaceMax * sizeof(ctxt->spaceTab[0])); if (ctxt->spaceTab == NULL) { fprintf(stderr, "realloc failed !\n"); @@ -449,7 +449,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt) { * the single character #xA. */ if (ctxt->token != 0) ctxt->token = 0; - else { + else if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { if ((*ctxt->input->cur == 0) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) && (ctxt->instate != XML_PARSER_COMMENT)) { @@ -540,9 +540,16 @@ xmlNextChar(xmlParserCtxtPtr ctxt) { if (*ctxt->input->cur == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } - } - if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); - if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); + } else { + ctxt->input->cur++; + ctxt->nbChars++; + if (*ctxt->input->cur == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + } + if ((*ctxt->input->cur == '%') && (!ctxt->html)) + xmlParserHandlePEReference(ctxt); + if ((*ctxt->input->cur == '&')&& (!ctxt->html)) + xmlParserHandleReference(ctxt); if ((*ctxt->input->cur == 0) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) xmlPopInput(ctxt); @@ -2373,6 +2380,10 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) /* let's assume it's UTF-8 without the XML decl */ ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); + case XML_CHAR_ENCODING_ASCII: + /* default encoding, no conversion should be needed */ + ctxt->charset = XML_CHAR_ENCODING_UTF8; + return(0); case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ ctxt->charset = XML_CHAR_ENCODING_UTF8; @@ -2427,7 +2438,10 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) case XML_CHAR_ENCODING_8859_8: case XML_CHAR_ENCODING_8859_9: /* - * Keep the internal content in the document encoding + * We used to keep the internal content in the + * document encoding however this turns being unmaintainable + * So xmlGetCharEncodingHandler() will return non-null + * values for this now. */ if ((ctxt->inputNr == 1) && (ctxt->encoding == NULL) && @@ -2625,7 +2639,7 @@ xmlStrndup(const xmlChar *cur, int len) { xmlChar *ret; if ((cur == NULL) || (len < 0)) return(NULL); - ret = xmlMalloc((len + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlMalloc((len + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "malloc of %ld byte failed\n", (len + 1) * (long)sizeof(xmlChar)); @@ -2671,7 +2685,7 @@ xmlCharStrndup(const char *cur, int len) { xmlChar *ret; if ((cur == NULL) || (len < 0)) return(NULL); - ret = xmlMalloc((len + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlMalloc((len + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "malloc of %ld byte failed\n", (len + 1) * (long)sizeof(xmlChar)); @@ -2872,7 +2886,7 @@ xmlStrncat(xmlChar *cur, const xmlChar *add, int len) { return(xmlStrndup(add, len)); size = xmlStrlen(cur); - ret = xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlStrncat: realloc of %ld byte failed\n", (size + len + 1) * (long)sizeof(xmlChar)); @@ -3113,7 +3127,7 @@ xmlNamespaceParseQName(xmlParserCtxtPtr ctxt, xmlChar **prefix) { * @name: an XML parser context * @prefix: a xmlChar ** * - * parse an XML qualified name string + * parse an UTF8 encoded XML qualified name string * * [NS 5] QName ::= (Prefix ':')? LocalPart * @@ -3131,7 +3145,7 @@ xmlSplitQName(xmlParserCtxtPtr ctxt, const xmlChar *name, xmlChar **prefix) { int len = 0; xmlChar *ret = NULL; const xmlChar *cur = name; - int c,l; + int c; *prefix = NULL; @@ -3144,36 +3158,23 @@ xmlSplitQName(xmlParserCtxtPtr ctxt, const xmlChar *name, xmlChar **prefix) { if (cur[0] == ':') return(xmlStrdup(name)); - c = CUR_SCHAR(cur, l); - if (!IS_LETTER(c) && (c != '_')) return(NULL); - - while ((IS_LETTER(c)) || (IS_DIGIT(c)) || - (c == '.') || (c == '-') || - (c == '_') || - (IS_COMBINING(c)) || - (IS_EXTENDER(c))) { - COPY_BUF(l,buf,len,c); - cur += l; - c = CUR_SCHAR(cur, l); + c = *cur++; + while ((c != 0) && (c != ':')) { + buf[len++] = c; + c = *cur++; } ret = xmlStrndup(buf, len); if (c == ':') { - cur += l; - c = CUR_SCHAR(cur, l); - if (!IS_LETTER(c) && (c != '_')) return(ret); + c = *cur++; + if (c == 0) return(ret); *prefix = ret; len = 0; - while ((IS_LETTER(c)) || (IS_DIGIT(c)) || - (c == '.') || (c == '-') || - (c == '_') || - (IS_COMBINING(c)) || - (IS_EXTENDER(c))) { - COPY_BUF(l,buf,len,c); - cur += l; - c = CUR_SCHAR(cur, l); + while (c != 0) { + buf[len++] = c; + c = *cur++; } ret = xmlStrndup(buf, len); @@ -3181,6 +3182,7 @@ xmlSplitQName(xmlParserCtxtPtr ctxt, const xmlChar *name, xmlChar **prefix) { return(ret); } + /** * xmlNamespaceParseNSDef: * @ctxt: an XML parser context @@ -3237,7 +3239,7 @@ xmlParseQuotedString(xmlParserCtxtPtr ctxt) { while (IS_CHAR(c) && (c != '"')) { if (len + 5 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); return(NULL); @@ -3263,7 +3265,7 @@ xmlParseQuotedString(xmlParserCtxtPtr ctxt) { while (IS_CHAR(c) && (c != '\'')) { if (len + 1 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); return(NULL); @@ -3675,7 +3677,7 @@ xmlParseEntityValue(xmlParserCtxtPtr ctxt, xmlChar **orig) { while (IS_CHAR(c) && ((c != stop) || (ctxt->input != input))) { if (len + 5 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); return(NULL); @@ -3841,7 +3843,18 @@ xmlParseAttValue(xmlParserCtxtPtr ctxt) { c = CUR_CHAR(l); while (((NXT(0) != limit) && (c != '<')) || (ctxt->token != 0)) { if (c == 0) break; - if ((c == '&') && (NXT(1) == '#')) { + if (ctxt->token == '&') { + static xmlChar buffer[6] = "&"; + + if (len > buf_size - 10) { + growBuffer(buf); + } + current = &buffer[0]; + while (*current != 0) { + buf[len++] = *current++; + } + ctxt->token = 0; + } else if ((c == '&') && (NXT(1) == '#')) { int val = xmlParseCharRef(ctxt); COPY_BUF(l,buf,len,val); NEXTL(l); @@ -3978,10 +3991,10 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) { while ((IS_CHAR(cur)) && (cur != stop)) { if (len + 5 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); - ctxt->instate = state; + ctxt->instate = (xmlParserInputState) state; return(NULL); } } @@ -3995,7 +4008,7 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) { } } buf[len] = 0; - ctxt->instate = state; + ctxt->instate = (xmlParserInputState) state; if (!IS_CHAR(cur)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n"); @@ -4052,7 +4065,7 @@ xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) { while ((IS_PUBIDCHAR(cur)) && (cur != stop)) { if (len + 1 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); return(NULL); @@ -4324,7 +4337,7 @@ xmlParseComment(xmlParserCtxtPtr ctxt) { } if (len + 5 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); ctxt->instate = state; @@ -4502,7 +4515,7 @@ xmlParsePI(xmlParserCtxtPtr ctxt) { ((cur != '?') || (NXT(1) != '>'))) { if (len + 5 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); ctxt->instate = state; @@ -7774,7 +7787,7 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) { ((r != ']') || (s != ']') || (cur != '>'))) { if (len + 5 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); return; @@ -8099,7 +8112,7 @@ xmlParseVersionNum(xmlParserCtxtPtr ctxt) { (cur == ':') || (cur == '-')) { if (len + 1 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); return(NULL); @@ -8222,7 +8235,7 @@ xmlParseEncName(xmlParserCtxtPtr ctxt) { (cur == '-')) { if (len + 1 >= size) { size *= 2; - buf = xmlRealloc(buf, size * sizeof(xmlChar)); + buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); if (buf == NULL) { fprintf(stderr, "realloc of %d byte failed\n", size); return(NULL); @@ -8345,7 +8358,9 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) { xmlSwitchToEncoding(ctxt, handler); } else { ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - xmlFree(encoding); + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Unsupported encoding %s\n", encoding); return(NULL); } } diff --git a/parserInternals.h b/parserInternals.h index e7e6fa0a..f0f7561c 100644 --- a/parserInternals.h +++ b/parserInternals.h @@ -572,6 +572,16 @@ int inputPush (xmlParserCtxtPtr ctxt, xmlParserInputPtr value); xmlParserInputPtr inputPop (xmlParserCtxtPtr ctxt); +/* + * Really core function shared with HTML parser + */ +int xmlCurrentChar (xmlParserCtxtPtr ctxt, + int *len); +int xmlCopyChar (int len, + xmlChar *out, + int val); +void xmlNextChar (xmlParserCtxtPtr ctxt); +void xmlParserInputShrink (xmlParserInputPtr in); #ifdef __cplusplus } #endif diff --git a/result/HTML/fp40.htm b/result/HTML/fp40.htm index 15bc076f..95a51873 100644 --- a/result/HTML/fp40.htm +++ b/result/HTML/fp40.htm @@ -8,7 +8,7 @@ <body> <font face="Verdana"> <h1><a name="top">Microsoft FrontPage 2000 Server Extensions, UNIX</a></h1> -<font size="2"><i>© Copyright Microsoft Corporation, 1999 </i></font> +<font size="2"><i>© Copyright Microsoft Corporation, 1999 </i></font> <p>The FrontPage Server Extensions are a set of programs on the Web server that support: </p> @@ -17,11 +17,11 @@ <li>Administering FrontPage webs</li> <li>Browse-time FrontPage web functionality</li> </ul> -<h2>Contents </h2> +<h2>Contents </h2> <a href="#relnotes">Release Notes</a> <br> <a href="#moreinfo">Resources for More Information</a> -<p> </p> +<p> </p> <hr> <h2><a name="relnotes">Release Notes</a></h2> <p>This section provides complementary or late-breaking @@ -41,7 +41,7 @@ configuration file, usually http.conf. To prevent the server extensions from usi configuration files (access.conf, srm.conf), add the following lines to http.conf:</p> </font> <blockquote><font face="Courier New"> -ResourceConfig /dev/null <br> +ResourceConfig /dev/null <br> AccessConfig /dev/null</font></blockquote> <font face="Verdana"> <p>If you have some settings stored in secondary configuration files, move them to http.conf.</p> @@ -100,7 +100,7 @@ keywords or the site's natural language search engine, which uses normal everyda answering inquiries, so you can write your question in your own words. To begin, go to <a href="http://support.microsoft.com/support/">http://support.microsoft.com/support/</a>.</p> <p align="right"><font size="1"><a href="#moreinfo">Top of Section</a></font></p> -<p> </p> +<p> </p> </font> </body> </html> diff --git a/result/HTML/wired.html b/result/HTML/wired.html index d4439bbc..6a523fb5 100644 --- a/result/HTML/wired.html +++ b/result/HTML/wired.html @@ -80,7 +80,7 @@ <td bgcolor="#FF0000" align="left" valign="center"><nobr> <img src="http://static.wired.com/news/images/spacer.gif" width="344" height="1"> <br> -<font size="1" face="Verdana, Arial, Geneva, sans-serif" color="#FFFFFF">   <b>updated 10:15 a.m.  15.Oct.99.PDT</b> +<font size="1" face="Verdana, Arial, Geneva, sans-serif" color="#FFFFFF"> <b>updated 10:15 a.m. 15.Oct.99.PDT</b> </font> </nobr></td> </tr> @@ -132,14 +132,14 @@ <input type="hidden" name="LIST" value="wn_ascii"> <input type="hidden" name="SOURCE" value="other"> <input type="hidden" name="ACTION" value="subscribe"> -<input type="TEXT" name="from" size="10" value="enter email">  +<input type="TEXT" name="from" size="10" value="enter email"> </form></td> <td valign="top" bgcolor="#99FF99"><input type="SUBMIT" name="SUBMIT" value="GO"></td> </tr></table></tr> <tr><td bgcolor="#FF0000"><font face="Verdana, Arial, Helvetica, sans-serif" color="#FFFFFF"><b><font size="1">STOCKS</font></b></font></td></tr> <tr><td bgcolor="#99FF99"><font face="Verdana, Arial, Helvetica, sans-serif" size="1">Get Quote:</font></td></tr> <tr><td bgcolor="#99FF99" marginwidth="0" marginheight="0"><form method="get" action="http://r.wired.com/r/10020/http://stocks.wired.com/stocks_quotes.asp"> -<input type="TEXT" name="Symbol" size="12"> <input type="SUBMIT" name="submit" value="GO"> +<input type="TEXT" name="Symbol" size="12"> <input type="SUBMIT" name="submit" value="GO"> </form></td></tr> <!-- BEGIN BUTTON ADS --><tr><td bgcolor="#CCFFCC"> <font size="1" face="Verdana, Arial, Helvetica, sans-serif" color="#000000">Financial Services</font> @@ -286,17 +286,17 @@ or <a href="/news/pointcast/0,1366,,00.html">PointCast</a> <br> <!-- IBD_SUBJECT: Homeless, but ID'd, in Seattle --><font face="Arial, Helvetica, sans-serif" size="5"><b><a href="/news/politics/0,1283,31911,00.html">Homeless, but ID'd, in Seattle</a></b></font> <br> -<font size="1" face="Verdana, Arial, Geneva, sans-serif" color="#FF0000">8:15 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The city council approves a plan to track the homeless by a numbering system, saying it'll improve services. The implications worry privacy advocates, naturally. By Craig Bicknell.</font> +<font size="1" face="Verdana, Arial, Geneva, sans-serif" color="#FF0000">8:15 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The city council approves a plan to track the homeless by a numbering system, saying it'll improve services. The implications worry privacy advocates, naturally. By Craig Bicknell.</font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/politics/0,1283,,00.html">in Politics</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/politics/0,1283,,00.html">in Politics</a></i></font> <br> <table bgcolor="#F0F0F0" cellpadding="0" cellspacing="0" border="0" width="147" align="RIGHT"> <!-- Commentary Frag Begin --><tr> -<td bgcolor="#000000"> </td> +<td bgcolor="#000000"> </td> <td bgcolor="#000000"><font size="1" face="Verdana, Arial, Helvetica, sans-serif" color="#FFFFFF"><b>HITS & MISC.</b></font></td> </tr> <tr> -<td> </td> +<td> </td> <td> <img src="http://static.wired.com/news/images/spacer.gif" height="5" width="5" alt=""> <br> @@ -317,11 +317,11 @@ or <a href="/news/pointcast/0,1366,,00.html">PointCast</a> </td> </tr> <!-- Commentary Frag End --><tr> -<td align="left" bgcolor="#000000"> </td> +<td align="left" bgcolor="#000000"> </td> <td bgcolor="#000000"><font size="1" face="Verdana, Arial, Helvetica, sans-serif" color="#FFFFFF"><b>CURRENT HOO-HA</b></font></td> </tr> <tr> -<td> </td> +<td> </td> <td> <img src="http://static.wired.com/news/images/spacer.gif" height="5" width="5" alt=""> <br> @@ -418,7 +418,7 @@ wired.com&BANNER=Sprint" style="text-decoration:none"><font color="#000000"> <br> <br> <font face="Arial, Helvetica, sans-serif" size="2"><b><i><a href="/news/special_reports/1,1293,,00.html">More Hoo-Ha</a></i></b></font> -<br> <br> +<br> <br> </font> </font> </font> @@ -430,19 +430,19 @@ wired.com&BANNER=Sprint" style="text-decoration:none"><font color="#000000"> </td> </tr> <!-- start of Gen News --><tr> -<td bgcolor="#000000"> </td> +<td bgcolor="#000000"> </td> <td bgcolor="#000000"><font size="1" face="Verdana, Arial, Helvetica, sans-serif" color="#FFFFFF"><b>MEANWHILE...</b></font></td> </tr> <tr> -<td> </td> +<td> </td> <td align="left" valign="top"> <img src="http://static.wired.com/news/images/spacer.gif" height="5" width="5" alt=""> <br> -<!-- 31942 --><font size="2" face="Arial, Helvetica, sans-serif" color="#000000"><b>Führer Furor</b></font> +<!-- 31942 --><font size="2" face="Arial, Helvetica, sans-serif" color="#000000"><b>Führer Furor</b></font> <br> <font size="1" face="Arial, Geneva, sans-serif" color="#000000"> <p>
-Contruction workers in Berlin opened an old wound in the German psyche this week when they accidentally stumbled across Adolf Hitler's bunker while excavating near the Brandenburg Gate. The bunker, just south of the Gate, was where Hitler and his closest associates barricaded themselves as the Red Army approached Berlin in the waning days of World War II. It is also where the Führer and his bride, Eva Braun, committed suicide rather than fall into the hands of the Russians. Although the bunker's location has never been a mystery, it has been sealed off since the end of the war to keep neo-Nazis from turning it into a shrine.
+Contruction workers in Berlin opened an old wound in the German psyche this week when they accidentally stumbled across Adolf Hitler's bunker while excavating near the Brandenburg Gate. The bunker, just south of the Gate, was where Hitler and his closest associates barricaded themselves as the Red Army approached Berlin in the waning days of World War II. It is also where the Führer and his bride, Eva Braun, committed suicide rather than fall into the hands of the Russians. Although the bunker's location has never been a mystery, it has been sealed off since the end of the war to keep neo-Nazis from turning it into a shrine.
<br> </p> <li>More from <a href="http://www.lycos.com/news/flash/hitlerbunker.html?v=wn1015&lpv=1">Lycos</a> @@ -454,7 +454,7 @@ Contruction workers in Berlin opened an old wound in the German psyche this week </tr> <!-- end of Gen News --> </table> -<font size="1"> <br> +<font size="1"> <br> </font> <br> <font face="Verdana, Arial, Geneva, sans-serif" size="2"><b><i>Other Top Stories</i></b></font> @@ -463,67 +463,67 @@ Contruction workers in Berlin opened an old wound in the German psyche this week <br> <!-- SQL query here --><!-- IBD_SUBJECT:Wall Street Keeps Reeling --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/reuters/0,1349,31934,00.html">Wall Street Keeps Reeling</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">10:15 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The Dow and Nasdaq suffer sizeable losses during the first half of Friday trading. Why? Wholesale prices are the highest this decade, and Greenspan is concerned about stock prices.</font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">10:15 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The Dow and Nasdaq suffer sizeable losses during the first half of Friday trading. Why? Wholesale prices are the highest this decade, and Greenspan is concerned about stock prices.</font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/reuters/0,1349,,00.html">in Reuters</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/reuters/0,1349,,00.html">in Reuters</a></i></font> <br> <br> <!-- IBD_SUBJECT:The Market's Madness --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/reuters/0,1349,31935,00.html">The Market's Madness</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">9:10 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The bulls and the bears are in the midst of a Battle Royale, and all this turbulence is not a healthy thing. So say the experts.</font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">9:10 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The bulls and the bears are in the midst of a Battle Royale, and all this turbulence is not a healthy thing. So say the experts.</font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/reuters/0,1349,,00.html">in Reuters</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/reuters/0,1349,,00.html">in Reuters</a></i></font> <br> <br> <!-- IBD_SUBJECT:'Want a Loan? What's Your Race?' --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/politics/0,1283,31533,00.html">'Want a Loan? What's Your Race?'</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The Federal Reserve is in the middle of changing banking regulations to let banks collect data on the race, sex, religion, and national origin of their customers. By Declan McCullagh. </font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The Federal Reserve is in the middle of changing banking regulations to let banks collect data on the race, sex, religion, and national origin of their customers. By Declan McCullagh. </font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/politics/0,1283,,00.html">in Politics</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/politics/0,1283,,00.html">in Politics</a></i></font> <br> <br> <!-- IBD_SUBJECT:Music Regs: A Bagful of Noise --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/business/0,1367,31832,00.html">Music Regs: A Bagful of Noise</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The struggle to come up with a digital music standard that would minimize download piracy is pushing right up against the holiday gift-giving season. By Jennifer Sullivan.</font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The struggle to come up with a digital music standard that would minimize download piracy is pushing right up against the holiday gift-giving season. By Jennifer Sullivan.</font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/business/0,1367,,00.html">in Business</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/business/0,1367,,00.html">in Business</a></i></font> <br> <br> <!-- IBD_SUBJECT:Can't Beat 'Em? Green 'Em --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/technology/0,1282,31927,00.html">Can't Beat 'Em? Green 'Em</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">High-tech companies are notoriously environmentally unfriendly, and a growing number of "Greenies" are trying to change things from the inside ... with varying results. By Chris Gaither.</font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">High-tech companies are notoriously environmentally unfriendly, and a growing number of "Greenies" are trying to change things from the inside ... with varying results. By Chris Gaither.</font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/technology/0,1282,,00.html">in Technology</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/technology/0,1282,,00.html">in Technology</a></i></font> <br> <br> <!-- IBD_SUBJECT:Y2K Cloud Over MS Office --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/business/0,1367,31932,00.html">Y2K Cloud Over MS Office</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">Windows NT sales remain strong, but corporate clients are wary of upgrading to MS Office 2000. Analysts say that means strong, but not stunning, Microsoft earnings. </font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">Windows NT sales remain strong, but corporate clients are wary of upgrading to MS Office 2000. Analysts say that means strong, but not stunning, Microsoft earnings. </font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/business/0,1367,,00.html">in Business</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/business/0,1367,,00.html">in Business</a></i></font> <br> <br> <font color="#FF0000" face="Verdana, Arial, Geneva, sans-serif" size="1">Med-Tech</font> <br> <!-- IBD_SUBJECT:Biochips for Custom Chemo --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/technology/0,1282,31914,00.html">Biochips for Custom Chemo</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">Different cancer patients need different medicine, but doctors can rarely determine the best match. New biochip technology promises chemotherapy tailored to a tumor's genetic make-up. By Kristen Philipkoski.</font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">Different cancer patients need different medicine, but doctors can rarely determine the best match. New biochip technology promises chemotherapy tailored to a tumor's genetic make-up. By Kristen Philipkoski.</font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/technology/0,1282,,00.html">in Technology</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/technology/0,1282,,00.html">in Technology</a></i></font> <br> <br> <!-- IBD_SUBJECT:High Stakes in Priceline Suit --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/business/0,1367,31916,00.html">High Stakes in Priceline Suit</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">It's not just another round of Redmond-bashing. A Priceline.com lawsuit against Microsoft's Expedia.com may have a big impact on how Net companies protect their business models. By Joanna Glasner.</font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">It's not just another round of Redmond-bashing. A Priceline.com lawsuit against Microsoft's Expedia.com may have a big impact on how Net companies protect their business models. By Joanna Glasner.</font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/business/0,1367,,00.html">in Business</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/business/0,1367,,00.html">in Business</a></i></font> <br> <br> <!-- IBD_SUBJECT:Biodiversity Merges Online --><font face="Arial, Helvetica, sans-serif" size="3"><b><a href="/news/technology/0,1282,31918,00.html">Biodiversity Merges Online</a></b></font> <br> -<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The far-flung databases on global biodiversity get together to form one monster database. Soon the red-eyed tree frog will be eyeing those Swedish lingonberries. From the Environment News Service.</font> +<font color="#ff0000" face="Verdana, Arial, Geneva, sans-serif" size="1">3:00 a.m.</font> <font face="Verdana, Arial, Geneva, sans-serif" size="2">The far-flung databases on global biodiversity get together to form one monster database. Soon the red-eyed tree frog will be eyeing those Swedish lingonberries. From the Environment News Service.</font> <br> -<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/technology/0,1282,,00.html">in Technology</a></i></font> +<font face="Verdana, Arial, Helvetica, sans-serif" size="1"><i><a href="/news/technology/0,1282,,00.html">in Technology</a></i></font> <br> <br> <!-- SQL above --><!------TRADES---------><br> @@ -597,18 +597,18 @@ Contruction workers in Berlin opened an old wound in the German psyche this week <br> <p><font face="Verdana, Arial, Geneva, sans-serif" size="1"> <a href="http://www.wired.com/news/feedback.html">Send us feedback</a> - |  + | <a href="http://www.hotwired.com/jobs/">Work at Wired Digital</a> - |  + | <a href="http://home.wired.com/advertising/">Advertise with us</a> <br> <a href="http://home.wired.com/">About Wired Digital</a> - |  + | <a href="http://www.wired.com/home/digital/privacy/">Our Privacy Policy</a> </font></p> <p> <font face="Verdana, Arial, Geneva" size="1"> -<a href="http://www.wired.com/home/copyright.html">Copyright</a> © 1994-99 Wired Digital Inc. All rights reserved.</font> +<a href="http://www.wired.com/home/copyright.html">Copyright</a> © 1994-99 Wired Digital Inc. All rights reserved.</font> <br> <!-- TRACKING --><img src="http://www.wired.com/special/modx/news.gif" height="1" width="1" alt=""> <map NAME="navstrip.map"> diff --git a/result/HTML/wired.html.err b/result/HTML/wired.html.err index b65f43db..357dc0a1 100644 --- a/result/HTML/wired.html.err +++ b/result/HTML/wired.html.err @@ -181,75 +181,75 @@ option value="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&OP ./test/HTML/wired.html:97: error: htmlParseEntityRef: expecting ';' lue="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&OPs=MDRTP&M ^ -./test/HTML/wired.html:165: error: Opening and ending tag mismatch: td and form +./test/HTML/wired.html:159: error: Opening and ending tag mismatch: td and form </td> ^ -./test/HTML/wired.html:170: error: Unexpected end tag : form +./test/HTML/wired.html:164: error: Unexpected end tag : form </tr> </form> ^ -./test/HTML/wired.html:244: error: Opening and ending tag mismatch: td and form +./test/HTML/wired.html:238: error: Opening and ending tag mismatch: td and form </select></font></td></tr> ^ -./test/HTML/wired.html:248: error: htmlParseEntityRef: expecting ';' +./test/HTML/wired.html:242: error: htmlParseEntityRef: expecting ';' MG SRC="http://barnesandnoble.bfast.com/booklink/serve?sourceid=383471&is_searc ^ -./test/HTML/wired.html:265: error: Unexpected end tag : form +./test/HTML/wired.html:257: error: Unexpected end tag : form </tr> </form> ^ -./test/HTML/wired.html:346: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:338: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:374: error: htmlParseEntityRef: no name +./test/HTML/wired.html:366: error: htmlParseEntityRef: no name a, sans-serif"><b><a href="/news/commentarySection/0,1292,31926,00.html">Rants ^ -./test/HTML/wired.html:374: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:366: error: Opening and ending tag mismatch: td and font Readers on Apple's G4 ... AOL's passwords ... MS vs. Linux.</font><br><br> </t ^ -./test/HTML/wired.html:374: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:366: error: Opening and ending tag mismatch: td and font Readers on Apple's G4 ... AOL's passwords ... MS vs. Linux.</font><br><br> </t ^ -./test/HTML/wired.html:402: error: Opening and ending tag mismatch: a and font +./test/HTML/wired.html:394: error: Opening and ending tag mismatch: a and font w.vignette.com/" style="text-decoration:none"><font color="#000000">Vignette</a ^ -./test/HTML/wired.html:407: error: htmlParseEntityRef: expecting ';' +./test/HTML/wired.html:398: error: htmlParseEntityRef: expecting ';' ervlet/appservlet?from=/wired/sprint/&template=/security/security.html&SITE= ^ -./test/HTML/wired.html:407: error: htmlParseEntityRef: expecting ';' +./test/HTML/wired.html:398: error: htmlParseEntityRef: expecting ';' ervlet/appservlet?from=/wired/sprint/&template=/security/security.html&SITE= ^ -./test/HTML/wired.html:408: error: htmlParseEntityRef: expecting ';' +./test/HTML/wired.html:398: error: htmlParseEntityRef: expecting ';' wired.com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Spr ^ -./test/HTML/wired.html:408: error: Opening and ending tag mismatch: a and font +./test/HTML/wired.html:398: error: Opening and ending tag mismatch: a and font com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</a ^ -./test/HTML/wired.html:408: error: End tag : expected '>' +./test/HTML/wired.html:398: error: End tag : expected '>' =Sprint" style="text-decoration:none"><font color="#000000">Sprint</a></i></fon ^ -./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:404: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:404: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:404: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:404: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:404: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:404: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:404: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:404: error: Opening and ending tag mismatch: td and font </td> ^ -./test/HTML/wired.html:432: error: htmlParseEntityRef: expecting ';' +./test/HTML/wired.html:422: error: htmlParseEntityRef: expecting ';' href="http://www.lycos.com/news/flash/hitlerbunker.html?v=wn1015&lpv=1">Lycos</ ^ diff --git a/result/valid/REC-xml-19980210.xml b/result/valid/REC-xml-19980210.xml index a27855db..2d4f035f 100644 --- a/result/valid/REC-xml-19980210.xml +++ b/result/valid/REC-xml-19980210.xml @@ -2574,7 +2574,7 @@ For example, given the following declarations: © 1947 %pub;. &rights;" >]]></eg> then the replacement text for the entity "<code>book</code>" is: <eg>La Peste: Albert Camus, -© 1947 Éditions Gallimard. &rights;</eg> +© 1947 Éditions Gallimard. &rights;</eg> The general-entity reference "<code>&rights;</code>" would be expanded should the reference "<code>&book;</code>" appear in the document's content or an attribute value.</p> @@ -49,6 +49,7 @@ static int sax = 0; static int repeat = 0; static int noout = 0; static int push = 0; +static char *encoding = NULL; xmlSAXHandler emptySAXHandlerStruct = { NULL, /* internalSubset */ @@ -638,12 +639,18 @@ void parseAndPrintFile(char *filename) { */ if (!noout) { #ifdef LIBXML_DEBUG_ENABLED - if (!debug) - htmlDocDump(stdout, doc); - else + if (!debug) { + if (encoding) + htmlSaveFileEnc("-", doc, encoding); + else + htmlDocDump(stdout, doc); + } else xmlDebugDumpDocument(stdout, doc); #else - htmlDocDump(stdout, doc); + if (encoding) + htmlSaveFileEnc("-", doc, encoding); + else + htmlDocDump(stdout, doc); #endif } @@ -674,8 +681,18 @@ int main(int argc, char **argv) { else if ((!strcmp(argv[i], "-repeat")) || (!strcmp(argv[i], "--repeat"))) repeat++; + else if ((!strcmp(argv[i], "-encode")) || + (!strcmp(argv[i], "--encode"))) { + i++; + encoding = argv[i]; + } } for (i = 1; i < argc ; i++) { + if ((!strcmp(argv[i], "-encode")) || + (!strcmp(argv[i], "--encode"))) { + i++; + continue; + } if (argv[i][0] != '-') { if (repeat) { for (count = 0;count < 100 * repeat;count++) { @@ -705,6 +722,7 @@ int main(int argc, char **argv) { printf("\t--repeat : parse the file 100 times, for timing\n"); printf("\t--noout : do not print the result\n"); printf("\t--push : use the push mode parser\n"); + printf("\t--encode encoding : output in the given encoding\n"); } xmlCleanupParser(); xmlMemoryDump(); @@ -36,6 +36,7 @@ #endif +#include <libxml/xml-error.h> #include <libxml/parser.h> #include <libxml/parserInternals.h> /* only for xmlNewInputFromFile() */ #include <libxml/tree.h> @@ -2984,7 +2984,7 @@ xmlNodeAddContentLen(xmlNodePtr cur, const xmlChar *content, int len) { switch (cur->type) { case XML_DOCUMENT_FRAG_NODE: case XML_ELEMENT_NODE: { - xmlNodePtr last = NULL, new; + xmlNodePtr last = NULL, newNode; if (cur->children != NULL) { last = cur->last; @@ -3006,11 +3006,11 @@ xmlNodeAddContentLen(xmlNodePtr cur, const xmlChar *content, int len) { last = cur->last; } } - new = xmlNewTextLen(content, len); - if (new != NULL) { - xmlAddChild(cur, new); - if ((last != NULL) && (last->next == new)) { - xmlTextMerge(last, new); + newNode = xmlNewTextLen(content, len); + if (newNode != NULL) { + xmlAddChild(cur, newNode); + if ((last != NULL) && (last->next == newNode)) { + xmlTextMerge(last, newNode); } } break; @@ -3470,6 +3470,54 @@ xmlReconciliateNs(xmlDocPtr doc, xmlNodePtr tree) { } /** + * xmlHasProp: + * @node: the node + * @name: the attribute name + * + * Search an attribute associated to a node + * This function also looks in DTD attribute declaration for #FIXED or + * default declaration values unless DTD use has been turned off. + * + * Returns the attribute or the attribute declaration or NULL if + * neither was found. + */ +xmlAttrPtr +xmlHasProp(xmlNodePtr node, const xmlChar *name) { + xmlAttrPtr prop; + xmlDocPtr doc; + + if ((node == NULL) || (name == NULL)) return(NULL); + /* + * Check on the properties attached to the node + */ + prop = node->properties; + while (prop != NULL) { + if (!xmlStrcmp(prop->name, name)) { + return(prop); + } + prop = prop->next; + } + if (!xmlCheckDTD) return(NULL); + + /* + * Check if there is a default declaration in the internal + * or external subsets + */ + doc = node->doc; + if (doc != NULL) { + xmlAttributePtr attrDecl; + if (doc->intSubset != NULL) { + attrDecl = xmlGetDtdAttrDesc(doc->intSubset, node->name, name); + if ((attrDecl == NULL) && (doc->extSubset != NULL)) + attrDecl = xmlGetDtdAttrDesc(doc->extSubset, node->name, name); + if (attrDecl != NULL) + return((xmlAttrPtr) attrDecl); + } + } + return(NULL); +} + +/** * xmlGetProp: * @node: the node * @name: the attribute name @@ -3652,7 +3700,9 @@ xmlNodeIsText(xmlNodePtr node) { * xmlIsBlankNode: * @node: the node * - * Is this node a Text node ? + * Checks whether this node is an empty or whitespace only + * (and possibly ignorable) text-node. + * * Returns 1 yes, 0 no */ int @@ -3863,7 +3913,7 @@ xmlBufferGrow(xmlBufferPtr buf, int len) { size = buf->use + len + 100; - newbuf = xmlRealloc(buf->content, size); + newbuf = (xmlChar *) xmlRealloc(buf->content, size); if (newbuf == NULL) return(-1); buf->content = newbuf; buf->size = size; @@ -5099,7 +5149,7 @@ xmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, if (cur->encoding != NULL) encoding = (const char *) cur->encoding; else if (cur->charset != XML_CHAR_ENCODING_UTF8) - encoding = xmlGetCharEncodingName(cur->charset); + encoding = xmlGetCharEncodingName((xmlCharEncoding) cur->charset); } if (encoding != NULL) { xmlOutputBufferWriteString(buf, " encoding="); @@ -5224,91 +5274,6 @@ xmlSetCompressMode(int mode) { else xmlCompressMode = mode; } -#if 0 -/** - * xmlDocDump: - * @f: the FILE* - * @cur: the document - * - * Dump an XML document to an open FILE. - */ -void -xmlDocDump(FILE *f, xmlDocPtr cur) { - xmlBufferPtr buf; - - if (cur == NULL) { -#ifdef DEBUG_TREE - fprintf(stderr, "xmlDocDump : document == NULL\n"); -#endif - return; - } - buf = xmlBufferCreate(); - if (buf == NULL) return; - xmlDocContentDump(buf, cur); - xmlBufferDump(f, buf); - xmlBufferFree(buf); -} - -/** - * xmlSaveFile: - * @filename: the filename - * @cur: the document - * - * Dump an XML document to a file. Will use compression if - * compiled in and enabled. If @filename is "-" the stdout file is - * used. - * returns: the number of file written or -1 in case of failure. - */ -int -xmlSaveFile(const char *filename, xmlDocPtr cur) { - xmlBufferPtr buf; -#ifdef HAVE_ZLIB_H - gzFile zoutput = NULL; - char mode[15]; -#endif - FILE *output = NULL; - int ret; - - /* - * save the content to a temp buffer. - */ - buf = xmlBufferCreate(); - if (buf == NULL) return(0); - xmlDocContentDump(buf, cur); - -#ifdef HAVE_ZLIB_H - if (cur->compression < 0) cur->compression = xmlCompressMode; - if ((cur->compression > 0) && (cur->compression <= 9)) { - sprintf(mode, "w%d", cur->compression); - if (!strcmp(filename, "-")) - zoutput = gzdopen(1, mode); - else - zoutput = gzopen(filename, mode); - } - if (zoutput == NULL) { -#endif - output = fopen(filename, "w"); - if (output == NULL) { - xmlBufferFree(buf); - return(-1); - } -#ifdef HAVE_ZLIB_H - } - - if (zoutput != NULL) { - ret = gzwrite(zoutput, buf->content, sizeof(xmlChar) * buf->use); - gzclose(zoutput); - } else { -#endif - ret = xmlBufferDump(output, buf); - fclose(output); -#ifdef HAVE_ZLIB_H - } -#endif - xmlBufferFree(buf); - return(ret * sizeof(xmlChar)); -} -#else /** * xmlDocDump: * @f: the FILE* @@ -5316,11 +5281,13 @@ xmlSaveFile(const char *filename, xmlDocPtr cur) { * * Dump an XML document to an open FILE. * - * returns: the number of file written or -1 in case of failure. + * returns: the number of byte written or -1 in case of failure. */ int xmlDocDump(FILE *f, xmlDocPtr cur) { xmlOutputBufferPtr buf; + const char * encoding; + xmlCharEncodingHandlerPtr handler = NULL; int ret; if (cur == NULL) { @@ -5329,38 +5296,27 @@ xmlDocDump(FILE *f, xmlDocPtr cur) { #endif return(-1); } - buf = xmlOutputBufferCreateFile(f, NULL); - if (buf == NULL) return(-1); - xmlDocContentDumpOutput(buf, cur, NULL); + encoding = (const char *) cur->encoding; - ret = xmlOutputBufferClose(buf); - return(ret); -} - -/** - * xmlSaveFile: - * @filename: the filename (or URL) - * @cur: the document - * - * Dump an XML document to a file. Will use compression if - * compiled in and enabled. If @filename is "-" the stdout file is - * used. - * returns: the number of file written or -1 in case of failure. - */ -int -xmlSaveFile(const char *filename, xmlDocPtr cur) { - xmlOutputBufferPtr buf; - int ret; + if (encoding != NULL) { + xmlCharEncoding enc; - /* - * save the content to a temp buffer. - */ -#ifdef HAVE_ZLIB_H - if (cur->compression < 0) cur->compression = xmlCompressMode; -#endif - buf = xmlOutputBufferCreateFilename(filename, NULL, cur->compression); - if (buf == NULL) return(0); + enc = xmlParseCharEncoding(encoding); + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + fprintf(stderr, "xmlDocDump: document not in UTF8\n"); + return(-1); + } + if (enc != XML_CHAR_ENCODING_UTF8) { + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) { + xmlFree((char *) cur->encoding); + cur->encoding = NULL; + } + } + } + buf = xmlOutputBufferCreateFile(f, handler); + if (buf == NULL) return(-1); xmlDocContentDumpOutput(buf, cur, NULL); ret = xmlOutputBufferClose(buf); @@ -5375,7 +5331,7 @@ xmlSaveFile(const char *filename, xmlDocPtr cur) { * * Dump an XML document to an I/O buffer. * - * returns: the number of file written or -1 in case of failure. + * returns: the number of byte written or -1 in case of failure. */ int xmlSaveFileTo(xmlOutputBuffer *buf, xmlDocPtr cur, const char *encoding) { @@ -5395,7 +5351,7 @@ xmlSaveFileTo(xmlOutputBuffer *buf, xmlDocPtr cur, const char *encoding) { * * Dump an XML document, converting it to the given encoding * - * returns: the number of file written or -1 in case of failure. + * returns: the number of byte written or -1 in case of failure. */ int xmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { @@ -5407,17 +5363,15 @@ xmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { xmlCharEncoding enc; enc = xmlParseCharEncoding(encoding); - if (enc != cur->charset) { - if (cur->charset != XML_CHAR_ENCODING_UTF8) { - /* - * Not supported yet - */ - return(-1); - } - + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + fprintf(stderr, "xmlSaveFileEnc: document not in UTF8\n"); + return(-1); + } + if (enc != XML_CHAR_ENCODING_UTF8) { handler = xmlFindCharEncodingHandler(encoding); - if (handler == NULL) + if (handler == NULL) { return(-1); + } } } @@ -5432,4 +5386,58 @@ xmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { ret = xmlOutputBufferClose(buf); return(ret); } + +/** + * xmlSaveFile: + * @filename: the filename (or URL) + * @cur: the document + * + * Dump an XML document to a file. Will use compression if + * compiled in and enabled. If @filename is "-" the stdout file is + * used. + * returns: the number of byte written or -1 in case of failure. + */ +int +xmlSaveFile(const char *filename, xmlDocPtr cur) { + xmlOutputBufferPtr buf; + const char *encoding; + xmlCharEncodingHandlerPtr handler = NULL; + int ret; + + if (cur == NULL) + return(-1); + encoding = (const char *) cur->encoding; + + /* + * save the content to a temp buffer. + */ +#ifdef HAVE_ZLIB_H + if (cur->compression < 0) cur->compression = xmlCompressMode; #endif + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + fprintf(stderr, "xmlSaveFile: document not in UTF8\n"); + return(-1); + } + if (enc != XML_CHAR_ENCODING_UTF8) { + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) { + xmlFree((char *) cur->encoding); + cur->encoding = NULL; + } + } + } + + buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); + if (buf == NULL) return(0); + + xmlDocContentDumpOutput(buf, cur, NULL); + + ret = xmlOutputBufferClose(buf); + return(ret); +} + @@ -549,6 +549,8 @@ xmlAttrPtr xmlSetProp (xmlNodePtr node, const xmlChar *value); xmlChar * xmlGetProp (xmlNodePtr node, const xmlChar *name); +xmlAttrPtr xmlHasProp (xmlNodePtr node, + const xmlChar *name); xmlChar * xmlGetNsProp (xmlNodePtr node, const xmlChar *name, const xmlChar *nameSpace); @@ -207,7 +207,7 @@ xmlSaveUri(xmlURIPtr uri) { max = 80; - ret = xmlMalloc((max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlMalloc((max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -219,7 +219,7 @@ xmlSaveUri(xmlURIPtr uri) { while (*p != 0) { if (len >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -229,7 +229,7 @@ xmlSaveUri(xmlURIPtr uri) { } if (len >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -242,7 +242,7 @@ xmlSaveUri(xmlURIPtr uri) { while (*p != 0) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -278,7 +278,7 @@ xmlSaveUri(xmlURIPtr uri) { } if (len >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -289,7 +289,7 @@ xmlSaveUri(xmlURIPtr uri) { if (uri->server != NULL) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -302,7 +302,7 @@ xmlSaveUri(xmlURIPtr uri) { while (*p != 0) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -338,7 +338,7 @@ xmlSaveUri(xmlURIPtr uri) { } if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -350,7 +350,7 @@ xmlSaveUri(xmlURIPtr uri) { while (*p != 0) { if (len >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -361,7 +361,7 @@ xmlSaveUri(xmlURIPtr uri) { if (uri->port > 0) { if (len + 10 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -372,7 +372,7 @@ xmlSaveUri(xmlURIPtr uri) { } else if (uri->authority != NULL) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -384,7 +384,7 @@ xmlSaveUri(xmlURIPtr uri) { while (*p != 0) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -424,7 +424,7 @@ xmlSaveUri(xmlURIPtr uri) { while (*p != 0) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -462,7 +462,7 @@ xmlSaveUri(xmlURIPtr uri) { if (uri->query != NULL) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -473,7 +473,7 @@ xmlSaveUri(xmlURIPtr uri) { while (*p != 0) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -508,7 +508,7 @@ xmlSaveUri(xmlURIPtr uri) { if (uri->fragment != NULL) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -519,7 +519,7 @@ xmlSaveUri(xmlURIPtr uri) { while (*p != 0) { if (len + 3 >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -553,7 +553,7 @@ xmlSaveUri(xmlURIPtr uri) { } if (len >= max) { max *= 2; - ret = xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); + ret = (xmlChar *) xmlRealloc(ret, (max + 1) * sizeof(xmlChar)); if (ret == NULL) { fprintf(stderr, "xmlSaveUri: out of memory\n"); return(NULL); @@ -33,7 +33,7 @@ scope int name##VPush(xmlValidCtxtPtr ctxt, type value) { \ if (ctxt->name##Nr >= ctxt->name##Max) { \ ctxt->name##Max *= 2; \ - ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \ + ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \ ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \ if (ctxt->name##Tab == NULL) { \ fprintf(stderr, "realloc failed !\n"); \ @@ -538,9 +538,11 @@ xmlAddElementDecl(xmlValidCtxtPtr ctxt, xmlDtdPtr dtd, const xmlChar *name, /* * Create the Element table if needed. */ - table = dtd->elements; - if (table == NULL) - table = dtd->elements = xmlCreateElementTable(); + table = (xmlElementTablePtr) dtd->elements; + if (table == NULL) { + table = xmlCreateElementTable(); + dtd->elements = (void *) table; + } if (table == NULL) { fprintf(stderr, "xmlAddElementDecl: Table creation failed!\n"); return(NULL); @@ -909,7 +911,7 @@ xmlScanAttributeDecl(xmlDtdPtr dtd, const xmlChar *elem) { fprintf(stderr, "xmlScanAttributeDecl: elem == NULL\n"); return(NULL); } - table = dtd->attributes; + table = (xmlAttributeTablePtr) dtd->attributes; if (table == NULL) return(NULL); @@ -1029,9 +1031,11 @@ xmlAddAttributeDecl(xmlValidCtxtPtr ctxt, xmlDtdPtr dtd, const xmlChar *elem, /* * Create the Attribute table if needed. */ - table = dtd->attributes; - if (table == NULL) - table = dtd->attributes = xmlCreateAttributeTable(); + table = (xmlAttributeTablePtr) dtd->attributes; + if (table == NULL) { + table = xmlCreateAttributeTable(); + dtd->attributes = (void *) table; + } if (table == NULL) { fprintf(stderr, "xmlAddAttributeDecl: Table creation failed!\n"); return(NULL); @@ -1388,9 +1392,9 @@ xmlAddNotationDecl(xmlValidCtxtPtr ctxt, xmlDtdPtr dtd, const xmlChar *name, /* * Create the Notation table if needed. */ - table = dtd->notations; + table = (xmlNotationTablePtr) dtd->notations; if (table == NULL) - table = dtd->notations = xmlCreateNotationTable(); + dtd->notations = table = xmlCreateNotationTable(); if (table == NULL) { fprintf(stderr, "xmlAddNotationDecl: Table creation failed!\n"); return(NULL); @@ -1657,9 +1661,9 @@ xmlAddID(xmlValidCtxtPtr ctxt, xmlDocPtr doc, const xmlChar *value, /* * Create the ID table if needed. */ - table = doc->ids; + table = (xmlIDTablePtr) doc->ids; if (table == NULL) - table = doc->ids = xmlCreateIDTable(); + doc->ids = table = xmlCreateIDTable(); if (table == NULL) { fprintf(stderr, "xmlAddID: Table creation failed!\n"); return(NULL); @@ -1804,7 +1808,7 @@ xmlRemoveID(xmlDocPtr doc, xmlAttrPtr attr) { if (doc == NULL) return(-1); if (attr == NULL) return(-1); - table = doc->ids; + table = (xmlIDTablePtr) doc->ids; if (table == NULL) return(-1); @@ -1848,7 +1852,7 @@ xmlGetID(xmlDocPtr doc, const xmlChar *ID) { return(NULL); } - table = doc->ids; + table = (xmlIDTablePtr) doc->ids; if (table == NULL) return(NULL); @@ -1935,9 +1939,9 @@ xmlAddRef(xmlValidCtxtPtr ctxt, xmlDocPtr doc, const xmlChar *value, /* * Create the Ref table if needed. */ - table = doc->refs; + table = (xmlRefTablePtr) doc->refs; if (table == NULL) - table = doc->refs = xmlCreateRefTable(); + doc->refs = table = xmlCreateRefTable(); if (table == NULL) { fprintf(stderr, "xmlAddRef: Table creation failed!\n"); return(NULL); @@ -2065,7 +2069,7 @@ xmlRemoveRef(xmlDocPtr doc, xmlAttrPtr attr) { if (doc == NULL) return(-1); if (attr == NULL) return(-1); - table = doc->refs; + table = (xmlRefTablePtr) doc->refs; if (table == NULL) return(-1); @@ -2109,7 +2113,7 @@ xmlGetRef(xmlDocPtr doc, const xmlChar *Ref) { return(NULL); } - table = doc->refs; + table = (xmlRefTablePtr) doc->refs; if (table == NULL) return(NULL); @@ -2150,7 +2154,7 @@ xmlGetDtdElementDesc(xmlDtdPtr dtd, const xmlChar *name) { if (dtd == NULL) return(NULL); if (dtd->elements == NULL) return(NULL); - table = dtd->elements; + table = (xmlElementTablePtr) dtd->elements; for (i = 0;i < table->nb_elements;i++) { cur = table->table[i]; @@ -2200,7 +2204,7 @@ xmlGetDtdQElementDesc(xmlDtdPtr dtd, const xmlChar *name, if (dtd == NULL) return(NULL); if (dtd->elements == NULL) return(NULL); - table = dtd->elements; + table = (xmlElementTablePtr) dtd->elements; for (i = 0;i < table->nb_elements;i++) { cur = table->table[i]; @@ -2234,7 +2238,7 @@ xmlGetDtdAttrDesc(xmlDtdPtr dtd, const xmlChar *elem, const xmlChar *name) { if (dtd == NULL) return(NULL); if (dtd->attributes == NULL) return(NULL); - table = dtd->attributes; + table = (xmlAttributeTablePtr) dtd->attributes; for (i = 0;i < table->nb_attributes;i++) { cur = table->table[i]; @@ -2288,7 +2292,7 @@ xmlGetDtdQAttrDesc(xmlDtdPtr dtd, const xmlChar *elem, const xmlChar *name, if (dtd == NULL) return(NULL); if (dtd->attributes == NULL) return(NULL); - table = dtd->attributes; + table = (xmlAttributeTablePtr) dtd->attributes; for (i = 0;i < table->nb_attributes;i++) { cur = table->table[i]; @@ -2320,7 +2324,7 @@ xmlGetDtdNotationDesc(xmlDtdPtr dtd, const xmlChar *name) { if (dtd == NULL) return(NULL); if (dtd->notations == NULL) return(NULL); - table = dtd->notations; + table = (xmlNotationTablePtr) dtd->notations; for (i = 0;i < table->nb_notations;i++) { cur = table->table[i]; @@ -2890,7 +2894,7 @@ xmlValidateAttributeDecl(xmlValidCtxtPtr ctxt, xmlDocPtr doc, * element in the external subset. */ nbId = 0; - table = doc->intSubset->attributes; + table = (xmlAttributeTablePtr) doc->intSubset->attributes; if (table != NULL) { for (i = 0;i < table->nb_attributes;i++) { if ((table->table[i]->atype == XML_ATTRIBUTE_ID) && @@ -3902,7 +3906,7 @@ xmlValidateDocumentFinal(xmlValidCtxtPtr ctxt, xmlDocPtr doc) { /* * Check all the IDREF/IDREFS attributes definition for validity */ - table = doc->refs; + table = (xmlRefTablePtr) doc->refs; if (table != NULL) { for (i = 0; i < table->nb_refs; i++) { if (table->table[i]->attr->atype == XML_ATTRIBUTE_IDREF) { @@ -4008,7 +4012,7 @@ xmlValidateDtdFinal(xmlValidCtxtPtr ctxt, xmlDocPtr doc) { return(0); dtd = doc->intSubset; if ((dtd != NULL) && (dtd->attributes != NULL)) { - table = dtd->attributes; + table = (xmlAttributeTablePtr) dtd->attributes; for (i = 0;i < table->nb_attributes;i++) { cur = table->table[i]; @@ -4041,7 +4045,7 @@ xmlValidateDtdFinal(xmlValidCtxtPtr ctxt, xmlDocPtr doc) { } dtd = doc->extSubset; if ((dtd != NULL) && (dtd->attributes != NULL)) { - table = dtd->attributes; + table = (xmlAttributeTablePtr) dtd->attributes; for (i = 0;i < table->nb_attributes;i++) { cur = table->table[i]; @@ -16,7 +16,7 @@ #include <libxml/tree.h> #ifdef __cplusplus -#define extern "C" { +extern "C" { #endif /** * Various defines for the various Link properties. @@ -1207,7 +1207,7 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) { if (len > buffree) len = buffree; - buffer = xmlMalloc((len + 1) * sizeof(char)); + buffer = (char *) xmlMalloc((len + 1) * sizeof(char)); if (buffer == NULL) { fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n"); return(-1); @@ -666,6 +666,7 @@ int main(int argc, char **argv) { printf("\t--nowarning : do not emit warnings from parser/validator\n"); printf("\t--noblanks : drop (ignorable?) blanks spaces\n"); printf("\t--testIO : test user I/O support\n"); + printf("\t--encode encoding : output in the given encoding\n"); } xmlCleanupParser(); xmlMemoryDump(); diff --git a/xmlmemory.c b/xmlmemory.c index e03e51da..b82a6e07 100644 --- a/xmlmemory.c +++ b/xmlmemory.c @@ -325,7 +325,7 @@ xmlMemStrdupLoc(const char *str, const char *file, int line) #ifdef MEM_LIST debugmem_list_add(p); #endif - s = HDR_2_CLIENT(p); + s = (char *) HDR_2_CLIENT(p); if (xmlMemStopAtBlock == block) xmlMallocBreakpoint(); @@ -382,7 +382,7 @@ void xmlMemContentShow(FILE *fp, MEMHDR *p) { int i,j,len = p->mh_size; - const char *buf = HDR_2_CLIENT(p); + const char *buf = (const char *) HDR_2_CLIENT(p); for (i = 0;i < len;i++) { if (buf[i] == 0) break; @@ -183,7 +183,7 @@ void xmlXPathStringFunction(xmlXPathParserContextPtr ctxt, int nargs); extern int name##Push(xmlXPathParserContextPtr ctxt, type value) { \ if (ctxt->name##Nr >= ctxt->name##Max) { \ ctxt->name##Max *= 2; \ - ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \ + ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \ ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \ if (ctxt->name##Tab == NULL) { \ fprintf(xmlXPathDebug, "realloc failed !\n"); \ @@ -849,10 +849,8 @@ xmlXPathFreeContext(xmlXPathContextPtr ctxt) { if (ctxt->namespaces != NULL) xmlFree(ctxt->namespaces); - /*********** if (ctxt->nodelist != NULL) xmlXPathFreeNodeSet(ctxt->nodelist); - ***********/ #ifdef DEBUG memset(ctxt, 0xB , (size_t) sizeof(xmlXPathContext)); #endif @@ -2548,7 +2546,7 @@ xmlXPathStringLengthFunction(xmlXPathParserContextPtr ctxt, int nargs) { */ void xmlXPathConcatFunction(xmlXPathParserContextPtr ctxt, int nargs) { - xmlXPathObjectPtr cur, new; + xmlXPathObjectPtr cur, newobj; xmlChar *tmp; if (nargs < 2) { @@ -2563,17 +2561,17 @@ xmlXPathConcatFunction(xmlXPathParserContextPtr ctxt, int nargs) { nargs--; while (nargs > 0) { - new = valuePop(ctxt); - if ((new == NULL) || (new->type != XPATH_STRING)) { - xmlXPathFreeObject(new); + newobj = valuePop(ctxt); + if ((newobj == NULL) || (newobj->type != XPATH_STRING)) { + xmlXPathFreeObject(newobj); xmlXPathFreeObject(cur); XP_ERROR(XPATH_INVALID_TYPE); } - tmp = xmlStrcat(new->stringval, cur->stringval); - new->stringval = cur->stringval; + tmp = xmlStrcat(newobj->stringval, cur->stringval); + newobj->stringval = cur->stringval; cur->stringval = tmp; - xmlXPathFreeObject(new); + xmlXPathFreeObject(newobj); nargs--; } valuePush(ctxt, cur); |