From 9c63cea5a6154295ea953b125788ee7b97010bd1 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Sun, 20 Nov 2022 15:36:41 +0100 Subject: test: Add test for push parser boundaries --- runtest.c | 306 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) (limited to 'runtest.c') diff --git a/runtest.c b/runtest.c index 4c1c2677..eedd26e5 100644 --- a/runtest.c +++ b/runtest.c @@ -1970,6 +1970,306 @@ pushParseTest(const char *filename, const char *result, } return(0); } + +static int pushBoundaryCount; +static int pushBoundaryRefCount; +static int pushBoundaryCharsCount; +static int pushBoundaryCDataCount; + +static void +internalSubsetBnd(void *ctx, const xmlChar *name, const xmlChar *externalID, + const xmlChar *systemID) { + pushBoundaryCount++; + xmlSAX2InternalSubset(ctx, name, externalID, systemID); +} + +static void +referenceBnd(void *ctx, const xmlChar *name) { + pushBoundaryRefCount++; + xmlSAX2Reference(ctx, name); +} + +static void +charactersBnd(void *ctx, const xmlChar *ch, int len) { + pushBoundaryCount++; + pushBoundaryCharsCount++; + xmlSAX2Characters(ctx, ch, len); +} + +static void +cdataBlockBnd(void *ctx, const xmlChar *ch, int len) { + pushBoundaryCount++; + pushBoundaryCDataCount++; + xmlSAX2CDataBlock(ctx, ch, len); +} + +static void +processingInstructionBnd(void *ctx, const xmlChar *target, + const xmlChar *data) { + pushBoundaryCount++; + xmlSAX2ProcessingInstruction(ctx, target, data); +} + +static void +commentBnd(void *ctx, const xmlChar *value) { + xmlParserCtxtPtr ctxt = ctx; + if (ctxt->inSubset == 0) + pushBoundaryCount++; + xmlSAX2Comment(ctx, value); +} + +static void +startElementBnd(void *ctx, const xmlChar *xname, const xmlChar **atts) { + const char *name = (const char *)xname; + + /* Some elements might be created automatically. */ + if ((strcmp(name, "html") != 0) && + (strcmp(name, "body") != 0) && + (strcmp(name, "head") != 0) && + (strcmp(name, "p") != 0)) { + pushBoundaryCount++; + } + xmlSAX2StartElement(ctx, xname, atts); +} + +static void +endElementBnd(void *ctx, const xmlChar *name) { + /*pushBoundaryCount++;*/ + xmlSAX2EndElement(ctx, name); +} + +static void +startElementNsBnd(void *ctx, const xmlChar *localname, const xmlChar *prefix, + const xmlChar *URI, int nb_namespaces, + const xmlChar **namespaces, int nb_attributes, + int nb_defaulted, const xmlChar **attributes) { + pushBoundaryCount++; + xmlSAX2StartElementNs(ctx, localname, prefix, URI, nb_namespaces, + namespaces, nb_attributes, nb_defaulted, attributes); +} + +static void +endElementNsBnd(void *ctx, const xmlChar *localname, const xmlChar *prefix, + const xmlChar *URI) { + /*pushBoundaryCount++;*/ + xmlSAX2EndElementNs(ctx, localname, prefix, URI); +} + +/** + * pushBoundaryTest: + * @filename: the file to parse + * @result: the file with expected result + * @err: the file with error messages: unused + * + * Test whether the push parser detects boundaries between syntactical + * elements correctly. + * + * Returns 0 in case of success, an error code otherwise + */ +static int +pushBoundaryTest(const char *filename, const char *result, + const char *err ATTRIBUTE_UNUSED, + int options) { + xmlParserCtxtPtr ctxt; + xmlDocPtr doc; + xmlSAXHandler bndSAX; + const char *base; + int size, res, numCallbacks; + int cur = 0; + unsigned long avail, oldConsumed, consumed; + + /* + * If the parser made progress, check that exactly one construct was + * processed and that the input buffer is (almost) empty. + * Since we use a chunk size of 1, this tests whether content is + * processed as early as possible. + */ + + nb_tests++; + + memset(&bndSAX, 0, sizeof(bndSAX)); +#ifdef LIBXML_HTML_ENABLED + if (options & XML_PARSE_HTML) { + xmlSAX2InitHtmlDefaultSAXHandler(&bndSAX); + bndSAX.startElement = startElementBnd; + bndSAX.endElement = endElementBnd; + } else +#endif + { + xmlSAXVersion(&bndSAX, 2); + bndSAX.startElementNs = startElementNsBnd; + bndSAX.endElementNs = endElementNsBnd; + } + + bndSAX.internalSubset = internalSubsetBnd; + bndSAX.reference = referenceBnd; + bndSAX.characters = charactersBnd; + bndSAX.cdataBlock = cdataBlockBnd; + bndSAX.processingInstruction = processingInstructionBnd; + bndSAX.comment = commentBnd; + + /* + * load the document in memory and work from there. + */ + if (loadMem(filename, &base, &size) != 0) { + fprintf(stderr, "Failed to load %s\n", filename); + return(-1); + } + +#ifdef LIBXML_HTML_ENABLED + if (options & XML_PARSE_HTML) + ctxt = htmlCreatePushParserCtxt(&bndSAX, NULL, base, 1, filename, + XML_CHAR_ENCODING_NONE); + else +#endif + ctxt = xmlCreatePushParserCtxt(&bndSAX, NULL, base, 1, filename); + xmlCtxtUseOptions(ctxt, options); + cur = 1; + consumed = 0; + numCallbacks = 0; + avail = 0; + while ((cur < size) && (numCallbacks <= 1) && (avail <= 0)) { + int terminate = (cur + 1 >= size); + int isText = 0; + + if (ctxt->instate == XML_PARSER_CONTENT) { + int firstChar = (ctxt->input->end > ctxt->input->cur) ? + *ctxt->input->cur : + base[cur]; + + if ((firstChar != '<') && + ((options & XML_PARSE_HTML) || (firstChar != '&'))) + isText = 1; + } + + oldConsumed = ctxt->input->consumed + + (unsigned long) (ctxt->input->cur - ctxt->input->base); + + pushBoundaryCount = 0; + pushBoundaryRefCount = 0; + pushBoundaryCharsCount = 0; + pushBoundaryCDataCount = 0; + +#ifdef LIBXML_HTML_ENABLED + if (options & XML_PARSE_HTML) + htmlParseChunk(ctxt, base + cur, 1, terminate); + else +#endif + xmlParseChunk(ctxt, base + cur, 1, terminate); + cur += 1; + + /* + * Callback check: Check that only a single construct was parsed. + */ + if (pushBoundaryRefCount > 0) { + numCallbacks = 1; + } else { + numCallbacks = pushBoundaryCount; + if (pushBoundaryCharsCount > 1) { + if (options & XML_PARSE_HTML) { + /* + * The HTML parser can generate a mix of chars and + * references. + */ + numCallbacks -= pushBoundaryCharsCount - 1; + } else { + /* + * Allow two chars callbacks. This can happen when + * multi-byte chars are split across buffer boundaries. + */ + numCallbacks -= 1; + } + } + if (options & XML_PARSE_HTML) { + /* + * Allow multiple cdata callbacks in HTML mode. + */ + if (pushBoundaryCDataCount > 1) + numCallbacks -= pushBoundaryCDataCount - 1; + } + } + + /* + * Buffer check: If input was consumed, check that the input + * buffer is (almost) empty. + */ + consumed = ctxt->input->consumed + + (unsigned long) (ctxt->input->cur - ctxt->input->base); + if ((ctxt->instate != XML_PARSER_DTD) && + (consumed >= 4) && + (consumed != oldConsumed)) { + size_t max = 0; + + avail = ctxt->input->end - ctxt->input->cur; + + if ((options & XML_PARSE_HTML) && + (ctxt->instate == XML_PARSER_END_TAG)) { + /* Something related to script parsing. */ + max = 3; + } else if (isText) { + int c = *ctxt->input->cur; + + /* 3 bytes for partial UTF-8 */ + max = ((c == '<') || (c == '&')) ? 1 : 3; + } else if (ctxt->instate == XML_PARSER_CDATA_SECTION) { + /* 2 bytes for terminator, 3 bytes for UTF-8 */ + max = 5; + } + + if (avail <= max) + avail = 0; + } + } + doc = ctxt->myDoc; +#ifdef LIBXML_HTML_ENABLED + if (options & XML_PARSE_HTML) + res = 1; + else +#endif + res = ctxt->wellFormed; + xmlFreeParserCtxt(ctxt); + free((char *)base); + if (numCallbacks > 1) { + xmlFreeDoc(doc); + fprintf(stderr, "Failed push boundary callback test (%d@%lu-%lu): %s\n", + numCallbacks, oldConsumed, consumed, filename); + return(-1); + } + if (avail > 0) { + xmlFreeDoc(doc); + fprintf(stderr, "Failed push boundary buffer test (%lu@%lu): %s\n", + avail, consumed, filename); + return(-1); + } + if (!res) { + xmlFreeDoc(doc); + fprintf(stderr, "Failed to parse %s\n", filename); + return(-1); + } +#ifdef LIBXML_HTML_ENABLED + if (options & XML_PARSE_HTML) + htmlDocDumpMemory(doc, (xmlChar **) &base, &size); + else +#endif + xmlDocDumpMemory(doc, (xmlChar **) &base, &size); + xmlFreeDoc(doc); + res = compareFileMem(result, base, size); + if ((base == NULL) || (res != 0)) { + if (base != NULL) + xmlFree((char *)base); + fprintf(stderr, "Result for %s failed in %s\n", filename, result); + return(-1); + } + xmlFree((char *)base); + if (err != NULL) { + res = compareFileMem(err, testErrors, testErrorsSize); + if (res != 0) { + fprintf(stderr, "Error for %s failed\n", filename); + return(-1); + } + } + return(0); +} #endif /** @@ -4660,6 +4960,9 @@ testDesc testDescriptions[] = { { "XML push regression tests" , pushParseTest, "./test/*", "result/", "", NULL, 0 }, + { "XML push boundary tests" , + pushBoundaryTest, "./test/*", "result/", "", NULL, + 0 }, #endif #ifdef LIBXML_HTML_ENABLED { "HTML regression tests" , @@ -4672,6 +4975,9 @@ testDesc testDescriptions[] = { { "Push HTML regression tests" , pushParseTest, "./test/HTML/*", "result/HTML/", "", ".err", XML_PARSE_HTML }, + { "Push HTML boundary tests" , + pushBoundaryTest, "./test/HTML/*", "result/HTML/", "", NULL, + XML_PARSE_HTML }, #endif { "HTML SAX regression tests" , saxParseTest, "./test/HTML/*", "result/HTML/", ".sax", NULL, -- cgit v1.2.1