9 files changed, 489 insertions, 0 deletions
diff --git a/result/boundaries1.xml b/result/boundaries1.xml
new file mode 100644
index 00000000..dc1848f4
--- /dev/null
+++ b/result/boundaries1.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<!DOCTYPE d [
+<!ENTITY a "]>">
+<!ENTITY b "]>">
+<!--> ]> -->]>
+<?pi p1?>
+<!--> c1 -->
+<d a="&gt;" b="&gt;">
+text&a;text
+<![CDATA[cdata]]>
+<?pi p2?>
+<!--> c2 -->
+</d>
+<?pi p3?>
+<!--> c3 -->
diff --git a/result/boundaries1.xml.rde b/result/boundaries1.xml.rde
new file mode 100644
index 00000000..113b487a
--- /dev/null
+++ b/result/boundaries1.xml.rde
@@ -0,0 +1,19 @@
+0 10 d 0 0
+0 7 pi 0 1 p1
+0 8 #comment 0 1 > c1 
+0 1 d 0 0
+1 3 #text 0 1 
+text]>text
+
+1 4 #cdata-section 0 1 cdata
+1 14 #text 0 1 
+
+1 7 pi 0 1 p2
+1 14 #text 0 1 
+
+1 8 #comment 0 1 > c2 
+1 14 #text 0 1 
+
+0 15 d 0 0
+0 7 pi 0 1 p3
+0 8 #comment 0 1 > c3 
diff --git a/result/boundaries1.xml.rdr b/result/boundaries1.xml.rdr
new file mode 100644
index 00000000..784ece01
--- /dev/null
+++ b/result/boundaries1.xml.rdr
@@ -0,0 +1,21 @@
+0 10 d 0 0
+0 7 pi 0 1 p1
+0 8 #comment 0 1 > c1 
+0 1 d 0 0
+1 3 #text 0 1 
+text
+1 5 a 0 0
+1 3 #text 0 1 text
+
+1 4 #cdata-section 0 1 cdata
+1 14 #text 0 1 
+
+1 7 pi 0 1 p2
+1 14 #text 0 1 
+
+1 8 #comment 0 1 > c2 
+1 14 #text 0 1 
+
+0 15 d 0 0
+0 7 pi 0 1 p3
+0 8 #comment 0 1 > c3 
diff --git a/result/boundaries1.xml.sax b/result/boundaries1.xml.sax
new file mode 100644
index 00000000..19e31815
--- /dev/null
+++ b/result/boundaries1.xml.sax
@@ -0,0 +1,32 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.internalSubset(d, , )
+SAX.entityDecl(a, 1, (null), (null), ]>)
+SAX.getEntity(a)
+SAX.entityDecl(b, 1, (null), (null), ]>)
+SAX.getEntity(b)
+SAX.comment(> ]> )
+SAX.externalSubset(d, , )
+SAX.processingInstruction(pi, p1)
+SAX.comment(> c1 )
+SAX.startElement(d, a='>', b='>')
+SAX.characters(
+text, 5)
+SAX.getEntity(a)
+SAX.characters(]>, 2)
+SAX.reference(a)
+SAX.characters(text
+, 5)
+SAX.pcdata(cdata, 5)
+SAX.characters(
+, 1)
+SAX.processingInstruction(pi, p2)
+SAX.characters(
+, 1)
+SAX.comment(> c2 )
+SAX.characters(
+, 1)
+SAX.endElement(d)
+SAX.processingInstruction(pi, p3)
+SAX.comment(> c3 )
+SAX.endDocument()
diff --git a/result/boundaries1.xml.sax2 b/result/boundaries1.xml.sax2
new file mode 100644
index 00000000..b3ad5e8e
--- /dev/null
+++ b/result/boundaries1.xml.sax2
@@ -0,0 +1,33 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.internalSubset(d, , )
+SAX.entityDecl(a, 1, (null), (null), ]>)
+SAX.getEntity(a)
+SAX.entityDecl(b, 1, (null), (null), ]>)
+SAX.getEntity(b)
+SAX.comment(> ]> )
+SAX.externalSubset(d, , )
+SAX.processingInstruction(pi, p1)
+SAX.comment(> c1 )
+SAX.startElementNs(d, NULL, NULL, 0, 2, 0, a='>" b...', 1, b='>'>
+...', 1)
+SAX.characters(
+text, 5)
+SAX.getEntity(a)
+SAX.characters(]>, 2)
+SAX.reference(a)
+SAX.characters(text
+, 5)
+SAX.pcdata(cdata, 5)
+SAX.characters(
+, 1)
+SAX.processingInstruction(pi, p2)
+SAX.characters(
+, 1)
+SAX.comment(> c2 )
+SAX.characters(
+, 1)
+SAX.endElementNs(d, NULL, NULL)
+SAX.processingInstruction(pi, p3)
+SAX.comment(> c3 )
+SAX.endDocument()
diff --git a/result/noent/boundaries1.xml b/result/noent/boundaries1.xml
new file mode 100644
index 00000000..da20872a
--- /dev/null
+++ b/result/noent/boundaries1.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<!DOCTYPE d [
+<!ENTITY a "]>">
+<!ENTITY b "]>">
+<!--> ]> -->]>
+<?pi p1?>
+<!--> c1 -->
+<d a="&gt;" b="&gt;">
+text]&gt;text
+<![CDATA[cdata]]>
+<?pi p2?>
+<!--> c2 -->
+</d>
+<?pi p3?>
+<!--> c3 -->
diff --git a/result/noent/boundaries1.xml.sax2 b/result/noent/boundaries1.xml.sax2
new file mode 100644
index 00000000..093ac3be
--- /dev/null
+++ b/result/noent/boundaries1.xml.sax2
@@ -0,0 +1,32 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.internalSubset(d, , )
+SAX.entityDecl(a, 1, (null), (null), ]>)
+SAX.getEntity(a)
+SAX.entityDecl(b, 1, (null), (null), ]>)
+SAX.getEntity(b)
+SAX.comment(> ]> )
+SAX.externalSubset(d, , )
+SAX.processingInstruction(pi, p1)
+SAX.comment(> c1 )
+SAX.startElementNs(d, NULL, NULL, 0, 2, 0, a='>" b...', 1, b='>'>
+...', 1)
+SAX.characters(
+text, 5)
+SAX.getEntity(a)
+SAX.characters(]>, 2)
+SAX.characters(text
+, 5)
+SAX.pcdata(cdata, 5)
+SAX.characters(
+, 1)
+SAX.processingInstruction(pi, p2)
+SAX.characters(
+, 1)
+SAX.comment(> c2 )
+SAX.characters(
+, 1)
+SAX.endElementNs(d, NULL, NULL)
+SAX.processingInstruction(pi, p3)
+SAX.comment(> c3 )
+SAX.endDocument()
diff --git a/runtest.c b/runtest.c
index 4c1c2677..eedd26e5 100644
--- a/runtest.c
+++ b/runtest.c
@@ -1970,6 +1970,306 @@ pushParseTest(const char *filename, const char *result,
     }
     return(0);
 }
+
+static int pushBoundaryCount;
+static int pushBoundaryRefCount;
+static int pushBoundaryCharsCount;
+static int pushBoundaryCDataCount;
+
+static void
+internalSubsetBnd(void *ctx, const xmlChar *name, const xmlChar *externalID,
+                  const xmlChar *systemID) {
+    pushBoundaryCount++;
+    xmlSAX2InternalSubset(ctx, name, externalID, systemID);
+}
+
+static void
+referenceBnd(void *ctx, const xmlChar *name) {
+    pushBoundaryRefCount++;
+    xmlSAX2Reference(ctx, name);
+}
+
+static void
+charactersBnd(void *ctx, const xmlChar *ch, int len) {
+    pushBoundaryCount++;
+    pushBoundaryCharsCount++;
+    xmlSAX2Characters(ctx, ch, len);
+}
+
+static void
+cdataBlockBnd(void *ctx, const xmlChar *ch, int len) {
+    pushBoundaryCount++;
+    pushBoundaryCDataCount++;
+    xmlSAX2CDataBlock(ctx, ch, len);
+}
+
+static void
+processingInstructionBnd(void *ctx, const xmlChar *target,
+                         const xmlChar *data) {
+    pushBoundaryCount++;
+    xmlSAX2ProcessingInstruction(ctx, target, data);
+}
+
+static void
+commentBnd(void *ctx, const xmlChar *value) {
+    xmlParserCtxtPtr ctxt = ctx;
+    if (ctxt->inSubset == 0)
+        pushBoundaryCount++;
+    xmlSAX2Comment(ctx, value);
+}
+
+static void
+startElementBnd(void *ctx, const xmlChar *xname, const xmlChar **atts) {
+    const char *name = (const char *)xname;
+
+    /* Some elements might be created automatically. */
+    if ((strcmp(name, "html") != 0) &&
+        (strcmp(name, "body") != 0) &&
+        (strcmp(name, "head") != 0) &&
+        (strcmp(name, "p") != 0)) {
+        pushBoundaryCount++;
+    }
+    xmlSAX2StartElement(ctx, xname, atts);
+}
+
+static void
+endElementBnd(void *ctx, const xmlChar *name) {
+    /*pushBoundaryCount++;*/
+    xmlSAX2EndElement(ctx, name);
+}
+
+static void
+startElementNsBnd(void *ctx, const xmlChar *localname, const xmlChar *prefix,
+                  const xmlChar *URI, int nb_namespaces,
+                  const xmlChar **namespaces, int nb_attributes,
+                  int nb_defaulted, const xmlChar **attributes) {
+    pushBoundaryCount++;
+    xmlSAX2StartElementNs(ctx, localname, prefix, URI, nb_namespaces,
+                          namespaces, nb_attributes, nb_defaulted, attributes);
+}
+
+static void
+endElementNsBnd(void *ctx, const xmlChar *localname, const xmlChar *prefix,
+                const xmlChar *URI) {
+    /*pushBoundaryCount++;*/
+    xmlSAX2EndElementNs(ctx, localname, prefix, URI);
+}
+
+/**
+ * pushBoundaryTest:
+ * @filename: the file to parse
+ * @result: the file with expected result
+ * @err: the file with error messages: unused
+ *
+ * Test whether the push parser detects boundaries between syntactical
+ * elements correctly.
+ *
+ * Returns 0 in case of success, an error code otherwise
+ */
+static int
+pushBoundaryTest(const char *filename, const char *result,
+                 const char *err ATTRIBUTE_UNUSED,
+                 int options) {
+    xmlParserCtxtPtr ctxt;
+    xmlDocPtr doc;
+    xmlSAXHandler bndSAX;
+    const char *base;
+    int size, res, numCallbacks;
+    int cur = 0;
+    unsigned long avail, oldConsumed, consumed;
+
+    /*
+     * If the parser made progress, check that exactly one construct was
+     * processed and that the input buffer is (almost) empty.
+     * Since we use a chunk size of 1, this tests whether content is
+     * processed as early as possible.
+     */
+
+    nb_tests++;
+
+    memset(&bndSAX, 0, sizeof(bndSAX));
+#ifdef LIBXML_HTML_ENABLED
+    if (options & XML_PARSE_HTML) {
+        xmlSAX2InitHtmlDefaultSAXHandler(&bndSAX);
+        bndSAX.startElement = startElementBnd;
+        bndSAX.endElement = endElementBnd;
+    } else
+#endif
+    {
+        xmlSAXVersion(&bndSAX, 2);
+        bndSAX.startElementNs = startElementNsBnd;
+        bndSAX.endElementNs = endElementNsBnd;
+    }
+
+    bndSAX.internalSubset = internalSubsetBnd;
+    bndSAX.reference = referenceBnd;
+    bndSAX.characters = charactersBnd;
+    bndSAX.cdataBlock = cdataBlockBnd;
+    bndSAX.processingInstruction = processingInstructionBnd;
+    bndSAX.comment = commentBnd;
+
+    /*
+     * load the document in memory and work from there.
+     */
+    if (loadMem(filename, &base, &size) != 0) {
+        fprintf(stderr, "Failed to load %s\n", filename);
+	return(-1);
+    }
+
+#ifdef LIBXML_HTML_ENABLED
+    if (options & XML_PARSE_HTML)
+	ctxt = htmlCreatePushParserCtxt(&bndSAX, NULL, base, 1, filename,
+	                                XML_CHAR_ENCODING_NONE);
+    else
+#endif
+    ctxt = xmlCreatePushParserCtxt(&bndSAX, NULL, base, 1, filename);
+    xmlCtxtUseOptions(ctxt, options);
+    cur = 1;
+    consumed = 0;
+    numCallbacks = 0;
+    avail = 0;
+    while ((cur < size) && (numCallbacks <= 1) && (avail <= 0)) {
+        int terminate = (cur + 1 >= size);
+        int isText = 0;
+
+        if (ctxt->instate == XML_PARSER_CONTENT) {
+            int firstChar = (ctxt->input->end > ctxt->input->cur) ?
+                            *ctxt->input->cur :
+                            base[cur];
+
+            if ((firstChar != '<') &&
+                ((options & XML_PARSE_HTML) || (firstChar != '&')))
+                isText = 1;
+        }
+
+        oldConsumed = ctxt->input->consumed +
+                      (unsigned long) (ctxt->input->cur - ctxt->input->base);
+
+        pushBoundaryCount = 0;
+        pushBoundaryRefCount = 0;
+        pushBoundaryCharsCount = 0;
+        pushBoundaryCDataCount = 0;
+
+#ifdef LIBXML_HTML_ENABLED
+        if (options & XML_PARSE_HTML)
+            htmlParseChunk(ctxt, base + cur, 1, terminate);
+        else
+#endif
+        xmlParseChunk(ctxt, base + cur, 1, terminate);
+	cur += 1;
+
+        /*
+         * Callback check: Check that only a single construct was parsed.
+         */
+        if (pushBoundaryRefCount > 0) {
+            numCallbacks = 1;
+        } else {
+            numCallbacks = pushBoundaryCount;
+            if (pushBoundaryCharsCount > 1) {
+                if (options & XML_PARSE_HTML) {
+                    /*
+                     * The HTML parser can generate a mix of chars and
+                     * references.
+                     */
+                    numCallbacks -= pushBoundaryCharsCount - 1;
+                } else {
+                    /*
+                     * Allow two chars callbacks. This can happen when
+                     * multi-byte chars are split across buffer boundaries.
+                     */
+                    numCallbacks -= 1;
+                }
+            }
+            if (options & XML_PARSE_HTML) {
+                /*
+                 * Allow multiple cdata callbacks in HTML mode.
+                 */
+                if (pushBoundaryCDataCount > 1)
+                    numCallbacks -= pushBoundaryCDataCount - 1;
+            }
+        }
+
+        /*
+         * Buffer check: If input was consumed, check that the input
+         * buffer is (almost) empty.
+         */
+        consumed = ctxt->input->consumed +
+                   (unsigned long) (ctxt->input->cur - ctxt->input->base);
+        if ((ctxt->instate != XML_PARSER_DTD) &&
+            (consumed >= 4) &&
+            (consumed != oldConsumed)) {
+            size_t max = 0;
+
+            avail = ctxt->input->end - ctxt->input->cur;
+
+            if ((options & XML_PARSE_HTML) &&
+                (ctxt->instate == XML_PARSER_END_TAG)) {
+                /* Something related to script parsing. */
+                max = 3;
+            } else if (isText) {
+                int c = *ctxt->input->cur;
+
+                /* 3 bytes for partial UTF-8 */
+                max = ((c == '<') || (c == '&')) ? 1 : 3;
+            } else if (ctxt->instate == XML_PARSER_CDATA_SECTION) {
+                /* 2 bytes for terminator, 3 bytes for UTF-8 */
+                max = 5;
+            }
+
+            if (avail <= max)
+                avail = 0;
+        }
+    }
+    doc = ctxt->myDoc;
+#ifdef LIBXML_HTML_ENABLED
+    if (options & XML_PARSE_HTML)
+        res = 1;
+    else
+#endif
+    res = ctxt->wellFormed;
+    xmlFreeParserCtxt(ctxt);
+    free((char *)base);
+    if (numCallbacks > 1) {
+	xmlFreeDoc(doc);
+	fprintf(stderr, "Failed push boundary callback test (%d@%lu-%lu): %s\n",
+                numCallbacks, oldConsumed, consumed, filename);
+	return(-1);
+    }
+    if (avail > 0) {
+	xmlFreeDoc(doc);
+	fprintf(stderr, "Failed push boundary buffer test (%lu@%lu): %s\n",
+                avail, consumed, filename);
+	return(-1);
+    }
+    if (!res) {
+	xmlFreeDoc(doc);
+	fprintf(stderr, "Failed to parse %s\n", filename);
+	return(-1);
+    }
+#ifdef LIBXML_HTML_ENABLED
+    if (options & XML_PARSE_HTML)
+	htmlDocDumpMemory(doc, (xmlChar **) &base, &size);
+    else
+#endif
+    xmlDocDumpMemory(doc, (xmlChar **) &base, &size);
+    xmlFreeDoc(doc);
+    res = compareFileMem(result, base, size);
+    if ((base == NULL) || (res != 0)) {
+	if (base != NULL)
+	    xmlFree((char *)base);
+        fprintf(stderr, "Result for %s failed in %s\n", filename, result);
+	return(-1);
+    }
+    xmlFree((char *)base);
+    if (err != NULL) {
+	res = compareFileMem(err, testErrors, testErrorsSize);
+	if (res != 0) {
+	    fprintf(stderr, "Error for %s failed\n", filename);
+	    return(-1);
+	}
+    }
+    return(0);
+}
 #endif
 
 /**
@@ -4660,6 +4960,9 @@ testDesc testDescriptions[] = {
     { "XML push regression tests" ,
       pushParseTest, "./test/*", "result/", "", NULL,
       0 },
+    { "XML push boundary tests" ,
+      pushBoundaryTest, "./test/*", "result/", "", NULL,
+      0 },
 #endif
 #ifdef LIBXML_HTML_ENABLED
     { "HTML regression tests" ,
@@ -4672,6 +4975,9 @@ testDesc testDescriptions[] = {
     { "Push HTML regression tests" ,
       pushParseTest, "./test/HTML/*", "result/HTML/", "", ".err",
       XML_PARSE_HTML },
+    { "Push HTML boundary tests" ,
+      pushBoundaryTest, "./test/HTML/*", "result/HTML/", "", NULL,
+      XML_PARSE_HTML },
 #endif
     { "HTML SAX regression tests" ,
       saxParseTest, "./test/HTML/*", "result/HTML/", ".sax", NULL,
diff --git a/test/boundaries1.xml b/test/boundaries1.xml
new file mode 100644
index 00000000..c4301cd3
--- /dev/null
+++ b/test/boundaries1.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0"?>
+<!DOCTYPE d [
+    <!ENTITY a "]>">
+    <!ENTITY b ']>'>
+    <!--> ]> -->
+]    >
+<?pi p1?>
+<!--> c1 -->
+<d a=">" b='>'>
+text&a;text
+<![CDATA[cdata]]>
+<?pi p2?>
+<!--> c2 -->
+</d>
+<?pi p3?>
+<!--> c3 -->