summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Wellnhofer <wellnhofer@aevum.de>2023-03-31 16:47:48 +0200
committerNick Wellnhofer <wellnhofer@aevum.de>2023-04-07 12:03:28 +0200
commit73210eeda5782da3414517cab20550d137e2dd25 (patch)
tree65998c7a7190d8287dc4f30e9675b66f1fe0e319
parent9a76bfef0f8ea9731155dfa4ed0338792e374a79 (diff)
downloadlibxml2-73210eeda5782da3414517cab20550d137e2dd25.tar.gz
SAX2: Ignore namespaces in HTML documents
In commit 21ca8829, we started to ignore namespaces in HTML element names but we still called xmlSplitQName, effectively stripping the namespace prefix. This would cause elements like <o:p> being parsed as <p>. Now we leave the name untouched. Fixes #508.
-rw-r--r--SAX2.c15
-rw-r--r--result/HTML/names.html6
-rw-r--r--result/HTML/names.html.err3
-rw-r--r--result/HTML/names.html.sax20
-rw-r--r--test/HTML/names.html5
5 files changed, 43 insertions, 6 deletions
diff --git a/SAX2.c b/SAX2.c
index 9faf5b46..0d685905 100644
--- a/SAX2.c
+++ b/SAX2.c
@@ -1608,12 +1608,15 @@ xmlSAX2StartElement(void *ctx, const xmlChar *fullname, const xmlChar **atts)
ctxt->validate = 0;
}
-
- /*
- * Split the full name into a namespace prefix and the tag name
- */
- name = xmlSplitQName(ctxt, fullname, &prefix);
-
+ if (ctxt->html) {
+ prefix = NULL;
+ name = xmlStrdup(fullname);
+ } else {
+ /*
+ * Split the full name into a namespace prefix and the tag name
+ */
+ name = xmlSplitQName(ctxt, fullname, &prefix);
+ }
/*
* Note : the namespace resolution is deferred until the end of the
diff --git a/result/HTML/names.html b/result/HTML/names.html
new file mode 100644
index 00000000..dd7dcc2e
--- /dev/null
+++ b/result/HTML/names.html
@@ -0,0 +1,6 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
+<html>
+<body>
+ <o:p></o:p>
+</body>
+</html>
diff --git a/result/HTML/names.html.err b/result/HTML/names.html.err
new file mode 100644
index 00000000..4d91a5d2
--- /dev/null
+++ b/result/HTML/names.html.err
@@ -0,0 +1,3 @@
+./test/HTML/names.html:3: HTML parser error : Tag o:p invalid
+ <o:p></o:p>
+ ^
diff --git a/result/HTML/names.html.sax b/result/HTML/names.html.sax
new file mode 100644
index 00000000..12a107f8
--- /dev/null
+++ b/result/HTML/names.html.sax
@@ -0,0 +1,20 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElement(html)
+SAX.characters(
+, 1)
+SAX.startElement(body)
+SAX.characters(
+ , 3)
+SAX.startElement(o:p)
+SAX.error: Tag o:p invalid
+SAX.endElement(o:p)
+SAX.characters(
+, 1)
+SAX.endElement(body)
+SAX.characters(
+, 1)
+SAX.endElement(html)
+SAX.characters(
+, 1)
+SAX.endDocument()
diff --git a/test/HTML/names.html b/test/HTML/names.html
new file mode 100644
index 00000000..0dac7a47
--- /dev/null
+++ b/test/HTML/names.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+ <o:p></o:p>
+</body>
+</html>