summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2016-12-19 20:34:19 -0500
committerLeonard Richardson <leonardr@segfault.org>2016-12-19 20:34:19 -0500
commite0b200dfa8d9282cd3d2c170ba11404c7839c9dc (patch)
treeb0be0e7bddd4a9ccaef3f8b7a45ac425804390d6
parent05b4ee8643c26bf9718fdfdfb460c7b9cf78082d (diff)
parentc91a18b10c9d34ae45192401b7a81f7139fb4745 (diff)
downloadbeautifulsoup4-e0b200dfa8d9282cd3d2c170ba11404c7839c9dc.tar.gz
Fixed foster parenting when html5lib is the tree builder. Thanks to Geoffrey Sneddon for a patch and test.
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/builder/_html5lib.py77
-rw-r--r--bs4/tests/test_html5lib.py5
3 files changed, 77 insertions, 8 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 12f979f..c05f189 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
= Unreleased =
+* Fixed foster parenting when html5lib is the tree builder. Thanks to
+ Geoffrey Sneddon for a patch and test.
+
* Fixed yet another problem that caused the html5lib tree builder to
create a disconnected parse tree. [bug=1629825]
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index ad0deea..5f54893 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -6,6 +6,7 @@ __all__ = [
]
import warnings
+import re
from bs4.builder import (
PERMISSIVE,
HTML,
@@ -17,7 +18,10 @@ from bs4.element import (
whitespace_re,
)
import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
+ namespaces,
+ prefixes,
+ )
from bs4.element import (
Comment,
Doctype,
@@ -83,7 +87,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
- self.soup, namespaceHTMLElements)
+ namespaceHTMLElements, self.soup)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
@@ -93,8 +97,12 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
- def __init__(self, soup, namespaceHTMLElements):
- self.soup = soup
+ def __init__(self, namespaceHTMLElements, soup=None):
+ if soup:
+ self.soup = soup
+ else:
+ from bs4 import BeautifulSoup
+ self.soup = BeautifulSoup("", "html.parser")
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
@@ -117,7 +125,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
- self.soup = BeautifulSoup("")
+ from bs4 import BeautifulSoup
+ self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
@@ -131,6 +140,56 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def getFragment(self):
return treebuilder_base.TreeBuilder.getFragment(self).element
+ def testSerializer(self, element):
+ from bs4 import BeautifulSoup
+ rv = []
+ doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
+
+ def serializeElement(element, indent=0):
+ if isinstance(element, BeautifulSoup):
+ pass
+ if isinstance(element, Doctype):
+ m = doctype_re.match(element)
+ if m:
+ name = m.group(1)
+ if m.lastindex > 1:
+ publicId = m.group(2) or ""
+ systemId = m.group(3) or m.group(4) or ""
+ rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+ (' ' * indent, name, publicId, systemId))
+ else:
+ rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
+ else:
+ rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
+ elif isinstance(element, Comment):
+ rv.append("|%s<!-- %s -->" % (' ' * indent, element))
+ elif isinstance(element, NavigableString):
+ rv.append("|%s\"%s\"" % (' ' * indent, element))
+ else:
+ if element.namespace:
+ name = "%s %s" % (prefixes[element.namespace],
+ element.name)
+ else:
+ name = element.name
+ rv.append("|%s<%s>" % (' ' * indent, name))
+ if element.attrs:
+ attributes = []
+ for name, value in element.attrs.items():
+ if isinstance(name, NamespacedAttribute):
+ name = "%s %s" % (prefixes[name.namespace], name.name)
+ if isinstance(value, list):
+ value = " ".join(value)
+ attributes.append((name, value))
+
+ for name, value in sorted(attributes):
+ rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+ indent += 2
+ for child in element.children:
+ serializeElement(child, indent)
+ serializeElement(element, 0)
+
+ return "\n".join(rv)
+
class AttrList(object):
def __init__(self, element):
self.element = element
@@ -182,8 +241,10 @@ class Element(treebuilder_base.Node):
child = node
elif node.element.__class__ == NavigableString:
string_child = child = node.element
+ node.parent = self
else:
child = node.element
+ node.parent = self
if not isinstance(child, basestring) and child.parent is not None:
node.element.extract()
@@ -250,11 +311,11 @@ class Element(treebuilder_base.Node):
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
+ text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
- text = TextNode(self.soup.new_string(data), self.soup)
- self.insertBefore(data, insertBefore)
+ self.insertBefore(text, insertBefore)
else:
- self.appendChild(data)
+ self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index eba7e84..0f89d62 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -123,3 +123,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
a1, a2 = soup.find_all('a')
self.assertEqual(a1, a2)
assert a1 is not a2
+
+ def test_foster_parenting(self):
+ markup = b"""<table><td></tbody>A"""
+ soup = self.soup(markup)
+ self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())