diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-12-19 20:34:19 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-12-19 20:34:19 -0500 |
commit | e0b200dfa8d9282cd3d2c170ba11404c7839c9dc (patch) | |
tree | b0be0e7bddd4a9ccaef3f8b7a45ac425804390d6 | |
parent | 05b4ee8643c26bf9718fdfdfb460c7b9cf78082d (diff) | |
parent | c91a18b10c9d34ae45192401b7a81f7139fb4745 (diff) | |
download | beautifulsoup4-e0b200dfa8d9282cd3d2c170ba11404c7839c9dc.tar.gz |
Fixed foster parenting when html5lib is the tree builder. Thanks to Geoffrey Sneddon for a patch and test.
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 77 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 5 |
3 files changed, 77 insertions, 8 deletions
@@ -1,5 +1,8 @@ = Unreleased = +* Fixed foster parenting when html5lib is the tree builder. Thanks to + Geoffrey Sneddon for a patch and test. + * Fixed yet another problem that caused the html5lib tree builder to create a disconnected parse tree. [bug=1629825] diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index ad0deea..5f54893 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -6,6 +6,7 @@ __all__ = [ ] import warnings +import re from bs4.builder import ( PERMISSIVE, HTML, @@ -17,7 +18,10 @@ from bs4.element import ( whitespace_re, ) import html5lib -from html5lib.constants import namespaces +from html5lib.constants import ( + namespaces, + prefixes, + ) from bs4.element import ( Comment, Doctype, @@ -83,7 +87,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) + namespaceHTMLElements, self.soup) return self.underlying_builder def test_fragment_to_document(self, fragment): @@ -93,8 +97,12 @@ class HTML5TreeBuilder(HTMLTreeBuilder): class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup + def __init__(self, namespaceHTMLElements, soup=None): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): @@ -117,7 +125,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): return TextNode(Comment(data), self.soup) def fragmentClass(self): - self.soup = BeautifulSoup("") + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) @@ -131,6 +140,56 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def getFragment(self): return treebuilder_base.TreeBuilder.getFragment(self).element + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) + else: + rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s<!-- %s -->" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in element.attrs.items(): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + class AttrList(object): def __init__(self, element): self.element = element @@ -182,8 +241,10 @@ class Element(treebuilder_base.Node): child = node elif node.element.__class__ == NavigableString: string_child = child = node.element + node.parent = self else: child = node.element + node.parent = self if not isinstance(child, basestring) and child.parent is not None: node.element.extract() @@ -250,11 +311,11 @@ class Element(treebuilder_base.Node): attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: - text = TextNode(self.soup.new_string(data), self.soup) - self.insertBefore(data, insertBefore) + self.insertBefore(text, insertBefore) else: - self.appendChild(data) + self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index eba7e84..0f89d62 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -123,3 +123,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): a1, a2 = soup.find_all('a') self.assertEqual(a1, a2) assert a1 is not a2 + + def test_foster_parenting(self): + markup = b"""<table><td></tbody>A""" + soup = self.soup(markup) + self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode()) |