diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-12-19 18:43:56 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-12-19 18:43:56 -0500 |
commit | 4f3f7ec5587f2f1f37416c6129dd1a6b3eafabf6 (patch) | |
tree | 925f83e2ffa6c7effd486018f9e1ce441f0bfaf6 | |
parent | fad293d237aadcbd2576e3fd347831724fe847cf (diff) | |
download | beautifulsoup4-4f3f7ec5587f2f1f37416c6129dd1a6b3eafabf6.tar.gz |
Fixed yet another problem that caused the html5lib tree builder to
-rw-r--r-- | NEWS.txt | 5 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 21 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 17 |
3 files changed, 37 insertions, 6 deletions
@@ -1,3 +1,8 @@ += Unreleased = + +* Fixed yet another problem that caused the html5lib tree builder to + create a disconnected parse tree. [bug=1629825] + = 4.5.1 (20160802) = * Fixed a crash when passing Unicode markup that contained a diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index c46f882..ad0deea 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -221,6 +221,8 @@ class Element(treebuilder_base.Node): most_recent_element=most_recent_element) def getAttributes(self): + if isinstance(self.element, Comment): + return {} return AttrList(self.element) def setAttributes(self, attributes): @@ -274,6 +276,7 @@ class Element(treebuilder_base.Node): # print "MOVE", self.element.contents # print "FROM", self.element # print "TO", new_parent.element + element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -292,7 +295,6 @@ class Element(treebuilder_base.Node): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent @@ -309,12 +311,19 @@ class Element(treebuilder_base.Node): if new_parents_last_child: new_parents_last_child.next_sibling = first_child - # Fix the last child's next_element and next_sibling - last_child = to_append[-1] - last_child.next_element = new_parents_last_descendant_next_element + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element if new_parents_last_descendant_next_element: - new_parents_last_descendant_next_element.previous_element = last_child - last_child.next_sibling = None + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None for child in to_append: child.parent = new_parent_element diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 8e3cba6..2eb41b3 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -95,6 +95,23 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): assert space1.next_element is tbody1 assert tbody2.next_element is space2 + def test_reparented_markup_containing_children(self): + markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>' + soup = self.soup(markup) + noscript = soup.noscript + self.assertEqual("target", noscript.next_element) + target = soup.find(string='target') + + # The 'aftermath' string was duplicated; we want the second one. + final_aftermath = soup.find_all(string='aftermath')[-1] + import pdb; pdb.set_trace() + + # The <noscript> tag was moved beneath a copy of the <a> tag, + # but the 'target' string within is still connected to the + # (second) 'aftermath' string. + self.assertEqual(final_aftermath, target.next_element) + self.assertEqual(target, final_aftermath.previous_element) + def test_processing_instruction(self): """Processing instructions become comments.""" markup = b"""<?PITarget PIContent?>""" |