diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-07-17 11:50:48 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-07-17 11:50:48 -0400 |
commit | d1365a072537d028e6513782298eb2a7334ee02c (patch) | |
tree | 49bfddcf1f5bcf4aa98b36b025bfb8cd0a69e81c | |
parent | 3cbfb6672f24c123fe52fcd6b28bd0a5478b0161 (diff) | |
download | beautifulsoup4-d1365a072537d028e6513782298eb2a7334ee02c.tar.gz |
Fixed a bug in the html5lib treebuilder that deranged the tree
when a whitespace element was reparented into a tag that contained
an identical whitespace element. [bug=1505351]
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/__init__.py | 13 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 11 |
3 files changed, 27 insertions, 1 deletions
@@ -3,6 +3,10 @@ * Beautiful Soup will now work with versions of html5lib greater than 0.99999999. [bug=1603299] +* Fixed a bug in the html5lib treebuilder that deranged the tree + when a whitespace element was reparented into a tag that contained + an identical whitespace element. [bug=1505351] + * Corrected handling of XML processing instructions. [bug=1504393] * The contents of <textarea> tags will no longer be modified when the diff --git a/bs4/__init__.py b/bs4/__init__.py index 37993ce..003dccb 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -376,7 +376,18 @@ class BeautifulSoup(Tag): if parent.next_sibling: # This node is being inserted into an element that has # already been parsed. Deal with any dangling references. - index = parent.contents.index(o) + index = len(parent.contents)-1 + while index >= 0: + if parent.contents[index] is o: + break + index -= 1 + else: + raise ValueError( + "Error in html5lib tree builder: supposedly %r was " + "inserted into %r, but I don't see it!" % ( + o, parent + ) + ) if index == 0: previous_element = parent previous_sibling = None diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 65536c2..8e3cba6 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -84,6 +84,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) + def test_reparented_markup_containing_identical_whitespace_nodes(self): + """Verify that we keep the two whitespace nodes in this + document distinct when reparenting the adjacent <tbody> tags. + """ + markup = '<table> <tbody><tbody><ims></tbody> </table>' + soup = self.soup(markup) + space1, space2 = soup.find_all(string=' ') + tbody1, tbody2 = soup.find_all('tbody') + assert space1.next_element is tbody1 + assert tbody2.next_element is space2 + def test_processing_instruction(self): """Processing instructions become comments.""" markup = b"""<?PITarget PIContent?>""" |