summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2016-07-17 11:50:48 -0400
committerLeonard Richardson <leonardr@segfault.org>2016-07-17 11:50:48 -0400
commitd1365a072537d028e6513782298eb2a7334ee02c (patch)
tree49bfddcf1f5bcf4aa98b36b025bfb8cd0a69e81c
parent3cbfb6672f24c123fe52fcd6b28bd0a5478b0161 (diff)
downloadbeautifulsoup4-d1365a072537d028e6513782298eb2a7334ee02c.tar.gz
Fixed a bug in the html5lib treebuilder that deranged the tree
when a whitespace element was reparented into a tag that contained an identical whitespace element. [bug=1505351]
-rw-r--r--NEWS.txt4
-rw-r--r--bs4/__init__.py13
-rw-r--r--bs4/tests/test_html5lib.py11
3 files changed, 27 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index eda6251..25795ce 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,10 @@
* Beautiful Soup will now work with versions of html5lib greater than
0.99999999. [bug=1603299]
+* Fixed a bug in the html5lib treebuilder that deranged the tree
+ when a whitespace element was reparented into a tag that contained
+ an identical whitespace element. [bug=1505351]
+
* Corrected handling of XML processing instructions. [bug=1504393]
* The contents of <textarea> tags will no longer be modified when the
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 37993ce..003dccb 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -376,7 +376,18 @@ class BeautifulSoup(Tag):
if parent.next_sibling:
# This node is being inserted into an element that has
# already been parsed. Deal with any dangling references.
- index = parent.contents.index(o)
+ index = len(parent.contents)-1
+ while index >= 0:
+ if parent.contents[index] is o:
+ break
+ index -= 1
+ else:
+ raise ValueError(
+ "Error in html5lib tree builder: supposedly %r was "
+ "inserted into %r, but I don't see it!" % (
+ o, parent
+ )
+ )
if index == 0:
previous_element = parent
previous_sibling = None
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 65536c2..8e3cba6 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -84,6 +84,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
+ def test_reparented_markup_containing_identical_whitespace_nodes(self):
+ """Verify that we keep the two whitespace nodes in this
+ document distinct when reparenting the adjacent <tbody> tags.
+ """
+ markup = '<table> <tbody><tbody><ims></tbody> </table>'
+ soup = self.soup(markup)
+ space1, space2 = soup.find_all(string=' ')
+ tbody1, tbody2 = soup.find_all('tbody')
+ assert space1.next_element is tbody1
+ assert tbody2.next_element is space2
+
def test_processing_instruction(self):
"""Processing instructions become comments."""
markup = b"""<?PITarget PIContent?>"""