diff options
-rw-r--r-- | NEWS.txt | 2 | ||||
-rw-r--r-- | bs4/__init__.py | 143 | ||||
-rw-r--r-- | bs4/testing.py | 165 |
3 files changed, 265 insertions, 45 deletions
@@ -16,7 +16,7 @@ * Fix a number of problems with the tree builder that caused trees that were superficially okay, but which fell apart when bits - were extracted. Patch by Isaac Muse. [bug=1782928] + were extracted. Patch by Isaac Muse. [bug=1782928,1809910] * Fixed a problem with the tree builder in which elements that contained no content (such as empty comments and all-whitespace diff --git a/bs4/__init__.py b/bs4/__init__.py index db9ecf3..9b4a046 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -440,53 +440,108 @@ class BeautifulSoup(Tag): self._most_recent_element = o parent.contents.append(o) - if parent.next_sibling is not None: - # This node is being inserted into an element that has - # already been parsed. Deal with any dangling references. - index = len(parent.contents)-1 - while index >= 0: - if parent.contents[index] is o: - break - index -= 1 + # Check if we are inserting into an already parsed node. + if parent.next_element is not None: + + # Check that links are proper across tag parent boundaries + child = self._linkage_fixer(parent) + + def _linkage_fixer(self, el, _recursive_call=False): + """Make sure linkage of this fragment is sound.""" + descendant = None + + # If element is document element, + # it should have no previous element, previous sibling, or next sibling. + if el.parent is None: + if el.previous_element is not None: + el.previous_element = None + if el.previous_sibling is not None: + el.previous_element = None + if el.next_sibling is not None: + el.next_sibling = None + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + if el.next_element is not child: + el.next_element = child + + if child.previous_element is not el: + child.previous_element = el + + if child.previous_sibling is not None: + child.previous_sibling = None + + # If not the first child, previous index should link as sibling to last index. + # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling). else: - raise ValueError( - "Error building tree: supposedly %r was inserted " - "into %r after the fact, but I don't see it!" % ( - o, parent - ) - ) - if index == 0: - previous_element = parent - previous_sibling = None + if child.previous_sibling is not el.contents[idx - 1]: + child.previous_sibling = el.contents[idx - 1] + if el.contents[idx - 1].next_sibling is not child: + el.contents[idx - 1].next_sibling = child + + if last_child is not None: + if child.previous_element is not last_child: + child.previous_element = last_child + if last_child.next_element is not child: + last_child.next_element = child + + # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way + if isinstance(child, Tag) and child.contents: + descendant = self._linkage_fixer(child, True) + # A bubbled up descendant should have no next siblings + # as it is last in its content list. + if descendant.next_sibling is not None: + descendant.next_sibling = None + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant else: - previous_element = previous_sibling = parent.contents[index-1] - previous = previous_element - while isinstance(previous, Tag): - if previous.contents: - previous.next_element = previous.contents[0] - previous = previous.contents[-1] - else: - break - previous_element = previous + last_child = child + + # If last child in list, there are no next siblings + if idx == last_idx: + if child.next_sibling is not None: + child.next_sibling = None + idx += 1 + + # The child to return is either the last descendant (if available) + # or the last processed child (if any). If neither is available, + # the parent element is its own last descendant. + child = descendant if descendant is not None else child + if child is None: + child = el + + # If not a recursive call, we are done processing this element. + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. + if not _recursive_call and child is not None: + child.next_element = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + child.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent - if index == len(parent.contents)-1: - next_element = parent.next_sibling - next_sibling = None - else: - next_element = next_sibling = parent.contents[index+1] - - o.previous_element = previous_element - if previous_element is not None: - previous_element.next_element = o - o.next_element = next_element - if next_element is not None: - next_element.previous_element = o - o.next_sibling = next_sibling - if next_sibling is not None: - next_sibling.previous_sibling = o - o.previous_sibling = previous_sibling - if previous_sibling is not None: - previous_sibling.next_sibling = o + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent diff --git a/bs4/testing.py b/bs4/testing.py index 13ff63f..9598f31 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -16,11 +16,48 @@ from bs4.element import ( ContentMetaAttributeValue, Doctype, SoupStrainer, + Tag ) from bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder +BAD_DOCUMENT = u"""A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our\u2603>Tag name contains Unicode characters</our\u2603></div> +<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +""" + class SoupTest(unittest.TestCase): @@ -59,6 +96,121 @@ class SoupTest(unittest.TestCase): self.assertEqual(earlier, e.previous_element) earlier = e + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + class HTMLTreeBuilderSmokeTest(object): """A basic test of a treebuilder's competence. @@ -614,6 +766,13 @@ Hello, world! data.a['foo'] = 'bar' self.assertEqual('<a foo="bar">text</a>', data.a.decode()) + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + class XMLTreeBuilderSmokeTest(object): def test_pickle_and_unpickle_identity(self): @@ -760,6 +919,12 @@ class XMLTreeBuilderSmokeTest(object): # The two tags have the same namespace prefix. self.assertEqual(tag.prefix, duplicate.prefix) + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" |