From e7f79c9408cfaa43b5250c0f29fb0f0e8f772f73 Mon Sep 17 00:00:00 2001 From: Isaac Muse Date: Tue, 25 Dec 2018 15:25:24 -0700 Subject: Ensure html5lib always has valid internal linkage html5lib, with malformed HTML, can end up with detached linkage internally. Improve the current code to ensure html5lib always has proper linkage. --- bs4/__init__.py | 143 +++++++++++++++++++++++++++++++++--------------- bs4/testing.py | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+), 44 deletions(-) diff --git a/bs4/__init__.py b/bs4/__init__.py index ea9d9eb..e29d28e 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -442,53 +442,108 @@ class BeautifulSoup(Tag): self._most_recent_element = o parent.contents.append(o) - if parent.next_sibling is not None: - # This node is being inserted into an element that has - # already been parsed. Deal with any dangling references. - index = len(parent.contents)-1 - while index >= 0: - if parent.contents[index] is o: - break - index -= 1 + # Check if we are inserting into an already parsed node. + if parent.next_element is not None: + + # Check that links are proper across tag parent boundaries + child = self._linkage_fixer(parent) + + def _linkage_fixer(self, el, _recursive_call=False): + """Make sure linkage of this fragment is sound.""" + descendant = None + + # If element is document element, + # it should have no previous element, previous sibling, or next sibling. + if el.parent is None: + if el.previous_element is not None: + el.previous_element = None + if el.previous_sibling is not None: + el.previous_element = None + if el.next_sibling is not None: + el.next_sibling = None + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + if el.next_element is not child: + el.next_element = child + + if child.previous_element is not el: + child.previous_element = el + + if child.previous_sibling is not None: + child.previous_sibling = None + + # If not the first child, previous index should link as sibling to last index. + # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling). else: - raise ValueError( - "Error building tree: supposedly %r was inserted " - "into %r after the fact, but I don't see it!" % ( - o, parent - ) - ) - if index == 0: - previous_element = parent - previous_sibling = None + if child.previous_sibling is not el.contents[idx - 1]: + child.previous_sibling = el.contents[idx - 1] + if el.contents[idx - 1].next_sibling is not child: + el.contents[idx - 1].next_sibling = child + + if last_child is not None: + if child.previous_element is not last_child: + child.previous_element = last_child + if last_child.next_element is not child: + last_child.next_element = child + + # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way + if isinstance(child, Tag) and child.contents: + descendant = self._linkage_fixer(child, True) + # A bubbled up descendant should have no next siblings + # as it is last in its content list. + if descendant.next_sibling is not None: + descendant.next_sibling = None + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant else: - previous_element = previous_sibling = parent.contents[index-1] - previous = previous_element - while isinstance(previous, Tag): - if previous.contents: - previous.next_element = previous.contents[0] - previous = previous.contents[-1] - else: - break - previous_element = previous + last_child = child + + # If last child in list, there are no next siblings + if idx == last_idx: + if child.next_sibling is not None: + child.next_sibling = None + idx += 1 + + # The child to return is either the last descendant (if available) + # or the last processed child (if any). If neither is available, + # the parent element is its own last descendant. + child = descendant if descendant is not None else child + if child is None: + child = el + + # If not a recursive call, we are done processing this element. + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. + if not _recursive_call and child is not None: + child.next_element = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + child.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent - if index == len(parent.contents)-1: - next_element = parent.next_sibling - next_sibling = None - else: - next_element = next_sibling = parent.contents[index+1] - - o.previous_element = previous_element - if previous_element is not None: - previous_element.next_element = o - o.next_element = next_element - if next_element is not None: - next_element.previous_element = o - o.next_sibling = next_sibling - if next_sibling is not None: - next_sibling.previous_sibling = o - o.previous_sibling = previous_sibling - if previous_sibling is not None: - previous_sibling.next_sibling = o + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent diff --git a/bs4/testing.py b/bs4/testing.py index 745a9c4..18b985c 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -17,11 +17,48 @@ from bs4.element import ( ContentMetaAttributeValue, Doctype, SoupStrainer, + Tag ) from bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder +BAD_DOCUMENT = u"""A bare string + + +
+
HTML5 does allow CDATA sections in SVG
+
A tag
+
A
tag that supposedly has contents.
+
AT&T
+
+
+
This numeric entity is missing the final semicolon:
+
, that attribute value was closed by the subsequent tag
+
a
+
This document contains (do you see it?)
+
This document ends with That attribute value was bogus
+The doctype is invalid because it contains extra whitespace +
That boolean attribute had no value
+
Here's a nonexistent entity: &#foo; (do you see it?)
+
This document ends before the entity finishes: > +

Paragraphs shouldn't contain block display elements, but this one does:

you see?

+Multiple values for the same attribute. +
Here's a table
+
+
This tag contains nothing but whitespace:
+

This p tag is cut off by

the end of the blockquote tag
+
Here's a nested table:
foo
This table contains bare markup
+ +
This document contains a surprise doctype
+ +
Tag name contains Unicode characters
+ + +""" + class SoupTest(unittest.TestCase): @@ -60,6 +97,123 @@ class SoupTest(unittest.TestCase): self.assertEqual(earlier, e.previous_element) earlier = e + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + return descendant if descendant is not None else child + + class HTMLTreeBuilderSmokeTest(object): """A basic test of a treebuilder's competence. @@ -615,6 +769,13 @@ Hello, world! data.a['foo'] = 'bar' self.assertEqual('text', data.a.decode()) + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + class XMLTreeBuilderSmokeTest(object): def test_pickle_and_unpickle_identity(self): @@ -761,6 +922,12 @@ class XMLTreeBuilderSmokeTest(object): # The two tags have the same namespace prefix. self.assertEqual(tag.prefix, duplicate.prefix) + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" -- cgit v1.2.1 From b2b0acea8040e7953f25592863611fbb81c7ff00 Mon Sep 17 00:00:00 2001 From: Isaac Muse Date: Wed, 26 Dec 2018 15:10:58 -0700 Subject: Remove dead line of code --- bs4/testing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bs4/testing.py b/bs4/testing.py index 18b985c..dee214c 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -211,8 +211,6 @@ class SoupTest(unittest.TestCase): # Return the child to the recursive caller return child - return descendant if descendant is not None else child - class HTMLTreeBuilderSmokeTest(object): -- cgit v1.2.1