summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIsaac Muse <isaacmuse@gmail.com>2018-12-25 15:25:24 -0700
committerIsaac Muse <isaacmuse@gmail.com>2018-12-25 15:25:24 -0700
commite7f79c9408cfaa43b5250c0f29fb0f0e8f772f73 (patch)
treed6f17244294b98cc123c04390d8016e636834751
parent5b5540581396b07404018664f0954f2d13377753 (diff)
downloadbeautifulsoup4-e7f79c9408cfaa43b5250c0f29fb0f0e8f772f73.tar.gz
Ensure html5lib always has valid internal linkage
html5lib, with malformed HTML, can end up with detached linkage internally. Improve the current code to ensure html5lib always has proper linkage.
-rw-r--r--bs4/__init__.py143
-rw-r--r--bs4/testing.py167
2 files changed, 266 insertions, 44 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index ea9d9eb..e29d28e 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -442,53 +442,108 @@ class BeautifulSoup(Tag):
self._most_recent_element = o
parent.contents.append(o)
- if parent.next_sibling is not None:
- # This node is being inserted into an element that has
- # already been parsed. Deal with any dangling references.
- index = len(parent.contents)-1
- while index >= 0:
- if parent.contents[index] is o:
- break
- index -= 1
+ # Check if we are inserting into an already parsed node.
+ if parent.next_element is not None:
+
+ # Check that links are proper across tag parent boundaries
+ child = self._linkage_fixer(parent)
+
+ def _linkage_fixer(self, el, _recursive_call=False):
+ """Make sure linkage of this fragment is sound."""
+ descendant = None
+
+ # If element is document element,
+ # it should have no previous element, previous sibling, or next sibling.
+ if el.parent is None:
+ if el.previous_element is not None:
+ el.previous_element = None
+ if el.previous_sibling is not None:
+ el.previous_element = None
+ if el.next_sibling is not None:
+ el.next_sibling = None
+
+ idx = 0
+ child = None
+ last_child = None
+ last_idx = len(el.contents) - 1
+ for child in el.contents:
+ descendant = None
+
+ # Parent should link next element to their first child
+ # That child should have no previous sibling
+ if idx == 0:
+ if el.parent is not None:
+ if el.next_element is not child:
+ el.next_element = child
+
+ if child.previous_element is not el:
+ child.previous_element = el
+
+ if child.previous_sibling is not None:
+ child.previous_sibling = None
+
+ # If not the first child, previous index should link as sibling to last index.
+ # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
else:
- raise ValueError(
- "Error building tree: supposedly %r was inserted "
- "into %r after the fact, but I don't see it!" % (
- o, parent
- )
- )
- if index == 0:
- previous_element = parent
- previous_sibling = None
+ if child.previous_sibling is not el.contents[idx - 1]:
+ child.previous_sibling = el.contents[idx - 1]
+ if el.contents[idx - 1].next_sibling is not child:
+ el.contents[idx - 1].next_sibling = child
+
+ if last_child is not None:
+ if child.previous_element is not last_child:
+ child.previous_element = last_child
+ if last_child.next_element is not child:
+ last_child.next_element = child
+
+ # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
+ if isinstance(child, Tag) and child.contents:
+ descendant = self._linkage_fixer(child, True)
+ # A bubbled up descendant should have no next siblings
+ # as it is last in its content list.
+ if descendant.next_sibling is not None:
+ descendant.next_sibling = None
+
+ # Mark last child as either the bubbled up descendant or the current child
+ if descendant is not None:
+ last_child = descendant
else:
- previous_element = previous_sibling = parent.contents[index-1]
- previous = previous_element
- while isinstance(previous, Tag):
- if previous.contents:
- previous.next_element = previous.contents[0]
- previous = previous.contents[-1]
- else:
- break
- previous_element = previous
+ last_child = child
+
+ # If last child in list, there are no next siblings
+ if idx == last_idx:
+ if child.next_sibling is not None:
+ child.next_sibling = None
+ idx += 1
+
+ # The child to return is either the last descendant (if available)
+ # or the last processed child (if any). If neither is available,
+ # the parent element is its own last descendant.
+ child = descendant if descendant is not None else child
+ if child is None:
+ child = el
+
+ # If not a recursive call, we are done processing this element.
+ # As the final step, link last descendant. It should be linked
+ # to the parent's next sibling (if found), else walk up the chain
+ # and find a parent with a sibling.
+ if not _recursive_call and child is not None:
+ child.next_element = None
+ target = el
+ while True:
+ if target is None:
+ break
+ elif target.next_sibling is not None:
+ child.next_element = target.next_sibling
+ target.next_sibling.previous_element = child
+ break
+ target = target.parent
- if index == len(parent.contents)-1:
- next_element = parent.next_sibling
- next_sibling = None
- else:
- next_element = next_sibling = parent.contents[index+1]
-
- o.previous_element = previous_element
- if previous_element is not None:
- previous_element.next_element = o
- o.next_element = next_element
- if next_element is not None:
- next_element.previous_element = o
- o.next_sibling = next_sibling
- if next_sibling is not None:
- next_sibling.previous_sibling = o
- o.previous_sibling = previous_sibling
- if previous_sibling is not None:
- previous_sibling.next_sibling = o
+ # We are done, so nothing to return
+ return None
+ else:
+ # Return the child to the recursive caller
+ return child
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
diff --git a/bs4/testing.py b/bs4/testing.py
index 745a9c4..18b985c 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -17,11 +17,48 @@ from bs4.element import (
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
+ Tag
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
+BAD_DOCUMENT = u"""A bare string
+<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
+<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
+<div><![CDATA[A CDATA section where it doesn't belong]]></div>
+<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
+<div>A <meta> tag</div>
+<div>A <br> tag that supposedly has contents.</br></div>
+<div>AT&T</div>
+<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
+<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
+<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
+<div><a href="http://example.com/</a> that attribute value never got closed</div>
+<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
+<! This document starts with a bogus declaration ><div>a</div>
+<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
+<div>This document ends with <!an incomplete declaration
+<div><a style={height:21px;}>That attribute value was bogus</a></div>
+<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
+<div><table><td nowrap>That boolean attribute had no value</td></table></div>
+<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
+<div>This document ends before the entity finishes: &gt
+<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
+<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
+<div><table><tr><td>Here's a table</td></tr></table></div>
+<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
+<div>This tag contains nothing but whitespace: <b> </b></div>
+<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
+<div><table><div>This table contains bare markup</div></table></div>
+<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
+<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
+<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
+<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
+<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+"""
+
class SoupTest(unittest.TestCase):
@@ -60,6 +97,123 @@ class SoupTest(unittest.TestCase):
self.assertEqual(earlier, e.previous_element)
earlier = e
+ def linkage_validator(self, el, _recursive_call=False):
+ """Ensure proper linkage throughout the document."""
+ descendant = None
+ # Document element should have no previous element or previous sibling.
+ # It also shouldn't have a next sibling.
+ if el.parent is None:
+ assert el.previous_element is None,\
+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ el, el.previous_element, None
+ )
+ assert el.previous_sibling is None,\
+ "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ el, el.previous_sibling, None
+ )
+ assert el.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+ el, el.next_sibling, None
+ )
+
+ idx = 0
+ child = None
+ last_child = None
+ last_idx = len(el.contents) - 1
+ for child in el.contents:
+ descendant = None
+
+ # Parent should link next element to their first child
+ # That child should have no previous sibling
+ if idx == 0:
+ if el.parent is not None:
+ assert el.next_element is child,\
+ "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+ el, el.next_element, child
+ )
+ assert child.previous_element is el,\
+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ child, child.previous_element, el
+ )
+ assert child.previous_sibling is None,\
+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
+ child, child.previous_sibling, None
+ )
+
+ # If not the first child, previous index should link as sibling to this index
+ # Previous element should match the last index or the last bubbled up descendant
+ else:
+ assert child.previous_sibling is el.contents[idx - 1],\
+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
+ child, child.previous_sibling, el.contents[idx - 1]
+ )
+ assert el.contents[idx - 1].next_sibling is child,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
+ )
+
+ if last_child is not None:
+ assert child.previous_element is last_child,\
+ "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
+ child, child.previous_element, last_child, child.parent.contents
+ )
+ assert last_child.next_element is child,\
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ last_child, last_child.next_element, child
+ )
+
+ if isinstance(child, Tag) and child.contents:
+ descendant = self.linkage_validator(child, True)
+ # A bubbled up descendant should have no next siblings
+ assert descendant.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ descendant, descendant.next_sibling, None
+ )
+
+ # Mark last child as either the bubbled up descendant or the current child
+ if descendant is not None:
+ last_child = descendant
+ else:
+ last_child = child
+
+ # If last child, there are non next siblings
+ if idx == last_idx:
+ assert child.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_sibling, None
+ )
+ idx += 1
+
+ child = descendant if descendant is not None else child
+ if child is None:
+ child = el
+
+ if not _recursive_call and child is not None:
+ target = el
+ while True:
+ if target is None:
+ assert child.next_element is None, \
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_element, None
+ )
+ break
+ elif target.next_sibling is not None:
+ assert child.next_element is target.next_sibling, \
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_element, target.next_sibling
+ )
+ break
+ target = target.parent
+
+ # We are done, so nothing to return
+ return None
+ else:
+ # Return the child to the recursive caller
+ return child
+
+ return descendant if descendant is not None else child
+
+
class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence.
@@ -615,6 +769,13 @@ Hello, world!
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+ def test_worst_case(self):
+ """Test the worst case (currently) for linking issues."""
+
+ soup = self.soup(BAD_DOCUMENT)
+ self.linkage_validator(soup)
+
+
class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
@@ -761,6 +922,12 @@ class XMLTreeBuilderSmokeTest(object):
# The two tags have the same namespace prefix.
self.assertEqual(tag.prefix, duplicate.prefix)
+ def test_worst_case(self):
+ """Test the worst case (currently) for linking issues."""
+
+ soup = self.soup(BAD_DOCUMENT)
+ self.linkage_validator(soup)
+
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""