summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2015-06-26 07:06:55 -0400
committerLeonard Richardson <leonardr@segfault.org>2015-06-26 07:06:55 -0400
commit800d1971dcbdc6316a013a4c6ce86e8c18d48dca (patch)
treebb0f4f28db26827247b60a23b6b1fa2965e82248
parentfc32a6eb0fe0e981b4f41362b97576099b8c4a4e (diff)
downloadbeautifulsoup4-800d1971dcbdc6316a013a4c6ce86e8c18d48dca.tar.gz
Added a sanity check helper method that makes sure all the elements of a tree are properly connected via .next_element and .previous_element.
-rw-r--r--bs4/builder/_html5lib.py12
-rw-r--r--bs4/dammit.py3
-rw-r--r--bs4/testing.py15
3 files changed, 21 insertions, 9 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index ad3c6ef..0778dde 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -236,9 +236,9 @@ class Element(html5lib.treebuilders._base.Node):
def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag."""
- #print "MOVE", self.element.contents
- #print "FROM", self.element
- #print "TO", new_parent.element
+ # print "MOVE", self.element.contents
+ # print "FROM", self.element
+ # print "TO", new_parent.element
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
@@ -289,9 +289,9 @@ class Element(html5lib.treebuilders._base.Node):
element.contents = []
element.next_element = final_next_element
- #print "DONE WITH MOVE"
- #print "FROM", self.element
- #print "TO", new_parent_element
+ # print "DONE WITH MOVE"
+ # print "FROM", self.element
+ # print "TO", new_parent_element
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 68ed81f..7ced3a5 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -3,10 +3,11 @@
This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It works best on XML and XML, but it does not rewrite the
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
+from pdb import set_trace
import codecs
from htmlentitydefs import codepoint2name
import re
diff --git a/bs4/testing.py b/bs4/testing.py
index 8ca3878..7232513 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -43,6 +43,16 @@ class SoupTest(unittest.TestCase):
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
+ def assertConnectedness(self, element):
+ """Ensure that next_element and previous_element are properly
+ set for all descendants of the given element.
+ """
+ earlier = None
+ for e in element.descendants:
+ if earlier:
+ self.assertEqual(e, earlier.next_element)
+ self.assertEqual(earlier, e.previous_element)
+ earlier = e
class HTMLTreeBuilderSmokeTest(object):
@@ -283,6 +293,7 @@ Hello, world!
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
+ self.assertConnectedness(soup)
def test_head_tag_between_head_and_body(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
@@ -293,6 +304,7 @@ Hello, world!
"""
soup = self.soup(content)
self.assertNotEqual(None, soup.html.body)
+ self.assertConnectedness(soup)
def test_multiple_copies_of_a_tag(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
@@ -309,8 +321,7 @@ Hello, world!
</html>
"""
soup = self.soup(content)
- [x for x in soup.article.descendants]
-
+ self.assertConnectedness(soup.article)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the