diff options
author | Leonard Richardson <leonardr@segfault.org> | 2015-06-26 07:06:55 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2015-06-26 07:06:55 -0400 |
commit | 800d1971dcbdc6316a013a4c6ce86e8c18d48dca (patch) | |
tree | bb0f4f28db26827247b60a23b6b1fa2965e82248 | |
parent | fc32a6eb0fe0e981b4f41362b97576099b8c4a4e (diff) | |
download | beautifulsoup4-800d1971dcbdc6316a013a4c6ce86e8c18d48dca.tar.gz |
Added a sanity check helper method that makes sure all the elements of a tree are properly connected via .next_element and .previous_element.
-rw-r--r-- | bs4/builder/_html5lib.py | 12 | ||||
-rw-r--r-- | bs4/dammit.py | 3 | ||||
-rw-r--r-- | bs4/testing.py | 15 |
3 files changed, 21 insertions, 9 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index ad3c6ef..0778dde 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -236,9 +236,9 @@ class Element(html5lib.treebuilders._base.Node): def reparentChildren(self, new_parent): """Move all of this tag's children into another tag.""" - #print "MOVE", self.element.contents - #print "FROM", self.element - #print "TO", new_parent.element + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -289,9 +289,9 @@ class Element(html5lib.treebuilders._base.Node): element.contents = [] element.next_element = final_next_element - #print "DONE WITH MOVE" - #print "FROM", self.element - #print "TO", new_parent_element + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) diff --git a/bs4/dammit.py b/bs4/dammit.py index 68ed81f..7ced3a5 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -3,10 +3,11 @@ This library converts a bytestream to Unicode through any means necessary. It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It works best on XML and XML, but it does not rewrite the +Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ +from pdb import set_trace import codecs from htmlentitydefs import codepoint2name import re diff --git a/bs4/testing.py b/bs4/testing.py index 8ca3878..7232513 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -43,6 +43,16 @@ class SoupTest(unittest.TestCase): self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + def assertConnectedness(self, element): + """Ensure that next_element and previous_element are properly + set for all descendants of the given element. + """ + earlier = None + for e in element.descendants: + if earlier: + self.assertEqual(e, earlier.next_element) + self.assertEqual(earlier, e.previous_element) + earlier = e class HTMLTreeBuilderSmokeTest(object): @@ -283,6 +293,7 @@ Hello, world! soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") self.assertEqual("p", soup.h2.string.next_element.name) self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) def test_head_tag_between_head_and_body(self): "Prevent recurrence of a bug in the html5lib treebuilder." @@ -293,6 +304,7 @@ Hello, world! """ soup = self.soup(content) self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) def test_multiple_copies_of_a_tag(self): "Prevent recurrence of a bug in the html5lib treebuilder." @@ -309,8 +321,7 @@ Hello, world! </html> """ soup = self.soup(content) - [x for x in soup.article.descendants] - + self.assertConnectedness(soup.article) def test_basic_namespaces(self): """Parsers don't need to *understand* namespaces, but at the |