From 49356a1f4d1ef2f56efdc791869706a01ea36bd1 Mon Sep 17 00:00:00 2001 From: Isaac Muse Date: Sat, 22 Dec 2018 14:25:35 -0700 Subject: Fix next and previous linkage issues. Fixes issues #1806598 and #1782928. --- bs4/__init__.py | 33 +++++++++++++++++++++++---------- bs4/builder/_html5lib.py | 10 +++++----- bs4/element.py | 14 +++++++------- bs4/tests/test_html5lib.py | 40 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 22 deletions(-) diff --git a/bs4/__init__.py b/bs4/__init__.py index 8c662f8..12308af 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -382,7 +382,7 @@ class BeautifulSoup(Tag): def pushTag(self, tag): #print "Push", tag.name - if self.currentTag: + if self.currentTag is not None: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] @@ -421,15 +421,19 @@ class BeautifulSoup(Tag): def object_was_parsed(self, o, parent=None, most_recent_element=None): """Add an object to the parse tree.""" - parent = parent or self.currentTag - previous_element = most_recent_element or self._most_recent_element + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element next_element = previous_sibling = next_sibling = None if isinstance(o, Tag): next_element = o.next_element next_sibling = o.next_sibling previous_sibling = o.previous_sibling - if not previous_element: + if previous_element is None: previous_element = o.previous_element o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) @@ -437,7 +441,7 @@ class BeautifulSoup(Tag): self._most_recent_element = o parent.contents.append(o) - if parent.next_sibling: + if parent.next_sibling is not None: # This node is being inserted into an element that has # already been parsed. Deal with any dangling references. index = len(parent.contents)-1 @@ -457,6 +461,15 @@ class BeautifulSoup(Tag): previous_sibling = None else: previous_element = previous_sibling = parent.contents[index-1] + previous = previous_element + while isinstance(previous, Tag): + if previous.contents: + previous.next_element = previous.contents[0] + previous = previous.contents[-1] + else: + break + previous_element = previous + if index == len(parent.contents)-1: next_element = parent.next_sibling next_sibling = None @@ -464,16 +477,16 @@ class BeautifulSoup(Tag): next_element = next_sibling = parent.contents[index+1] o.previous_element = previous_element - if previous_element: + if previous_element is not None: previous_element.next_element = o o.next_element = next_element - if next_element: + if next_element is not None: next_element.previous_element = o o.next_sibling = next_sibling - if next_sibling: + if next_sibling is not None: next_sibling.previous_sibling = o o.previous_sibling = previous_sibling - if previous_sibling: + if previous_sibling is not None: previous_sibling.next_sibling = o def _popToTag(self, name, nsprefix=None, inclusivePop=True): @@ -520,7 +533,7 @@ class BeautifulSoup(Tag): self.currentTag, self._most_recent_element) if tag is None: return tag - if self._most_recent_element: + if self._most_recent_element is not None: self._most_recent_element.next_element = tag self._most_recent_element = tag self.pushTag(tag) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 5f54893..388c3d6 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -249,7 +249,7 @@ class Element(treebuilder_base.Node): if not isinstance(child, basestring) and child.parent is not None: node.element.extract() - if (string_child and self.element.contents + if (string_child is not None and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # We are appending a string onto another string. # TODO This has O(n^2) performance, for input like @@ -360,16 +360,16 @@ class Element(treebuilder_base.Node): # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - if new_parents_last_descendant: + if new_parents_last_descendant is not None: first_child.previous_element = new_parents_last_descendant else: first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child - if new_parents_last_descendant: + if new_parents_last_descendant is not None: new_parents_last_descendant.next_element = first_child else: new_parent_element.next_element = first_child - if new_parents_last_child: + if new_parents_last_child is not None: new_parents_last_child.next_sibling = first_child # Find the very last element being moved. It is now the @@ -379,7 +379,7 @@ class Element(treebuilder_base.Node): last_childs_last_descendant = to_append[-1]._last_descendant(False, True) last_childs_last_descendant.next_element = new_parents_last_descendant_next_element - if new_parents_last_descendant_next_element: + if new_parents_last_descendant_next_element is not None: # TODO: This code has no test coverage and I'm not sure # how to get html5lib to go through this path, but it's # just the other side of the previous line. diff --git a/bs4/element.py b/bs4/element.py index 886eb91..8059c43 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -256,26 +256,26 @@ class PageElement(object): self.previous_element.next_element = self self.next_element = next_element - if self.next_element: + if self.next_element is not None: self.next_element.previous_element = self self.next_sibling = next_sibling - if self.next_sibling: + if self.next_sibling is not None: self.next_sibling.previous_sibling = self - if (not previous_sibling + if (previous_sibling is None and self.parent is not None and self.parent.contents): previous_sibling = self.parent.contents[-1] self.previous_sibling = previous_sibling - if previous_sibling: + if previous_sibling is not None: self.previous_sibling.next_sibling = self nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): - if not self.parent: + if self.parent is None: raise ValueError( "Cannot replace one element with another when the" "element to be replaced is not part of a tree.") @@ -292,7 +292,7 @@ class PageElement(object): def unwrap(self): my_parent = self.parent - if not self.parent: + if self.parent is None: raise ValueError( "Cannot replace an element with its contents when that" "element is not part of a tree.") @@ -340,7 +340,7 @@ class PageElement(object): def _last_descendant(self, is_initialized=True, accept_self=True): "Finds the last element beneath this object to be parsed." - if is_initialized and self.next_sibling: + if is_initialized and self.next_sibling is not None: last_child = self.next_sibling.previous_element else: last_child = self diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 0f89d62..3a04787 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -128,3 +128,43 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): markup = b"""A""" soup = self.soup(markup) self.assertEqual(u"A
", soup.body.decode()) + + def test_extraction(self): + """ + Test that extraction does not destroy the tree. + + https://bugs.launchpad.net/beautifulsoup/+bug/1782928 + """ + + markup = """ + +

hello

+""" + soup = self.soup(markup) + [s.extract() for s in soup('script')] + [s.extract() for s in soup('style')] + + self.assertEqual(len(soup.find_all("p")), 1) + + def test_empty_comment(self): + """ + Test that empty comment does not break structure. + + https://bugs.launchpad.net/beautifulsoup/+bug/1806598 + """ + + markup = """ + + +
+ +
+ + +""" + soup = self.soup(markup) + inputs = [] + for form in soup.find_all('form'): + inputs.extend(form.find_all('input')) + self.assertEqual(len(inputs), 1) -- cgit v1.2.1