summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-12-30 20:31:33 -0500
committerLeonard Richardson <leonardr@segfault.org>2018-12-30 20:31:33 -0500
commitcc6de8c2b4bf4d41b7ab2a7f609e7493c9e0a859 (patch)
tree56c1a4ec1830db2d1cba303b5bdee33bbafdf329 /bs4/__init__.py
parent74971441d1f86411e5e72d834dd6fa87a6994007 (diff)
parentb2b0acea8040e7953f25592863611fbb81c7ff00 (diff)
downloadbeautifulsoup4-cc6de8c2b4bf4d41b7ab2a7f609e7493c9e0a859.tar.gz
Merging the linkage checker and html5lib fixes by Isaac Muse found in https://code.launchpad.net/~facelessuser/beautifulsoup/html5lib-fix/+merge/361282. [bug=1809910]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py143
1 files changed, 99 insertions, 44 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index db9ecf3..9b4a046 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -440,53 +440,108 @@ class BeautifulSoup(Tag):
self._most_recent_element = o
parent.contents.append(o)
- if parent.next_sibling is not None:
- # This node is being inserted into an element that has
- # already been parsed. Deal with any dangling references.
- index = len(parent.contents)-1
- while index >= 0:
- if parent.contents[index] is o:
- break
- index -= 1
+ # Check if we are inserting into an already parsed node.
+ if parent.next_element is not None:
+
+ # Check that links are proper across tag parent boundaries
+ child = self._linkage_fixer(parent)
+
+ def _linkage_fixer(self, el, _recursive_call=False):
+ """Make sure linkage of this fragment is sound."""
+ descendant = None
+
+ # If element is document element,
+ # it should have no previous element, previous sibling, or next sibling.
+ if el.parent is None:
+ if el.previous_element is not None:
+ el.previous_element = None
+ if el.previous_sibling is not None:
+ el.previous_element = None
+ if el.next_sibling is not None:
+ el.next_sibling = None
+
+ idx = 0
+ child = None
+ last_child = None
+ last_idx = len(el.contents) - 1
+ for child in el.contents:
+ descendant = None
+
+ # Parent should link next element to their first child
+ # That child should have no previous sibling
+ if idx == 0:
+ if el.parent is not None:
+ if el.next_element is not child:
+ el.next_element = child
+
+ if child.previous_element is not el:
+ child.previous_element = el
+
+ if child.previous_sibling is not None:
+ child.previous_sibling = None
+
+ # If not the first child, previous index should link as sibling to last index.
+ # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
else:
- raise ValueError(
- "Error building tree: supposedly %r was inserted "
- "into %r after the fact, but I don't see it!" % (
- o, parent
- )
- )
- if index == 0:
- previous_element = parent
- previous_sibling = None
+ if child.previous_sibling is not el.contents[idx - 1]:
+ child.previous_sibling = el.contents[idx - 1]
+ if el.contents[idx - 1].next_sibling is not child:
+ el.contents[idx - 1].next_sibling = child
+
+ if last_child is not None:
+ if child.previous_element is not last_child:
+ child.previous_element = last_child
+ if last_child.next_element is not child:
+ last_child.next_element = child
+
+ # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
+ if isinstance(child, Tag) and child.contents:
+ descendant = self._linkage_fixer(child, True)
+ # A bubbled up descendant should have no next siblings
+ # as it is last in its content list.
+ if descendant.next_sibling is not None:
+ descendant.next_sibling = None
+
+ # Mark last child as either the bubbled up descendant or the current child
+ if descendant is not None:
+ last_child = descendant
else:
- previous_element = previous_sibling = parent.contents[index-1]
- previous = previous_element
- while isinstance(previous, Tag):
- if previous.contents:
- previous.next_element = previous.contents[0]
- previous = previous.contents[-1]
- else:
- break
- previous_element = previous
+ last_child = child
+
+ # If last child in list, there are no next siblings
+ if idx == last_idx:
+ if child.next_sibling is not None:
+ child.next_sibling = None
+ idx += 1
+
+ # The child to return is either the last descendant (if available)
+ # or the last processed child (if any). If neither is available,
+ # the parent element is its own last descendant.
+ child = descendant if descendant is not None else child
+ if child is None:
+ child = el
+
+ # If not a recursive call, we are done processing this element.
+ # As the final step, link last descendant. It should be linked
+ # to the parent's next sibling (if found), else walk up the chain
+ # and find a parent with a sibling.
+ if not _recursive_call and child is not None:
+ child.next_element = None
+ target = el
+ while True:
+ if target is None:
+ break
+ elif target.next_sibling is not None:
+ child.next_element = target.next_sibling
+ target.next_sibling.previous_element = child
+ break
+ target = target.parent
- if index == len(parent.contents)-1:
- next_element = parent.next_sibling
- next_sibling = None
- else:
- next_element = next_sibling = parent.contents[index+1]
-
- o.previous_element = previous_element
- if previous_element is not None:
- previous_element.next_element = o
- o.next_element = next_element
- if next_element is not None:
- next_element.previous_element = o
- o.next_sibling = next_sibling
- if next_sibling is not None:
- next_sibling.previous_sibling = o
- o.previous_sibling = previous_sibling
- if previous_sibling is not None:
- previous_sibling.next_sibling = o
+ # We are done, so nothing to return
+ return None
+ else:
+ # Return the child to the recursive caller
+ return child
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent