diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-12-30 20:31:33 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-12-30 20:31:33 -0500 |
commit | cc6de8c2b4bf4d41b7ab2a7f609e7493c9e0a859 (patch) | |
tree | 56c1a4ec1830db2d1cba303b5bdee33bbafdf329 /bs4/__init__.py | |
parent | 74971441d1f86411e5e72d834dd6fa87a6994007 (diff) | |
parent | b2b0acea8040e7953f25592863611fbb81c7ff00 (diff) | |
download | beautifulsoup4-cc6de8c2b4bf4d41b7ab2a7f609e7493c9e0a859.tar.gz |
Merging the linkage checker and html5lib fixes by Isaac Muse found in https://code.launchpad.net/~facelessuser/beautifulsoup/html5lib-fix/+merge/361282. [bug=1809910]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r-- | bs4/__init__.py | 143 |
1 files changed, 99 insertions, 44 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index db9ecf3..9b4a046 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -440,53 +440,108 @@ class BeautifulSoup(Tag): self._most_recent_element = o parent.contents.append(o) - if parent.next_sibling is not None: - # This node is being inserted into an element that has - # already been parsed. Deal with any dangling references. - index = len(parent.contents)-1 - while index >= 0: - if parent.contents[index] is o: - break - index -= 1 + # Check if we are inserting into an already parsed node. + if parent.next_element is not None: + + # Check that links are proper across tag parent boundaries + child = self._linkage_fixer(parent) + + def _linkage_fixer(self, el, _recursive_call=False): + """Make sure linkage of this fragment is sound.""" + descendant = None + + # If element is document element, + # it should have no previous element, previous sibling, or next sibling. + if el.parent is None: + if el.previous_element is not None: + el.previous_element = None + if el.previous_sibling is not None: + el.previous_element = None + if el.next_sibling is not None: + el.next_sibling = None + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + if el.next_element is not child: + el.next_element = child + + if child.previous_element is not el: + child.previous_element = el + + if child.previous_sibling is not None: + child.previous_sibling = None + + # If not the first child, previous index should link as sibling to last index. + # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling). else: - raise ValueError( - "Error building tree: supposedly %r was inserted " - "into %r after the fact, but I don't see it!" % ( - o, parent - ) - ) - if index == 0: - previous_element = parent - previous_sibling = None + if child.previous_sibling is not el.contents[idx - 1]: + child.previous_sibling = el.contents[idx - 1] + if el.contents[idx - 1].next_sibling is not child: + el.contents[idx - 1].next_sibling = child + + if last_child is not None: + if child.previous_element is not last_child: + child.previous_element = last_child + if last_child.next_element is not child: + last_child.next_element = child + + # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way + if isinstance(child, Tag) and child.contents: + descendant = self._linkage_fixer(child, True) + # A bubbled up descendant should have no next siblings + # as it is last in its content list. + if descendant.next_sibling is not None: + descendant.next_sibling = None + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant else: - previous_element = previous_sibling = parent.contents[index-1] - previous = previous_element - while isinstance(previous, Tag): - if previous.contents: - previous.next_element = previous.contents[0] - previous = previous.contents[-1] - else: - break - previous_element = previous + last_child = child + + # If last child in list, there are no next siblings + if idx == last_idx: + if child.next_sibling is not None: + child.next_sibling = None + idx += 1 + + # The child to return is either the last descendant (if available) + # or the last processed child (if any). If neither is available, + # the parent element is its own last descendant. + child = descendant if descendant is not None else child + if child is None: + child = el + + # If not a recursive call, we are done processing this element. + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. + if not _recursive_call and child is not None: + child.next_element = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + child.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent - if index == len(parent.contents)-1: - next_element = parent.next_sibling - next_sibling = None - else: - next_element = next_sibling = parent.contents[index+1] - - o.previous_element = previous_element - if previous_element is not None: - previous_element.next_element = o - o.next_element = next_element - if next_element is not None: - next_element.previous_element = o - o.next_sibling = next_sibling - if next_sibling is not None: - next_sibling.previous_sibling = o - o.previous_sibling = previous_sibling - if previous_sibling is not None: - previous_sibling.next_sibling = o + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent |