From 334e562e1f19b166cc5f3be986329d97e08df6d9 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 28 Jul 2018 18:23:00 -0400 Subject: When markup contains duplicate elements, a select() call that includes multiple match clauses will match all relevant elements. [bug=1770596] --- NEWS.txt | 4 ++++ bs4/element.py | 22 ++++++++++++++++++---- bs4/tests/test_lxml.py | 2 +- bs4/tests/test_tree.py | 14 +++++++++++++- 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/NEWS.txt b/NEWS.txt index acdcc04..3884687 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -20,6 +20,10 @@ * Improved the warning given when no parser is specified. [bug=1780571] +* When markup contains duplicate elements, a select() call that + includes multiple match clauses will match all relevant + elements. [bug=1770596] + * Fixed code that was causing deprecation warnings in recent Python 3 versions. Includes a patch from Ville Skyttä. [bug=1778909] [bug=1689496] diff --git a/bs4/element.py b/bs4/element.py index c431a97..8383c3f 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1409,15 +1409,29 @@ class Tag(PageElement): # Handle grouping selectors if ',' exists, ie: p,a if ',' in selector: context = [] - for partial_selector in selector.split(','): - partial_selector = partial_selector.strip() + selectors = [x.strip() for x in selector.split(",")] + + # If a selector is mentioned multiple times we don't want + # to use it more than once. + used_selectors = set() + + # We also don't want to select the same element more than once, + # if it's matched by multiple selectors. + selected_object_ids = set() + for partial_selector in selectors: if partial_selector == '': raise ValueError('Invalid group selection syntax: %s' % selector) + if partial_selector in used_selectors: + continue + used_selectors.add(partial_selector) candidates = self.select(partial_selector, limit=limit) for candidate in candidates: - if candidate not in context: + # This lets us distinguish between distinct tags that + # represent the same markup. + object_id = id(candidate) + if object_id not in selected_object_ids: context.append(candidate) - + selected_object_ids.add(object_id) if limit and len(context) >= limit: break return context diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 23cbaef..8a8f690 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -46,7 +46,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "

foo�bar

", "

foobar

") - def test_entities_in_original_document_encoding(self): + def test_entities_in_foreign_document_encoding(self): # We can't implement this case correctly because by the time we # hear about markup like "“", it's been (incorrectly) converted into # a string like u'\x93' diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index e5dcfa7..68887b4 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -2074,5 +2074,17 @@ class TestSoupSelector(TreeTest): def test_multiple_select_nested(self): self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) - + def test_select_duplicate_elements(self): + # When markup contains duplicate elements, a multiple select + # will find all of them. + markup = '
' + soup = BeautifulSoup(markup, 'html.parser') + selected = soup.select(".c1, .c2") + self.assertEquals(3, len(selected)) + + # Verify that find_all finds the same elements, though because + # of an implementation detail it finds them in a different + # order. + for element in soup.find_all(class_=['c1', 'c2']): + assert element in selected -- cgit v1.2.1