summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2016-07-18 22:23:22 -0400
committerLeonard Richardson <leonardr@segfault.org>2016-07-18 22:23:22 -0400
commitbeba77f8fccc699681102ce348e8e811946bbf2f (patch)
tree1b536e366f324e6c3d7256458dd723b9e7efe8c1
parentbb1d95c9926fa4aa3e563c8f694cf5482a12da12 (diff)
downloadbeautifulsoup4-beba77f8fccc699681102ce348e8e811946bbf2f.tar.gz
If a search against each individual value of a multi-valued
attribute fails, the search will be run one final time against the complete attribute value considered as a single string. [bug=1476868]
-rw-r--r--NEWS.txt10
-rw-r--r--bs4/element.py24
-rw-r--r--bs4/tests/test_tree.py17
3 files changed, 33 insertions, 18 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 73d737c..702623d 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,16 @@
* Beautiful Soup will now work with versions of html5lib greater than
0.99999999. [bug=1603299]
+* If a search against each individual value of a multi-valued
+ attribute fails, the search will be run one final time against the
+ complete attribute value considered as a single string. That is, if
+ a tag has class="foo bar" and neither "foo" nor "bar" matches, but
+ "foo bar" does, the tag is now considered a match.
+
+ This happened in previous versions, but only when the value being
+ searched for was a string. Now it also works when searching for
+ regular expressions, lists of values, etc. [bug=1476868]
+
* Fixed a bug that deranged the tree when a whitespace element was
reparented into a tag that contained an identical whitespace
element. [bug=1505351]
diff --git a/bs4/element.py b/bs4/element.py
index ad13533..5a3665e 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1704,21 +1704,15 @@ class SoupStrainer(object):
if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute
# like 'class'.
- if (isinstance(match_against, unicode)
- and ' ' in match_against):
- # A bit of a special case. If they try to match "foo
- # bar" on a multivalue attribute's value, only accept
- # the literal value "foo bar"
- #
- # XXX This is going to be pretty slow because we keep
- # splitting match_against. But it shouldn't come up
- # too often.
- return (whitespace_re.split(match_against) == markup)
- else:
- for item in markup:
- if self._matches(item, match_against):
- return True
- return False
+ for item in markup:
+ if self._matches(item, match_against):
+ return True
+ # We didn't match any particular value of the multivalue
+ # attribute, but maybe we match the attribute value when
+ # considered as a string.
+ if self._matches(' '.join(markup), match_against):
+ return True
+ return False
if match_against is True:
# True matches any non-None value.
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 2f9aba1..6bcf8af 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -222,6 +222,17 @@ class TestFindAllByName(TreeTest):
self.assertSelects(
tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
+ def test_find_with_multi_valued_attribute(self):
+ soup = self.soup(
+ "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>"
+ )
+ r1 = soup.find('div', 'a d');
+ r2 = soup.find('div', re.compile(r'a d'));
+ r3, r4 = soup.find_all('div', ['a b', 'a d']);
+ self.assertEqual('3', r1.string)
+ self.assertEqual('3', r2.string)
+ self.assertEqual('1', r3.string)
+ self.assertEqual('3', r4.string)
class TestFindAllByAttribute(TreeTest):
@@ -294,10 +305,10 @@ class TestFindAllByAttribute(TreeTest):
f = tree.find_all("gar", class_=re.compile("a"))
self.assertSelects(f, ["Found it"])
- # Since the class is not the string "foo bar", but the two
- # strings "foo" and "bar", this will not find anything.
+ # If the search fails to match the individual strings "foo" and "bar",
+ # it will be tried against the combined string "foo bar".
f = tree.find_all("gar", class_=re.compile("o b"))
- self.assertSelects(f, [])
+ self.assertSelects(f, ["Found it"])
def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
soup = self.soup("<a class='bar'>Found it</a>")