summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2016-07-18 21:33:24 -0400
committerLeonard Richardson <leonardr@segfault.org>2016-07-18 21:33:24 -0400
commitd6c1e826c8691aac8c3aaa1a44f9a04732462d9b (patch)
tree02e5817b5ad0bf5b7533327d17d27f496c2bd75a
parent3c769d98b64ba347dd7cb7e42d7d8b4ce1f8ed3c (diff)
downloadbeautifulsoup4-d6c1e826c8691aac8c3aaa1a44f9a04732462d9b.tar.gz
Added support for CSS selector values that contain quoted spaces,
such as tag[style="display: foo"]. [bug=1540588]
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/element.py14
-rw-r--r--bs4/tests/test_tree.py8
3 files changed, 17 insertions, 8 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 1fe2159..9e27d51 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -7,6 +7,9 @@
reparented into a tag that contained an identical whitespace
element. [bug=1505351]
+* Added support for CSS selector values that contain quoted spaces,
+ such as tag[style="display: foo"]. [bug=1540588]
+
* Corrected handling of XML processing instructions. [bug=1504393]
* The contents of <textarea> tags will no longer be modified when the
diff --git a/bs4/element.py b/bs4/element.py
index 7a3aa52..ad13533 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -4,6 +4,7 @@ __license__ = "MIT"
import collections
import re
+import shlex
import sys
import warnings
from bs4.dammit import EntitySubstitution
@@ -1319,6 +1320,7 @@ class Tag(PageElement):
_selector_combinators = ['>', '+', '~']
_select_debug = False
+ quoted_colon = re.compile('"[^"]*:[^"]*"')
def select_one(self, selector):
"""Perform a CSS selection operation on the current element."""
value = self.select(selector, limit=1)
@@ -1344,8 +1346,7 @@ class Tag(PageElement):
if limit and len(context) >= limit:
break
return context
-
- tokens = selector.split()
+ tokens = shlex.split(selector)
current_context = [self]
if tokens[-1] in self._selector_combinators:
@@ -1397,7 +1398,7 @@ class Tag(PageElement):
return classes.issubset(candidate.get('class', []))
checker = classes_match
- elif ':' in token:
+ elif ':' in token and not self.quoted_colon.search(token):
# Pseudo-class
tag_name, pseudo = token.split(':', 1)
if tag_name == '':
@@ -1428,11 +1429,8 @@ class Tag(PageElement):
self.count += 1
if self.count == self.destination:
return True
- if self.count > self.destination:
- # Stop the generator that's sending us
- # these things.
- raise StopIteration()
- return False
+ else:
+ return False
checker = Counter(pseudo_value).nth_child_of_type
else:
raise NotImplementedError(
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 8a05990..fc19046 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1909,6 +1909,14 @@ class TestSoupSelector(TreeTest):
('div[data-tag]', ['data1'])
)
+ def test_quoted_space_in_selector_name(self):
+ html = """<div style="display: wrong">nope</div>
+ <div style="display: right">yes</div>
+ """
+ soup = BeautifulSoup(html, 'html.parser')
+ [chosen] = soup.select('div[style="display: right"]')
+ self.assertEqual("yes", chosen.string)
+
def test_unsupported_pseudoclass(self):
self.assertRaises(
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")