summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-12-30 21:13:03 -0500
committerLeonard Richardson <leonardr@segfault.org>2018-12-30 21:13:03 -0500
commitd420395d8c70794d64efe07448283946ee5b074e (patch)
treeccc341799927593743bc7503ec3f1e5ca3dd378f
parentcc6de8c2b4bf4d41b7ab2a7f609e7493c9e0a859 (diff)
downloadbeautifulsoup4-d420395d8c70794d64efe07448283946ee5b074e.tar.gz
Fixed a problem with multi-valued attributes where the value
contained whitespace. Thanks to Jens Svalgaard for the fix. [bug=1787453]
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/builder/__init__.py4
-rw-r--r--bs4/builder/_html5lib.py4
-rw-r--r--bs4/element.py4
-rw-r--r--bs4/testing.py12
5 files changed, 25 insertions, 5 deletions
diff --git a/NEWS.txt b/NEWS.txt
index b67335f..5052413 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -23,8 +23,12 @@
elements) were not being treated as part of the tree. Patch by Isaac
Muse. [bug=1798699]
+* Fixed a problem with multi-valued attributes where the value
+ contained whitespace. Thanks to Jens Svalgaard for the
+ fix. [bug=1787453]
+
* Clarified ambiguous license statements in the source code. Beautiful
- Soup is released under the MIT license, and has been for some time.
+ Soup is released under the MIT license, and has been since 4.4.0.
= 4.6.3 (20180812)
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 46b28bd..4207750 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -8,7 +8,7 @@ from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
HTMLAwareEntitySubstitution,
- whitespace_re
+ nonwhitespace_re
)
__all__ = [
@@ -173,7 +173,7 @@ class TreeBuilder(object):
# values. Split it into a list.
value = attrs[attr]
if isinstance(value, basestring):
- values = whitespace_re.split(value)
+ values = nonwhitespace_re.findall(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 2c929b9..6fa8593 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -15,7 +15,7 @@ from bs4.builder import (
)
from bs4.element import (
NamespacedAttribute,
- whitespace_re,
+ nonwhitespace_re,
)
import html5lib
from html5lib.constants import (
@@ -206,7 +206,7 @@ class AttrList(object):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
- value = whitespace_re.split(value)
+ value = nonwhitespace_re.findall(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
diff --git a/bs4/element.py b/bs4/element.py
index ba70b24..fb74f9c 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -21,6 +21,10 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
+nonwhitespace_re = re.compile(r"\S+")
+
+# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
+# the off chance someone imported it for their own use.
whitespace_re = re.compile(r"\s+")
def _alias(attr):
diff --git a/bs4/testing.py b/bs4/testing.py
index 9598f31..e4a0ffe 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -452,6 +452,18 @@ Hello, world!
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+ def test_multivalued_attribute_with_whitespace(self):
+ # Whitespace separating the values of a multi-valued attribute
+ # should be ignored.
+
+ markup = '<div class=" foo bar "></a>'
+ soup = self.soup(markup)
+ self.assertEqual(['foo', 'bar'], soup.div['class'])
+
+ # If you search by the literal name of the class it's like the whitespace
+ # wasn't there.
+ self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
+
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with