diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/builder/__init__.py | 4 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 4 | ||||
-rw-r--r-- | bs4/element.py | 4 | ||||
-rw-r--r-- | bs4/testing.py | 12 |
4 files changed, 20 insertions, 4 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 46b28bd..4207750 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -8,7 +8,7 @@ from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, HTMLAwareEntitySubstitution, - whitespace_re + nonwhitespace_re ) __all__ = [ @@ -173,7 +173,7 @@ class TreeBuilder(object): # values. Split it into a list. value = attrs[attr] if isinstance(value, basestring): - values = whitespace_re.split(value) + values = nonwhitespace_re.findall(value) else: # html5lib sometimes calls setAttributes twice # for the same tag when rearranging the parse diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 2c929b9..6fa8593 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -15,7 +15,7 @@ from bs4.builder import ( ) from bs4.element import ( NamespacedAttribute, - whitespace_re, + nonwhitespace_re, ) import html5lib from html5lib.constants import ( @@ -206,7 +206,7 @@ class AttrList(object): # A node that is being cloned may have already undergone # this procedure. if not isinstance(value, list): - value = whitespace_re.split(value) + value = nonwhitespace_re.findall(value) self.element[name] = value def items(self): return list(self.attrs.items()) diff --git a/bs4/element.py b/bs4/element.py index ba70b24..fb74f9c 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -21,6 +21,10 @@ from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. whitespace_re = re.compile(r"\s+") def _alias(attr): diff --git a/bs4/testing.py b/bs4/testing.py index 9598f31..e4a0ffe 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -452,6 +452,18 @@ Hello, world! "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '<div class=" foo bar "></a>' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.div['class']) + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + self.assertEqual(soup.div, soup.find('div', class_="foo bar")) + def test_deeply_nested_multivalued_attribute(self): # html5lib can set the attributes of the same tag many times # as it rearranges the tree. This has caused problems with |