summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt4
-rw-r--r--bs4/builder/_html5lib.py14
-rw-r--r--bs4/testing.py8
3 files changed, 24 insertions, 2 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 92765e1..d49d451 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -40,6 +40,10 @@
displayed correctly even if the filename or URL is a Unicode
string. [bug=1268888]
+* If the initial <html> tag contains a CDATA list attribute such as
+ 'class', the html5lib tree builder will now turn its value into a
+ list, as it would with any other tag. [bug=1296481]
+
* Improved docstring for encode_contents() and
decode_contents(). [bug=1441543]
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index ea8ff43..ad3c6ef 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -9,7 +9,10 @@ from bs4.builder import (
HTML_5,
HTMLTreeBuilder,
)
-from bs4.element import NamespacedAttribute
+from bs4.element import (
+ NamespacedAttribute,
+ whitespace_re,
+)
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
@@ -103,7 +106,13 @@ class AttrList(object):
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
- "set attr", name, value
+ # If this attribute is a multi-valued attribute for this element,
+ # turn its value into a list.
+ list_attr = HTML5TreeBuilder.cdata_list_attributes
+ if (name in list_attr['*']
+ or (self.element.name in list_attr
+ and name in list_attr[self.element.name])):
+ value = whitespace_re.split(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
@@ -180,6 +189,7 @@ class Element(html5lib.treebuilders._base.Node):
return AttrList(self.element)
def setAttributes(self, attributes):
+
if attributes is not None and len(attributes) > 0:
converted_attributes = []
diff --git a/bs4/testing.py b/bs4/testing.py
index dfaa047..8ca3878 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -243,6 +243,14 @@ Hello, world!
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
+ def test_multivalued_attribute_on_html(self):
+ # html5lib uses a different API to set the attributes ot the
+ # <html> tag. This has caused problems with multivalued
+ # attributes.
+ markup = '<html class="a b"></html>'
+ soup = self.soup(markup)
+ self.assertEqual(["a", "b"], soup.html['class'])
+
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')