diff options
author | Leonard Richardson <leonardr@segfault.org> | 2015-06-24 17:03:40 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2015-06-24 17:03:40 -0400 |
commit | 1d4d77e815a33fc0cafd83f9243f0e6626f59add (patch) | |
tree | a9329913264eb90b0a9e7cd90b870c6a5f400298 | |
parent | 5995aa450c78dd8c078df3176703f98cd51c780d (diff) | |
download | beautifulsoup4-1d4d77e815a33fc0cafd83f9243f0e6626f59add.tar.gz |
If the initial <html> tag contains a CDATA list attribute such as
'class', the html5lib tree builder will now turn its value into a
list, as it would with any other tag. [bug=1296481]
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 14 | ||||
-rw-r--r-- | bs4/testing.py | 8 |
3 files changed, 24 insertions, 2 deletions
@@ -40,6 +40,10 @@ displayed correctly even if the filename or URL is a Unicode string. [bug=1268888] +* If the initial <html> tag contains a CDATA list attribute such as + 'class', the html5lib tree builder will now turn its value into a + list, as it would with any other tag. [bug=1296481] + * Improved docstring for encode_contents() and decode_contents(). [bug=1441543] diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index ea8ff43..ad3c6ef 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -9,7 +9,10 @@ from bs4.builder import ( HTML_5, HTMLTreeBuilder, ) -from bs4.element import NamespacedAttribute +from bs4.element import ( + NamespacedAttribute, + whitespace_re, +) import html5lib from html5lib.constants import namespaces from bs4.element import ( @@ -103,7 +106,13 @@ class AttrList(object): def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): - "set attr", name, value + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = HTML5TreeBuilder.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + value = whitespace_re.split(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -180,6 +189,7 @@ class Element(html5lib.treebuilders._base.Node): return AttrList(self.element) def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: converted_attributes = [] diff --git a/bs4/testing.py b/bs4/testing.py index dfaa047..8ca3878 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -243,6 +243,14 @@ Hello, world! soup = self.soup(markup) self.assertEqual(["css"], soup.div.div['class']) + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # <html> tag. This has caused problems with multivalued + # attributes. + markup = '<html class="a b"></html>' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') |