diff options
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 3 | ||||
-rw-r--r-- | bs4/testing.py | 5 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 8 |
4 files changed, 17 insertions, 5 deletions
@@ -1,7 +1,7 @@ = 4.1.0 (unreleased) = * Added experimental support for fixing Windows-1252 characters - embedded in UTF-8 documents. + embedded in UTF-8 documents. (UnicodeDammit.detwingle()) * Fixed the handling of " with the built-in parser. [bug=993871] @@ -9,6 +9,10 @@ markup declarations are now treated as preformatted strings, the way CData blocks are. [bug=1001025] +* Fixed a bug with the lxml treebuilder that prevented the user from + adding attributes to a tag that didn't originally have + any. [bug=1002378] Thanks to Oliver Beattie for the patch. + = 4.0.5 (20120427) = * Added a new method, wrap(), which wraps an element in a tag. diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 6491322..c78fdff 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -88,6 +88,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps = None def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None # Invert each namespace map as it comes in. if len(nsmap) == 0 and self.nsmaps != None: diff --git a/bs4/testing.py b/bs4/testing.py index 40dc976..5a84b0b 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -421,6 +421,11 @@ class HTMLTreeBuilderSmokeTest(object): # encoding. self.assertEqual('utf8', charset.encode("utf8")) + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("<a>text</a>") + data.a['foo'] = 'bar' + self.assertEqual('<a foo="bar">text</a>', data.a.decode()) + class XMLTreeBuilderSmokeTest(object): def test_docstring_generated(self): diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index ef58521..23a664e 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -286,7 +286,7 @@ class TestUnicodeDammit(unittest.TestCase): self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) - def test_fix_embedded_windows_1252(self): + def test_detwingle(self): # Here's a UTF8 document. utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") @@ -306,11 +306,11 @@ class TestUnicodeDammit(unittest.TestCase): # But if we run it through fix_embedded_windows_1252, it's fixed: - fixed = UnicodeDammit.fix_embedded_windows_1252(doc) + fixed = UnicodeDammit.detwingle(doc) self.assertEqual( u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) - def test_fix_embedded_windows_1252_ignores_multibyte_characters(self): + def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending # in \x93. \x93 is a smart quote if interpreted as # Windows-1252. But our code knows to skip over multibyte @@ -322,7 +322,7 @@ class TestUnicodeDammit(unittest.TestCase): ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) - output = UnicodeDammit.fix_embedded_windows_1252(input) + output = UnicodeDammit.detwingle(input) self.assertEqual(output, input) class TestNamedspacedAttribute(SoupTest): |