summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/builder/_lxml.py3
-rw-r--r--bs4/testing.py5
-rw-r--r--bs4/tests/test_soup.py8
4 files changed, 17 insertions, 5 deletions
diff --git a/NEWS.txt b/NEWS.txt
index acfb93d..b9c92fe 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,7 +1,7 @@
= 4.1.0 (unreleased) =
* Added experimental support for fixing Windows-1252 characters
- embedded in UTF-8 documents.
+ embedded in UTF-8 documents. (UnicodeDammit.detwingle())
* Fixed the handling of " with the built-in parser. [bug=993871]
@@ -9,6 +9,10 @@
markup declarations are now treated as preformatted strings, the way
CData blocks are. [bug=1001025]
+* Fixed a bug with the lxml treebuilder that prevented the user from
+ adding attributes to a tag that didn't originally have
+ any. [bug=1002378] Thanks to Oliver Beattie for the patch.
+
= 4.0.5 (20120427) =
* Added a new method, wrap(), which wraps an element in a tag.
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 6491322..c78fdff 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -88,6 +88,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps = None
def start(self, name, attrs, nsmap={}):
+ # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
+ attrs = dict(attrs)
+
nsprefix = None
# Invert each namespace map as it comes in.
if len(nsmap) == 0 and self.nsmaps != None:
diff --git a/bs4/testing.py b/bs4/testing.py
index 40dc976..5a84b0b 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -421,6 +421,11 @@ class HTMLTreeBuilderSmokeTest(object):
# encoding.
self.assertEqual('utf8', charset.encode("utf8"))
+ def test_tag_with_no_attributes_can_have_attributes_added(self):
+ data = self.soup("<a>text</a>")
+ data.a['foo'] = 'bar'
+ self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+
class XMLTreeBuilderSmokeTest(object):
def test_docstring_generated(self):
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index ef58521..23a664e 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -286,7 +286,7 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
- def test_fix_embedded_windows_1252(self):
+ def test_detwingle(self):
# Here's a UTF8 document.
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
@@ -306,11 +306,11 @@ class TestUnicodeDammit(unittest.TestCase):
# But if we run it through fix_embedded_windows_1252, it's fixed:
- fixed = UnicodeDammit.fix_embedded_windows_1252(doc)
+ fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
- def test_fix_embedded_windows_1252_ignores_multibyte_characters(self):
+ def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
@@ -322,7 +322,7 @@ class TestUnicodeDammit(unittest.TestCase):
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
- output = UnicodeDammit.fix_embedded_windows_1252(input)
+ output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
class TestNamedspacedAttribute(SoupTest):