diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-07-18 21:56:10 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-07-18 21:56:10 -0400 |
commit | bb1d95c9926fa4aa3e563c8f694cf5482a12da12 (patch) | |
tree | e38aea830f78c9f03295a76ae7e1c7946fbb1fa7 | |
parent | d6c1e826c8691aac8c3aaa1a44f9a04732462d9b (diff) | |
download | beautifulsoup4-bb1d95c9926fa4aa3e563c8f694cf5482a12da12.tar.gz |
Corrected an encoding error that happened when a BeautifulSoup
object was copied. [bug=1554439]
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/__init__.py | 11 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 7 |
3 files changed, 20 insertions, 1 deletions
@@ -12,6 +12,9 @@ * Corrected handling of XML processing instructions. [bug=1504393] +* Corrected an encoding error that happened when a BeautifulSoup + object was copied. [bug=1554439] + * The contents of <textarea> tags will no longer be modified when the tree is prettified. [bug=1555829] diff --git a/bs4/__init__.py b/bs4/__init__.py index bc611c9..308428a 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -236,7 +236,16 @@ class BeautifulSoup(Tag): self.builder.soup = None def __copy__(self): - return type(self)(self.encode(), builder=self.builder) + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy def __getstate__(self): # Frequently a tree builder can't be pickled. diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index fc19046..2f9aba1 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1328,6 +1328,13 @@ class TestPersistence(SoupTest): copied = copy.deepcopy(self.tree) self.assertEqual(copied.decode(), self.tree.decode()) + def test_copy_preserves_encoding(self): + soup = BeautifulSoup('<p> </p>', 'html.parser') + self.assertEqual('ascii', soup.original_encoding) + copy = soup.__copy__() + self.assertEqual(u"<p> </p>", unicode(copy)) + self.assertEqual('ascii', copy.original_encoding) + def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. html = u"<b>\N{SNOWMAN}</b>" |