Corrected an encoding error that happened when a BeautifulSoup

object was copied. [bug=1554439]
author: Leonard Richardson <leonardr@segfault.org> 2016-07-18 21:56:10 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2016-07-18 21:56:10 -0400
commit: bb1d95c9926fa4aa3e563c8f694cf5482a12da12 (patch)
tree: e38aea830f78c9f03295a76ae7e1c7946fbb1fa7
parent: d6c1e826c8691aac8c3aaa1a44f9a04732462d9b (diff)
download: beautifulsoup4-bb1d95c9926fa4aa3e563c8f694cf5482a12da12.tar.gz
3 files changed, 20 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 9e27d51..73d737c 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -12,6 +12,9 @@
 
 * Corrected handling of XML processing instructions. [bug=1504393]
 
+* Corrected an encoding error that happened when a BeautifulSoup
+  object was copied. [bug=1554439]
+
 * The contents of <textarea> tags will no longer be modified when the
   tree is prettified. [bug=1555829]
 
diff --git a/bs4/__init__.py b/bs4/__init__.py
index bc611c9..308428a 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -236,7 +236,16 @@ class BeautifulSoup(Tag):
         self.builder.soup = None
 
     def __copy__(self):
-        return type(self)(self.encode(), builder=self.builder)
+        copy = type(self)(
+            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+        )
+
+        # Although we encoded the tree to UTF-8, that may not have
+        # been the encoding of the original markup. Set the copy's
+        # .original_encoding to reflect the original object's
+        # .original_encoding.
+        copy.original_encoding = self.original_encoding
+        return copy
 
     def __getstate__(self):
         # Frequently a tree builder can't be pickled.
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index fc19046..2f9aba1 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1328,6 +1328,13 @@ class TestPersistence(SoupTest):
         copied = copy.deepcopy(self.tree)
         self.assertEqual(copied.decode(), self.tree.decode())
 
+    def test_copy_preserves_encoding(self):
+        soup = BeautifulSoup('<p>&nbsp;</p>', 'html.parser')
+        self.assertEqual('ascii', soup.original_encoding)
+        copy = soup.__copy__()
+        self.assertEqual(u"<p> </p>", unicode(copy))
+        self.assertEqual('ascii', copy.original_encoding)
+
     def test_unicode_pickle(self):
         # A tree containing Unicode characters can be pickled.
         html = u"<b>\N{SNOWMAN}</b>"
author	Leonard Richardson <leonardr@segfault.org>	2016-07-18 21:56:10 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2016-07-18 21:56:10 -0400
commit	bb1d95c9926fa4aa3e563c8f694cf5482a12da12 (patch)
tree	e38aea830f78c9f03295a76ae7e1c7946fbb1fa7
parent	d6c1e826c8691aac8c3aaa1a44f9a04732462d9b (diff)
download	beautifulsoup4-bb1d95c9926fa4aa3e563c8f694cf5482a12da12.tar.gz