summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2016-07-18 21:56:10 -0400
committerLeonard Richardson <leonardr@segfault.org>2016-07-18 21:56:10 -0400
commitbb1d95c9926fa4aa3e563c8f694cf5482a12da12 (patch)
treee38aea830f78c9f03295a76ae7e1c7946fbb1fa7
parentd6c1e826c8691aac8c3aaa1a44f9a04732462d9b (diff)
downloadbeautifulsoup4-bb1d95c9926fa4aa3e563c8f694cf5482a12da12.tar.gz
Corrected an encoding error that happened when a BeautifulSoup
object was copied. [bug=1554439]
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/__init__.py11
-rw-r--r--bs4/tests/test_tree.py7
3 files changed, 20 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 9e27d51..73d737c 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -12,6 +12,9 @@
* Corrected handling of XML processing instructions. [bug=1504393]
+* Corrected an encoding error that happened when a BeautifulSoup
+ object was copied. [bug=1554439]
+
* The contents of <textarea> tags will no longer be modified when the
tree is prettified. [bug=1555829]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index bc611c9..308428a 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -236,7 +236,16 @@ class BeautifulSoup(Tag):
self.builder.soup = None
def __copy__(self):
- return type(self)(self.encode(), builder=self.builder)
+ copy = type(self)(
+ self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+ )
+
+ # Although we encoded the tree to UTF-8, that may not have
+ # been the encoding of the original markup. Set the copy's
+ # .original_encoding to reflect the original object's
+ # .original_encoding.
+ copy.original_encoding = self.original_encoding
+ return copy
def __getstate__(self):
# Frequently a tree builder can't be pickled.
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index fc19046..2f9aba1 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1328,6 +1328,13 @@ class TestPersistence(SoupTest):
copied = copy.deepcopy(self.tree)
self.assertEqual(copied.decode(), self.tree.decode())
+ def test_copy_preserves_encoding(self):
+ soup = BeautifulSoup('<p>&nbsp;</p>', 'html.parser')
+ self.assertEqual('ascii', soup.original_encoding)
+ copy = soup.__copy__()
+ self.assertEqual(u"<p> </p>", unicode(copy))
+ self.assertEqual('ascii', copy.original_encoding)
+
def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled.
html = u"<b>\N{SNOWMAN}</b>"