summaryrefslogtreecommitdiff
path: root/bs4/testing.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-07-28 16:58:23 -0400
committerLeonard Richardson <leonardr@segfault.org>2018-07-28 16:58:23 -0400
commit37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch)
tree30826a9a744be8f2194c469484618a0326d0488e /bs4/testing.py
parent81f853622f808fba7cd89d02ec524abc8588f196 (diff)
downloadbeautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz
Correctly handle invalid HTML numeric character entities like &#147;
which reference code points that are not Unicode code points. Note that this is only fixed when Beautiful Soup is used with the html.parser parser -- html5lib already worked and I couldn't fix it with lxml. [bug=1782933]
Diffstat (limited to 'bs4/testing.py')
-rw-r--r--bs4/testing.py13
1 files changed, 13 insertions, 0 deletions
diff --git a/bs4/testing.py b/bs4/testing.py
index bbcc271..745a9c4 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
"""Helper classes for tests."""
# Use of this source code is governed by a BSD-style license that can be
@@ -326,6 +327,18 @@ Hello, world!
u"<p>&bull; AT&T is in the s&p 500</p>",
u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
)
+
+ def test_entities_in_foreign_document_encoding(self):
+ # &#147; and &#148; are invalid numeric entities referencing
+ # Windows-1252 characters. &#45; references a character common
+ # to Windows-1252 and Unicode, and &#9731; references a
+ # character only found in Unicode.
+ #
+ # All of these entities should be converted to Unicode
+ # characters.
+ markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
+ soup = self.soup(markup)
+ self.assertEquals(u"“Hello” -☃", soup.p.string)
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'