diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 16:58:23 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 16:58:23 -0400 |
commit | 37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch) | |
tree | 30826a9a744be8f2194c469484618a0326d0488e /bs4/testing.py | |
parent | 81f853622f808fba7cd89d02ec524abc8588f196 (diff) | |
download | beautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz |
Correctly handle invalid HTML numeric character entities like “
which reference code points that are not Unicode code points. Note
that this is only fixed when Beautiful Soup is used with the
html.parser parser -- html5lib already worked and I couldn't fix it
with lxml. [bug=1782933]
Diffstat (limited to 'bs4/testing.py')
-rw-r--r-- | bs4/testing.py | 13 |
1 files changed, 13 insertions, 0 deletions
diff --git a/bs4/testing.py b/bs4/testing.py index bbcc271..745a9c4 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -1,3 +1,4 @@ +# encoding: utf-8 """Helper classes for tests.""" # Use of this source code is governed by a BSD-style license that can be @@ -326,6 +327,18 @@ Hello, world! u"<p>• AT&T is in the s&p 500</p>", u"<p>\u2022 AT&T is in the s&p 500</p>" ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "<p>“Hello” -☃</p>" + soup = self.soup(markup) + self.assertEquals(u"“Hello” -☃", soup.p.string) def test_entities_in_attributes_converted_to_unicode(self): expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' |