diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 16:58:23 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 16:58:23 -0400 |
commit | 37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch) | |
tree | 30826a9a744be8f2194c469484618a0326d0488e | |
parent | 81f853622f808fba7cd89d02ec524abc8588f196 (diff) | |
download | beautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz |
Correctly handle invalid HTML numeric character entities like “
which reference code points that are not Unicode code points. Note
that this is only fixed when Beautiful Soup is used with the
html.parser parser -- html5lib already worked and I couldn't fix it
with lxml. [bug=1782933]
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 26 | ||||
-rw-r--r-- | bs4/testing.py | 13 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 2 |
5 files changed, 47 insertions, 6 deletions
@@ -12,6 +12,12 @@ * Fixed a problem where the html.parser tree builder interpreted a string like "&foo " as the character entity "&foo;" [bug=1728706] +* Correctly handle invalid HTML numeric character entities like “ + which reference code points that are not Unicode code points. Note + that this is only fixed when Beautiful Soup is used with the + html.parser parser -- html5lib already worked and I couldn't fix it + with lxml. [bug=1782933] + * Improved the warning given when no parser is specified. [bug=1780571] * Fixed code that was causing deprecation warnings in recent Python 3 diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index eff30ff..ee6c685 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -1,3 +1,4 @@ +# encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" # Use of this source code is governed by a BSD-style license that can be @@ -140,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser): else: real_name = int(name) - try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" - + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError, e: + pass + if not data: + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + pass + data = data or u"\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): diff --git a/bs4/testing.py b/bs4/testing.py index bbcc271..745a9c4 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -1,3 +1,4 @@ +# encoding: utf-8 """Helper classes for tests.""" # Use of this source code is governed by a BSD-style license that can be @@ -326,6 +327,18 @@ Hello, world! u"<p>• AT&T is in the s&p 500</p>", u"<p>\u2022 AT&T is in the s&p 500</p>" ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "<p>“Hello” -☃</p>" + soup = self.soup(markup) + self.assertEquals(u"“Hello” -☃", soup.p.string) def test_entities_in_attributes_converted_to_unicode(self): expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index a05870b..23cbaef 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") + def test_entities_in_original_document_encoding(self): + # We can't implement this case correctly because by the time we + # hear about markup like "“", it's been (incorrectly) converted into + # a string like u'\x93' + pass + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # test if an old version of lxml is installed. diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index e5cc47e..e5dcfa7 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1206,7 +1206,7 @@ class TestElementObjects(SoupTest): tag = soup.bTag self.assertEqual(soup.b, tag) self.assertEqual( - '.bTag is deprecated, use .find("b") instead.', + '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")', str(w[0].message)) def test_has_attr(self): |