Correctly handle invalid HTML numeric character entities like 

which reference code points that are not Unicode code points. Note that this is only fixed when Beautiful Soup is used with the html.parser parser -- html5lib already worked and I couldn't fix it with lxml. [bug=1782933]
author: Leonard Richardson <leonardr@segfault.org> 2018-07-28 16:58:23 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-28 16:58:23 -0400
commit: 37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch)
tree: 30826a9a744be8f2194c469484618a0326d0488e
parent: 81f853622f808fba7cd89d02ec524abc8588f196 (diff)
download: beautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz
5 files changed, 47 insertions, 6 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 1aa0a42..acdcc04 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -12,6 +12,12 @@
 * Fixed a problem where the html.parser tree builder interpreted
   a string like "&foo " as the character entity "&foo;"  [bug=1728706]
 
+* Correctly handle invalid HTML numeric character entities like &#147;
+  which reference code points that are not Unicode code points. Note
+  that this is only fixed when Beautiful Soup is used with the
+  html.parser parser -- html5lib already worked and I couldn't fix it
+  with lxml.  [bug=1782933]
+
 * Improved the warning given when no parser is specified. [bug=1780571]
 
 * Fixed code that was causing deprecation warnings in recent Python 3
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index eff30ff..ee6c685 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 
 # Use of this source code is governed by a BSD-style license that can be
@@ -140,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
         else:
             real_name = int(name)
 
-        try:
-            data = unichr(real_name)
-        except (ValueError, OverflowError), e:
-            data = u"\N{REPLACEMENT CHARACTER}"
-
+        data = None
+        if real_name < 256:
+            # HTML numeric entities are supposed to reference Unicode
+            # code points, but sometimes they reference code points in
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
+            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+            # code tries to detect this situation and compensate.
+            for encoding in (self.soup.original_encoding, 'windows-1252'):
+                if not encoding:
+                    continue
+                try:
+                    data = bytearray([real_name]).decode(encoding)
+                except UnicodeDecodeError, e:
+                    pass
+        if not data:
+            try:
+                data = unichr(real_name)
+            except (ValueError, OverflowError), e:
+                pass
+        data = data or u"\N{REPLACEMENT CHARACTER}"
         self.handle_data(data)
 
     def handle_entityref(self, name):
diff --git a/bs4/testing.py b/bs4/testing.py
index bbcc271..745a9c4 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Helper classes for tests."""
 
 # Use of this source code is governed by a BSD-style license that can be
@@ -326,6 +327,18 @@ Hello, world!
             u"<p>&bull; AT&T is in the s&p 500</p>",
             u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
         )
+
+    def test_entities_in_foreign_document_encoding(self):
+        # &#147; and &#148; are invalid numeric entities referencing
+        # Windows-1252 characters. &#45; references a character common
+        # to Windows-1252 and Unicode, and &#9731; references a
+        # character only found in Unicode.
+        #
+        # All of these entities should be converted to Unicode
+        # characters.
+        markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
+        soup = self.soup(markup)
+        self.assertEquals(u"“Hello” -☃", soup.p.string)
         
     def test_entities_in_attributes_converted_to_unicode(self):
         expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index a05870b..23cbaef 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertSoupEquals(
             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
 
+    def test_entities_in_original_document_encoding(self):
+        # We can't implement this case correctly because by the time we
+        # hear about markup like "&#147;", it's been (incorrectly) converted into
+        # a string like u'\x93'
+        pass
+        
     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
     # test if an old version of lxml is installed.
 
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index e5cc47e..e5dcfa7 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1206,7 +1206,7 @@ class TestElementObjects(SoupTest):
             tag = soup.bTag
         self.assertEqual(soup.b, tag)
         self.assertEqual(
-            '.bTag is deprecated, use .find("b") instead.',
+            '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
             str(w[0].message))
 
     def test_has_attr(self):
author	Leonard Richardson <leonardr@segfault.org>	2018-07-28 16:58:23 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-28 16:58:23 -0400
commit	37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch)
tree	30826a9a744be8f2194c469484618a0326d0488e
parent	81f853622f808fba7cd89d02ec524abc8588f196 (diff)
download	beautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz