From 37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Sat, 28 Jul 2018 16:58:23 -0400
Subject: Correctly handle invalid HTML numeric character entities like &#147; 
  which reference code points that are not Unicode code points. Note   that
 this is only fixed when Beautiful Soup is used with the   html.parser parser
 -- html5lib already worked and I couldn't fix it   with lxml.  [bug=1782933]

---
 NEWS.txt                   |  6 ++++++
 bs4/builder/_htmlparser.py | 26 +++++++++++++++++++++-----
 bs4/testing.py             | 13 +++++++++++++
 bs4/tests/test_lxml.py     |  6 ++++++
 bs4/tests/test_tree.py     |  2 +-
 5 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/NEWS.txt b/NEWS.txt
index 1aa0a42..acdcc04 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -12,6 +12,12 @@
 * Fixed a problem where the html.parser tree builder interpreted
   a string like "&foo " as the character entity "&foo;"  [bug=1728706]
 
+* Correctly handle invalid HTML numeric character entities like &#147;
+  which reference code points that are not Unicode code points. Note
+  that this is only fixed when Beautiful Soup is used with the
+  html.parser parser -- html5lib already worked and I couldn't fix it
+  with lxml.  [bug=1782933]
+
 * Improved the warning given when no parser is specified. [bug=1780571]
 
 * Fixed code that was causing deprecation warnings in recent Python 3
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index eff30ff..ee6c685 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 
 # Use of this source code is governed by a BSD-style license that can be
@@ -140,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
         else:
             real_name = int(name)
 
-        try:
-            data = unichr(real_name)
-        except (ValueError, OverflowError), e:
-            data = u"\N{REPLACEMENT CHARACTER}"
-
+        data = None
+        if real_name < 256:
+            # HTML numeric entities are supposed to reference Unicode
+            # code points, but sometimes they reference code points in
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
+            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+            # code tries to detect this situation and compensate.
+            for encoding in (self.soup.original_encoding, 'windows-1252'):
+                if not encoding:
+                    continue
+                try:
+                    data = bytearray([real_name]).decode(encoding)
+                except UnicodeDecodeError, e:
+                    pass
+        if not data:
+            try:
+                data = unichr(real_name)
+            except (ValueError, OverflowError), e:
+                pass
+        data = data or u"\N{REPLACEMENT CHARACTER}"
         self.handle_data(data)
 
     def handle_entityref(self, name):
diff --git a/bs4/testing.py b/bs4/testing.py
index bbcc271..745a9c4 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Helper classes for tests."""
 
 # Use of this source code is governed by a BSD-style license that can be
@@ -326,6 +327,18 @@ Hello, world!
             u"<p>&bull; AT&T is in the s&p 500</p>",
             u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
         )
+
+    def test_entities_in_foreign_document_encoding(self):
+        # &#147; and &#148; are invalid numeric entities referencing
+        # Windows-1252 characters. &#45; references a character common
+        # to Windows-1252 and Unicode, and &#9731; references a
+        # character only found in Unicode.
+        #
+        # All of these entities should be converted to Unicode
+        # characters.
+        markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
+        soup = self.soup(markup)
+        self.assertEquals(u"“Hello” -☃", soup.p.string)
         
     def test_entities_in_attributes_converted_to_unicode(self):
         expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index a05870b..23cbaef 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertSoupEquals(
             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
 
+    def test_entities_in_original_document_encoding(self):
+        # We can't implement this case correctly because by the time we
+        # hear about markup like "&#147;", it's been (incorrectly) converted into
+        # a string like u'\x93'
+        pass
+        
     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
     # test if an old version of lxml is installed.
 
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index e5cc47e..e5dcfa7 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1206,7 +1206,7 @@ class TestElementObjects(SoupTest):
             tag = soup.bTag
         self.assertEqual(soup.b, tag)
         self.assertEqual(
-            '.bTag is deprecated, use .find("b") instead.',
+            '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
             str(w[0].message))
 
     def test_has_attr(self):
-- 
cgit v1.2.1