summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-07-28 16:58:23 -0400
committerLeonard Richardson <leonardr@segfault.org>2018-07-28 16:58:23 -0400
commit37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch)
tree30826a9a744be8f2194c469484618a0326d0488e
parent81f853622f808fba7cd89d02ec524abc8588f196 (diff)
downloadbeautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz
Correctly handle invalid HTML numeric character entities like &#147;
which reference code points that are not Unicode code points. Note that this is only fixed when Beautiful Soup is used with the html.parser parser -- html5lib already worked and I couldn't fix it with lxml. [bug=1782933]
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/builder/_htmlparser.py26
-rw-r--r--bs4/testing.py13
-rw-r--r--bs4/tests/test_lxml.py6
-rw-r--r--bs4/tests/test_tree.py2
5 files changed, 47 insertions, 6 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 1aa0a42..acdcc04 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -12,6 +12,12 @@
* Fixed a problem where the html.parser tree builder interpreted
a string like "&foo " as the character entity "&foo;" [bug=1728706]
+* Correctly handle invalid HTML numeric character entities like &#147;
+ which reference code points that are not Unicode code points. Note
+ that this is only fixed when Beautiful Soup is used with the
+ html.parser parser -- html5lib already worked and I couldn't fix it
+ with lxml. [bug=1782933]
+
* Improved the warning given when no parser is specified. [bug=1780571]
* Fixed code that was causing deprecation warnings in recent Python 3
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index eff30ff..ee6c685 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
@@ -140,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
- try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
- data = u"\N{REPLACEMENT CHARACTER}"
-
+ data = None
+ if real_name < 256:
+ # HTML numeric entities are supposed to reference Unicode
+ # code points, but sometimes they reference code points in
+ # some other encoding (ahem, Windows-1252). E.g. &#147;
+ # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+ # code tries to detect this situation and compensate.
+ for encoding in (self.soup.original_encoding, 'windows-1252'):
+ if not encoding:
+ continue
+ try:
+ data = bytearray([real_name]).decode(encoding)
+ except UnicodeDecodeError, e:
+ pass
+ if not data:
+ try:
+ data = unichr(real_name)
+ except (ValueError, OverflowError), e:
+ pass
+ data = data or u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
diff --git a/bs4/testing.py b/bs4/testing.py
index bbcc271..745a9c4 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
"""Helper classes for tests."""
# Use of this source code is governed by a BSD-style license that can be
@@ -326,6 +327,18 @@ Hello, world!
u"<p>&bull; AT&T is in the s&p 500</p>",
u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
)
+
+ def test_entities_in_foreign_document_encoding(self):
+ # &#147; and &#148; are invalid numeric entities referencing
+ # Windows-1252 characters. &#45; references a character common
+ # to Windows-1252 and Unicode, and &#9731; references a
+ # character only found in Unicode.
+ #
+ # All of these entities should be converted to Unicode
+ # characters.
+ markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
+ soup = self.soup(markup)
+ self.assertEquals(u"“Hello” -☃", soup.p.string)
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index a05870b..23cbaef 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+ def test_entities_in_original_document_encoding(self):
+ # We can't implement this case correctly because by the time we
+ # hear about markup like "&#147;", it's been (incorrectly) converted into
+ # a string like u'\x93'
+ pass
+
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index e5cc47e..e5dcfa7 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1206,7 +1206,7 @@ class TestElementObjects(SoupTest):
tag = soup.bTag
self.assertEqual(soup.b, tag)
self.assertEqual(
- '.bTag is deprecated, use .find("b") instead.',
+ '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
str(w[0].message))
def test_has_attr(self):