Fixed a problem where the html.parser tree builder interpreted

a string like '&foo ' as the character entity '&foo;' [bug=1728706]
author: Leonard Richardson <leonardr@segfault.org> 2018-07-21 12:18:17 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-21 12:18:17 -0400
commit: 70210f7ddef2d4a6aa8cb090c6f520a294012417 (patch)
tree: ae2ff7d078620dd2886770d22a544f6128dd5723
parent: e8bc86a1e47d8c86a4279ddebdf38af69ffbf494 (diff)
download: beautifulsoup4-70210f7ddef2d4a6aa8cb090c6f520a294012417.tar.gz
3 files changed, 18 insertions, 2 deletions
diff --git a/NEWS.txt b/NEWS.txt
index bca2098..ecaccad 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -7,7 +7,10 @@
    the ones introduced at the top level. [bug=1718787]
 
 * Added a new formatter, "html5", which represents void elements
-  elements as "<element>" rather than "<element/>".  [bug=1716272]
+   as "<element>" rather than "<element/>".  [bug=1716272]
+
+* Fixed a problem where the html.parser tree builder interpreted
+  a string like "&foo " as the character entity "&foo;"  [bug=1728706]
 
 * Improved the warning given when no parser is specified. [bug=1780571]
 
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ef9fd1e..eff30ff 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -152,7 +152,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
         if character is not None:
             data = character
         else:
-            data = "&%s;" % name
+            # If this were XML, it would be ambiguous whether "&foo"
+            # was an character entity reference with a missing
+            # semicolon or the literal string "&foo". Since this is
+            # HTML, we have a complete list of all character entity references,
+            # and this one wasn't found, so assume it's the literal string "&foo".
+            data = "&%s" % name
         self.handle_data(data)
 
     def handle_comment(self, data):
diff --git a/bs4/testing.py b/bs4/testing.py
index 5b0eb8f..bbcc271 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -319,6 +319,14 @@ Hello, world!
     def test_angle_brackets_in_attribute_values_are_escaped(self):
         self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
 
+    def test_strings_resembling_character_entity_references(self):
+        # "&T" and "&p" look like incomplete character entities, but they are
+        # not.
+        self.assertSoupEquals(
+            u"<p>&bull; AT&T is in the s&p 500</p>",
+            u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
+        )
+        
     def test_entities_in_attributes_converted_to_unicode(self):
         expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
         self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
author	Leonard Richardson <leonardr@segfault.org>	2018-07-21 12:18:17 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-21 12:18:17 -0400
commit	70210f7ddef2d4a6aa8cb090c6f520a294012417 (patch)
tree	ae2ff7d078620dd2886770d22a544f6128dd5723
parent	e8bc86a1e47d8c86a4279ddebdf38af69ffbf494 (diff)
download	beautifulsoup4-70210f7ddef2d4a6aa8cb090c6f520a294012417.tar.gz