diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-21 12:18:17 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-21 12:18:17 -0400 |
commit | 70210f7ddef2d4a6aa8cb090c6f520a294012417 (patch) | |
tree | ae2ff7d078620dd2886770d22a544f6128dd5723 | |
parent | e8bc86a1e47d8c86a4279ddebdf38af69ffbf494 (diff) | |
download | beautifulsoup4-70210f7ddef2d4a6aa8cb090c6f520a294012417.tar.gz |
Fixed a problem where the html.parser tree builder interpreted
a string like '&foo ' as the character entity '&foo;' [bug=1728706]
-rw-r--r-- | NEWS.txt | 5 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 7 | ||||
-rw-r--r-- | bs4/testing.py | 8 |
3 files changed, 18 insertions, 2 deletions
@@ -7,7 +7,10 @@ the ones introduced at the top level. [bug=1718787] * Added a new formatter, "html5", which represents void elements - elements as "<element>" rather than "<element/>". [bug=1716272] + as "<element>" rather than "<element/>". [bug=1716272] + +* Fixed a problem where the html.parser tree builder interpreted + a string like "&foo " as the character entity "&foo;" [bug=1728706] * Improved the warning given when no parser is specified. [bug=1780571] diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index ef9fd1e..eff30ff 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -152,7 +152,12 @@ class BeautifulSoupHTMLParser(HTMLParser): if character is not None: data = character else: - data = "&%s;" % name + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name self.handle_data(data) def handle_comment(self, data): diff --git a/bs4/testing.py b/bs4/testing.py index 5b0eb8f..bbcc271 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -319,6 +319,14 @@ Hello, world! def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( + u"<p>• AT&T is in the s&p 500</p>", + u"<p>\u2022 AT&T is in the s&p 500</p>" + ) + def test_entities_in_attributes_converted_to_unicode(self): expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' self.assertSoupEquals('<p id="piñata"></p>', expect) |