diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-15 08:27:40 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-15 08:27:40 -0400 |
commit | b5ffba33327acaf51a2274f9e0a5305b5fb8bdf9 (patch) | |
tree | e627743f2434291a3cb12139c4ae45b8088d8592 | |
parent | b2836d45288e0de1474ecc555f6e3aac51f3168c (diff) | |
download | beautifulsoup4-b5ffba33327acaf51a2274f9e0a5305b5fb8bdf9.tar.gz |
Stop data loss when encountering an empty numeric entity, and
possibly in other cases. Thanks to tos.kamiya for the fix. [bug=1698503]
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 1 | ||||
-rw-r--r-- | bs4/testing.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 5 |
4 files changed, 10 insertions, 1 deletions
@@ -1,5 +1,8 @@ = Unreleased +* Stop data loss when encountering an empty numeric entity, and + possibly in other cases. Thanks to tos.kamiya for the fix. [bug=1698503] + * Improved the warning given when no parser is specified. [bug=1780571] * Fixed code that was causing deprecation warnings in recent Python 3 diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 71604c5..ef9fd1e 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -224,6 +224,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) + parser.close() except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) diff --git a/bs4/testing.py b/bs4/testing.py index 6ba2506..9d42702 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -334,7 +334,7 @@ Hello, world! self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) - + def test_multipart_strings(self): "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index c13d59f..0381c7d 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -34,6 +34,11 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") self.assertSoupEquals('</br></br></br>', "") + def test_empty_element(self): + # This verifies that any buffered data present when the parser + # finishes working is handled. + self.assertSoupEquals("foo &# bar", "foo &# bar") + class TestHTMLParserSubclass(SoupTest): def test_error(self): |