diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-14 15:35:24 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-14 15:35:24 -0400 |
commit | 670ecfdeb3b315cf05777e6bd361539a21604907 (patch) | |
tree | a6e66959686d963952a4110deff53319974ef491 /bs4 | |
parent | d48fd72468023868ade770abe8ea824bff7df4cc (diff) | |
download | beautifulsoup4-670ecfdeb3b315cf05777e6bd361539a21604907.tar.gz |
Stopped HTMLParser from raising an exception in very rare cases of
bad markup. [bug=1708831]
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/builder/_htmlparser.py | 13 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 8 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 10 |
3 files changed, 28 insertions, 3 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 67890b3..71604c5 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -64,7 +64,18 @@ class BeautifulSoupHTMLParser(HTMLParser): # order. It's a list of closing tags we've already handled and # will ignore, assuming they ever show up. self.already_closed_empty_element = [] - + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + def handle_startendtag(self, name, attrs): # This is only called when the markup looks like # <tag/>. diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index d2ca287..3439271 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -5,9 +5,13 @@ __all__ = [ 'LXMLTreeBuilder', ] +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable + from io import BytesIO from StringIO import StringIO -import collections from lxml import etree from bs4.element import ( Comment, @@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Use the default parser. parser = self.default_parser(encoding) - if isinstance(parser, collections.Callable): + if isinstance(parser, Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index d5cf025..c13d59f 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -5,6 +5,7 @@ from pdb import set_trace import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder +from bs4.builder._htmlparser import BeautifulSoupHTMLParser class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @@ -32,3 +33,12 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_redundant_empty_element_closing_tags(self): self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") self.assertSoupEquals('</br></br></br>', "") + + +class TestHTMLParserSubclass(SoupTest): + def test_error(self): + """Verify that our HTMLParser subclass implements error() in a way + that doesn't cause a crash. + """ + parser = BeautifulSoupHTMLParser() + parser.error("don't crash") |