Stopped HTMLParser from raising an exception in very rare cases of

bad markup. [bug=1708831]
author: Leonard Richardson <leonardr@segfault.org> 2018-07-14 15:35:24 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-14 15:35:24 -0400
commit: 670ecfdeb3b315cf05777e6bd361539a21604907 (patch)
tree: a6e66959686d963952a4110deff53319974ef491
parent: d48fd72468023868ade770abe8ea824bff7df4cc (diff)
download: beautifulsoup4-670ecfdeb3b315cf05777e6bd361539a21604907.tar.gz
4 files changed, 31 insertions, 3 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 2437e83..909de65 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -8,6 +8,9 @@
 * Fixed a Windows crash in diagnose() when checking whether a long
   markup string is a filename. [bug=1737121]
 
+* Stopped HTMLParser from raising an exception in very rare cases of
+  bad markup. [bug=1708831]
+
 = 4.6.0 (20170507) =
 
 * Added the `Tag.get_attribute_list` method, which acts like `Tag.get` for
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 67890b3..71604c5 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -64,7 +64,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
         # order. It's a list of closing tags we've already handled and
         # will ignore, assuming they ever show up.
         self.already_closed_empty_element = []
-    
+
+    def error(self, msg):
+        """In Python 3, HTMLParser subclasses must implement error(), although this
+        requirement doesn't appear to be documented.
+
+        In Python 2, HTMLParser implements error() as raising an exception.
+
+        In any event, this method is called only on very strange markup and our best strategy
+        is to pretend it didn't happen and keep going.
+        """
+        warnings.warn(msg)
+        
     def handle_startendtag(self, name, attrs):
         # This is only called when the markup looks like
         # <tag/>.
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index d2ca287..3439271 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -5,9 +5,13 @@ __all__ = [
     'LXMLTreeBuilder',
     ]
 
+try:
+    from collections.abc import Callable # Python 3.6
+except ImportError , e:
+    from collections import Callable
+
 from io import BytesIO
 from StringIO import StringIO
-import collections
 from lxml import etree
 from bs4.element import (
     Comment,
@@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         # Use the default parser.
         parser = self.default_parser(encoding)
 
-        if isinstance(parser, collections.Callable):
+        if isinstance(parser, Callable):
             # Instantiate the parser with default arguments
             parser = parser(target=self, strip_cdata=False, encoding=encoding)
         return parser
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index d5cf025..c13d59f 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -5,6 +5,7 @@ from pdb import set_trace
 import pickle
 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 from bs4.builder import HTMLParserTreeBuilder
+from bs4.builder._htmlparser import BeautifulSoupHTMLParser
 
 class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
 
@@ -32,3 +33,12 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
     def test_redundant_empty_element_closing_tags(self):
         self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
         self.assertSoupEquals('</br></br></br>', "")
+
+
+class TestHTMLParserSubclass(SoupTest):
+    def test_error(self):
+        """Verify that our HTMLParser subclass implements error() in a way
+        that doesn't cause a crash.
+        """
+        parser = BeautifulSoupHTMLParser()
+        parser.error("don't crash")
author	Leonard Richardson <leonardr@segfault.org>	2018-07-14 15:35:24 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-14 15:35:24 -0400
commit	670ecfdeb3b315cf05777e6bd361539a21604907 (patch)
tree	a6e66959686d963952a4110deff53319974ef491
parent	d48fd72468023868ade770abe8ea824bff7df4cc (diff)
download	beautifulsoup4-670ecfdeb3b315cf05777e6bd361539a21604907.tar.gz