Improved the handling of empty-element tags like <br> when using the

html.parser parser. [bug=1676935]
author: Leonard Richardson <leonardr@segfault.org> 2017-05-06 21:31:10 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2017-05-06 21:31:10 -0400
commit: e21aa406ab7c524692e2d462e1f30a4c37b1f0fc (patch)
tree: 409ea0066143454155fb4bd401c66558b9fcafc8
parent: a8a9224b7f97e882cc8ec712323d8b86631e42e9 (diff)
download: beautifulsoup4-e21aa406ab7c524692e2d462e1f30a4c37b1f0fc.tar.gz
4 files changed, 67 insertions, 6 deletions
diff --git a/NEWS.txt b/NEWS.txt
index daeec24..91fe13c 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -7,6 +7,9 @@
 * It's now possible to use a tag's namespace prefix when searching,
   e.g. soup.find('namespace:tag') [bug=1655332]
 
+* Improved the handling of empty-element tags like <br> when using the
+  html.parser parser. [bug=1676935]
+
 * HTML parsers treat all HTML4 and HTML5 empty element tags (aka void
   element tags) correctly. [bug=1656909]
 
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 823ca15..67890b3 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -52,7 +52,31 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'
 
 class BeautifulSoupHTMLParser(HTMLParser):
-    def handle_starttag(self, name, attrs):
+
+    def __init__(self, *args, **kwargs):
+        HTMLParser.__init__(self, *args, **kwargs)
+
+        # Keep a list of empty-element tags that were encountered
+        # without an explicit closing tag. If we encounter a closing tag
+        # of this type, we'll associate it with one of those entries.
+        #
+        # This isn't a stack because we don't care about the
+        # order. It's a list of closing tags we've already handled and
+        # will ignore, assuming they ever show up.
+        self.already_closed_empty_element = []
+    
+    def handle_startendtag(self, name, attrs):
+        # This is only called when the markup looks like
+        # <tag/>.
+
+        # is_startend() tells handle_starttag not to close the tag
+        # just because its name matches a known empty-element tag. We
+        # know that this is an empty-element tag and we want to call
+        # handle_endtag ourselves.
+        tag = self.handle_starttag(name, attrs, handle_empty_element=False)
+        self.handle_endtag(name)
+        
+    def handle_starttag(self, name, attrs, handle_empty_element=True):
         # XXX namespace
         attr_dict = {}
         for key, value in attrs:
@@ -62,10 +86,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
                 value = ''
             attr_dict[key] = value
             attrvalue = '""'
-        self.soup.handle_starttag(name, None, None, attr_dict)
-
-    def handle_endtag(self, name):
-        self.soup.handle_endtag(name)
+        #print "START", name
+        tag = self.soup.handle_starttag(name, None, None, attr_dict)
+        if tag and tag.is_empty_element and handle_empty_element:
+            # Unlike other parsers, html.parser doesn't send separate end tag
+            # events for empty-element tags. (It's handled in
+            # handle_startendtag, but only if the original markup looked like
+            # <tag/>.)
+            #
+            # So we need to call handle_endtag() ourselves. Since we
+            # know the start event is identical to the end event, we
+            # don't want handle_endtag() to cross off any previous end
+            # events for tags of this name.
+            self.handle_endtag(name, check_already_closed=False)
+
+            # But we might encounter an explicit closing tag for this tag
+            # later on. If so, we want to ignore it.
+            self.already_closed_empty_element.append(name)
+            
+    def handle_endtag(self, name, check_already_closed=True):
+        #print "END", name
+        if check_already_closed and name in self.already_closed_empty_element:
+            # This is a redundant end tag for an empty-element tag.
+            # We've already called handle_endtag() for it, so just
+            # check it off the list.
+            # print "ALREADY CLOSED", name
+            self.already_closed_empty_element.remove(name)
+        else:
+            self.soup.handle_endtag(name)
 
     def handle_data(self, data):
         self.soup.handle_data(data)
@@ -169,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
             warnings.warn(RuntimeWarning(
                 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
             raise e
+        parser.already_closed_empty_element = []
 
 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
diff --git a/bs4/testing.py b/bs4/testing.py
index 9d89de7..40ccac6 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -342,6 +342,13 @@ Hello, world!
         self.assertEqual("p", soup.p.name)
         self.assertConnectedness(soup)
 
+    def test_empty_element_tags(self):
+        """Verify consistent handling of empty-element tags,
+        no matter how they come in through the markup.
+        """
+        self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
+        self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
+        
     def test_head_tag_between_head_and_body(self):
         "Prevent recurrence of a bug in the html5lib treebuilder."
         content = """<html><head></head>
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index b45e35f..d5cf025 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -29,4 +29,6 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         loaded = pickle.loads(dumped)
         self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
 
-
+    def test_redundant_empty_element_closing_tags(self):
+        self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
+        self.assertSoupEquals('</br></br></br>', "")
author	Leonard Richardson <leonardr@segfault.org>	2017-05-06 21:31:10 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2017-05-06 21:31:10 -0400
commit	e21aa406ab7c524692e2d462e1f30a4c37b1f0fc (patch)
tree	409ea0066143454155fb4bd401c66558b9fcafc8
parent	a8a9224b7f97e882cc8ec712323d8b86631e42e9 (diff)
download	beautifulsoup4-e21aa406ab7c524692e2d462e1f30a4c37b1f0fc.tar.gz