summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2017-05-06 21:31:10 -0400
committerLeonard Richardson <leonardr@segfault.org>2017-05-06 21:31:10 -0400
commite21aa406ab7c524692e2d462e1f30a4c37b1f0fc (patch)
tree409ea0066143454155fb4bd401c66558b9fcafc8
parenta8a9224b7f97e882cc8ec712323d8b86631e42e9 (diff)
downloadbeautifulsoup4-e21aa406ab7c524692e2d462e1f30a4c37b1f0fc.tar.gz
Improved the handling of empty-element tags like <br> when using the
html.parser parser. [bug=1676935]
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/builder/_htmlparser.py59
-rw-r--r--bs4/testing.py7
-rw-r--r--bs4/tests/test_htmlparser.py4
4 files changed, 67 insertions, 6 deletions
diff --git a/NEWS.txt b/NEWS.txt
index daeec24..91fe13c 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -7,6 +7,9 @@
* It's now possible to use a tag's namespace prefix when searching,
e.g. soup.find('namespace:tag') [bug=1655332]
+* Improved the handling of empty-element tags like <br> when using the
+ html.parser parser. [bug=1676935]
+
* HTML parsers treat all HTML4 and HTML5 empty element tags (aka void
element tags) correctly. [bug=1656909]
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 823ca15..67890b3 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -52,7 +52,31 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
- def handle_starttag(self, name, attrs):
+
+ def __init__(self, *args, **kwargs):
+ HTMLParser.__init__(self, *args, **kwargs)
+
+ # Keep a list of empty-element tags that were encountered
+ # without an explicit closing tag. If we encounter a closing tag
+ # of this type, we'll associate it with one of those entries.
+ #
+ # This isn't a stack because we don't care about the
+ # order. It's a list of closing tags we've already handled and
+ # will ignore, assuming they ever show up.
+ self.already_closed_empty_element = []
+
+ def handle_startendtag(self, name, attrs):
+ # This is only called when the markup looks like
+ # <tag/>.
+
+ # is_startend() tells handle_starttag not to close the tag
+ # just because its name matches a known empty-element tag. We
+ # know that this is an empty-element tag and we want to call
+ # handle_endtag ourselves.
+ tag = self.handle_starttag(name, attrs, handle_empty_element=False)
+ self.handle_endtag(name)
+
+ def handle_starttag(self, name, attrs, handle_empty_element=True):
# XXX namespace
attr_dict = {}
for key, value in attrs:
@@ -62,10 +86,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
value = ''
attr_dict[key] = value
attrvalue = '""'
- self.soup.handle_starttag(name, None, None, attr_dict)
-
- def handle_endtag(self, name):
- self.soup.handle_endtag(name)
+ #print "START", name
+ tag = self.soup.handle_starttag(name, None, None, attr_dict)
+ if tag and tag.is_empty_element and handle_empty_element:
+ # Unlike other parsers, html.parser doesn't send separate end tag
+ # events for empty-element tags. (It's handled in
+ # handle_startendtag, but only if the original markup looked like
+ # <tag/>.)
+ #
+ # So we need to call handle_endtag() ourselves. Since we
+ # know the start event is identical to the end event, we
+ # don't want handle_endtag() to cross off any previous end
+ # events for tags of this name.
+ self.handle_endtag(name, check_already_closed=False)
+
+ # But we might encounter an explicit closing tag for this tag
+ # later on. If so, we want to ignore it.
+ self.already_closed_empty_element.append(name)
+
+ def handle_endtag(self, name, check_already_closed=True):
+ #print "END", name
+ if check_already_closed and name in self.already_closed_empty_element:
+ # This is a redundant end tag for an empty-element tag.
+ # We've already called handle_endtag() for it, so just
+ # check it off the list.
+ # print "ALREADY CLOSED", name
+ self.already_closed_empty_element.remove(name)
+ else:
+ self.soup.handle_endtag(name)
def handle_data(self, data):
self.soup.handle_data(data)
@@ -169,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
+ parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
diff --git a/bs4/testing.py b/bs4/testing.py
index 9d89de7..40ccac6 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -342,6 +342,13 @@ Hello, world!
self.assertEqual("p", soup.p.name)
self.assertConnectedness(soup)
+ def test_empty_element_tags(self):
+ """Verify consistent handling of empty-element tags,
+ no matter how they come in through the markup.
+ """
+ self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
+ self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
+
def test_head_tag_between_head_and_body(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<html><head></head>
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index b45e35f..d5cf025 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -29,4 +29,6 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
loaded = pickle.loads(dumped)
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
-
+ def test_redundant_empty_element_closing_tags(self):
+ self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
+ self.assertSoupEquals('</br></br></br>', "")