diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 59 | ||||
-rw-r--r-- | bs4/testing.py | 7 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 4 |
4 files changed, 67 insertions, 6 deletions
@@ -7,6 +7,9 @@ * It's now possible to use a tag's namespace prefix when searching, e.g. soup.find('namespace:tag') [bug=1655332] +* Improved the handling of empty-element tags like <br> when using the + html.parser parser. [bug=1676935] + * HTML parsers treat all HTML4 and HTML5 empty element tags (aka void element tags) correctly. [bug=1656909] diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 823ca15..67890b3 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -52,7 +52,31 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): - def handle_starttag(self, name, attrs): + + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def handle_startendtag(self, name, attrs): + # This is only called when the markup looks like + # <tag/>. + + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): # XXX namespace attr_dict = {} for key, value in attrs: @@ -62,10 +86,34 @@ class BeautifulSoupHTMLParser(HTMLParser): value = '' attr_dict[key] = value attrvalue = '""' - self.soup.handle_starttag(name, None, None, attr_dict) - - def handle_endtag(self, name): - self.soup.handle_endtag(name) + #print "START", name + tag = self.soup.handle_starttag(name, None, None, attr_dict) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # <tag/>.) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) + + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + #print "END", name + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + # print "ALREADY CLOSED", name + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) def handle_data(self, data): self.soup.handle_data(data) @@ -169,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e + parser.already_closed_empty_element = [] # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a diff --git a/bs4/testing.py b/bs4/testing.py index 9d89de7..40ccac6 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -342,6 +342,13 @@ Hello, world! self.assertEqual("p", soup.p.name) self.assertConnectedness(soup) + def test_empty_element_tags(self): + """Verify consistent handling of empty-element tags, + no matter how they come in through the markup. + """ + self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>") + self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>") + def test_head_tag_between_head_and_body(self): "Prevent recurrence of a bug in the html5lib treebuilder." content = """<html><head></head> diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index b45e35f..d5cf025 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -29,4 +29,6 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): loaded = pickle.loads(dumped) self.assertTrue(isinstance(loaded.builder, type(tree.builder))) - + def test_redundant_empty_element_closing_tags(self): + self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") + self.assertSoupEquals('</br></br></br>', "") |