diff options
author | Leonard Richardson <leonardr@segfault.org> | 2017-05-06 13:34:52 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2017-05-06 13:34:52 -0400 |
commit | c8221ac4a36666b775f2f77920b3a5071109d374 (patch) | |
tree | 776ddc4cc1f8981c1c868d12c334fae8142c8a5c | |
parent | f361aacefa877c8b431ace557b27898b3a12568d (diff) | |
download | beautifulsoup4-c8221ac4a36666b775f2f77920b3a5071109d374.tar.gz |
HTML parsers treat all HTML4 and HTML5 empty element tags (aka void element tags) correctly. [bug=1656909]
-rw-r--r-- | NEWS.txt | 7 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 9 | ||||
-rw-r--r-- | bs4/testing.py | 12 |
3 files changed, 24 insertions, 4 deletions
@@ -1,7 +1,10 @@ = 4.5.4 (Unreleased) = -* It's now possible to use a tag's namespace prefix when searching for - it, e.g. soup.find('namespace:tag') [bug=1655332] +* It's now possible to use a tag's namespace prefix when searching, + e.g. soup.find('namespace:tag') [bug=1655332] + +* HTML parsers treat all HTML4 and HTML5 empty element tags (aka void + element tags) correctly. [bug=1656909] = 4.5.3 (20170102) = diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 601979b..fdb3362 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder): """ preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags - empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from HTML4, removed in HTML5. + 'spacer', 'frame' + ]) # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, diff --git a/bs4/testing.py b/bs4/testing.py index 733cc29..9d89de7 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -69,6 +69,18 @@ class HTMLTreeBuilderSmokeTest(object): markup in these tests, there's not much room for interpretation. """ + def test_empty_element_tags(self): + """Verify that all HTML4 and HTML5 empty element (aka void element) tags + are handled correctly. + """ + for name in [ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + 'spacer', 'frame' + ]: + soup = self.soup("") + new_tag = soup.new_tag(name) + self.assertEqual(True, new_tag.is_empty_element) + def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. |