summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2017-05-06 13:34:52 -0400
committerLeonard Richardson <leonardr@segfault.org>2017-05-06 13:34:52 -0400
commitc8221ac4a36666b775f2f77920b3a5071109d374 (patch)
tree776ddc4cc1f8981c1c868d12c334fae8142c8a5c
parentf361aacefa877c8b431ace557b27898b3a12568d (diff)
downloadbeautifulsoup4-c8221ac4a36666b775f2f77920b3a5071109d374.tar.gz
HTML parsers treat all HTML4 and HTML5 empty element tags (aka void element tags) correctly. [bug=1656909]
-rw-r--r--NEWS.txt7
-rw-r--r--bs4/builder/__init__.py9
-rw-r--r--bs4/testing.py12
3 files changed, 24 insertions, 4 deletions
diff --git a/NEWS.txt b/NEWS.txt
index f885e4e..0fa568c 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,7 +1,10 @@
= 4.5.4 (Unreleased) =
-* It's now possible to use a tag's namespace prefix when searching for
- it, e.g. soup.find('namespace:tag') [bug=1655332]
+* It's now possible to use a tag's namespace prefix when searching,
+ e.g. soup.find('namespace:tag') [bug=1655332]
+
+* HTML parsers treat all HTML4 and HTML5 empty element tags (aka void
+ element tags) correctly. [bug=1656909]
= 4.5.3 (20170102) =
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 601979b..fdb3362 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder):
"""
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
- empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base'])
+ empty_element_tags = set([
+ # These are from HTML5.
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+
+ # These are from HTML4, removed in HTML5.
+ 'spacer', 'frame'
+ ])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
diff --git a/bs4/testing.py b/bs4/testing.py
index 733cc29..9d89de7 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -69,6 +69,18 @@ class HTMLTreeBuilderSmokeTest(object):
markup in these tests, there's not much room for interpretation.
"""
+ def test_empty_element_tags(self):
+ """Verify that all HTML4 and HTML5 empty element (aka void element) tags
+ are handled correctly.
+ """
+ for name in [
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+ 'spacer', 'frame'
+ ]:
+ soup = self.soup("")
+ new_tag = soup.new_tag(name)
+ self.assertEqual(True, new_tag.is_empty_element)
+
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.