summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-07-18 22:22:19 -0400
committerLeonard Richardson <leonardr@segfault.org>2018-07-18 22:22:19 -0400
commit9e8cd98aa68fcd539c8039dc3ef1b09e8278a551 (patch)
treef62072546f28fce4a10d4b4bc78cf63af351a218
parent668626385ccad937dc62a2d6c98988ede64778ca (diff)
downloadbeautifulsoup4-9e8cd98aa68fcd539c8039dc3ef1b09e8278a551.tar.gz
Fixed a bug where find_all() was not working when asked to find a
tag with a namespaced name in an XML document that was parsed as HTML. [bug=1723783]
-rw-r--r--NEWS.txt8
-rw-r--r--bs4/element.py15
-rw-r--r--bs4/testing.py12
3 files changed, 27 insertions, 8 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 47b2e70..bca2098 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -6,6 +6,9 @@
* Preserve XML namespaces introduced inside an XML document, not just
the ones introduced at the top level. [bug=1718787]
+* Added a new formatter, "html5", which represents void elements
+ elements as "<element>" rather than "<element/>". [bug=1716272]
+
* Improved the warning given when no parser is specified. [bug=1780571]
* Fixed code that was causing deprecation warnings in recent Python 3
@@ -17,8 +20,9 @@
* Stopped HTMLParser from raising an exception in very rare cases of
bad markup. [bug=1708831]
-* Added a new formatter, "html5", which represents void elements
- elements as "<element>" rather than "<element/>". [bug=1716272]
+* Fixed a bug where find_all() was not working when asked to find a
+ tag with a namespaced name in an XML document that was parsed as
+ HTML. [bug=1723783]
* You can get finer control over formatting by subclassing
bs4.element.Formatter and passing a Formatter instance into (e.g.)
diff --git a/bs4/element.py b/bs4/element.py
index 911b9bc..f010833 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -589,14 +589,21 @@ class PageElement(object):
elif isinstance(name, basestring):
# Optimization to find all tags with a given name.
if name.count(':') == 1:
- # This is a name with a prefix.
- prefix, name = name.split(':', 1)
+ # This is a name with a prefix. If this is a namespace-aware document,
+ # we need to match the local name against tag.name. If not,
+ # we need to match the fully-qualified name against tag.name.
+ prefix, local_name = name.split(':', 1)
else:
prefix = None
+ local_name = name
result = (element for element in generator
if isinstance(element, Tag)
- and element.name == name
- and (prefix is None or element.prefix == prefix)
+ and (
+ element.name == name
+ ) or (
+ element.name == local_name
+ and (prefix is None or element.prefix == prefix)
+ )
)
return ResultSet(strainer, result)
results = ResultSet(strainer)
diff --git a/bs4/testing.py b/bs4/testing.py
index 641663c..5b0eb8f 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -150,6 +150,14 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
+ def test_namespaced_html(self):
+ """When a namespaced XML document is parsed as HTML it should
+ be treated as HTML with weird tag names.
+ """
+ markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
+ soup = self.soup(markup)
+ self.assertEqual(2, len(soup.find_all("ns1:foo")))
+
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
@@ -625,14 +633,14 @@ class XMLTreeBuilderSmokeTest(object):
soup.encode("utf-8"), markup)
def test_nested_namespaces(self):
- doc = """<?xml version="1.0" encoding="utf-8"?>
+ doc = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<parent xmlns="http://ns1/">
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
</child>
</parent>"""
- soup = BeautifulSoup(doc, "lxml-xml")
+ soup = self.soup(doc)
self.assertEqual(doc, soup.encode())
def test_formatter_processes_script_tag_for_xml_documents(self):