Fixed a bug where find_all() was not working when asked to find a

tag with a namespaced name in an XML document that was parsed as HTML. [bug=1723783]
author: Leonard Richardson <leonardr@segfault.org> 2018-07-18 22:22:19 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-18 22:22:19 -0400
commit: 9e8cd98aa68fcd539c8039dc3ef1b09e8278a551 (patch)
tree: f62072546f28fce4a10d4b4bc78cf63af351a218
parent: 668626385ccad937dc62a2d6c98988ede64778ca (diff)
download: beautifulsoup4-9e8cd98aa68fcd539c8039dc3ef1b09e8278a551.tar.gz
3 files changed, 27 insertions, 8 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 47b2e70..bca2098 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -6,6 +6,9 @@
 * Preserve XML namespaces introduced inside an XML document, not just
    the ones introduced at the top level. [bug=1718787]
 
+* Added a new formatter, "html5", which represents void elements
+  elements as "<element>" rather than "<element/>".  [bug=1716272]
+
 * Improved the warning given when no parser is specified. [bug=1780571]
 
 * Fixed code that was causing deprecation warnings in recent Python 3
@@ -17,8 +20,9 @@
 * Stopped HTMLParser from raising an exception in very rare cases of
   bad markup. [bug=1708831]
 
-* Added a new formatter, "html5", which represents void elements
-  elements as "<element>" rather than "<element/>".  [bug=1716272]
+* Fixed a bug where find_all() was not working when asked to find a
+  tag with a namespaced name in an XML document that was parsed as
+  HTML. [bug=1723783]
 
 * You can get finer control over formatting by subclassing
   bs4.element.Formatter and passing a Formatter instance into (e.g.)
diff --git a/bs4/element.py b/bs4/element.py
index 911b9bc..f010833 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -589,14 +589,21 @@ class PageElement(object):
             elif isinstance(name, basestring):
                 # Optimization to find all tags with a given name.
                 if name.count(':') == 1:
-                    # This is a name with a prefix.
-                    prefix, name = name.split(':', 1)
+                    # This is a name with a prefix. If this is a namespace-aware document,
+                    # we need to match the local name against tag.name. If not,
+                    # we need to match the fully-qualified name against tag.name.
+                    prefix, local_name = name.split(':', 1)
                 else:
                     prefix = None
+                    local_name = name
                 result = (element for element in generator
                           if isinstance(element, Tag)
-                            and element.name == name
-                          and (prefix is None or element.prefix == prefix)
+                          and (
+                              element.name == name
+                          ) or (
+                              element.name == local_name
+                              and (prefix is None or element.prefix == prefix)
+                          )
                 )
                 return ResultSet(strainer, result)
         results = ResultSet(strainer)
diff --git a/bs4/testing.py b/bs4/testing.py
index 641663c..5b0eb8f 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -150,6 +150,14 @@ class HTMLTreeBuilderSmokeTest(object):
             soup.encode("utf-8").replace(b"\n", b""),
             markup.replace(b"\n", b""))
 
+    def test_namespaced_html(self):
+        """When a namespaced XML document is parsed as HTML it should
+        be treated as HTML with weird tag names.
+        """
+        markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
+        soup = self.soup(markup)
+        self.assertEqual(2, len(soup.find_all("ns1:foo")))
+        
     def test_processing_instruction(self):
         # We test both Unicode and bytestring to verify that
         # process_markup correctly sets processing_instruction_class
@@ -625,14 +633,14 @@ class XMLTreeBuilderSmokeTest(object):
             soup.encode("utf-8"), markup)
 
     def test_nested_namespaces(self):
-        doc = """<?xml version="1.0" encoding="utf-8"?>
+        doc = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
 <parent xmlns="http://ns1/">
 <child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
 <grandchild ns3:attr="value" xmlns="http://ns4/"/>
 </child>
 </parent>"""
-        soup = BeautifulSoup(doc, "lxml-xml")
+        soup = self.soup(doc)
         self.assertEqual(doc, soup.encode())
         
     def test_formatter_processes_script_tag_for_xml_documents(self):
author	Leonard Richardson <leonardr@segfault.org>	2018-07-18 22:22:19 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-18 22:22:19 -0400
commit	9e8cd98aa68fcd539c8039dc3ef1b09e8278a551 (patch)
tree	f62072546f28fce4a10d4b4bc78cf63af351a218
parent	668626385ccad937dc62a2d6c98988ede64778ca (diff)
download	beautifulsoup4-9e8cd98aa68fcd539c8039dc3ef1b09e8278a551.tar.gz