diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-18 22:22:19 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-18 22:22:19 -0400 |
commit | 9e8cd98aa68fcd539c8039dc3ef1b09e8278a551 (patch) | |
tree | f62072546f28fce4a10d4b4bc78cf63af351a218 | |
parent | 668626385ccad937dc62a2d6c98988ede64778ca (diff) | |
download | beautifulsoup4-9e8cd98aa68fcd539c8039dc3ef1b09e8278a551.tar.gz |
Fixed a bug where find_all() was not working when asked to find a
tag with a namespaced name in an XML document that was parsed as
HTML. [bug=1723783]
-rw-r--r-- | NEWS.txt | 8 | ||||
-rw-r--r-- | bs4/element.py | 15 | ||||
-rw-r--r-- | bs4/testing.py | 12 |
3 files changed, 27 insertions, 8 deletions
@@ -6,6 +6,9 @@ * Preserve XML namespaces introduced inside an XML document, not just the ones introduced at the top level. [bug=1718787] +* Added a new formatter, "html5", which represents void elements + elements as "<element>" rather than "<element/>". [bug=1716272] + * Improved the warning given when no parser is specified. [bug=1780571] * Fixed code that was causing deprecation warnings in recent Python 3 @@ -17,8 +20,9 @@ * Stopped HTMLParser from raising an exception in very rare cases of bad markup. [bug=1708831] -* Added a new formatter, "html5", which represents void elements - elements as "<element>" rather than "<element/>". [bug=1716272] +* Fixed a bug where find_all() was not working when asked to find a + tag with a namespaced name in an XML document that was parsed as + HTML. [bug=1723783] * You can get finer control over formatting by subclassing bs4.element.Formatter and passing a Formatter instance into (e.g.) diff --git a/bs4/element.py b/bs4/element.py index 911b9bc..f010833 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -589,14 +589,21 @@ class PageElement(object): elif isinstance(name, basestring): # Optimization to find all tags with a given name. if name.count(':') == 1: - # This is a name with a prefix. - prefix, name = name.split(':', 1) + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) else: prefix = None + local_name = name result = (element for element in generator if isinstance(element, Tag) - and element.name == name - and (prefix is None or element.prefix == prefix) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) ) return ResultSet(strainer, result) results = ResultSet(strainer) diff --git a/bs4/testing.py b/bs4/testing.py index 641663c..5b0eb8f 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -150,6 +150,14 @@ class HTMLTreeBuilderSmokeTest(object): soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) + def test_namespaced_html(self): + """When a namespaced XML document is parsed as HTML it should + be treated as HTML with weird tag names. + """ + markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>""" + soup = self.soup(markup) + self.assertEqual(2, len(soup.find_all("ns1:foo"))) + def test_processing_instruction(self): # We test both Unicode and bytestring to verify that # process_markup correctly sets processing_instruction_class @@ -625,14 +633,14 @@ class XMLTreeBuilderSmokeTest(object): soup.encode("utf-8"), markup) def test_nested_namespaces(self): - doc = """<?xml version="1.0" encoding="utf-8"?> + doc = b"""<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <parent xmlns="http://ns1/"> <child xmlns="http://ns2/" xmlns:ns3="http://ns3/"> <grandchild ns3:attr="value" xmlns="http://ns4/"/> </child> </parent>""" - soup = BeautifulSoup(doc, "lxml-xml") + soup = self.soup(doc) self.assertEqual(doc, soup.encode()) def test_formatter_processes_script_tag_for_xml_documents(self): |