It's now possible to use a tag's namespace prefix when searching,

e.g. soup.find('namespace:tag') [bug=1655332]
author: Leonard Richardson <leonardr@segfault.org> 2017-05-06 13:23:18 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2017-05-06 13:23:18 -0400
commit: f361aacefa877c8b431ace557b27898b3a12568d (patch)
tree: 658ed832e9cbcb38f1af14b056d834188ea9d492
parent: 5b224088cefb7d4e55339a1e238d233b97e256d4 (diff)
download: beautifulsoup4-f361aacefa877c8b431ace557b27898b3a12568d.tar.gz
4 files changed, 82 insertions, 10 deletions
diff --git a/NEWS.txt b/NEWS.txt
index e88dd68..f885e4e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,8 @@
+= 4.5.4 (Unreleased) =
+
+* It's now possible to use a tag's namespace prefix when searching for
+  it, e.g. soup.find('namespace:tag') [bug=1655332]
+
 = 4.5.3 (20170102) =
 
 * Fixed foster parenting when html5lib is the tree builder. Thanks to
diff --git a/bs4/element.py b/bs4/element.py
index 9284d11..115ab24 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -535,9 +535,16 @@ class PageElement(object):
                 return ResultSet(strainer, result)
             elif isinstance(name, basestring):
                 # Optimization to find all tags with a given name.
+                if name.count(':') == 1:
+                    # This is a name with a prefix.
+                    prefix, name = name.split(':', 1)
+                else:
+                    prefix = None
                 result = (element for element in generator
                           if isinstance(element, Tag)
-                            and element.name == name)
+                            and element.name == name
+                          and (prefix is None or element.prefix == prefix)
+                )
                 return ResultSet(strainer, result)
         results = ResultSet(strainer)
         while True:
@@ -1698,7 +1705,7 @@ class SoupStrainer(object):
                 "I don't know how to match against a %s" % markup.__class__)
         return found
 
-    def _matches(self, markup, match_against):
+    def _matches(self, markup, match_against, already_tried=None):
         # print u"Matching %s against %s" % (markup, match_against)
         result = False
         if isinstance(markup, list) or isinstance(markup, tuple):
@@ -1713,7 +1720,7 @@ class SoupStrainer(object):
             if self._matches(' '.join(markup), match_against):
                 return True
             return False
-
+        
         if match_against is True:
             # True matches any non-None value.
             return markup is not None
@@ -1723,6 +1730,7 @@ class SoupStrainer(object):
 
         # Custom callables take the tag as an argument, but all
         # other ways of matching match the tag name as a string.
+        original_markup = markup
         if isinstance(markup, Tag):
             markup = markup.name
 
@@ -1733,18 +1741,51 @@ class SoupStrainer(object):
             # None matches None, False, an empty string, an empty list, and so on.
             return not match_against
 
-        if isinstance(match_against, unicode):
+        if (hasattr(match_against, '__iter__')
+            and not isinstance(match_against, basestring)):
+            # We're asked to match against an iterable of items.
+            # The markup must be match at least one item in the
+            # iterable. We'll try each one in turn.
+            #
+            # To avoid infinite recursion we need to keep track of
+            # items we've already seen.
+            if not already_tried:
+                already_tried = set()
+            for item in match_against:
+                if item.__hash__:
+                    key = item
+                else:
+                    key = id(item)
+                if key in already_tried:
+                    continue
+                else:
+                    already_tried.add(key)
+                    if self._matches(original_markup, item, already_tried):
+                        return True
+            else:
+                return False
+        
+        # Beyond this point we might need to run the test twice: once against
+        # the tag's name and once against its prefixed name.
+        match = False
+        
+        if not match and isinstance(match_against, unicode):
             # Exact string match
-            return markup == match_against
+            match = markup == match_against
 
-        if hasattr(match_against, 'match'):
+        if not match and hasattr(match_against, 'search'):
             # Regexp match
             return match_against.search(markup)
 
-        if hasattr(match_against, '__iter__'):
-            # The markup must be an exact match against something
-            # in the iterable.
-            return markup in match_against
+        if (not match
+            and isinstance(original_markup, Tag)
+            and original_markup.prefix):
+            # Try the whole thing again with the prefixed tag name.
+            return self._matches(
+                original_markup.prefix + ':' + original_markup.name, match_against
+            )
+
+        return match
 
 
 class ResultSet(list):
diff --git a/bs4/testing.py b/bs4/testing.py
index 3a6ed42..733cc29 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -669,6 +669,30 @@ class XMLTreeBuilderSmokeTest(object):
         soup = self.soup(markup)
         self.assertEqual(unicode(soup.foo), markup)
 
+    def test_find_by_prefixed_name(self):
+        doc = """<?xml version="1.0" encoding="utf-8"?>
+<Document xmlns="http://example.com/ns0"
+    xmlns:ns1="http://example.com/ns1"
+    xmlns:ns2="http://example.com/ns2"
+    <ns1:tag>foo</ns1:tag>
+    <ns1:tag>bar</ns1:tag>
+    <ns2:tag key="value">baz</ns2:tag>
+</Document>
+"""
+        soup = self.soup(doc)
+
+        # There are three <tag> tags.
+        self.assertEqual(3, len(soup.find_all('tag')))
+
+        # But two of them are ns1:tag and one of them is ns2:tag.
+        self.assertEqual(2, len(soup.find_all('ns1:tag')))
+        self.assertEqual(1, len(soup.find_all('ns2:tag')))
+        
+        self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
+        self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
+        
+
+        
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
     """Smoke test for a tree builder that supports HTML5."""
 
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index a4fe0b1..354473a 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1,3 +1,4 @@
+
 # -*- coding: utf-8 -*-
 """Tests for Beautiful Soup's tree traversal methods.
 
@@ -234,6 +235,7 @@ class TestFindAllByName(TreeTest):
         self.assertEqual('1', r3.string)
         self.assertEqual('3', r4.string)
 
+        
 class TestFindAllByAttribute(TreeTest):
 
     def test_find_all_by_attribute_name(self):
author	Leonard Richardson <leonardr@segfault.org>	2017-05-06 13:23:18 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2017-05-06 13:23:18 -0400
commit	f361aacefa877c8b431ace557b27898b3a12568d (patch)
tree	658ed832e9cbcb38f1af14b056d834188ea9d492
parent	5b224088cefb7d4e55339a1e238d233b97e256d4 (diff)
download	beautifulsoup4-f361aacefa877c8b431ace557b27898b3a12568d.tar.gz