From f361aacefa877c8b431ace557b27898b3a12568d Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 6 May 2017 13:23:18 -0400 Subject: It's now possible to use a tag's namespace prefix when searching, e.g. soup.find('namespace:tag') [bug=1655332] --- NEWS.txt | 5 +++++ bs4/element.py | 61 +++++++++++++++++++++++++++++++++++++++++--------- bs4/testing.py | 24 ++++++++++++++++++++ bs4/tests/test_tree.py | 2 ++ 4 files changed, 82 insertions(+), 10 deletions(-) diff --git a/NEWS.txt b/NEWS.txt index e88dd68..f885e4e 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,3 +1,8 @@ += 4.5.4 (Unreleased) = + +* It's now possible to use a tag's namespace prefix when searching for + it, e.g. soup.find('namespace:tag') [bug=1655332] + = 4.5.3 (20170102) = * Fixed foster parenting when html5lib is the tree builder. Thanks to diff --git a/bs4/element.py b/bs4/element.py index 9284d11..115ab24 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -535,9 +535,16 @@ class PageElement(object): return ResultSet(strainer, result) elif isinstance(name, basestring): # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. + prefix, name = name.split(':', 1) + else: + prefix = None result = (element for element in generator if isinstance(element, Tag) - and element.name == name) + and element.name == name + and (prefix is None or element.prefix == prefix) + ) return ResultSet(strainer, result) results = ResultSet(strainer) while True: @@ -1698,7 +1705,7 @@ class SoupStrainer(object): "I don't know how to match against a %s" % markup.__class__) return found - def _matches(self, markup, match_against): + def _matches(self, markup, match_against, already_tried=None): # print u"Matching %s against %s" % (markup, match_against) result = False if isinstance(markup, list) or isinstance(markup, tuple): @@ -1713,7 +1720,7 @@ class SoupStrainer(object): if self._matches(' '.join(markup), match_against): return True return False - + if match_against is True: # True matches any non-None value. return markup is not None @@ -1723,6 +1730,7 @@ class SoupStrainer(object): # Custom callables take the tag as an argument, but all # other ways of matching match the tag name as a string. + original_markup = markup if isinstance(markup, Tag): markup = markup.name @@ -1733,18 +1741,51 @@ class SoupStrainer(object): # None matches None, False, an empty string, an empty list, and so on. return not match_against - if isinstance(match_against, unicode): + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, basestring)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, unicode): # Exact string match - return markup == match_against + match = markup == match_against - if hasattr(match_against, 'match'): + if not match and hasattr(match_against, 'search'): # Regexp match return match_against.search(markup) - if hasattr(match_against, '__iter__'): - # The markup must be an exact match against something - # in the iterable. - return markup in match_against + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match class ResultSet(list): diff --git a/bs4/testing.py b/bs4/testing.py index 3a6ed42..733cc29 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -669,6 +669,30 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(unicode(soup.foo), markup) + def test_find_by_prefixed_name(self): + doc = """ +foo + bar + baz + +""" + soup = self.soup(doc) + + # There are three tags. + self.assertEqual(3, len(soup.find_all('tag'))) + + # But two of them are ns1:tag and one of them is ns2:tag. + self.assertEqual(2, len(soup.find_all('ns1:tag'))) + self.assertEqual(1, len(soup.find_all('ns2:tag'))) + + self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) + self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) + + + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index a4fe0b1..354473a 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1,3 +1,4 @@ + # -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. @@ -234,6 +235,7 @@ class TestFindAllByName(TreeTest): self.assertEqual('1', r3.string) self.assertEqual('3', r4.string) + class TestFindAllByAttribute(TreeTest): def test_find_all_by_attribute_name(self): -- cgit v1.2.1