summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2017-05-06 13:23:18 -0400
committerLeonard Richardson <leonardr@segfault.org>2017-05-06 13:23:18 -0400
commitf361aacefa877c8b431ace557b27898b3a12568d (patch)
tree658ed832e9cbcb38f1af14b056d834188ea9d492
parent5b224088cefb7d4e55339a1e238d233b97e256d4 (diff)
downloadbeautifulsoup4-f361aacefa877c8b431ace557b27898b3a12568d.tar.gz
It's now possible to use a tag's namespace prefix when searching,
e.g. soup.find('namespace:tag') [bug=1655332]
-rw-r--r--NEWS.txt5
-rw-r--r--bs4/element.py61
-rw-r--r--bs4/testing.py24
-rw-r--r--bs4/tests/test_tree.py2
4 files changed, 82 insertions, 10 deletions
diff --git a/NEWS.txt b/NEWS.txt
index e88dd68..f885e4e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,8 @@
+= 4.5.4 (Unreleased) =
+
+* It's now possible to use a tag's namespace prefix when searching for
+ it, e.g. soup.find('namespace:tag') [bug=1655332]
+
= 4.5.3 (20170102) =
* Fixed foster parenting when html5lib is the tree builder. Thanks to
diff --git a/bs4/element.py b/bs4/element.py
index 9284d11..115ab24 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -535,9 +535,16 @@ class PageElement(object):
return ResultSet(strainer, result)
elif isinstance(name, basestring):
# Optimization to find all tags with a given name.
+ if name.count(':') == 1:
+ # This is a name with a prefix.
+ prefix, name = name.split(':', 1)
+ else:
+ prefix = None
result = (element for element in generator
if isinstance(element, Tag)
- and element.name == name)
+ and element.name == name
+ and (prefix is None or element.prefix == prefix)
+ )
return ResultSet(strainer, result)
results = ResultSet(strainer)
while True:
@@ -1698,7 +1705,7 @@ class SoupStrainer(object):
"I don't know how to match against a %s" % markup.__class__)
return found
- def _matches(self, markup, match_against):
+ def _matches(self, markup, match_against, already_tried=None):
# print u"Matching %s against %s" % (markup, match_against)
result = False
if isinstance(markup, list) or isinstance(markup, tuple):
@@ -1713,7 +1720,7 @@ class SoupStrainer(object):
if self._matches(' '.join(markup), match_against):
return True
return False
-
+
if match_against is True:
# True matches any non-None value.
return markup is not None
@@ -1723,6 +1730,7 @@ class SoupStrainer(object):
# Custom callables take the tag as an argument, but all
# other ways of matching match the tag name as a string.
+ original_markup = markup
if isinstance(markup, Tag):
markup = markup.name
@@ -1733,18 +1741,51 @@ class SoupStrainer(object):
# None matches None, False, an empty string, an empty list, and so on.
return not match_against
- if isinstance(match_against, unicode):
+ if (hasattr(match_against, '__iter__')
+ and not isinstance(match_against, basestring)):
+ # We're asked to match against an iterable of items.
+ # The markup must be match at least one item in the
+ # iterable. We'll try each one in turn.
+ #
+ # To avoid infinite recursion we need to keep track of
+ # items we've already seen.
+ if not already_tried:
+ already_tried = set()
+ for item in match_against:
+ if item.__hash__:
+ key = item
+ else:
+ key = id(item)
+ if key in already_tried:
+ continue
+ else:
+ already_tried.add(key)
+ if self._matches(original_markup, item, already_tried):
+ return True
+ else:
+ return False
+
+ # Beyond this point we might need to run the test twice: once against
+ # the tag's name and once against its prefixed name.
+ match = False
+
+ if not match and isinstance(match_against, unicode):
# Exact string match
- return markup == match_against
+ match = markup == match_against
- if hasattr(match_against, 'match'):
+ if not match and hasattr(match_against, 'search'):
# Regexp match
return match_against.search(markup)
- if hasattr(match_against, '__iter__'):
- # The markup must be an exact match against something
- # in the iterable.
- return markup in match_against
+ if (not match
+ and isinstance(original_markup, Tag)
+ and original_markup.prefix):
+ # Try the whole thing again with the prefixed tag name.
+ return self._matches(
+ original_markup.prefix + ':' + original_markup.name, match_against
+ )
+
+ return match
class ResultSet(list):
diff --git a/bs4/testing.py b/bs4/testing.py
index 3a6ed42..733cc29 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -669,6 +669,30 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
+ def test_find_by_prefixed_name(self):
+ doc = """<?xml version="1.0" encoding="utf-8"?>
+<Document xmlns="http://example.com/ns0"
+ xmlns:ns1="http://example.com/ns1"
+ xmlns:ns2="http://example.com/ns2"
+ <ns1:tag>foo</ns1:tag>
+ <ns1:tag>bar</ns1:tag>
+ <ns2:tag key="value">baz</ns2:tag>
+</Document>
+"""
+ soup = self.soup(doc)
+
+ # There are three <tag> tags.
+ self.assertEqual(3, len(soup.find_all('tag')))
+
+ # But two of them are ns1:tag and one of them is ns2:tag.
+ self.assertEqual(2, len(soup.find_all('ns1:tag')))
+ self.assertEqual(1, len(soup.find_all('ns2:tag')))
+
+ self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
+ self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
+
+
+
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index a4fe0b1..354473a 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1,3 +1,4 @@
+
# -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods.
@@ -234,6 +235,7 @@ class TestFindAllByName(TreeTest):
self.assertEqual('1', r3.string)
self.assertEqual('3', r4.string)
+
class TestFindAllByAttribute(TreeTest):
def test_find_all_by_attribute_name(self):