diff options
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | bs4/element.py | 11 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 4 | ||||
-rw-r--r-- | doc/source/index.rst | 14 |
4 files changed, 28 insertions, 7 deletions
@@ -1,4 +1,8 @@ -= 4.5.4 (Unreleased) = += 4.6.0 (Unreleased) = + +* Added the `Tag.stringattr` method, which acts like `Tag.get` for + getting the value of an attribute, but which joins attribute + multi-values into a single string value. [bug=1678589] * It's now possible to use a tag's namespace prefix when searching, e.g. soup.find('namespace:tag') [bug=1655332] diff --git a/bs4/element.py b/bs4/element.py index 115ab24..5462592 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -131,8 +131,8 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's @@ -992,6 +992,13 @@ class Tag(PageElement): attribute.""" return self.attrs.get(key, default) + def string_attr(self, key, default=None): + """The same as get(), but converts lists of values to strings.""" + value = self.get(key, default) + if isinstance(value, list): + value = " ".join(value) + return value + def has_attr(self, key): return key in self.attrs diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 354473a..f57255d 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1286,6 +1286,10 @@ class TestCDAtaListAttributes(SoupTest): soup = self.soup("<a class='foo\tbar'>") self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) + def test_attribute_values_joined_into_string_through_string_attr(self): + soup = self.soup("<a class='foo\tbar'>") + self.assertEqual(b'foo bar', soup.a.string_attr('class')) + def test_accept_charset(self): soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) diff --git a/doc/source/index.rst b/doc/source/index.rst index 56aa7fe..0d13e0a 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -402,13 +402,13 @@ one CSS class). Others include ``rel``, ``rev``, ``accept-charset``, ``headers``, and ``accesskey``. Beautiful Soup presents the value(s) of a multi-valued attribute as a list:: - css_soup = BeautifulSoup('<p class="body strikeout"></p>') - css_soup.p['class'] - # ["body", "strikeout"] - css_soup = BeautifulSoup('<p class="body"></p>') css_soup.p['class'] # ["body"] + + css_soup = BeautifulSoup('<p class="body strikeout"></p>') + css_soup.p['class'] + # ["body", "strikeout"] If an attribute `looks` like it has more than one value, but it's not a multi-valued attribute as defined by any version of the HTML @@ -428,6 +428,12 @@ consolidated:: print(rel_soup.p) # <p>Back to the <a rel="index contents">homepage</a></p> +You can use ```string_attr`` to get the value of any attribute as a +string, whether or not it's a multi-valued atribute:: + + css_soup.p.string_attr('class') + # "body strikeout" + If you parse a document as XML, there are no multi-valued attributes:: xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml') |