summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-04-16 10:06:26 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2012-04-16 10:06:26 -0400
commitcbac8023487be12c16995c8dc4f72917413e742c (patch)
tree09c0d5342162e3738ab8f2648b4ff115a4c5612a
parent7d5a6d8da6382bc4822593beea2b95116f1a59eb (diff)
downloadbeautifulsoup4-cbac8023487be12c16995c8dc4f72917413e742c.tar.gz
Attribute values are now run through the provided output formatter. Previously they were always run through the 'minimal' formatter. [bug=980237]
-rw-r--r--NEWS.txt6
-rw-r--r--TODO.txt2
-rw-r--r--bs4/dammit.py70
-rw-r--r--bs4/element.py27
-rw-r--r--bs4/tests/test_tree.py19
-rw-r--r--doc/source/index.rst22
6 files changed, 97 insertions, 49 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 7db893f..ddceae3 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -5,6 +5,12 @@
* Fixed a bug with the string setter that moved a string around the
tree instead of copying it. [bug=983050]
+* Attribute values are now run through the provided output formatter.
+ Previously they were always run through the 'minimal' formatter. In
+ the future I may make it possible to specify different formatters
+ for attribute values and strings, but for now, consistent behavior
+ is better than inconsistent behavior. [bug=980237]
+
* Added the missing renderContents method from Beautiful Soup 3. Also
added an encode_contents() method to go along with decode_contents().
diff --git a/TODO.txt b/TODO.txt
index 5ad7e93..8112a5e 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -2,7 +2,7 @@ Optimizations
-------------
The html5lib tree builder doesn't use the standard tree-building API,
-which worries me.
+which worries me and has resulted in a number of bugs.
markup_attr_map can be optimized since it's always a map now.
diff --git a/bs4/dammit.py b/bs4/dammit.py
index a35c213..65fd43d 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -81,58 +81,62 @@ class EntitySubstitution(object):
return "&%s;" % entity
@classmethod
- def substitute_xml(cls, value, make_quoted_attribute=False):
- """Substitute XML entities for special XML characters.
+ def quoted_attribute_value(self, value):
+ """Make a value into a quoted XML attribute, possibly escaping it.
- :param value: A string to be substituted. The less-than sign will
- become &lt;, the greater-than sign will become &gt;, and any
- ampersands that are not part of an entity defition will
- become &amp;.
-
- :param make_quoted_attribute: If True, then the string will be
- quoted, as befits an attribute value.
-
- Ordinarily, the string will be quoted using double quotes.
+ Most strings will be quoted using double quotes.
Bob's Bar -> "Bob's Bar"
- If the string contains double quotes, it will be quoted using
+ If a string contains double quotes, it will be quoted using
single quotes.
Welcome to "my bar" -> 'Welcome to "my bar"'
- If the string contains both single and double quotes, the
+ If a string contains both single and double quotes, the
double quotes will be escaped, and the string will be quoted
using double quotes.
Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
"""
- if make_quoted_attribute:
- quote_with = '"'
- if '"' in value:
- if "'" in value:
- # The string contains both single and double
- # quotes. Turn the double quotes into
- # entities. We quote the double quotes rather than
- # the single quotes because the entity name is
- # "&quot;" whether this is HTML or XML. If we
- # quoted the single quotes, we'd have to decide
- # between &apos; and &squot;.
- replace_with = "&quot;"
- value = value.replace('"', replace_with)
- else:
- # There are double quotes but no single quotes.
- # We can use single quotes to quote the attribute.
- quote_with = "'"
+ quote_with = '"'
+ if '"' in value:
+ if "'" in value:
+ # The string contains both single and double
+ # quotes. Turn the double quotes into
+ # entities. We quote the double quotes rather than
+ # the single quotes because the entity name is
+ # "&quot;" whether this is HTML or XML. If we
+ # quoted the single quotes, we'd have to decide
+ # between &apos; and &squot;.
+ replace_with = "&quot;"
+ value = value.replace('"', replace_with)
+ else:
+ # There are double quotes but no single quotes.
+ # We can use single quotes to quote the attribute.
+ quote_with = "'"
+ return quote_with + value + quote_with
+
+ @classmethod
+ def substitute_xml(cls, value, make_quoted_attribute=False):
+ """Substitute XML entities for special XML characters.
+ :param value: A string to be substituted. The less-than sign will
+ become &lt;, the greater-than sign will become &gt;, and any
+ ampersands that are not part of an entity defition will
+ become &amp;.
+
+ :param make_quoted_attribute: If True, then the string will be
+ quoted, as befits an attribute value.
+ """
# Escape angle brackets, and ampersands that aren't part of
# entities.
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
+
if make_quoted_attribute:
- return quote_with + value + quote_with
- else:
- return value
+ value = cls.quoted_attribute_value(value)
+ return value
@classmethod
def substitute_html(cls, s):
diff --git a/bs4/element.py b/bs4/element.py
index 496f2ad..684da38 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -57,6 +57,18 @@ class PageElement(object):
None : None
}
+ @classmethod
+ def format_string(self, s, formatter='minimal'):
+ """Format the given string using the given formatter."""
+ if not callable(formatter):
+ formatter = self.FORMATTERS.get(
+ formatter, EntitySubstitution.substitute_xml)
+ if formatter is None:
+ output = s
+ else:
+ output = formatter(s)
+ return output
+
def setup(self, parent=None, previous_element=None):
"""Sets up the initial relations between this element and
other elements."""
@@ -617,14 +629,7 @@ class NavigableString(unicode, PageElement):
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
- if not callable(formatter):
- formatter = self.FORMATTERS.get(
- formatter, EntitySubstitution.substitute_xml)
- if formatter is None:
- output = self
- else:
- output = formatter(self)
-
+ output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@@ -950,8 +955,10 @@ class Tag(PageElement):
and '%SOUP-ENCODING%' in val):
val = self.substitute_encoding(val, eventual_encoding)
- decoded = (str(key) + '='
- + EntitySubstitution.substitute_xml(val, True))
+ text = self.format_string(val, formatter)
+ decoded = (
+ str(key) + '='
+ + EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
closeTag = ''
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f4fe451..661decb 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1219,6 +1219,23 @@ class TestSubstitutions(SoupTest):
decoded,
self.document_for(u"<b><FOO></b><b>BAR</b>"))
+ def test_formatter_is_run_on_attribute_values(self):
+ markup = u'<a href="http://a.com?a=b&c=é">e</a>'
+ soup = self.soup(markup)
+ a = soup.a
+
+ expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>'
+
+ self.assertEqual(expect_minimal, a.decode())
+ self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
+
+ expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
+ self.assertEqual(expect_html, a.decode(formatter="html"))
+
+ self.assertEqual(markup, a.decode(formatter=None))
+ expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
+ self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
+
def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>")
pretty = soup.prettify(formatter = lambda x: x.upper())
@@ -1309,7 +1326,7 @@ class TestEncoding(SoupTest):
def test_encode_contents(self):
html = u"<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
- self.assertEquals(
+ self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
encoding="utf8"))
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 1ebcb5c..d4dabb1 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1996,6 +1996,10 @@ invalid HTML or XML::
soup.p
# <p>The law firm of Dewey, Cheatem, &amp; Howe</p>
+ soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
+ soup.a
+ # <a href="http://example.com/?foo=val1&amp;bar=val2">A link</a>
+
You can change this behavior by providing a value for the
``formatter`` argument to ``prettify()``, ``encode()``, or
``decode()``. Beautiful Soup recognizes four possible values for
@@ -2029,7 +2033,7 @@ Unicode characters to HTML entities whenever possible::
If you pass in ``formatter=None``, Beautiful Soup will not modify
strings at all on output. This is the fastest option, but it may lead
-to Beautiful Soup generating invalid HTML/XML, as in this example::
+to Beautiful Soup generating invalid HTML/XML, as in these examples::
print(soup.prettify(formatter=None))
# <html>
@@ -2040,11 +2044,16 @@ to Beautiful Soup generating invalid HTML/XML, as in this example::
# </body>
# </html>
+ link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
+ print(link_soup.a.encode(formatter=None))
+ # <a href="http://example.com/?foo=val1&bar=val2">A link</a>
+
Finally, if you pass in a function for ``formatter``, Beautiful Soup
-will call that function once for every string in the document. You can
-do whatever you want in this function. Here's a formatter that
-converts strings to uppercase and does absolutely nothing else::
+will call that function once for every string and attribute value in
+the document. You can do whatever you want in this function. Here's a
+formatter that converts strings to uppercase and does absolutely
+nothing else::
def uppercase(str):
return str.upper()
@@ -2058,6 +2067,11 @@ converts strings to uppercase and does absolutely nothing else::
# </body>
# </html>
+ print(link_soup.a.prettify(formatter=uppercase))
+ # <a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2">
+ # A LINK
+ # </a>
+
If you're writing your own function, you should know about the
``EntitySubstitution`` class in the ``bs4.dammit`` module. This class
implements Beautiful Soup's standard formatters as class methods: the