Attribute values are now run through the provided output formatter. Previously they were always run through the 'minimal' formatter. [bug=980237]

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-16 10:06:26 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-16 10:06:26 -0400
commit: cbac8023487be12c16995c8dc4f72917413e742c (patch)
tree: 09c0d5342162e3738ab8f2648b4ff115a4c5612a
parent: 7d5a6d8da6382bc4822593beea2b95116f1a59eb (diff)
download: beautifulsoup4-cbac8023487be12c16995c8dc4f72917413e742c.tar.gz
6 files changed, 97 insertions, 49 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 7db893f..ddceae3 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -5,6 +5,12 @@
 * Fixed a bug with the string setter that moved a string around the
   tree instead of copying it. [bug=983050]
 
+* Attribute values are now run through the provided output formatter.
+  Previously they were always run through the 'minimal' formatter. In
+  the future I may make it possible to specify different formatters
+  for attribute values and strings, but for now, consistent behavior
+  is better than inconsistent behavior. [bug=980237]
+
 * Added the missing renderContents method from Beautiful Soup 3. Also
   added an encode_contents() method to go along with decode_contents().
 
diff --git a/TODO.txt b/TODO.txt
index 5ad7e93..8112a5e 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -2,7 +2,7 @@ Optimizations
 -------------
 
 The html5lib tree builder doesn't use the standard tree-building API,
-which worries me.
+which worries me and has resulted in a number of bugs.
 
 markup_attr_map can be optimized since it's always a map now.
 
diff --git a/bs4/dammit.py b/bs4/dammit.py
index a35c213..65fd43d 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -81,58 +81,62 @@ class EntitySubstitution(object):
         return "&%s;" % entity
 
     @classmethod
-    def substitute_xml(cls, value, make_quoted_attribute=False):
-        """Substitute XML entities for special XML characters.
+    def quoted_attribute_value(self, value):
+        """Make a value into a quoted XML attribute, possibly escaping it.
 
-        :param value: A string to be substituted. The less-than sign will
-          become &lt;, the greater-than sign will become &gt;, and any
-          ampersands that are not part of an entity defition will
-          become &amp;.
-
-        :param make_quoted_attribute: If True, then the string will be
-         quoted, as befits an attribute value.
-
-         Ordinarily, the string will be quoted using double quotes.
+         Most strings will be quoted using double quotes.
 
           Bob's Bar -> "Bob's Bar"
 
-         If the string contains double quotes, it will be quoted using
+         If a string contains double quotes, it will be quoted using
          single quotes.
 
           Welcome to "my bar" -> 'Welcome to "my bar"'
 
-         If the string contains both single and double quotes, the
+         If a string contains both single and double quotes, the
          double quotes will be escaped, and the string will be quoted
          using double quotes.
 
           Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
         """
-        if make_quoted_attribute:
-            quote_with = '"'
-            if '"' in value:
-                if "'" in value:
-                    # The string contains both single and double
-                    # quotes.  Turn the double quotes into
-                    # entities. We quote the double quotes rather than
-                    # the single quotes because the entity name is
-                    # "&quot;" whether this is HTML or XML.  If we
-                    # quoted the single quotes, we'd have to decide
-                    # between &apos; and &squot;.
-                    replace_with = "&quot;"
-                    value = value.replace('"', replace_with)
-                else:
-                    # There are double quotes but no single quotes.
-                    # We can use single quotes to quote the attribute.
-                    quote_with = "'"
+        quote_with = '"'
+        if '"' in value:
+            if "'" in value:
+                # The string contains both single and double
+                # quotes.  Turn the double quotes into
+                # entities. We quote the double quotes rather than
+                # the single quotes because the entity name is
+                # "&quot;" whether this is HTML or XML.  If we
+                # quoted the single quotes, we'd have to decide
+                # between &apos; and &squot;.
+                replace_with = "&quot;"
+                value = value.replace('"', replace_with)
+            else:
+                # There are double quotes but no single quotes.
+                # We can use single quotes to quote the attribute.
+                quote_with = "'"
+        return quote_with + value + quote_with
+
+    @classmethod
+    def substitute_xml(cls, value, make_quoted_attribute=False):
+        """Substitute XML entities for special XML characters.
 
+        :param value: A string to be substituted. The less-than sign will
+          become &lt;, the greater-than sign will become &gt;, and any
+          ampersands that are not part of an entity defition will
+          become &amp;.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+        """
         # Escape angle brackets, and ampersands that aren't part of
         # entities.
         value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
             cls._substitute_xml_entity, value)
+
         if make_quoted_attribute:
-            return quote_with + value + quote_with
-        else:
-            return value
+            value = cls.quoted_attribute_value(value)
+        return value
 
     @classmethod
     def substitute_html(cls, s):
diff --git a/bs4/element.py b/bs4/element.py
index 496f2ad..684da38 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -57,6 +57,18 @@ class PageElement(object):
         None : None
         }
 
+    @classmethod
+    def format_string(self, s, formatter='minimal'):
+        """Format the given string using the given formatter."""
+        if not callable(formatter):
+            formatter = self.FORMATTERS.get(
+                formatter, EntitySubstitution.substitute_xml)
+        if formatter is None:
+            output = s
+        else:
+            output = formatter(s)
+        return output
+
     def setup(self, parent=None, previous_element=None):
         """Sets up the initial relations between this element and
         other elements."""
@@ -617,14 +629,7 @@ class NavigableString(unicode, PageElement):
                     self.__class__.__name__, attr))
 
     def output_ready(self, formatter="minimal"):
-        if not callable(formatter):
-            formatter = self.FORMATTERS.get(
-                formatter, EntitySubstitution.substitute_xml)
-        if formatter is None:
-            output = self
-        else:
-            output = formatter(self)
-
+        output = self.format_string(self, formatter)
         return self.PREFIX + output + self.SUFFIX
 
 
@@ -950,8 +955,10 @@ class Tag(PageElement):
                         and '%SOUP-ENCODING%' in val):
                         val = self.substitute_encoding(val, eventual_encoding)
 
-                    decoded = (str(key) + '='
-                               + EntitySubstitution.substitute_xml(val, True))
+                    text = self.format_string(val, formatter)
+                    decoded = (
+                        str(key) + '='
+                        + EntitySubstitution.quoted_attribute_value(text))
                 attrs.append(decoded)
         close = ''
         closeTag = ''
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f4fe451..661decb 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1219,6 +1219,23 @@ class TestSubstitutions(SoupTest):
             decoded,
             self.document_for(u"<b><FOO></b><b>BAR</b>"))
 
+    def test_formatter_is_run_on_attribute_values(self):
+        markup = u'<a href="http://a.com?a=b&c=é">e</a>'
+        soup = self.soup(markup)
+        a = soup.a
+
+        expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>'
+
+        self.assertEqual(expect_minimal, a.decode())
+        self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
+
+        expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
+        self.assertEqual(expect_html, a.decode(formatter="html"))
+
+        self.assertEqual(markup, a.decode(formatter=None))
+        expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
+        self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
+
     def test_prettify_accepts_formatter(self):
         soup = BeautifulSoup("<html><body>foo</body></html>")
         pretty = soup.prettify(formatter = lambda x: x.upper())
@@ -1309,7 +1326,7 @@ class TestEncoding(SoupTest):
     def test_encode_contents(self):
         html = u"<b>\N{SNOWMAN}</b>"
         soup = self.soup(html)
-        self.assertEquals(
+        self.assertEqual(
             u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
                 encoding="utf8"))
 
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 1ebcb5c..d4dabb1 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1996,6 +1996,10 @@ invalid HTML or XML::
  soup.p
  # <p>The law firm of Dewey, Cheatem, &amp; Howe</p>
 
+ soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
+ soup.a
+ # <a href="http://example.com/?foo=val1&amp;bar=val2">A link</a>
+
 You can change this behavior by providing a value for the
 ``formatter`` argument to ``prettify()``, ``encode()``, or
 ``decode()``. Beautiful Soup recognizes four possible values for
@@ -2029,7 +2033,7 @@ Unicode characters to HTML entities whenever possible::
 
 If you pass in ``formatter=None``, Beautiful Soup will not modify
 strings at all on output. This is the fastest option, but it may lead
-to Beautiful Soup generating invalid HTML/XML, as in this example::
+to Beautiful Soup generating invalid HTML/XML, as in these examples::
 
  print(soup.prettify(formatter=None))
  # <html>
@@ -2040,11 +2044,16 @@ to Beautiful Soup generating invalid HTML/XML, as in this example::
  #  </body>
  # </html>
 
+ link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
+ print(link_soup.a.encode(formatter=None))
+ # <a href="http://example.com/?foo=val1&bar=val2">A link</a>
+
 
 Finally, if you pass in a function for ``formatter``, Beautiful Soup
-will call that function once for every string in the document. You can
-do whatever you want in this function. Here's a formatter that
-converts strings to uppercase and does absolutely nothing else::
+will call that function once for every string and attribute value in
+the document. You can do whatever you want in this function. Here's a
+formatter that converts strings to uppercase and does absolutely
+nothing else::
 
  def uppercase(str):
      return str.upper()
@@ -2058,6 +2067,11 @@ converts strings to uppercase and does absolutely nothing else::
  #  </body>
  # </html>
 
+ print(link_soup.a.prettify(formatter=uppercase))
+ # <a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2">
+ #  A LINK
+ # </a>
+
 If you're writing your own function, you should know about the
 ``EntitySubstitution`` class in the ``bs4.dammit`` module. This class
 implements Beautiful Soup's standard formatters as class methods: the
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-16 10:06:26 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-16 10:06:26 -0400
commit	cbac8023487be12c16995c8dc4f72917413e742c (patch)
tree	09c0d5342162e3738ab8f2648b4ff115a4c5612a
parent	7d5a6d8da6382bc4822593beea2b95116f1a59eb (diff)
download	beautifulsoup4-cbac8023487be12c16995c8dc4f72917413e742c.tar.gz