diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-16 10:06:26 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-16 10:06:26 -0400 |
commit | cbac8023487be12c16995c8dc4f72917413e742c (patch) | |
tree | 09c0d5342162e3738ab8f2648b4ff115a4c5612a | |
parent | 7d5a6d8da6382bc4822593beea2b95116f1a59eb (diff) | |
download | beautifulsoup4-cbac8023487be12c16995c8dc4f72917413e742c.tar.gz |
Attribute values are now run through the provided output formatter. Previously they were always run through the 'minimal' formatter. [bug=980237]
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | TODO.txt | 2 | ||||
-rw-r--r-- | bs4/dammit.py | 70 | ||||
-rw-r--r-- | bs4/element.py | 27 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 19 | ||||
-rw-r--r-- | doc/source/index.rst | 22 |
6 files changed, 97 insertions, 49 deletions
@@ -5,6 +5,12 @@ * Fixed a bug with the string setter that moved a string around the tree instead of copying it. [bug=983050] +* Attribute values are now run through the provided output formatter. + Previously they were always run through the 'minimal' formatter. In + the future I may make it possible to specify different formatters + for attribute values and strings, but for now, consistent behavior + is better than inconsistent behavior. [bug=980237] + * Added the missing renderContents method from Beautiful Soup 3. Also added an encode_contents() method to go along with decode_contents(). @@ -2,7 +2,7 @@ Optimizations ------------- The html5lib tree builder doesn't use the standard tree-building API, -which worries me. +which worries me and has resulted in a number of bugs. markup_attr_map can be optimized since it's always a map now. diff --git a/bs4/dammit.py b/bs4/dammit.py index a35c213..65fd43d 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -81,58 +81,62 @@ class EntitySubstitution(object): return "&%s;" % entity @classmethod - def substitute_xml(cls, value, make_quoted_attribute=False): - """Substitute XML entities for special XML characters. + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. - :param value: A string to be substituted. The less-than sign will - become <, the greater-than sign will become >, and any - ampersands that are not part of an entity defition will - become &. - - :param make_quoted_attribute: If True, then the string will be - quoted, as befits an attribute value. - - Ordinarily, the string will be quoted using double quotes. + Most strings will be quoted using double quotes. Bob's Bar -> "Bob's Bar" - If the string contains double quotes, it will be quoted using + If a string contains double quotes, it will be quoted using single quotes. Welcome to "my bar" -> 'Welcome to "my bar"' - If the string contains both single and double quotes, the + If a string contains both single and double quotes, the double quotes will be escaped, and the string will be quoted using double quotes. Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" """ - if make_quoted_attribute: - quote_with = '"' - if '"' in value: - if "'" in value: - # The string contains both single and double - # quotes. Turn the double quotes into - # entities. We quote the double quotes rather than - # the single quotes because the entity name is - # """ whether this is HTML or XML. If we - # quoted the single quotes, we'd have to decide - # between ' and &squot;. - replace_with = """ - value = value.replace('"', replace_with) - else: - # There are double quotes but no single quotes. - # We can use single quotes to quote the attribute. - quote_with = "'" + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ # Escape angle brackets, and ampersands that aren't part of # entities. value = cls.BARE_AMPERSAND_OR_BRACKET.sub( cls._substitute_xml_entity, value) + if make_quoted_attribute: - return quote_with + value + quote_with - else: - return value + value = cls.quoted_attribute_value(value) + return value @classmethod def substitute_html(cls, s): diff --git a/bs4/element.py b/bs4/element.py index 496f2ad..684da38 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -57,6 +57,18 @@ class PageElement(object): None : None } + @classmethod + def format_string(self, s, formatter='minimal'): + """Format the given string using the given formatter.""" + if not callable(formatter): + formatter = self.FORMATTERS.get( + formatter, EntitySubstitution.substitute_xml) + if formatter is None: + output = s + else: + output = formatter(s) + return output + def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" @@ -617,14 +629,7 @@ class NavigableString(unicode, PageElement): self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): - if not callable(formatter): - formatter = self.FORMATTERS.get( - formatter, EntitySubstitution.substitute_xml) - if formatter is None: - output = self - else: - output = formatter(self) - + output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX @@ -950,8 +955,10 @@ class Tag(PageElement): and '%SOUP-ENCODING%' in val): val = self.substitute_encoding(val, eventual_encoding) - decoded = (str(key) + '=' - + EntitySubstitution.substitute_xml(val, True)) + text = self.format_string(val, formatter) + decoded = ( + str(key) + '=' + + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index f4fe451..661decb 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1219,6 +1219,23 @@ class TestSubstitutions(SoupTest): decoded, self.document_for(u"<b><FOO></b><b>BAR</b>")) + def test_formatter_is_run_on_attribute_values(self): + markup = u'<a href="http://a.com?a=b&c=é">e</a>' + soup = self.soup(markup) + a = soup.a + + expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' + + self.assertEqual(expect_minimal, a.decode()) + self.assertEqual(expect_minimal, a.decode(formatter="minimal")) + + expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' + self.assertEqual(expect_html, a.decode(formatter="html")) + + self.assertEqual(markup, a.decode(formatter=None)) + expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' + self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + def test_prettify_accepts_formatter(self): soup = BeautifulSoup("<html><body>foo</body></html>") pretty = soup.prettify(formatter = lambda x: x.upper()) @@ -1309,7 +1326,7 @@ class TestEncoding(SoupTest): def test_encode_contents(self): html = u"<b>\N{SNOWMAN}</b>" soup = self.soup(html) - self.assertEquals( + self.assertEqual( u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( encoding="utf8")) diff --git a/doc/source/index.rst b/doc/source/index.rst index 1ebcb5c..d4dabb1 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1996,6 +1996,10 @@ invalid HTML or XML:: soup.p # <p>The law firm of Dewey, Cheatem, & Howe</p> + soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>') + soup.a + # <a href="http://example.com/?foo=val1&bar=val2">A link</a> + You can change this behavior by providing a value for the ``formatter`` argument to ``prettify()``, ``encode()``, or ``decode()``. Beautiful Soup recognizes four possible values for @@ -2029,7 +2033,7 @@ Unicode characters to HTML entities whenever possible:: If you pass in ``formatter=None``, Beautiful Soup will not modify strings at all on output. This is the fastest option, but it may lead -to Beautiful Soup generating invalid HTML/XML, as in this example:: +to Beautiful Soup generating invalid HTML/XML, as in these examples:: print(soup.prettify(formatter=None)) # <html> @@ -2040,11 +2044,16 @@ to Beautiful Soup generating invalid HTML/XML, as in this example:: # </body> # </html> + link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>') + print(link_soup.a.encode(formatter=None)) + # <a href="http://example.com/?foo=val1&bar=val2">A link</a> + Finally, if you pass in a function for ``formatter``, Beautiful Soup -will call that function once for every string in the document. You can -do whatever you want in this function. Here's a formatter that -converts strings to uppercase and does absolutely nothing else:: +will call that function once for every string and attribute value in +the document. You can do whatever you want in this function. Here's a +formatter that converts strings to uppercase and does absolutely +nothing else:: def uppercase(str): return str.upper() @@ -2058,6 +2067,11 @@ converts strings to uppercase and does absolutely nothing else:: # </body> # </html> + print(link_soup.a.prettify(formatter=uppercase)) + # <a href="HTTP://EXAMPLE.COM/?FOO=VAL1&BAR=VAL2"> + # A LINK + # </a> + If you're writing your own function, you should know about the ``EntitySubstitution`` class in the ``bs4.dammit`` module. This class implements Beautiful Soup's standard formatters as class methods: the |