diff options
-rw-r--r-- | NEWS.txt | 10 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 23 | ||||
-rw-r--r-- | bs4/element.py | 87 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 14 | ||||
-rw-r--r-- | doc/source/index.rst | 14 |
5 files changed, 95 insertions, 53 deletions
@@ -14,10 +14,12 @@ * Stopped HTMLParser from raising an exception in very rare cases of bad markup. [bug=1708831] -* It's possible for a TreeBuilder subclass to specify that void - elements should be represented as "<element>" rather than - "<element/>", by setting TreeBuilder.void_element_close_prefix to - the empty string. [bug=1716272] +* Added a new formatter, "html5", which represents void elements + elements as "<element>" rather than "<element/>". [bug=1716272] + +* You can get finer control over formatting by subclassing + bs4.element.Formatter and passing a Formatter instance into (e.g.) + encode(). [bug=1716272] = 4.6.0 (20170507) = diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index d7166bb..21454e6 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -93,14 +93,6 @@ class TreeBuilder(object): preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - void_tags = None # There are no void tags. - - # This string goes just before the end of the start tag for an - # void element. - # - # Leave this alone and you'll get tags like "<br/>". Change it to the - # empty string and you'll get tags like "<br>". - void_element_close_prefix = '/' # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. @@ -133,17 +125,6 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - - def is_void(self, tag_name): - """Must a tag with this name be a void tag? - - A void tag cannot have contents and is presented with neither - a a closing tag or a closing slash, e.g.: - <link href="foo"> - """ - if self.void_tags is None: - return False - return tag_name in self.void_tags def feed(self, markup): raise NotImplementedError() @@ -255,8 +236,8 @@ class HTMLTreeBuilder(TreeBuilder): # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - # These are from HTML4, removed in HTML5. - 'spacer', 'frame' + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' ]) # The HTML standard defines these attributes as containing a diff --git a/bs4/element.py b/bs4/element.py index 181f135..911b9bc 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -126,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): return cls._substitute_if_appropriate( ns, EntitySubstitution.substitute_xml) +class Formatter(object): + """Contains information about how to format a parse tree.""" + + # By default, represent void elements as <tag/> rather than <tag> + void_element_close_prefix = '/' + + def substitute_entities(self, *args, **kwargs): + """Transform certain characters into named entities.""" + raise NotImplementedError() + +class HTMLFormatter(Formatter): + """The default HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + +class MinimalHTMLFormatter(Formatter): + """A minimal HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs) + +class HTML5Formatter(HTMLFormatter): + """An HTML formatter that omits the slash in a void tag.""" + void_element_close_prefix = None + +class XMLFormatter(Formatter): + """Substitute only the essential XML entities.""" + def substitute(self, *args, **kwargs): + return EntitySubstitution.substitute_xml(*args, **kwargs) + +class HTMLXMLFormatter(Formatter): + """Format XML using HTML rules.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -134,40 +169,49 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "html5" - The same as "html", but empty void tags are represented as + # <tag> rather than <tag/> + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". - # A function - This function will be called on every string that + # A callable function - it will be called on every string that needs to undergo entity substitution. + # A Formatter instance - Formatter.substitute(string) will be called on every string that # needs to undergo entity substitution. # - # In an HTML document, the default "html" and "minimal" functions - # will leave the contents of <script> and <style> tags alone. For - # an XML document, all tags will be given the same treatment. + # In an HTML document, the default "html", "html5", and "minimal" + # functions will leave the contents of <script> and <style> tags + # alone. For an XML document, all tags will be given the same + # treatment. HTML_FORMATTERS = { - "html" : HTMLAwareEntitySubstitution.substitute_html, - "minimal" : HTMLAwareEntitySubstitution.substitute_xml, + "html" : HTMLFormatter(), + "html5" : HTML5Formatter(), + "minimal" : MinimalHTMLFormatter(), None : None } XML_FORMATTERS = { - "html" : EntitySubstitution.substitute_html, - "minimal" : EntitySubstitution.substitute_xml, + "html" : HTMLXMLFormatter(), + "minimal" : XMLFormatter(), None : None } def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" - if not callable(formatter): + if isinstance(formatter, basestring): formatter = self._formatter_for_name(formatter) if formatter is None: output = s else: - output = formatter(s) + if callable(formatter): + # Backwards compatibility -- you used to pass in a formatting method. + output = formatter(s) + else: + output = formatter.substitute(s) return output @property @@ -197,11 +241,9 @@ class PageElement(object): def _formatter_for_name(self, name): "Look up a formatter function based on its name and the tree." if self._is_xml: - return self.XML_FORMATTERS.get( - name, EntitySubstitution.substitute_xml) + return self.XML_FORMATTERS.get(name, XMLFormatter()) else: - return self.HTML_FORMATTERS.get( - name, HTMLAwareEntitySubstitution.substitute_xml) + return self.HTML_FORMATTERS.get(name, HTMLFormatter()) def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): @@ -871,10 +913,8 @@ class Tag(PageElement): if builder is not None: builder.set_up_substitutions(self) self.can_be_empty_element = builder.can_be_empty_element(name) - self.void_element_close_prefix = builder.void_element_close_prefix or "" else: self.can_be_empty_element = False - self.void_element_close_prefix = '/' parserClass = _alias("parser_class") # BS3 @@ -1142,11 +1182,10 @@ class Tag(PageElement): encoding. """ - # First off, turn a string formatter into a function. This + # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not callable(formatter): + if not isinstance(formatter, Formatter) and not callable(formatter): formatter = self._formatter_for_name(formatter) - attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): @@ -1175,7 +1214,7 @@ class Tag(PageElement): prefix = self.prefix + ":" if self.is_empty_element: - close = self.void_element_close_prefix + close = formatter.void_element_close_prefix or '' else: closeTag = '</%s%s>' % (prefix, self.name) @@ -1246,9 +1285,9 @@ class Tag(PageElement): :param formatter: The output formatter responsible for converting entities to Unicode characters. """ - # First off, turn a string formatter into a function. This + # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not callable(formatter): + if not isinstance(formatter, Formatter) and not callable(formatter): formatter = self._formatter_for_name(formatter) pretty_print = (indent_level is not None) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index e75cf1d..e8903e3 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1439,13 +1439,21 @@ class TestSubstitutions(SoupTest): u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_html(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = u"<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="html") self.assertEqual( decoded, - self.document_for("<b><<Sacré bleu!>></b>")) + self.document_for("<br/><b><<Sacré bleu!>></b>")) + def test_formatter_html5(self): + markup = u"<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + self.assertEqual( + decoded, + self.document_for("<br><b><<Sacré bleu!>></b>")) + def test_formatter_minimal(self): markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) @@ -1518,7 +1526,7 @@ class TestSubstitutions(SoupTest): u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', soup.div.prettify()) - def test_prettify_accepts_formatter(self): + def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) diff --git a/doc/source/index.rst b/doc/source/index.rst index e1b73aa..cc816a0 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2145,7 +2145,7 @@ invalid HTML or XML:: You can change this behavior by providing a value for the ``formatter`` argument to ``prettify()``, ``encode()``, or -``decode()``. Beautiful Soup recognizes four possible values for +``decode()``. Beautiful Soup recognizes six possible values for ``formatter``. The default is ``formatter="minimal"``. Strings will only be processed @@ -2174,6 +2174,18 @@ Unicode characters to HTML entities whenever possible:: # </body> # </html> + If you pass in ``formatter="html5"``, it's the same as +``formatter="html5"``, but Beautiful Soup will +omit the closing slash in HTML void tags like "br":: + + soup = BeautifulSoup("<br>") + + print(soup.encode(formatter="html")) + # <html><body><br/></body></html> + + print(soup.encode(formatter="html5")) + # <html><body><br></body></html> + If you pass in ``formatter=None``, Beautiful Soup will not modify strings at all on output. This is the fastest option, but it may lead to Beautiful Soup generating invalid HTML/XML, as in these examples:: |