Introduced the Formatter system. [bug=1716272].

author: Leonard Richardson <leonardr@segfault.org> 2018-07-15 19:50:15 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-15 19:50:15 -0400
commit: 999a1ad671036ccbb4704d402dff624083fbee90 (patch)
tree: dbbedfcbb0590ccab3098f52c0c5f6ec25991d25
parent: db0ef1662efba41a111861d652a248385f7baac9 (diff)
download: beautifulsoup4-999a1ad671036ccbb4704d402dff624083fbee90.tar.gz
5 files changed, 95 insertions, 53 deletions
diff --git a/NEWS.txt b/NEWS.txt
index e22e88f..45a6952 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -14,10 +14,12 @@
 * Stopped HTMLParser from raising an exception in very rare cases of
   bad markup. [bug=1708831]
 
-* It's possible for a TreeBuilder subclass to specify that void
-  elements should be represented as "<element>" rather than
-  "<element/>", by setting TreeBuilder.void_element_close_prefix to
-  the empty string. [bug=1716272]
+* Added a new formatter, "html5", which represents void elements
+  elements as "<element>" rather than "<element/>".  [bug=1716272]
+
+* You can get finer control over formatting by subclassing
+  bs4.element.Formatter and passing a Formatter instance into (e.g.)
+  encode(). [bug=1716272]
 
 = 4.6.0 (20170507) =
 
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index d7166bb..21454e6 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -93,14 +93,6 @@ class TreeBuilder(object):
     preserve_whitespace_tags = set()
     empty_element_tags = None # A tag will be considered an empty-element
                               # tag when and only when it has no contents.
-    void_tags = None # There are no void tags.
-
-    # This string goes just before the end of the start tag for an
-    # void element.
-    #
-    # Leave this alone and you'll get tags like "<br/>". Change it to the
-    # empty string and you'll get tags like "<br>".
-    void_element_close_prefix = '/'
     
     # A value for these tag/attribute combinations is a space- or
     # comma-separated list of CDATA, rather than a single CDATA.
@@ -133,17 +125,6 @@ class TreeBuilder(object):
         if self.empty_element_tags is None:
             return True
         return tag_name in self.empty_element_tags
-
-    def is_void(self, tag_name):
-        """Must a tag with this name be a void tag?
-
-        A void tag cannot have contents and is presented with neither
-        a a closing tag or a closing slash, e.g.:
-            <link href="foo">
-        """
-        if self.void_tags is None:
-            return False
-        return tag_name in self.void_tags
         
     def feed(self, markup):
         raise NotImplementedError()
@@ -255,8 +236,8 @@ class HTMLTreeBuilder(TreeBuilder):
         # These are from HTML5.
         'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
         
-        # These are from HTML4, removed in HTML5.
-        'spacer', 'frame'
+        # These are from earlier versions of HTML and are removed in HTML5.
+        'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
     ])
     
     # The HTML standard defines these attributes as containing a
diff --git a/bs4/element.py b/bs4/element.py
index 181f135..911b9bc 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -126,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
         return cls._substitute_if_appropriate(
             ns, EntitySubstitution.substitute_xml)
 
+class Formatter(object):
+    """Contains information about how to format a parse tree."""
+    
+    # By default, represent void elements as <tag/> rather than <tag>
+    void_element_close_prefix = '/'
+
+    def substitute_entities(self, *args, **kwargs):
+        """Transform certain characters into named entities."""
+        raise NotImplementedError()
+
+class HTMLFormatter(Formatter):
+    """The default HTML formatter."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+class MinimalHTMLFormatter(Formatter):
+    """A minimal HTML formatter."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
+    
+class HTML5Formatter(HTMLFormatter):
+    """An HTML formatter that omits the slash in a void tag."""
+    void_element_close_prefix = None
+
+class XMLFormatter(Formatter):
+    """Substitute only the essential XML entities."""
+    def substitute(self, *args, **kwargs):
+        return EntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTMLXMLFormatter(Formatter):
+    """Format XML using HTML rules."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+    
 class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
@@ -134,40 +169,49 @@ class PageElement(object):
     # to methods like encode() and prettify():
     #
     # "html" - All Unicode characters with corresponding HTML entities
-    #   are converted to those entities on output. 
-   # "minimal" - Bare ampersands and angle brackets are converted to
+    #   are converted to those entities on output.
+    # "html5" - The same as "html", but empty void tags are represented as
+    #   <tag> rather than <tag/>
+    # "minimal" - Bare ampersands and angle brackets are converted to
     #   XML entities: &amp; &lt; &gt;
     # None - The null formatter. Unicode characters are never
     #   converted to entities.  This is not recommended, but it's
     #   faster than "minimal".
-    # A function - This function will be called on every string that
+    # A callable function - it will be called on every string that needs to undergo entity substitution.
+    # A Formatter instance - Formatter.substitute(string) will be called on every string that
     #  needs to undergo entity substitution.
     #
 
-    # In an HTML document, the default "html" and "minimal" functions
-    # will leave the contents of <script> and <style> tags alone. For
-    # an XML document, all tags will be given the same treatment.
+    # In an HTML document, the default "html", "html5", and "minimal"
+    # functions will leave the contents of <script> and <style> tags
+    # alone. For an XML document, all tags will be given the same
+    # treatment.
 
     HTML_FORMATTERS = {
-        "html" : HTMLAwareEntitySubstitution.substitute_html,
-        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+        "html" : HTMLFormatter(),
+        "html5" : HTML5Formatter(),
+        "minimal" : MinimalHTMLFormatter(),
         None : None
         }
 
     XML_FORMATTERS = {
-        "html" : EntitySubstitution.substitute_html,
-        "minimal" : EntitySubstitution.substitute_xml,
+        "html" : HTMLXMLFormatter(),
+        "minimal" : XMLFormatter(),
         None : None
         }
 
     def format_string(self, s, formatter='minimal'):
         """Format the given string using the given formatter."""
-        if not callable(formatter):
+        if isinstance(formatter, basestring):
             formatter = self._formatter_for_name(formatter)
         if formatter is None:
             output = s
         else:
-            output = formatter(s)
+            if callable(formatter):
+                # Backwards compatibility -- you used to pass in a formatting method.
+                output = formatter(s)
+            else:
+                output = formatter.substitute(s)
         return output
 
     @property
@@ -197,11 +241,9 @@ class PageElement(object):
     def _formatter_for_name(self, name):
         "Look up a formatter function based on its name and the tree."
         if self._is_xml:
-            return self.XML_FORMATTERS.get(
-                name, EntitySubstitution.substitute_xml)
+            return self.XML_FORMATTERS.get(name, XMLFormatter())
         else:
-            return self.HTML_FORMATTERS.get(
-                name, HTMLAwareEntitySubstitution.substitute_xml)
+            return self.HTML_FORMATTERS.get(name, HTMLFormatter())
 
     def setup(self, parent=None, previous_element=None, next_element=None,
               previous_sibling=None, next_sibling=None):
@@ -871,10 +913,8 @@ class Tag(PageElement):
         if builder is not None:
             builder.set_up_substitutions(self)
             self.can_be_empty_element = builder.can_be_empty_element(name)
-            self.void_element_close_prefix = builder.void_element_close_prefix or ""
         else:
             self.can_be_empty_element = False
-            self.void_element_close_prefix = '/'
             
     parserClass = _alias("parser_class")  # BS3
 
@@ -1142,11 +1182,10 @@ class Tag(PageElement):
            encoding.
         """
 
-        # First off, turn a string formatter into a function. This
+        # First off, turn a string formatter into a Formatter object. This
         # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, Formatter) and not callable(formatter):
             formatter = self._formatter_for_name(formatter)
-
         attrs = []
         if self.attrs:
             for key, val in sorted(self.attrs.items()):
@@ -1175,7 +1214,7 @@ class Tag(PageElement):
             prefix = self.prefix + ":"
 
         if self.is_empty_element:
-            close = self.void_element_close_prefix
+            close = formatter.void_element_close_prefix or ''
         else:
             closeTag = '</%s%s>' % (prefix, self.name)
 
@@ -1246,9 +1285,9 @@ class Tag(PageElement):
         :param formatter: The output formatter responsible for converting
            entities to Unicode characters.
         """
-        # First off, turn a string formatter into a function. This
+        # First off, turn a string formatter into a Formatter object. This
         # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, Formatter) and not callable(formatter):
             formatter = self._formatter_for_name(formatter)
 
         pretty_print = (indent_level is not None)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index e75cf1d..e8903e3 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1439,13 +1439,21 @@ class TestSubstitutions(SoupTest):
                 u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
 
     def test_formatter_html(self):
-        markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+        markup = u"<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
         soup = self.soup(markup)
         decoded = soup.decode(formatter="html")
         self.assertEqual(
             decoded,
-            self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+            self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
 
+    def test_formatter_html5(self):
+        markup = u"<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+        soup = self.soup(markup)
+        decoded = soup.decode(formatter="html5")
+        self.assertEqual(
+            decoded,
+            self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+        
     def test_formatter_minimal(self):
         markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
         soup = self.soup(markup)
@@ -1518,7 +1526,7 @@ class TestSubstitutions(SoupTest):
             u'<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n</div>',
             soup.div.prettify())
 
-    def test_prettify_accepts_formatter(self):
+    def test_prettify_accepts_formatter_function(self):
         soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
         pretty = soup.prettify(formatter = lambda x: x.upper())
         self.assertTrue("FOO" in pretty)
diff --git a/doc/source/index.rst b/doc/source/index.rst
index e1b73aa..cc816a0 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2145,7 +2145,7 @@ invalid HTML or XML::
 
 You can change this behavior by providing a value for the
 ``formatter`` argument to ``prettify()``, ``encode()``, or
-``decode()``. Beautiful Soup recognizes four possible values for
+``decode()``. Beautiful Soup recognizes six possible values for
 ``formatter``.
 
 The default is ``formatter="minimal"``. Strings will only be processed
@@ -2174,6 +2174,18 @@ Unicode characters to HTML entities whenever possible::
  #  </body>
  # </html>
 
+ If you pass in ``formatter="html5"``, it's the same as
+``formatter="html5"``, but Beautiful Soup will
+omit the closing slash in HTML void tags like "br"::
+
+ soup = BeautifulSoup("<br>")
+ 
+ print(soup.encode(formatter="html"))
+ # <html><body><br/></body></html>
+ 
+ print(soup.encode(formatter="html5"))
+ # <html><body><br></body></html>
+ 
 If you pass in ``formatter=None``, Beautiful Soup will not modify
 strings at all on output. This is the fastest option, but it may lead
 to Beautiful Soup generating invalid HTML/XML, as in these examples::
author	Leonard Richardson <leonardr@segfault.org>	2018-07-15 19:50:15 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-15 19:50:15 -0400
commit	999a1ad671036ccbb4704d402dff624083fbee90 (patch)
tree	dbbedfcbb0590ccab3098f52c0c5f6ec25991d25
parent	db0ef1662efba41a111861d652a248385f7baac9 (diff)
download	beautifulsoup4-999a1ad671036ccbb4704d402dff624083fbee90.tar.gz