1 files changed, 81 insertions, 29 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 9ef75f8..911b9bc 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -2,7 +2,10 @@
 # found in the LICENSE file.
 __license__ = "MIT"
 
-import collections
+try:
+    from collections.abc import Callable # Python 3.6
+except ImportError , e:
+    from collections import Callable
 import re
 import shlex
 import sys
@@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 PY3K = (sys.version_info[0] > 2)
 
-whitespace_re = re.compile("\s+")
+whitespace_re = re.compile(r"\s+")
 
 def _alias(attr):
     """Alias one attribute name to another for backward compatibility"""
@@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
     The value of the 'content' attribute will be one of these objects.
     """
 
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
 
     def __new__(cls, original_value):
         match = cls.CHARSET_RE.search(original_value)
@@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
         return cls._substitute_if_appropriate(
             ns, EntitySubstitution.substitute_xml)
 
+class Formatter(object):
+    """Contains information about how to format a parse tree."""
+    
+    # By default, represent void elements as <tag/> rather than <tag>
+    void_element_close_prefix = '/'
+
+    def substitute_entities(self, *args, **kwargs):
+        """Transform certain characters into named entities."""
+        raise NotImplementedError()
+
+class HTMLFormatter(Formatter):
+    """The default HTML formatter."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+class MinimalHTMLFormatter(Formatter):
+    """A minimal HTML formatter."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
+    
+class HTML5Formatter(HTMLFormatter):
+    """An HTML formatter that omits the slash in a void tag."""
+    void_element_close_prefix = None
+
+class XMLFormatter(Formatter):
+    """Substitute only the essential XML entities."""
+    def substitute(self, *args, **kwargs):
+        return EntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTMLXMLFormatter(Formatter):
+    """Format XML using HTML rules."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+    
 class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
@@ -131,40 +169,49 @@ class PageElement(object):
     # to methods like encode() and prettify():
     #
     # "html" - All Unicode characters with corresponding HTML entities
-    #   are converted to those entities on output. 
-   # "minimal" - Bare ampersands and angle brackets are converted to
+    #   are converted to those entities on output.
+    # "html5" - The same as "html", but empty void tags are represented as
+    #   <tag> rather than <tag/>
+    # "minimal" - Bare ampersands and angle brackets are converted to
     #   XML entities: &amp; &lt; &gt;
     # None - The null formatter. Unicode characters are never
     #   converted to entities.  This is not recommended, but it's
     #   faster than "minimal".
-    # A function - This function will be called on every string that
+    # A callable function - it will be called on every string that needs to undergo entity substitution.
+    # A Formatter instance - Formatter.substitute(string) will be called on every string that
     #  needs to undergo entity substitution.
     #
 
-    # In an HTML document, the default "html" and "minimal" functions
-    # will leave the contents of <script> and <style> tags alone. For
-    # an XML document, all tags will be given the same treatment.
+    # In an HTML document, the default "html", "html5", and "minimal"
+    # functions will leave the contents of <script> and <style> tags
+    # alone. For an XML document, all tags will be given the same
+    # treatment.
 
     HTML_FORMATTERS = {
-        "html" : HTMLAwareEntitySubstitution.substitute_html,
-        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+        "html" : HTMLFormatter(),
+        "html5" : HTML5Formatter(),
+        "minimal" : MinimalHTMLFormatter(),
         None : None
         }
 
     XML_FORMATTERS = {
-        "html" : EntitySubstitution.substitute_html,
-        "minimal" : EntitySubstitution.substitute_xml,
+        "html" : HTMLXMLFormatter(),
+        "minimal" : XMLFormatter(),
         None : None
         }
 
     def format_string(self, s, formatter='minimal'):
         """Format the given string using the given formatter."""
-        if not callable(formatter):
+        if isinstance(formatter, basestring):
             formatter = self._formatter_for_name(formatter)
         if formatter is None:
             output = s
         else:
-            output = formatter(s)
+            if callable(formatter):
+                # Backwards compatibility -- you used to pass in a formatting method.
+                output = formatter(s)
+            else:
+                output = formatter.substitute(s)
         return output
 
     @property
@@ -194,11 +241,9 @@ class PageElement(object):
     def _formatter_for_name(self, name):
         "Look up a formatter function based on its name and the tree."
         if self._is_xml:
-            return self.XML_FORMATTERS.get(
-                name, EntitySubstitution.substitute_xml)
+            return self.XML_FORMATTERS.get(name, XMLFormatter())
         else:
-            return self.HTML_FORMATTERS.get(
-                name, HTMLAwareEntitySubstitution.substitute_xml)
+            return self.HTML_FORMATTERS.get(name, HTMLFormatter())
 
     def setup(self, parent=None, previous_element=None, next_element=None,
               previous_sibling=None, next_sibling=None):
@@ -316,6 +361,14 @@ class PageElement(object):
             and not isinstance(new_child, NavigableString)):
             new_child = NavigableString(new_child)
 
+        from bs4 import BeautifulSoup
+        if isinstance(new_child, BeautifulSoup):
+            # We don't want to end up with a situation where one BeautifulSoup
+            # object contains another. Insert the children one at a time.
+            for subchild in list(new_child.contents):
+                self.insert(position, subchild)
+                position += 1
+            return
         position = min(position, len(self.contents))
         if hasattr(new_child, 'parent') and new_child.parent is not None:
             # We're 'inserting' an element that's already one
@@ -862,7 +915,7 @@ class Tag(PageElement):
             self.can_be_empty_element = builder.can_be_empty_element(name)
         else:
             self.can_be_empty_element = False
-
+            
     parserClass = _alias("parser_class")  # BS3
 
     def __copy__(self):
@@ -1129,11 +1182,10 @@ class Tag(PageElement):
            encoding.
         """
 
-        # First off, turn a string formatter into a function. This
+        # First off, turn a string formatter into a Formatter object. This
         # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, Formatter) and not callable(formatter):
             formatter = self._formatter_for_name(formatter)
-
         attrs = []
         if self.attrs:
             for key, val in sorted(self.attrs.items()):
@@ -1162,7 +1214,7 @@ class Tag(PageElement):
             prefix = self.prefix + ":"
 
         if self.is_empty_element:
-            close = '/'
+            close = formatter.void_element_close_prefix or ''
         else:
             closeTag = '</%s%s>' % (prefix, self.name)
 
@@ -1233,9 +1285,9 @@ class Tag(PageElement):
         :param formatter: The output formatter responsible for converting
            entities to Unicode characters.
         """
-        # First off, turn a string formatter into a function. This
+        # First off, turn a string formatter into a Formatter object. This
         # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, Formatter) and not callable(formatter):
             formatter = self._formatter_for_name(formatter)
 
         pretty_print = (indent_level is not None)
@@ -1418,7 +1470,7 @@ class Tag(PageElement):
                 if tag_name == '':
                     raise ValueError(
                         "A pseudo-class must be prefixed with a tag name.")
-                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+                pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
                 found = []
                 if pseudo_attributes is None:
                     pseudo_type = pseudo
@@ -1652,7 +1704,7 @@ class SoupStrainer(object):
             markup = markup_name
             markup_attrs = markup
         call_function_with_tag_data = (
-            isinstance(self.name, collections.Callable)
+            isinstance(self.name, Callable)
             and not isinstance(markup_name, Tag))
 
         if ((not self.name)
@@ -1732,7 +1784,7 @@ class SoupStrainer(object):
             # True matches any non-None value.
             return markup is not None
 
-        if isinstance(match_against, collections.Callable):
+        if isinstance(match_against, Callable):
             return match_against(markup)
 
         # Custom callables take the tag as an argument, but all