summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-07-15 19:50:15 -0400
committerLeonard Richardson <leonardr@segfault.org>2018-07-15 19:50:15 -0400
commit999a1ad671036ccbb4704d402dff624083fbee90 (patch)
treedbbedfcbb0590ccab3098f52c0c5f6ec25991d25
parentdb0ef1662efba41a111861d652a248385f7baac9 (diff)
downloadbeautifulsoup4-999a1ad671036ccbb4704d402dff624083fbee90.tar.gz
Introduced the Formatter system. [bug=1716272].
-rw-r--r--NEWS.txt10
-rw-r--r--bs4/builder/__init__.py23
-rw-r--r--bs4/element.py87
-rw-r--r--bs4/tests/test_tree.py14
-rw-r--r--doc/source/index.rst14
5 files changed, 95 insertions, 53 deletions
diff --git a/NEWS.txt b/NEWS.txt
index e22e88f..45a6952 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -14,10 +14,12 @@
* Stopped HTMLParser from raising an exception in very rare cases of
bad markup. [bug=1708831]
-* It's possible for a TreeBuilder subclass to specify that void
- elements should be represented as "<element>" rather than
- "<element/>", by setting TreeBuilder.void_element_close_prefix to
- the empty string. [bug=1716272]
+* Added a new formatter, "html5", which represents void elements
+ elements as "<element>" rather than "<element/>". [bug=1716272]
+
+* You can get finer control over formatting by subclassing
+ bs4.element.Formatter and passing a Formatter instance into (e.g.)
+ encode(). [bug=1716272]
= 4.6.0 (20170507) =
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index d7166bb..21454e6 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -93,14 +93,6 @@ class TreeBuilder(object):
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
- void_tags = None # There are no void tags.
-
- # This string goes just before the end of the start tag for an
- # void element.
- #
- # Leave this alone and you'll get tags like "<br/>". Change it to the
- # empty string and you'll get tags like "<br>".
- void_element_close_prefix = '/'
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
@@ -133,17 +125,6 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
-
- def is_void(self, tag_name):
- """Must a tag with this name be a void tag?
-
- A void tag cannot have contents and is presented with neither
- a a closing tag or a closing slash, e.g.:
- <link href="foo">
- """
- if self.void_tags is None:
- return False
- return tag_name in self.void_tags
def feed(self, markup):
raise NotImplementedError()
@@ -255,8 +236,8 @@ class HTMLTreeBuilder(TreeBuilder):
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
- # These are from HTML4, removed in HTML5.
- 'spacer', 'frame'
+ # These are from earlier versions of HTML and are removed in HTML5.
+ 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
])
# The HTML standard defines these attributes as containing a
diff --git a/bs4/element.py b/bs4/element.py
index 181f135..911b9bc 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -126,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
+class Formatter(object):
+ """Contains information about how to format a parse tree."""
+
+ # By default, represent void elements as <tag/> rather than <tag>
+ void_element_close_prefix = '/'
+
+ def substitute_entities(self, *args, **kwargs):
+ """Transform certain characters into named entities."""
+ raise NotImplementedError()
+
+class HTMLFormatter(Formatter):
+ """The default HTML formatter."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+class MinimalHTMLFormatter(Formatter):
+ """A minimal HTML formatter."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTML5Formatter(HTMLFormatter):
+ """An HTML formatter that omits the slash in a void tag."""
+ void_element_close_prefix = None
+
+class XMLFormatter(Formatter):
+ """Substitute only the essential XML entities."""
+ def substitute(self, *args, **kwargs):
+ return EntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTMLXMLFormatter(Formatter):
+ """Format XML using HTML rules."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -134,40 +169,49 @@ class PageElement(object):
# to methods like encode() and prettify():
#
# "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "minimal" - Bare ampersands and angle brackets are converted to
+ # are converted to those entities on output.
+ # "html5" - The same as "html", but empty void tags are represented as
+ # <tag> rather than <tag/>
+ # "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: &amp; &lt; &gt;
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# faster than "minimal".
- # A function - This function will be called on every string that
+ # A callable function - it will be called on every string that needs to undergo entity substitution.
+ # A Formatter instance - Formatter.substitute(string) will be called on every string that
# needs to undergo entity substitution.
#
- # In an HTML document, the default "html" and "minimal" functions
- # will leave the contents of <script> and <style> tags alone. For
- # an XML document, all tags will be given the same treatment.
+ # In an HTML document, the default "html", "html5", and "minimal"
+ # functions will leave the contents of <script> and <style> tags
+ # alone. For an XML document, all tags will be given the same
+ # treatment.
HTML_FORMATTERS = {
- "html" : HTMLAwareEntitySubstitution.substitute_html,
- "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+ "html" : HTMLFormatter(),
+ "html5" : HTML5Formatter(),
+ "minimal" : MinimalHTMLFormatter(),
None : None
}
XML_FORMATTERS = {
- "html" : EntitySubstitution.substitute_html,
- "minimal" : EntitySubstitution.substitute_xml,
+ "html" : HTMLXMLFormatter(),
+ "minimal" : XMLFormatter(),
None : None
}
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
- if not callable(formatter):
+ if isinstance(formatter, basestring):
formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
- output = formatter(s)
+ if callable(formatter):
+ # Backwards compatibility -- you used to pass in a formatting method.
+ output = formatter(s)
+ else:
+ output = formatter.substitute(s)
return output
@property
@@ -197,11 +241,9 @@ class PageElement(object):
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
- return self.XML_FORMATTERS.get(
- name, EntitySubstitution.substitute_xml)
+ return self.XML_FORMATTERS.get(name, XMLFormatter())
else:
- return self.HTML_FORMATTERS.get(
- name, HTMLAwareEntitySubstitution.substitute_xml)
+ return self.HTML_FORMATTERS.get(name, HTMLFormatter())
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
@@ -871,10 +913,8 @@ class Tag(PageElement):
if builder is not None:
builder.set_up_substitutions(self)
self.can_be_empty_element = builder.can_be_empty_element(name)
- self.void_element_close_prefix = builder.void_element_close_prefix or ""
else:
self.can_be_empty_element = False
- self.void_element_close_prefix = '/'
parserClass = _alias("parser_class") # BS3
@@ -1142,11 +1182,10 @@ class Tag(PageElement):
encoding.
"""
- # First off, turn a string formatter into a function. This
+ # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
- if not callable(formatter):
+ if not isinstance(formatter, Formatter) and not callable(formatter):
formatter = self._formatter_for_name(formatter)
-
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
@@ -1175,7 +1214,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
- close = self.void_element_close_prefix
+ close = formatter.void_element_close_prefix or ''
else:
closeTag = '</%s%s>' % (prefix, self.name)
@@ -1246,9 +1285,9 @@ class Tag(PageElement):
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
"""
- # First off, turn a string formatter into a function. This
+ # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
- if not callable(formatter):
+ if not isinstance(formatter, Formatter) and not callable(formatter):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index e75cf1d..e8903e3 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1439,13 +1439,21 @@ class TestSubstitutions(SoupTest):
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_html(self):
- markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ markup = u"<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="html")
self.assertEqual(
decoded,
- self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+ self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+ def test_formatter_html5(self):
+ markup = u"<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ soup = self.soup(markup)
+ decoded = soup.decode(formatter="html5")
+ self.assertEqual(
+ decoded,
+ self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+
def test_formatter_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
@@ -1518,7 +1526,7 @@ class TestSubstitutions(SoupTest):
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
soup.div.prettify())
- def test_prettify_accepts_formatter(self):
+ def test_prettify_accepts_formatter_function(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty)
diff --git a/doc/source/index.rst b/doc/source/index.rst
index e1b73aa..cc816a0 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2145,7 +2145,7 @@ invalid HTML or XML::
You can change this behavior by providing a value for the
``formatter`` argument to ``prettify()``, ``encode()``, or
-``decode()``. Beautiful Soup recognizes four possible values for
+``decode()``. Beautiful Soup recognizes six possible values for
``formatter``.
The default is ``formatter="minimal"``. Strings will only be processed
@@ -2174,6 +2174,18 @@ Unicode characters to HTML entities whenever possible::
# </body>
# </html>
+ If you pass in ``formatter="html5"``, it's the same as
+``formatter="html5"``, but Beautiful Soup will
+omit the closing slash in HTML void tags like "br"::
+
+ soup = BeautifulSoup("<br>")
+
+ print(soup.encode(formatter="html"))
+ # <html><body><br/></body></html>
+
+ print(soup.encode(formatter="html5"))
+ # <html><body><br></body></html>
+
If you pass in ``formatter=None``, Beautiful Soup will not modify
strings at all on output. This is the fastest option, but it may lead
to Beautiful Soup generating invalid HTML/XML, as in these examples::