From 999a1ad671036ccbb4704d402dff624083fbee90 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 15 Jul 2018 19:50:15 -0400 Subject: Introduced the Formatter system. [bug=1716272]. --- NEWS.txt | 10 +++--- bs4/builder/__init__.py | 23 ++----------- bs4/element.py | 87 +++++++++++++++++++++++++++++++++++-------------- bs4/tests/test_tree.py | 14 ++++++-- doc/source/index.rst | 14 +++++++- 5 files changed, 95 insertions(+), 53 deletions(-) diff --git a/NEWS.txt b/NEWS.txt index e22e88f..45a6952 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -14,10 +14,12 @@ * Stopped HTMLParser from raising an exception in very rare cases of bad markup. [bug=1708831] -* It's possible for a TreeBuilder subclass to specify that void - elements should be represented as "" rather than - "", by setting TreeBuilder.void_element_close_prefix to - the empty string. [bug=1716272] +* Added a new formatter, "html5", which represents void elements + elements as "" rather than "". [bug=1716272] + +* You can get finer control over formatting by subclassing + bs4.element.Formatter and passing a Formatter instance into (e.g.) + encode(). [bug=1716272] = 4.6.0 (20170507) = diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index d7166bb..21454e6 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -93,14 +93,6 @@ class TreeBuilder(object): preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - void_tags = None # There are no void tags. - - # This string goes just before the end of the start tag for an - # void element. - # - # Leave this alone and you'll get tags like "
". Change it to the - # empty string and you'll get tags like "
". - void_element_close_prefix = '/' # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. @@ -133,17 +125,6 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - - def is_void(self, tag_name): - """Must a tag with this name be a void tag? - - A void tag cannot have contents and is presented with neither - a a closing tag or a closing slash, e.g.: - - """ - if self.void_tags is None: - return False - return tag_name in self.void_tags def feed(self, markup): raise NotImplementedError() @@ -255,8 +236,8 @@ class HTMLTreeBuilder(TreeBuilder): # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - # These are from HTML4, removed in HTML5. - 'spacer', 'frame' + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' ]) # The HTML standard defines these attributes as containing a diff --git a/bs4/element.py b/bs4/element.py index 181f135..911b9bc 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -126,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): return cls._substitute_if_appropriate( ns, EntitySubstitution.substitute_xml) +class Formatter(object): + """Contains information about how to format a parse tree.""" + + # By default, represent void elements as rather than + void_element_close_prefix = '/' + + def substitute_entities(self, *args, **kwargs): + """Transform certain characters into named entities.""" + raise NotImplementedError() + +class HTMLFormatter(Formatter): + """The default HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + +class MinimalHTMLFormatter(Formatter): + """A minimal HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs) + +class HTML5Formatter(HTMLFormatter): + """An HTML formatter that omits the slash in a void tag.""" + void_element_close_prefix = None + +class XMLFormatter(Formatter): + """Substitute only the essential XML entities.""" + def substitute(self, *args, **kwargs): + return EntitySubstitution.substitute_xml(*args, **kwargs) + +class HTMLXMLFormatter(Formatter): + """Format XML using HTML rules.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -134,40 +169,49 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "html5" - The same as "html", but empty void tags are represented as + # rather than + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". - # A function - This function will be called on every string that + # A callable function - it will be called on every string that needs to undergo entity substitution. + # A Formatter instance - Formatter.substitute(string) will be called on every string that # needs to undergo entity substitution. # - # In an HTML document, the default "html" and "minimal" functions - # will leave the contents of