diff options
-rw-r--r-- | NEWS.txt | 25 | ||||
-rw-r--r-- | bs4/__init__.py | 80 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 12 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 14 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 8 | ||||
-rw-r--r-- | bs4/dammit.py | 6 | ||||
-rw-r--r-- | bs4/diagnose.py | 20 | ||||
-rw-r--r-- | bs4/element.py | 110 | ||||
-rw-r--r-- | bs4/testing.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 15 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 36 | ||||
-rw-r--r-- | doc/source/index.rst | 14 | ||||
-rw-r--r-- | prepare-release.sh | 1 | ||||
-rw-r--r-- | setup.py | 2 |
14 files changed, 276 insertions, 69 deletions
@@ -1,4 +1,27 @@ -= 4.6.0 (Unreleased) = += Unreleased + +* Stop data loss when encountering an empty numeric entity, and + possibly in other cases. Thanks to tos.kamiya for the fix. [bug=1698503] + +* Improved the warning given when no parser is specified. [bug=1780571] + +* Fixed code that was causing deprecation warnings in recent Python 3 + versions. Includes a patch from Ville Skyttä. [bug=1778909] [bug=1689496] + +* Fixed a Windows crash in diagnose() when checking whether a long + markup string is a filename. [bug=1737121] + +* Stopped HTMLParser from raising an exception in very rare cases of + bad markup. [bug=1708831] + +* Added a new formatter, "html5", which represents void elements + elements as "<element>" rather than "<element/>". [bug=1716272] + +* You can get finer control over formatting by subclassing + bs4.element.Formatter and passing a Formatter instance into (e.g.) + encode(). [bug=1716272] + += 4.6.0 (20170507) = * Added the `Tag.get_attribute_list` method, which acts like `Tag.get` for getting the value of an attribute, but which always returns a list, diff --git a/bs4/__init__.py b/bs4/__init__.py index c984ef6..329ef53 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -21,7 +21,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # found in the LICENSE file. __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.5.3" +__version__ = "4.6.0" __copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" __license__ = "MIT" @@ -29,6 +29,7 @@ __all__ = ['BeautifulSoup'] import os import re +import sys import traceback import warnings @@ -82,14 +83,46 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser.""" + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be used. This + may be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type of markup + to be used ("html", "html5", "xml"). It's recommended that you + name a specific parser, so that Beautiful Soup gives you the + same results across platforms and virtual environments. + + :param builder: A specific TreeBuilder to use instead of looking one + up based on `features`. You shouldn't need to use this. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4 and there's no need to actually pass keyword + arguments into the constructor. + """ if 'convertEntities' in kwargs: warnings.warn( @@ -171,14 +204,35 @@ class BeautifulSoup(Tag): else: markup_type = "HTML" - caller = traceback.extract_stack()[0] - filename = caller[0] - line_number = caller[1] - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( - filename=filename, - line_number=line_number, - parser=builder.NAME, - markup_type=markup_type)) + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) self.builder = builder self.is_xml = builder.is_xml diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index fdb3362..21454e6 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -93,7 +93,7 @@ class TreeBuilder(object): preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - + # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. cdata_list_attributes = {} @@ -125,7 +125,7 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - + def feed(self, markup): raise NotImplementedError() @@ -235,11 +235,11 @@ class HTMLTreeBuilder(TreeBuilder): empty_element_tags = set([ # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - - # These are from HTML4, removed in HTML5. - 'spacer', 'frame' + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' ]) - + # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 67890b3..ef9fd1e 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -64,7 +64,18 @@ class BeautifulSoupHTMLParser(HTMLParser): # order. It's a list of closing tags we've already handled and # will ignore, assuming they ever show up. self.already_closed_empty_element = [] - + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + def handle_startendtag(self, name, attrs): # This is only called when the markup looks like # <tag/>. @@ -213,6 +224,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) + parser.close() except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index d2ca287..3439271 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -5,9 +5,13 @@ __all__ = [ 'LXMLTreeBuilder', ] +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable + from io import BytesIO from StringIO import StringIO -import collections from lxml import etree from bs4.element import ( Comment, @@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Use the default parser. parser = self.default_parser(encoding) - if isinstance(parser, collections.Callable): + if isinstance(parser, Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser diff --git a/bs4/dammit.py b/bs4/dammit.py index 7965565..be46b39 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -46,9 +46,9 @@ except ImportError: pass xml_encoding_re = re.compile( - '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) + '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) html_meta_re = re.compile( - '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): @@ -82,7 +82,7 @@ class EntitySubstitution(object): } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")") AMPERSAND_OR_BRACKET = re.compile("([<>&])") diff --git a/bs4/diagnose.py b/bs4/diagnose.py index 8768332..7a28c09 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -37,7 +37,7 @@ def diagnose(data): name) if 'lxml' in basic_parsers: - basic_parsers.append(["lxml", "xml"]) + basic_parsers.append("lxml-xml") try: from lxml import etree print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) @@ -56,21 +56,27 @@ def diagnose(data): if hasattr(data, 'read'): data = data.read() - elif os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data - with open(data) as fp: - data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." return - print + else: + try: + if os.path.exists(data): + print '"%s" looks like a filename. Reading data from the file.' % data + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: - soup = BeautifulSoup(data, parser) + soup = BeautifulSoup(data, features=parser) success = True except Exception, e: print "%s could not parse the markup." % parser diff --git a/bs4/element.py b/bs4/element.py index 9ef75f8..911b9bc 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -2,7 +2,10 @@ # found in the LICENSE file. __license__ = "MIT" -import collections +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable import re import shlex import sys @@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) -whitespace_re = re.compile("\s+") +whitespace_re = re.compile(r"\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): The value of the 'content' attribute will be one of these objects. """ - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) @@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): return cls._substitute_if_appropriate( ns, EntitySubstitution.substitute_xml) +class Formatter(object): + """Contains information about how to format a parse tree.""" + + # By default, represent void elements as <tag/> rather than <tag> + void_element_close_prefix = '/' + + def substitute_entities(self, *args, **kwargs): + """Transform certain characters into named entities.""" + raise NotImplementedError() + +class HTMLFormatter(Formatter): + """The default HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + +class MinimalHTMLFormatter(Formatter): + """A minimal HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs) + +class HTML5Formatter(HTMLFormatter): + """An HTML formatter that omits the slash in a void tag.""" + void_element_close_prefix = None + +class XMLFormatter(Formatter): + """Substitute only the essential XML entities.""" + def substitute(self, *args, **kwargs): + return EntitySubstitution.substitute_xml(*args, **kwargs) + +class HTMLXMLFormatter(Formatter): + """Format XML using HTML rules.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -131,40 +169,49 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "html5" - The same as "html", but empty void tags are represented as + # <tag> rather than <tag/> + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". - # A function - This function will be called on every string that + # A callable function - it will be called on every string that needs to undergo entity substitution. + # A Formatter instance - Formatter.substitute(string) will be called on every string that # needs to undergo entity substitution. # - # In an HTML document, the default "html" and "minimal" functions - # will leave the contents of <script> and <style> tags alone. For - # an XML document, all tags will be given the same treatment. + # In an HTML document, the default "html", "html5", and "minimal" + # functions will leave the contents of <script> and <style> tags + # alone. For an XML document, all tags will be given the same + # treatment. HTML_FORMATTERS = { - "html" : HTMLAwareEntitySubstitution.substitute_html, - "minimal" : HTMLAwareEntitySubstitution.substitute_xml, + "html" : HTMLFormatter(), + "html5" : HTML5Formatter(), + "minimal" : MinimalHTMLFormatter(), None : None } XML_FORMATTERS = { - "html" : EntitySubstitution.substitute_html, - "minimal" : EntitySubstitution.substitute_xml, + "html" : HTMLXMLFormatter(), + "minimal" : XMLFormatter(), None : None } def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" - if not callable(formatter): + if isinstance(formatter, basestring): formatter = self._formatter_for_name(formatter) if formatter is None: output = s else: - output = formatter(s) + if callable(formatter): + # Backwards compatibility -- you used to pass in a formatting method. + output = formatter(s) + else: + output = formatter.substitute(s) return output @property @@ -194,11 +241,9 @@ class PageElement(object): def _formatter_for_name(self, name): "Look up a formatter function based on its name and the tree." if self._is_xml: - return self.XML_FORMATTERS.get( - name, EntitySubstitution.substitute_xml) + return self.XML_FORMATTERS.get(name, XMLFormatter()) else: - return self.HTML_FORMATTERS.get( - name, HTMLAwareEntitySubstitution.substitute_xml) + return self.HTML_FORMATTERS.get(name, HTMLFormatter()) def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): @@ -316,6 +361,14 @@ class PageElement(object): and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one @@ -862,7 +915,7 @@ class Tag(PageElement): self.can_be_empty_element = builder.can_be_empty_element(name) else: self.can_be_empty_element = False - + parserClass = _alias("parser_class") # BS3 def __copy__(self): @@ -1129,11 +1182,10 @@ class Tag(PageElement): encoding. """ - # First off, turn a string formatter into a function. This + # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not callable(formatter): + if not isinstance(formatter, Formatter) and not callable(formatter): formatter = self._formatter_for_name(formatter) - attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): @@ -1162,7 +1214,7 @@ class Tag(PageElement): prefix = self.prefix + ":" if self.is_empty_element: - close = '/' + close = formatter.void_element_close_prefix or '' else: closeTag = '</%s%s>' % (prefix, self.name) @@ -1233,9 +1285,9 @@ class Tag(PageElement): :param formatter: The output formatter responsible for converting entities to Unicode characters. """ - # First off, turn a string formatter into a function. This + # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not callable(formatter): + if not isinstance(formatter, Formatter) and not callable(formatter): formatter = self._formatter_for_name(formatter) pretty_print = (indent_level is not None) @@ -1418,7 +1470,7 @@ class Tag(PageElement): if tag_name == '': raise ValueError( "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) found = [] if pseudo_attributes is None: pseudo_type = pseudo @@ -1652,7 +1704,7 @@ class SoupStrainer(object): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( - isinstance(self.name, collections.Callable) + isinstance(self.name, Callable) and not isinstance(markup_name, Tag)) if ((not self.name) @@ -1732,7 +1784,7 @@ class SoupStrainer(object): # True matches any non-None value. return markup is not None - if isinstance(match_against, collections.Callable): + if isinstance(match_against, Callable): return match_against(markup) # Custom callables take the tag as an argument, but all diff --git a/bs4/testing.py b/bs4/testing.py index 6ba2506..9d42702 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -334,7 +334,7 @@ Hello, world! self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) - + def test_multipart_strings(self): "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index d5cf025..0381c7d 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -5,6 +5,7 @@ from pdb import set_trace import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder +from bs4.builder._htmlparser import BeautifulSoupHTMLParser class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @@ -32,3 +33,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_redundant_empty_element_closing_tags(self): self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") self.assertSoupEquals('</br></br></br>', "") + + def test_empty_element(self): + # This verifies that any buffered data present when the parser + # finishes working is handled. + self.assertSoupEquals("foo &# bar", "foo &# bar") + + +class TestHTMLParserSubclass(SoupTest): + def test_error(self): + """Verify that our HTMLParser subclass implements error() in a way + that doesn't cause a crash. + """ + parser = BeautifulSoupHTMLParser() + parser.error("don't crash") diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index c0e7c40..e8903e3 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -605,7 +605,7 @@ class SiblingTest(TreeTest): </html>''' # All that whitespace looks good but makes the tests more # difficult. Get rid of it. - markup = re.compile("\n\s*").sub("", markup) + markup = re.compile(r"\n\s*").sub("", markup) self.tree = self.soup(markup) @@ -821,6 +821,26 @@ class TestTreeModification(SoupTest): soup = self.soup(text) self.assertRaises(ValueError, soup.a.insert, 0, soup.a) + def test_insert_beautifulsoup_object_inserts_children(self): + """Inserting one BeautifulSoup object into another actually inserts all + of its children -- you'll never combine BeautifulSoup objects. + """ + soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>") + + text = "<p>p2</p><p>p3</p>" + to_insert = self.soup(text) + soup.insert(1, to_insert) + + for i in soup.descendants: + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) + self.assertEquals("And now, a word:", p1.string) + self.assertEquals("p2", p2.string) + self.assertEquals("p3", p3.string) + self.assertEquals("And we're back.", p4.string) + + def test_replace_with_maintains_next_element_throughout(self): soup = self.soup('<p><a>one</a><b>three</b></p>') a = soup.a @@ -1419,13 +1439,21 @@ class TestSubstitutions(SoupTest): u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_html(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = u"<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="html") self.assertEqual( decoded, - self.document_for("<b><<Sacré bleu!>></b>")) + self.document_for("<br/><b><<Sacré bleu!>></b>")) + def test_formatter_html5(self): + markup = u"<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + self.assertEqual( + decoded, + self.document_for("<br><b><<Sacré bleu!>></b>")) + def test_formatter_minimal(self): markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) @@ -1498,7 +1526,7 @@ class TestSubstitutions(SoupTest): u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', soup.div.prettify()) - def test_prettify_accepts_formatter(self): + def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) diff --git a/doc/source/index.rst b/doc/source/index.rst index 9269385..8b2822d 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2145,7 +2145,7 @@ invalid HTML or XML:: You can change this behavior by providing a value for the ``formatter`` argument to ``prettify()``, ``encode()``, or -``decode()``. Beautiful Soup recognizes four possible values for +``decode()``. Beautiful Soup recognizes six possible values for ``formatter``. The default is ``formatter="minimal"``. Strings will only be processed @@ -2174,6 +2174,18 @@ Unicode characters to HTML entities whenever possible:: # </body> # </html> + If you pass in ``formatter="html5"``, it's the same as +``formatter="html5"``, but Beautiful Soup will +omit the closing slash in HTML void tags like "br":: + + soup = BeautifulSoup("<br>") + + print(soup.encode(formatter="html")) + # <html><body><br/></body></html> + + print(soup.encode(formatter="html5")) + # <html><body><br></body></html> + If you pass in ``formatter=None``, Beautiful Soup will not modify strings at all on output. This is the fastest option, but it may lead to Beautiful Soup generating invalid HTML/XML, as in these examples:: diff --git a/prepare-release.sh b/prepare-release.sh index d88ff1e..c278b67 100644 --- a/prepare-release.sh +++ b/prepare-release.sh @@ -55,6 +55,7 @@ source ../py2-install-test-virtualenv/bin/activate python setup.py install echo "EXPECT HTML ON LINE BELOW" (cd .. && python -c "from bs4 import _s; print(_s('<a>foo', 'html.parser'))") +echo # That should print '<a>foo</a>' deactivate rm -rf ../py2-install-test-virtualenv @@ -5,7 +5,7 @@ from setuptools import ( setup( name="beautifulsoup4", - version = "4.5.3", + version = "4.6.0", author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", |