diff options
Diffstat (limited to 'bs4/element.py')
-rw-r--r-- | bs4/element.py | 110 |
1 files changed, 81 insertions, 29 deletions
diff --git a/bs4/element.py b/bs4/element.py index 9ef75f8..911b9bc 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -2,7 +2,10 @@ # found in the LICENSE file. __license__ = "MIT" -import collections +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable import re import shlex import sys @@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) -whitespace_re = re.compile("\s+") +whitespace_re = re.compile(r"\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): The value of the 'content' attribute will be one of these objects. """ - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) @@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): return cls._substitute_if_appropriate( ns, EntitySubstitution.substitute_xml) +class Formatter(object): + """Contains information about how to format a parse tree.""" + + # By default, represent void elements as <tag/> rather than <tag> + void_element_close_prefix = '/' + + def substitute_entities(self, *args, **kwargs): + """Transform certain characters into named entities.""" + raise NotImplementedError() + +class HTMLFormatter(Formatter): + """The default HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + +class MinimalHTMLFormatter(Formatter): + """A minimal HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs) + +class HTML5Formatter(HTMLFormatter): + """An HTML formatter that omits the slash in a void tag.""" + void_element_close_prefix = None + +class XMLFormatter(Formatter): + """Substitute only the essential XML entities.""" + def substitute(self, *args, **kwargs): + return EntitySubstitution.substitute_xml(*args, **kwargs) + +class HTMLXMLFormatter(Formatter): + """Format XML using HTML rules.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -131,40 +169,49 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "html5" - The same as "html", but empty void tags are represented as + # <tag> rather than <tag/> + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". - # A function - This function will be called on every string that + # A callable function - it will be called on every string that needs to undergo entity substitution. + # A Formatter instance - Formatter.substitute(string) will be called on every string that # needs to undergo entity substitution. # - # In an HTML document, the default "html" and "minimal" functions - # will leave the contents of <script> and <style> tags alone. For - # an XML document, all tags will be given the same treatment. + # In an HTML document, the default "html", "html5", and "minimal" + # functions will leave the contents of <script> and <style> tags + # alone. For an XML document, all tags will be given the same + # treatment. HTML_FORMATTERS = { - "html" : HTMLAwareEntitySubstitution.substitute_html, - "minimal" : HTMLAwareEntitySubstitution.substitute_xml, + "html" : HTMLFormatter(), + "html5" : HTML5Formatter(), + "minimal" : MinimalHTMLFormatter(), None : None } XML_FORMATTERS = { - "html" : EntitySubstitution.substitute_html, - "minimal" : EntitySubstitution.substitute_xml, + "html" : HTMLXMLFormatter(), + "minimal" : XMLFormatter(), None : None } def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" - if not callable(formatter): + if isinstance(formatter, basestring): formatter = self._formatter_for_name(formatter) if formatter is None: output = s else: - output = formatter(s) + if callable(formatter): + # Backwards compatibility -- you used to pass in a formatting method. + output = formatter(s) + else: + output = formatter.substitute(s) return output @property @@ -194,11 +241,9 @@ class PageElement(object): def _formatter_for_name(self, name): "Look up a formatter function based on its name and the tree." if self._is_xml: - return self.XML_FORMATTERS.get( - name, EntitySubstitution.substitute_xml) + return self.XML_FORMATTERS.get(name, XMLFormatter()) else: - return self.HTML_FORMATTERS.get( - name, HTMLAwareEntitySubstitution.substitute_xml) + return self.HTML_FORMATTERS.get(name, HTMLFormatter()) def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): @@ -316,6 +361,14 @@ class PageElement(object): and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one @@ -862,7 +915,7 @@ class Tag(PageElement): self.can_be_empty_element = builder.can_be_empty_element(name) else: self.can_be_empty_element = False - + parserClass = _alias("parser_class") # BS3 def __copy__(self): @@ -1129,11 +1182,10 @@ class Tag(PageElement): encoding. """ - # First off, turn a string formatter into a function. This + # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not callable(formatter): + if not isinstance(formatter, Formatter) and not callable(formatter): formatter = self._formatter_for_name(formatter) - attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): @@ -1162,7 +1214,7 @@ class Tag(PageElement): prefix = self.prefix + ":" if self.is_empty_element: - close = '/' + close = formatter.void_element_close_prefix or '' else: closeTag = '</%s%s>' % (prefix, self.name) @@ -1233,9 +1285,9 @@ class Tag(PageElement): :param formatter: The output formatter responsible for converting entities to Unicode characters. """ - # First off, turn a string formatter into a function. This + # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not callable(formatter): + if not isinstance(formatter, Formatter) and not callable(formatter): formatter = self._formatter_for_name(formatter) pretty_print = (indent_level is not None) @@ -1418,7 +1470,7 @@ class Tag(PageElement): if tag_name == '': raise ValueError( "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) found = [] if pseudo_attributes is None: pseudo_type = pseudo @@ -1652,7 +1704,7 @@ class SoupStrainer(object): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( - isinstance(self.name, collections.Callable) + isinstance(self.name, Callable) and not isinstance(markup_name, Tag)) if ((not self.name) @@ -1732,7 +1784,7 @@ class SoupStrainer(object): # True matches any non-None value. return markup is not None - if isinstance(match_against, collections.Callable): + if isinstance(match_against, Callable): return match_against(markup) # Custom callables take the tag as an argument, but all |