summaryrefslogtreecommitdiff
path: root/bs4/element.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/element.py')
-rw-r--r--bs4/element.py110
1 files changed, 81 insertions, 29 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 9ef75f8..911b9bc 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -2,7 +2,10 @@
# found in the LICENSE file.
__license__ = "MIT"
-import collections
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError , e:
+ from collections import Callable
import re
import shlex
import sys
@@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
-whitespace_re = re.compile("\s+")
+whitespace_re = re.compile(r"\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
The value of the 'content' attribute will be one of these objects.
"""
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+ CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value)
@@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
+class Formatter(object):
+ """Contains information about how to format a parse tree."""
+
+ # By default, represent void elements as <tag/> rather than <tag>
+ void_element_close_prefix = '/'
+
+ def substitute_entities(self, *args, **kwargs):
+ """Transform certain characters into named entities."""
+ raise NotImplementedError()
+
+class HTMLFormatter(Formatter):
+ """The default HTML formatter."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+class MinimalHTMLFormatter(Formatter):
+ """A minimal HTML formatter."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTML5Formatter(HTMLFormatter):
+ """An HTML formatter that omits the slash in a void tag."""
+ void_element_close_prefix = None
+
+class XMLFormatter(Formatter):
+ """Substitute only the essential XML entities."""
+ def substitute(self, *args, **kwargs):
+ return EntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTMLXMLFormatter(Formatter):
+ """Format XML using HTML rules."""
+ def substitute(self, *args, **kwargs):
+ return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -131,40 +169,49 @@ class PageElement(object):
# to methods like encode() and prettify():
#
# "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "minimal" - Bare ampersands and angle brackets are converted to
+ # are converted to those entities on output.
+ # "html5" - The same as "html", but empty void tags are represented as
+ # <tag> rather than <tag/>
+ # "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: &amp; &lt; &gt;
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# faster than "minimal".
- # A function - This function will be called on every string that
+ # A callable function - it will be called on every string that needs to undergo entity substitution.
+ # A Formatter instance - Formatter.substitute(string) will be called on every string that
# needs to undergo entity substitution.
#
- # In an HTML document, the default "html" and "minimal" functions
- # will leave the contents of <script> and <style> tags alone. For
- # an XML document, all tags will be given the same treatment.
+ # In an HTML document, the default "html", "html5", and "minimal"
+ # functions will leave the contents of <script> and <style> tags
+ # alone. For an XML document, all tags will be given the same
+ # treatment.
HTML_FORMATTERS = {
- "html" : HTMLAwareEntitySubstitution.substitute_html,
- "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+ "html" : HTMLFormatter(),
+ "html5" : HTML5Formatter(),
+ "minimal" : MinimalHTMLFormatter(),
None : None
}
XML_FORMATTERS = {
- "html" : EntitySubstitution.substitute_html,
- "minimal" : EntitySubstitution.substitute_xml,
+ "html" : HTMLXMLFormatter(),
+ "minimal" : XMLFormatter(),
None : None
}
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
- if not callable(formatter):
+ if isinstance(formatter, basestring):
formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
- output = formatter(s)
+ if callable(formatter):
+ # Backwards compatibility -- you used to pass in a formatting method.
+ output = formatter(s)
+ else:
+ output = formatter.substitute(s)
return output
@property
@@ -194,11 +241,9 @@ class PageElement(object):
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
- return self.XML_FORMATTERS.get(
- name, EntitySubstitution.substitute_xml)
+ return self.XML_FORMATTERS.get(name, XMLFormatter())
else:
- return self.HTML_FORMATTERS.get(
- name, HTMLAwareEntitySubstitution.substitute_xml)
+ return self.HTML_FORMATTERS.get(name, HTMLFormatter())
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
@@ -316,6 +361,14 @@ class PageElement(object):
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
+ from bs4 import BeautifulSoup
+ if isinstance(new_child, BeautifulSoup):
+ # We don't want to end up with a situation where one BeautifulSoup
+ # object contains another. Insert the children one at a time.
+ for subchild in list(new_child.contents):
+ self.insert(position, subchild)
+ position += 1
+ return
position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
@@ -862,7 +915,7 @@ class Tag(PageElement):
self.can_be_empty_element = builder.can_be_empty_element(name)
else:
self.can_be_empty_element = False
-
+
parserClass = _alias("parser_class") # BS3
def __copy__(self):
@@ -1129,11 +1182,10 @@ class Tag(PageElement):
encoding.
"""
- # First off, turn a string formatter into a function. This
+ # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
- if not callable(formatter):
+ if not isinstance(formatter, Formatter) and not callable(formatter):
formatter = self._formatter_for_name(formatter)
-
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
@@ -1162,7 +1214,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
- close = '/'
+ close = formatter.void_element_close_prefix or ''
else:
closeTag = '</%s%s>' % (prefix, self.name)
@@ -1233,9 +1285,9 @@ class Tag(PageElement):
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
"""
- # First off, turn a string formatter into a function. This
+ # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
- if not callable(formatter):
+ if not isinstance(formatter, Formatter) and not callable(formatter):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None)
@@ -1418,7 +1470,7 @@ class Tag(PageElement):
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+ pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is None:
pseudo_type = pseudo
@@ -1652,7 +1704,7 @@ class SoupStrainer(object):
markup = markup_name
markup_attrs = markup
call_function_with_tag_data = (
- isinstance(self.name, collections.Callable)
+ isinstance(self.name, Callable)
and not isinstance(markup_name, Tag))
if ((not self.name)
@@ -1732,7 +1784,7 @@ class SoupStrainer(object):
# True matches any non-None value.
return markup is not None
- if isinstance(match_against, collections.Callable):
+ if isinstance(match_against, Callable):
return match_against(markup)
# Custom callables take the tag as an argument, but all