summaryrefslogtreecommitdiff
path: root/bs4/doc
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-07 23:40:06 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-07 23:40:06 -0500
commitbd9711feb69ce4791c5de44b2f67f54be194071a (patch)
treef6795cbc8b2c415e57a1dadcd3e72a6dc5b687b1 /bs4/doc
parentc12086f610fe914db122489bc5b268d705297fc6 (diff)
downloadbeautifulsoup4-bd9711feb69ce4791c5de44b2f67f54be194071a.tar.gz
Package the docs with the code.
Diffstat (limited to 'bs4/doc')
-rw-r--r--bs4/doc/Makefile130
-rw-r--r--bs4/doc/__init__.py1
-rw-r--r--bs4/doc/source/6.1.jpgbin0 -> 22619 bytes
-rw-r--r--bs4/doc/source/conf.py256
-rw-r--r--bs4/doc/source/index.rst2427
5 files changed, 2814 insertions, 0 deletions
diff --git a/bs4/doc/Makefile b/bs4/doc/Makefile
new file mode 100644
index 0000000..8c833d2
--- /dev/null
+++ b/bs4/doc/Makefile
@@ -0,0 +1,130 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ -rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ make -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/bs4/doc/__init__.py b/bs4/doc/__init__.py
new file mode 100644
index 0000000..a2920fe
--- /dev/null
+++ b/bs4/doc/__init__.py
@@ -0,0 +1 @@
+"""Executable documentation about beautifulsoup."""
diff --git a/bs4/doc/source/6.1.jpg b/bs4/doc/source/6.1.jpg
new file mode 100644
index 0000000..97014f0
--- /dev/null
+++ b/bs4/doc/source/6.1.jpg
Binary files differ
diff --git a/bs4/doc/source/conf.py b/bs4/doc/source/conf.py
new file mode 100644
index 0000000..56c0939
--- /dev/null
+++ b/bs4/doc/source/conf.py
@@ -0,0 +1,256 @@
+# -*- coding: utf-8 -*-
+#
+# Beautiful Soup documentation build configuration file, created by
+# sphinx-quickstart on Thu Jan 26 11:22:55 2012.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Beautiful Soup'
+copyright = u'2012, Leonard Richardson'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '4'
+# The full version, including alpha/beta/rc tags.
+release = '4.0.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'BeautifulSoupdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+# The paper size ('letter' or 'a4').
+#latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+#latex_font_size = '10pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation',
+ u'Leonard Richardson', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Additional stuff for the LaTeX preamble.
+#latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'beautifulsoup', u'Beautiful Soup Documentation',
+ [u'Leonard Richardson'], 1)
+]
+
+
+# -- Options for Epub output ---------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = u'Beautiful Soup'
+epub_author = u'Leonard Richardson'
+epub_publisher = u'Leonard Richardson'
+epub_copyright = u'2012, Leonard Richardson'
+
+# The language of the text. It defaults to the language option
+# or en if the language is not set.
+#epub_language = ''
+
+# The scheme of the identifier. Typical schemes are ISBN or URL.
+#epub_scheme = ''
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#epub_identifier = ''
+
+# A unique identification for the text.
+#epub_uid = ''
+
+# HTML files that should be inserted before the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#epub_pre_files = []
+
+# HTML files shat should be inserted after the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#epub_post_files = []
+
+# A list of files that should not be packed into the epub file.
+#epub_exclude_files = []
+
+# The depth of the table of contents in toc.ncx.
+#epub_tocdepth = 3
+
+# Allow duplicate toc entries.
+#epub_tocdup = True
diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst
new file mode 100644
index 0000000..ad1dbf7
--- /dev/null
+++ b/bs4/doc/source/index.rst
@@ -0,0 +1,2427 @@
+Beautiful Soup Documentation
+============================
+
+.. image:: 6.1.jpg
+ :align: right
+ :alt: "The Fish-Footman began by producing from under his arm a great letter, nearly as large as himself."
+
+`Beautiful Soup <http://www.crummy.com/software/BeautifulSoup/>`_ is a
+Python library for pulling data out of HTML and XML files. It works
+with your favorite parser to provide idiomatic ways of navigating,
+searching, and modifying the parse tree. It commonly saves programmers
+hours or days of work.
+
+These instructions illustrate all major features of Beautiful Soup 4,
+with examples. I show you what the library is good for, how it works,
+how to use it, how to make it do what you want, and what to do when it
+violates your expectations.
+
+The examples in this documentation should work the same way in Python
+2.7 and Python 3.2.
+
+You might be looking for the documentation for `Beautiful Soup 3
+<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. If
+you want to learn about the differences between Beautiful Soup 3 and
+Beautiful Soup 4, see `Porting code to BS4`_.
+
+Getting help
+------------
+
+If you have questions about Beautiful Soup, or run into problems,
+`send mail to the discussion group
+<http://groups.google.com/group/beautifulsoup/>`_.
+
+Quick Start
+===========
+
+Here's an HTML document I'll be using as an example throughout this
+document. It's part of a story from `Alice in Wonderland`::
+
+ html_doc = """
+ <html><head><title>The Dormouse's story</title></head>
+
+ <p class="title"><b>The Dormouse's story</b></p>
+
+ <p class="story">Once upon a time there were three little sisters; and their names were
+ <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
+ <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
+ <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
+ and they lived at the bottom of a well.</p>
+
+ <p class="story">...</p>
+ """
+
+Running the "three sisters" document through Beautiful Soup gives us a
+``BeautifulSoup`` object, which represents the document as a nested
+data structure::
+
+ from bs4 import BeautifulSoup
+ soup = BeautifulSoup(html_doc)
+
+ print(soup.prettify())
+ # <html>
+ # <head>
+ # <title>
+ # The Dormouse's story
+ # </title>
+ # </head>
+ # <body>
+ # <p class="title">
+ # <b>
+ # The Dormouse's story
+ # </b>
+ # </p>
+ # <p class="story">
+ # Once upon a time there were three little sisters; and their names were
+ # <a class="sister" href="http://example.com/elsie" id="link1">
+ # Elsie
+ # </a>
+ # ,
+ # <a class="sister" href="http://example.com/lacie" id="link2">
+ # Lacie
+ # </a>
+ # and
+ # <a class="sister" href="http://example.com/tillie" id="link2">
+ # Tillie
+ # </a>
+ # ; and they lived at the bottom of a well.
+ # </p>
+ # <p class="story">
+ # ...
+ # </p>
+ # </body>
+ # </html>
+
+Here are some simple ways to navigate that data structure::
+
+ soup.title
+ # <title>The Dormouse's story</title>
+
+ soup.title.name
+ # u'title'
+
+ soup.title.string
+ # u'The Dormouse's story'
+
+ soup.title.parent.name
+ # u'head'
+
+ soup.p
+ # <p class="title"><b>The Dormouse's story</b></p>
+
+ soup.p['class']
+ # u'title'
+
+ soup.a
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+
+ soup.find_all('a')
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+ soup.find(id="link3")
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
+
+One common task is extracting all the URLs found within a page's <a> tags::
+
+ for link in soup.find_all('a'):
+ print(link.get('href'))
+ # http://example.com/elsie
+ # http://example.com/lacie
+ # http://example.com/tillie
+
+Another common task is extracting all the text from a page::
+
+ print(soup.get_text())
+ # The Dormouse's story
+ #
+ # The Dormouse's story
+ #
+ # Once upon a time there were three little sisters; and their names were
+ # Elsie,
+ # Lacie and
+ # Tillie;
+ # and they lived at the bottom of a well.
+ #
+ # ...
+
+Does this look like what you need? If so, read on.
+
+Installing Beautiful Soup
+=========================
+
+Beautiful Soup 4 is published through PyPi, so you can install it with
+``easy_install`` or ``pip``. The package name is ``beautifulsoup4``,
+and the same package works on Python 2 and Python 3.
+
+:kbd:`$ easy_install beautifulsoup4`
+:kbd:`$ pip install beautifulsoup4`
+
+(The ``BeautifulSoup`` package is probably `not` what you want. That's
+the previous major release, `Beautiful Soup 3`_. Lots of software uses
+BS3, so it's still available, but if you're writing new code you
+should install ``beautifulsoup4``.)
+
+You can also `download the Beautiful Soup 4 source tarball
+<http://www.crummy.com/software/BeautifulSoup/download/4.x/>`_ and
+install it with ``setup.py``. The license for Beautiful Soup allows
+you to package the entire library with your application, allowing you
+to copy the ``bs4`` directory into your application's codebase.
+
+I use Python 2.7 and Python 3.2 to develop Beautiful Soup, but it
+should work with other recent versions.
+
+.. _parser-installation:
+
+Be sure to install a good parser!
+---------------------------------
+
+Beautiful Soup uses a plugin system that supports a number of popular
+Python parsers. If no third-party parsers are installed, Beautiful
+Soup uses the HTML parser that comes with Python. In recent releases
+of Python (2.7.2 and 3.2.2), this parser works pretty well at handling
+bad HTML. In older releases, it's not so good.
+
+Even if you're using a recent release of Python, I recommend you
+install the `lxml parser <http://lxml.de/>`_ if possible. It's much
+faster than Python's built-in parser. It works with both Python 2 and
+Python 3, and it parses HTML and XML very well. Beautiful Soup will
+detect that you have lxml installed, and use it instead of Python's
+built-in parser.
+
+Depending on your setup, you might install lxml with one of these commands:
+
+:kbd:`$ apt-get install python-lxml`
+
+:kbd:`$ easy_install lxml`
+
+:kbd:`$ pip install lxml`
+
+If you're using Python 2, another alternative is the pure-Python
+`html5lib parser <http://code.google.com/p/html5lib/>`_, which parses
+HTML the way a web browser does. Depending on your setup, you might
+install html5lib with one of these commands:
+
+:kbd:`$ apt-get install python-html5lib`
+
+:kbd:`$ easy_install html5lib`
+
+:kbd:`$ pip install html5lib`
+
+Making the soup
+===============
+
+To parse a document, pass it into the ``BeautifulSoup``
+constructor. You can pass in a string or an open filehandle::
+
+ from bs4 import BeautifulSoup
+
+ soup = BeautifulSoup(open("index.html"))
+
+ soup = BeautifulSoup("<html>data</html>")
+
+First, the document is converted to Unicode, and HTML entities are
+converted to Unicode characters::
+
+ BeautifulSoup("Sacr&eacute; bleu!")
+ <html><head></head><body>Sacré bleu!</body></html>
+
+Beautiful Soup then parses the document using the best available
+parser. It will use an HTML parser unless you specifically tell it to
+use an XML parser. (See `Choosing a parser`_.)
+
+Kinds of objects
+================
+
+Beautiful Soup transforms a complex HTML document into a complex tree
+of Python objects. But you'll only ever have to deal with about four
+`kinds` of objects.
+
+.. _Tag:
+
+``Tag``
+-------
+
+A ``Tag`` object corresponds to an XML or HTML tag in the original document::
+
+ soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
+ tag = soup.b
+ type(tag)
+ # <class 'bs4.element.Tag'>
+
+Tags have a lot of attributes and methods, and I'll cover most of them
+in `Navigating the tree`_ and `Searching the tree`_. For now, the most
+important features of a tag are its name and attributes.
+
+Name
+^^^^
+
+Every tag has a name, accessible as ``.name``::
+
+ tag.name
+ # u'b'
+
+If you change a tag's name, the change will be reflected in any HTML
+markup generated by Beautiful Soup::
+
+ tag.name = "blockquote"
+ tag
+ # <blockquote class="boldest">Extremely bold</blockquote>
+
+Attributes
+^^^^^^^^^^
+
+A tag may have any number of attributes. The tag ``<b
+class="boldest">`` has an attribute "class" whose value is
+"boldest". You can access a tag's attributes by treating the tag like
+a dictionary::
+
+ tag['class']
+ # u'boldest'
+
+You can access that dictionary directly as ``.attrs``::
+
+ tag.attrs
+ # {u'class': u'boldest'}
+
+You can add, remove, and modify a tag's attributes. Again, this is
+done by treating the tag as a dictionary::
+
+ tag['class'] = 'verybold'
+ tag['id'] = 1
+ tag
+ # <blockquote class="verybold" id="1">Extremely bold</blockquote>
+
+ del tag['class']
+ del tag['id']
+ tag
+ # <blockquote>Extremely bold</blockquote>
+
+``NavigableString``
+-------------------
+
+A string corresponds to a bit of text within a tag. Beautiful Soup
+defines the ``NavigableString`` class to contain these bits of text::
+
+ tag.string
+ # u'Extremely bold'
+ type(tag.string)
+ # <class 'bs4.element.NavigableString'>
+
+A ``NavigableString`` is just like a Python Unicode string, except
+that it also supports some of the features described in `Navigating
+the tree`_ and `Searching the tree`_. You can convert a
+``NavigableString`` to a Unicode string with ``unicode()``::
+
+ unicode_string = unicode(tag.string)
+ unicode_string
+ # u'Extremely bold'
+ type(unicode_string)
+ # <type 'unicode'>
+
+You can't edit a string in place, but you can replace one string with
+another, using :ref:`replace_with`::
+
+ tag.string.replace_with("No longer bold")
+ tag
+ # <blockquote>No longer bold</blockquote>
+
+``NavigableString`` supports most of the features described in
+`Navigating the tree`_ and `Searching the tree`_, but not all of
+them. In particular, since a string can't contain anything (the way a
+tag may contain a string or another tag), strings don't support the
+``.contents`` or ``.string`` attributes, or the `find()` method.
+
+``BeautifulSoup``
+-----------------
+
+The ``BeautifulSoup`` object itself represents the document as a
+whole. For most purposes, you can treat it as a :ref:`Tag`
+object. This means it supports most of the methods described in
+`Navigating the tree`_ and `Searching the tree`_.
+
+Since the ``BeautifulSoup`` object doesn't correspond to an actual
+HTML or XML tag, it has no name and no attributes. But sometimes it's
+useful to look at its ``.name``, so it's been given the special
+``.name`` "[document]"::
+
+ soup.name
+ # u'[document]'
+
+Comments and other special strings
+----------------------------------
+
+``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost
+everything you'll see in an HTML or XML file, but there are a few
+leftover bits. The only one you'll probably ever need to worry about
+is the comment::
+
+ markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
+ soup = BeautifulSoup(markup)
+ comment = soup.b.string
+ type(comment)
+ # <class 'bs4.element.Comment'>
+
+The ``Comment`` object is just a special type of ``NavigableString``::
+
+ comment
+ # u'Hey, buddy. Want to buy a used parser'
+
+But when it appears as part of an HTML document, a ``Comment`` is
+displayed with special formatting::
+
+ print(soup.b.prettify())
+ # <b>
+ # <!--Hey, buddy. Want to buy a used parser?-->
+ # </b>
+
+Beautiful Soup defines classes for anything else that might show up in
+an XML document: ``CData``, ``ProcessingInstruction``,
+``Declaration``, and ``Doctype``. Just like ``Comment``, these classes
+are subclasses of ``NavigableString`` that add something extra to the
+string. Here's an example that replaces the comment with a CDATA
+block::
+
+ from bs4 import CData
+ cdata = CData("A CDATA block")
+ comment.replace_with(cdata)
+
+ print(soup.b.prettify())
+ # <b>
+ # <![CDATA[A CDATA block]]>
+ # </b>
+
+
+Navigating the tree
+===================
+
+Here's the "Three sisters" HTML document again::
+
+ html_doc = """
+ <html><head><title>The Dormouse's story</title></head>
+
+ <p class="title"><b>The Dormouse's story</b></p>
+
+ <p class="story">Once upon a time there were three little sisters; and their names were
+ <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
+ <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
+ <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
+ and they lived at the bottom of a well.</p>
+
+ <p class="story">...</p>
+ """
+
+ from bs4 import BeautifulSoup
+ soup = BeautifulSoup(html_doc)
+
+I'll use this as an example to show you how to move from one part of
+a document to another.
+
+Going down
+----------
+
+Tags may contain strings and other tags. These elements are the tag's
+`children`. Beautiful Soup provides a lot of different attributes for
+navigating and iterating over a tag's children.
+
+Note that Beautiful Soup strings don't support any of these
+attributes, because a string can't have children.
+
+Navigating using tag names
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The simplest way to navigate the parse tree is to say the name of the
+tag you want. If you want the <head> tag, just say ``soup.head``::
+
+ soup.head
+ # <head><title>The Dormouse's story</title></head>
+
+ soup.title
+ # <title>The Dormouse's story</title>
+
+You can do use this trick again and again to zoom in on a certain part
+of the parse tree. This code gets the first <b> tag beneath the <body> tag::
+
+ soup.body.b
+ # <b>The Dormouse's story</b>
+
+Using a tag name as an attribute will give you only the `first` tag by that
+name::
+
+ soup.a
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+
+If you need to get `all` the <a> tags, or anything more complicated
+than the first tag with a certain name, you'll need to use one of the
+methods described in `Searching the tree`_, such as `find_all()`::
+
+ soup.find_all('a')
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+``.contents`` and ``.children``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A tag's children are available in a list called ``.contents``::
+
+ head_tag = soup.head
+ head_tag
+ # <head><title>The Dormouse's story</title></head>
+
+ head_tag.contents
+ [<title>The Dormouse's story</title>]
+
+ title_tag = head_tag.contents[0]
+ title_tag
+ # <title>The Dormouse's story</title>
+ title_tag.contents
+ # [u'The Dormouse's story']
+
+The ``BeautifulSoup`` object itself has children. In this case, the
+<html> tag is the child of the ``BeautifulSoup`` object.::
+
+ len(soup.contents)
+ # 1
+ soup.contents[0].name
+ # u'html'
+
+A string does not have ``.contents``, because it can't contain
+anything::
+
+ text = title_tag.contents[0]
+ text.contents
+ # AttributeError: 'NavigableString' object has no attribute 'contents'
+
+Instead of getting them as a list, you can iterate over a tag's
+children using the ``.children`` generator::
+
+ for child in title_tag.children:
+ print(child)
+ # The Dormouse's story
+
+``.descendants``
+^^^^^^^^^^^^^^^^
+
+The ``.contents`` and ``.children`` attributes only consider a tag's
+`direct` children. For instance, the <head> tag has a single direct
+child--the <title> tag::
+
+ head_tag.contents
+ # [<title>The Dormouse's story</title>]
+
+But the <title> tag itself has a child: the string "The Dormouse's
+story". There's a sense in which that string is also a child of the
+<head> tag. The ``.descendants`` attribute lets you iterate over `all`
+of a tag's children, recursively: its direct children, the children of
+its direct children, and so on::
+
+ for child in head_tag.descendants:
+ print(child)
+ # <title>The Dormouse's story</title>
+ # The Dormouse's story
+
+The <head> tag has only one child, but it has two descendants: the
+<title> tag and the <title> tag's child. The ``BeautifulSoup`` object
+only has one direct child (the <html> tag), but it has a whole lot of
+descendants::
+
+ len(list(soup.children))
+ # 1
+ len(list(soup.descendants))
+ # 25
+
+.. _.string:
+
+``.string``
+^^^^^^^^^^^
+
+If a tag has only one child, and that child is a string, the string is
+made available as ``.string``::
+
+ title_tag.string
+ # u'The Dormouse's story'
+
+If a tag's only child is another tag, and `that` tag has a
+``.string``, then the parent tag is considered to have the same
+``.string`` as its child::
+
+ head_tag.contents
+ # [<title>The Dormouse's story</title>]
+
+ head_tag.string
+ # u'The Dormouse's story'
+
+If a tag contains more than one thing, then it's not clear what
+``.string`` should refer to, so ``.string`` is defined to be
+``None``::
+
+ print(soup.html.string)
+ # None
+
+.. _string-generators:
+
+``.strings`` and ``stripped_strings``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If there's more than one thing inside a tag, you can still look at
+just the strings. Use the ``.strings`` generator::
+
+ for string in soup.strings:
+ print(repr(string))
+ # u"The Dormouse's story"
+ # u'\n\n'
+ # u"The Dormouse's story"
+ # u'\n\n'
+ # u'Once upon a time there were three little sisters; and their names were\n'
+ # u'Elsie'
+ # u',\n'
+ # u'Lacie'
+ # u' and\n'
+ # u'Tillie'
+ # u';\nand they lived at the bottom of a well.'
+ # u'\n\n'
+ # u'...'
+ # u'\n'
+
+These strings tend to have a lot of extra whitespace, which you can
+remove by using the ``.stripped_strings`` generator instead::
+
+ for string in soup.stripped_strings:
+ print(repr(string))
+ # u"The Dormouse's story"
+ # u"The Dormouse's story"
+ # u'Once upon a time there were three little sisters; and their names were'
+ # u'Elsie'
+ # u','
+ # u'Lacie'
+ # u'and'
+ # u'Tillie'
+ # u';\nand they lived at the bottom of a well.'
+ # u'...'
+
+Here, strings consisting entirely of whitespace are ignored, and
+whitespace at the beginning and end of strings is removed.
+
+Going up
+--------
+
+Continuing the "family tree" analogy, every tag and every string has a
+`parent`: the tag that contains it.
+
+.. _.parent:
+
+``.parent``
+^^^^^^^^^^^
+
+You can access an element's parent with the ``.parent`` attribute. In
+the example "three sisters" document, the <head> tag is the parent
+of the <title> tag::
+
+ title_tag = soup.title
+ title_tag
+ # <title>The Dormouse's story</title>
+ title_tag.parent
+ # <head><title>The Dormouse's story</title></head>
+
+The title string itself has a parent: the <title> tag that contains
+it::
+
+ title_tag.string.parent
+ # <title>The Dormouse's story</title>
+
+The parent of a top-level tag like <html> is the ``BeautifulSoup`` object
+itself::
+
+ html_tag = soup.html
+ type(html_tag.parent)
+ # <class 'bs4.BeautifulSoup'>
+
+And the ``.parent`` of a ``BeautifulSoup`` object is defined as None::
+
+ print(soup.parent)
+ # None
+
+.. _.parents:
+
+``.parents``
+^^^^^^^^^^^^
+
+You can iterate over all of an element's parents with
+``.parents``. This example uses ``.parents`` to travel from an <a> tag
+buried deep within the document, to the very top of the document::
+
+ link = soup.a
+ link
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+ for parent in link.parents:
+ if parent is None:
+ print(parent)
+ else:
+ print(parent.name)
+ # p
+ # body
+ # html
+ # [document]
+ # None
+
+Going sideways
+--------------
+
+Consider a simple document like this::
+
+ sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
+ print(sibling_soup.prettify())
+ # <html>
+ # <body>
+ # <a>
+ # <b>
+ # text1
+ # </b>
+ # <c>
+ # text2
+ # </c>
+ # </a>
+ # </body>
+ # </html>
+
+The <b> tag and the <c> tag are at the same level: they're both direct
+children of the same tag. We call them `siblings`. When a document is
+pretty-printed, siblings show up at the same indentation level. You
+can also use this relationship in the code you write.
+
+``.next_sibling`` and ``.previous_sibling``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can use ``.next_sibling`` and ``.previous_sibling`` to navigate
+between page elements that are on the same level of the parse tree::
+
+ sibling_soup.b.next_sibling
+ # <c>text2</c>
+
+ sibling_soup.c.previous_sibling
+ # <b>text1</b>
+
+The <b> tag has a ``.next_sibling``, but no ``.previous_sibling``,
+because there's nothing before the <b> tag `on the same level of the
+tree`. For the same reason, the <c> tag has a ``.previous_sibling``
+but no ``.next_sibling``::
+
+ print(sibling_soup.b.previous_sibling)
+ # None
+ print(sibling_soup.c.next_sibling)
+ # None
+
+The strings "text1" and "text2" are `not` siblings, because they don't
+have the same parent::
+
+ sibling_soup.b.string
+ # u'text1'
+
+ print(sibling_soup.b.string.next_sibling)
+ # None
+
+In real documents, the ``.next_sibling`` or ``.previous_sibling`` of a
+tag will usually be a string containing whitespace. Going back to the
+"three sisters" document::
+
+ <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
+ <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
+ <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
+
+You might think that the ``.next_sibling`` of the first <a> tag would
+be the second <a> tag. But actually, it's a string: the comma and
+newline that separate the first <a> tag from the second::
+
+ link = soup.a
+ link
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+
+ link.next_sibling
+ # u',\n'
+
+The second <a> tag is actually the ``.next_sibling`` of the comma::
+
+ link.next_sibling.next_sibling
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
+
+.. _sibling-generators:
+
+``.next_siblings`` and ``.previous_siblings``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can iterate over a tag's siblings with ``.next_siblings`` or
+``.previous_siblings``::
+
+ for sibling in soup.a.next_siblings:
+ print(repr(sibling))
+ # u',\n'
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
+ # u' and\n'
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
+ # u'; and they lived at the bottom of a well.'
+ # None
+
+ for sibling in soup.find(id="link3").previous_siblings:
+ print(repr(sibling))
+ # ' and\n'
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
+ # u',\n'
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+ # u'Once upon a time there were three little sisters; and their names were\n'
+ # None
+
+Going back and forth
+--------------------
+
+Take a look at the beginning of the "three sisters" document::
+
+ <html><head><title>The Dormouse's story</title></head>
+ <p class="title"><b>The Dormouse's story</b></p>
+
+An HTML parser takes this string of characters and turns it into a
+series of events: "open an <html> tag", "open a <head> tag", "open a
+<title> tag", "add a string", "close the <title> tag", "open a <p>
+tag", and so on. Beautiful Soup offers tools for reconstructing the
+initial parse of the document.
+
+.. _element-generators:
+
+``.next_element`` and ``.previous_element``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``.next_element`` attribute of a string or tag points to whatever
+was parsed immediately afterwards. It might be the same as
+``.next_sibling``, but it's usually drastically different.
+
+Here's the final <a> tag in the "three sisters" document. Its
+``.next_sibling`` is a string: the conclusion of the sentence that was
+interrupted by the start of the <a> tag.::
+
+ last_a_tag = soup.find("a", id="link3")
+ last_a_tag
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
+
+ last_a_tag.next_sibling
+ # '; and they lived at the bottom of a well.'
+
+But the ``.next_element`` of that <a> tag, the thing that was parsed
+immediately after the <a> tag, is `not` the rest of that sentence:
+it's the word "Tillie"::
+
+ last_a_tag.next_element
+ # u'Tillie'
+
+That's because in the original markup, the word "Tillie" appeared
+before that semicolon. The parser encountered an <a> tag, then the
+word "Tillie", then the closing </a> tag, then the semicolon and rest of
+the sentence. The semicolon is on the same level as the <a> tag, but the
+word "Tillie" was encountered first.
+
+The ``.previous_element`` attribute is the exact opposite of
+``.next_element``. It points to whatever element was parsed
+immediately before this one::
+
+ last_a_tag.previous_element
+ # u' and\n'
+ last_a_tag.previous_element.next_element
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
+
+``.next_elements`` and ``.previous_elements``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You should get the idea by now. You can use these iterators to move
+forward or backward in the document as it was parsed::
+
+ for element in last_a_tag.next_elements:
+ print(repr(element))
+ # u'Tillie'
+ # u';\nand they lived at the bottom of a well.'
+ # u'\n\n'
+ # <p class="story">...</p>
+ # u'...'
+ # u'\n'
+ # None
+
+Searching the tree
+==================
+
+Beautiful Soup defines a lot of methods for searching the parse tree,
+but they're all very similar. I'm going to spend a lot of time explain
+the two most popular methods: ``find()`` and ``find_all()``. The other
+methods take almost exactly the same arguments, so I'll just cover
+them briefly.
+
+Once again, I'll be using the "three sisters" document as an example::
+
+ html_doc = """
+ <html><head><title>The Dormouse's story</title></head>
+
+ <p class="title"><b>The Dormouse's story</b></p>
+
+ <p class="story">Once upon a time there were three little sisters; and their names were
+ <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
+ <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
+ <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
+ and they lived at the bottom of a well.</p>
+
+ <p class="story">...</p>
+ """
+
+ from bs4 import BeautifulSoup
+ soup = BeautifulSoup(html_doc)
+
+By passing in a filter to an argument like ``find_all()``, you can
+isolate whatever parts of the document you're interested.
+
+Kinds of filters
+----------------
+
+Before talking in detail about ``find_all()`` and similar methods, I
+want to show examples of different filters you can pass into these
+methods. These filters show up again and again, throughout the
+search API. You can use them to filter based on a tag's name,
+on its attributes, on the text of a string, or on some combination of
+these.
+
+.. _a string:
+
+A string
+^^^^^^^^
+
+The simplest filter is a string. Pass a string to a search method and
+Beautiful Soup will perform a match against that exact string. This
+code finds all the <b> tags in the document::
+
+ soup.find_all('b')
+ # [<b>The Dormouse's story</b>]
+
+.. _a regular expression:
+
+A regular expression
+^^^^^^^^^^^^^^^^^^^^
+
+If you pass in a regular expression object, Beautiful Soup will filter
+against that regular expression. This code finds all the tags whose
+names start with the letter "b"; in this case, the <body> tag and the
+<b> tag::
+
+ import re
+ for tag in soup.find_all(re.compile("b.*")):
+ print(tag.name)
+ # body
+ # b
+
+.. _a list:
+
+A list
+^^^^^^
+
+If you pass in a list, Beautiful Soup will allow a string match
+against `any` item in that list. This code finds all the <a> tags
+`and` all the <b> tags::
+
+ soup.find_all(["a", "b"])
+ # [<b>The Dormouse's story</b>,
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+.. _the value True:
+
+``True``
+^^^^^^^^
+
+The value ``True`` matches everything it can. This code finds `all`
+the tags in the document, but none of the text strings::
+
+ for tag in soup.find_all(True):
+ print(tag.name)
+ # html
+ # head
+ # title
+ # body
+ # p
+ # b
+ # p
+ # a
+ # a
+ # a
+ # p
+
+.. a function:
+
+A function
+^^^^^^^^^^
+
+If none of the other matches work for you, define a function that
+takes an element as its only argument. The function should return
+``True`` if the argument matches, and ``False`` otherwise.
+
+Here's a function that returns ``True`` if a tag defines the "class"
+attribute but doesn't define the "id" attribute::
+
+ def has_class_but_no_id(tag):
+ return tag.has_key('class') and not tag.has_key('id')
+
+Pass this function into ``find_all()`` and you'll pick up all the <p>
+tags::
+
+ soup.find_all(has_class_but_no_id)
+ # [<p class="title"><b>The Dormouse's story</b></p>,
+ # <p class="story">Once upon a time there were...</p>,
+ # <p class="story">...</p>]
+
+This function only picks up the <p> tags. It doesn't pick up the <a>
+tags, because those tags define both "class" and "id". It doesn't pick
+up tags like <html> and <title>, because those tags don't define
+"class".
+
+Here's a function that returns ``True`` if a tag is surrounded by
+string objects::
+
+ from bs4 import NavigableString
+ def surrounded_by_strings(tag):
+ return (isinstance(tag.next_element, NavigableString)
+ and isinstance(tag.previous_element, NavigableString))
+
+ for tag in soup.find_all(surrounded_by_strings):
+ print tag.name
+ # p
+ # a
+ # a
+ # a
+ # p
+
+Now we're ready to look at the search methods in detail.
+
+``find_all()``
+--------------
+
+Signature: find_all(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive
+<recursive>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`)
+
+The ``find_all()`` method looks through a tag's descendants and
+retrieves `all` descendants that match your filters. I gave several
+examples in `Kinds of filters`_, but here are a few more::
+
+ soup.find_all("title")
+ # [<title>The Dormouse's story</title>]
+
+ soup.find_all("p", "title")
+ # [<p class="title"><b>The Dormouse's story</b></p>]
+
+ soup.find_all("a")
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+ soup.find_all(id="link2")
+ # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
+
+ import re
+ soup.find(text=re.compile("sisters"))
+ # u'Once upon a time there were three little sisters; and their names were\n'
+
+Some of these should look familiar, but others are new. What does it
+mean to pass in a value for ``text``, or ``id``? Why does
+``find_all("p", "title")`` find a <p> tag with the CSS class "title"?
+Let's look at the arguments to ``find_all()``.
+
+.. _name:
+
+The ``name`` argument
+^^^^^^^^^^^^^^^^^^^^^
+
+Pass in a value for ``name`` and you'll tell Beautiful Soup to only
+consider tags with certain names. Text strings will be ignored, as
+will tags whose names that don't match.
+
+This is the simplest usage::
+
+ soup.find_all("title")
+ # [<title>The Dormouse's story</title>]
+
+Recall from `Kinds of filters`_ that the value to ``name`` can be `a
+string`_, `a regular expression`_, `a list`_, `a function`_, or `the value
+True`_.
+
+.. _kwargs:
+
+The keyword arguments
+^^^^^^^^^^^^^^^^^^^^^
+
+Any argument that's not recognized will be turned into a filter on tag
+attributes. If you pass in a value for an argument called ``id``,
+Beautiful Soup will filter against the tag's 'id' attribute::
+
+ soup.find_all(id='link2')
+ # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
+
+If you pass in a value for ``href``, Beautiful Soup will filter
+against the tag's 'href' attribute::
+
+ soup.find_all(href=re.compile("elsie"))
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
+
+You can filter an attribute based on `a string`_, `a regular
+expression`_, `a list`_, `a function`_, or `the value True`_.
+
+This code finds all tags that have an ``id`` attribute, regardless of
+what the value is::
+
+ soup.find_all(id=True)
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+You can filter multiple attributes at once by passing in more than one
+keyword argument::
+
+ soup.find_all(href=re.compile("elsie"), id='link1')
+ # [<a class="sister" href="http://example.com/elsie" id="link1">three</a>]
+
+.. _attrs:
+
+``attrs`` and searching by CSS class
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Instead of using keyword arguments, you can filter tags based on their
+attributes passing a dictionary in for ``attrs``. These two lines of
+code are equivalent::
+
+ soup.find_all(href=re.compile("elsie"), id='link1')
+ soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'})
+
+The ``attrs`` argument would be a pretty obscure feature were it not for
+one thing: CSS. It's very useful to search for a tag that has a
+certain CSS class, but the name of the CSS attribute, "class", is also a
+Python reserved word.
+
+You can use ``attrs`` to search by CSS class::
+
+ soup.find_all("a", { "class" : "sister" })
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+But that's a lot of code for such a common operation. Instead, you can
+pass a string for `attrs` instead of a dictionary. The string will be
+used to restrict the CSS class::
+
+ soup.find_all("a", "sister")
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+.. _text:
+
+The ``text`` argument
+^^^^^^^^^^^^^^^^^^^^^
+
+With ``text`` you can search for strings instead of tags. As with
+``name`` and the keyword arguments, you can pass in `a string`_, `a
+regular expression`_, `a list`_, `a function`_, or `the value True`_.
+Here are some examples::
+
+ soup.find_all(text="Elsie")
+ # [u'Elsie']
+
+ soup.find_all(text=["Tillie", "Elsie", "Lacie"])
+ # [u'Elsie', u'Lacie', u'Tillie']
+
+ soup.find_all(text=re.compile("Dormouse"))
+ [u"The Dormouse's story", u"The Dormouse's story"]
+
+ def is_the_only_string_within_a_tag(s):
+ """Return True if this string is the only child of its parent tag."""
+ return (s == s.parent.string)
+
+ soup.find_all(text=is_the_only_string_within_a_tag)
+ # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...']
+
+.. _limit:
+
+The ``limit`` argument
+^^^^^^^^^^^^^^^^^^^^^^
+
+``find_all()`` returns all the tags and strings that match your
+filters. This can take a while if the document is large. If you don't
+need `all` the results, you can pass in a number for ``limit``. This
+works just like the LIMIT keyword in SQL. It tells Beautiful Soup to
+stop gathering results after it's found a certain number.
+
+There are three links in the "three sisters" document, but this code
+only finds the first two::
+
+ soup.find_all("a", limit=2)
+ # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
+
+.. _recursive:
+
+The ``recursive`` argument
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you call ``mytag.find_all()``, Beautiful Soup will examine all the
+descendants of ``mytag``: its children, its children's children, and
+so on. If you only want Beautiful Soup to consider direct children,
+you can pass in ``recursive=False``. See the difference here::
+
+ soup.html.find_all("title")
+ # [<title>The Dormouse's story</title>]
+
+ soup.html.find_all("title", recursive=False)
+ # []
+
+Here's that part of the document::
+
+ <html>
+ <head>
+ <title>
+ The Dormouse's story
+ </title>
+ </head>
+ ...
+
+The <title> tag is beneath the <html> tag, but it's not `directly`
+beneath the <html> tag: the <head> tag is in the way. Beautiful Soup
+finds the <title> tag when it's allowed to look at all descendants of
+the <html> tag, but when ``recursive=False`` restricts it to the
+<html> tag's immediate children, it finds nothing.
+
+Beautiful Soup offers a lot of tree-searching methods (covered below),
+and they mostly take the same arguments as ``find_all()``: ``name``,
+``attrs``, ``text``, ``limit``, and the keyword arguments. But the
+``recursive`` argument is different: ``find_all()`` and ``find()`` are
+the only methods that support it. Passing ``recursive=False`` into a
+method like ``find_parents()`` wouldn't be very useful.
+
+Calling a tag is like calling ``find_all()``
+--------------------------------------------
+
+Because ``find_all()`` is the most popular method in the Beautiful
+Soup search API, you can use a shortcut for it. If you treat the
+``BeautifulSoup`` object or a ``Tag`` object as though it were a
+function, then it's the same as calling ``find_all()`` on that
+object. These two lines of code are equivalent::
+
+ soup.find_all("a")
+ soup("a")
+
+These two lines are also equivalent::
+
+ soup.title.find_all(text=True)
+ soup.title(text=True)
+
+``find()``
+----------
+
+Signature: find(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive
+<recursive>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`)
+
+The ``find_all()`` method scans the entire document looking for
+results, but sometimes you only want to find one result. If you know a
+document only has one <body> tag, it's a waste of time to scan the
+entire document looking for more. Rather than passing in ``limit=1``
+every time you call ``find_all``, you can use the ``find()``
+method. These two lines of code are `nearly` equivalent::
+
+ soup.find_all('title', limit=1)
+ # [<title>The Dormouse's story</title>]
+
+ soup.find('title')
+ # <title>The Dormouse's story</title>
+
+The only difference is that ``find_all()`` returns a list containing
+the single result, and ``find()`` just returns the result.
+
+If ``find_all()`` can't find anything, it returns an empty list. If
+``find()`` can't find anything, it returns ``None``::
+
+ print(soup.find("nosuchtag"))
+ # None
+
+Remember the ``soup.head.title`` trick from `Navigating using tag
+names`_? That trick works by repeatedly calling ``find()``::
+
+ soup.head.title
+ # <title>The Dormouse's story</title>
+
+ soup.find("head").find("title")
+ # <title>The Dormouse's story</title>
+
+``find_parents()`` and ``find_parent()``
+----------------------------------------
+
+Signature: find_parents(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`)
+
+Signature: find_parent(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`)
+
+I spent a lot of time above covering ``find_all()`` and
+``find()``. The Beautiful Soup API defines ten other methods for
+searching the tree, but don't be afraid. Five of these methods are
+basically the same as ``find_all()``, and the other five are basically
+the same as ``find()``. The only differences are in what parts of the
+tree they search.
+
+First let's consider ``find_parents()`` and
+``find_parent()``. Remember that ``find_all()`` and ``find()`` work
+their way down the tree, looking at tag's descendants. These methods
+do the opposite: they work their way `up` the tree, looking at a tag's
+(or a string's) parents. Let's try them out, starting from a string
+buried deep in the "three daughters" document::
+
+ a_string = soup.find(text="Lacie")
+ a_string
+ # u'Lacie'
+
+ a_string.find_parents("a")
+ # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
+
+ a_string.find_parent("p")
+ # <p class="story">Once upon a time there were three little sisters; and their names were
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
+ # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
+ # and they lived at the bottom of a well.</p>
+
+ a_string.find_parents("p", class="title")
+ # []
+
+One of the three <a> tags is the direct parent of the string in
+question, so our search finds it. One of the three <p> tags is an
+indirect parent of the string, and our search finds that as
+well. There's a <p> tag with the CSS class "title" `somewhere` in the
+document, but it's not one of this string's parents, so we can't find
+it with ``find_parents()``.
+
+You may have made the connection between ``find_parent()`` and
+``find_parents()``, and the `.parent`_ and `.parents`_ attributes
+mentioned earlier. The connection is very strong. These search methods
+actually use ``.parents`` to iterate over all the parents, and check
+each one against the provided filter to see if it matches.
+
+``find_next_siblings()`` and ``find_next_sibling()``
+----------------------------------------------------
+
+Signature: find_next_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`)
+
+Signature: find_next_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`)
+
+These methods use :ref:`.next_siblings <sibling-generators>` to
+iterate over the rest of an element's siblings in the tree. The
+``find_next_siblings()`` method returns all the siblings that match,
+and ``find_next_sibling()`` only returns the first one::
+
+ first_link = soup.a
+ first_link
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+
+ first_link.find_next_siblings("a")
+ # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
+
+ first_story_paragraph = soup.find("p", "story")
+ first_story_paragraph.find_next_sibling("p")
+ # <p class="story">...</p>
+
+``find_previous_siblings()`` and ``find_previous_sibling()``
+------------------------------------------------------------
+
+Signature: find_previous_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`)
+
+Signature: find_previous_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`)
+
+These methods use :ref:`.previous_siblings <sibling-generators>` to iterate over an element's
+siblings that precede it in the tree. The ``find_previous_siblings()``
+method returns all the siblings that match, and
+``find_previous_sibling()`` only returns the first one::
+
+ last_link = soup.find("a", id="link3")
+ last_link
+ # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
+
+ last_link.find_previous_siblings("a")
+ # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
+
+ first_story_paragraph = soup.find("p", "story")
+ first_story_paragraph.find_previous_sibling("p")
+ # <p class="title"><b>The Dormouse's story</b></p>
+
+
+``find_all_next()`` and ``find_next()``
+---------------------------------------
+
+Signature: find_all_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`)
+
+Signature: find_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`)
+
+These methods use :ref:`.next_elements <element-generators>` to
+iterate over whatever tags and strings that come after it in the
+document. The ``find_all_next()`` method returns all matches, and
+``find_next()`` only returns the first match::
+
+ first_link = soup.a
+ first_link
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+
+ first_link.find_all_next(text=True)
+ # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie',
+ # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n']
+
+ first_link.find_next("p")
+ # <p class="story">...</p>
+
+In the first example, the string "Elsie" showed up, even though it was
+contained within the <a> tag we started from. In the second example,
+the last <p> tag in the document showed up, even though it's not in
+the same part of the tree as the <a> tag we started from. For these
+methods, all that matters is that an element match the filter, and
+show up later in the document than the starting element.
+
+``find_all_previous()`` and ``find_previous()``
+-----------------------------------------------
+
+Signature: find_all_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`)
+
+Signature: find_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`)
+
+These methods use :ref:`.previous_elements <element-generators>` to
+iterate over the tags and strings that came before it in the
+document. The ``find_all_previous()`` method returns all matches, and
+``find_previous()`` only returns the first match::
+
+ first_link = soup.a
+ first_link
+ # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+
+ first_link.find_all_previous("p")
+ # [<p class="story">Once upon a time there were three little sisters; ...</p>,
+ # <p class="title"><b>The Dormouse's story</b></p>]
+
+ first_link.find_previous("title")
+ # <title>The Dormouse's story</title>
+
+The call to ``find_all_previous("p")`` found the first paragraph in
+the document (the one with class="title"), but it also finds the
+second paragraph, the <p> tag that contains the <a> tag we started
+with. This shouldn't be too surprising: we're looking at all the tags
+that show up earlier in the document than the one we started with. A
+<p> tag that contains an <a> tag must have shown up earlier in the
+document.
+
+Modifying the tree
+==================
+
+Beautiful Soup's main strength is in searching the parse tree, but you
+can also modify the tree and write your changes as a new HTML or XML
+document.
+
+Changing tag names and attributes
+---------------------------------
+
+I covered this earlier, in `Attributes`_, but it bears repeating. You
+can rename a tag, change the values of its attributes, add new
+attributes, and delete attributes::
+
+ soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
+ tag = soup.b
+
+ tag.name = "blockquote"
+ tag['class'] = 'verybold'
+ tag['id'] = 1
+ tag
+ # <blockquote class="verybold" id="1">Extremely bold</blockquote>
+
+ del tag['class']
+ del tag['id']
+ tag
+ # <blockquote>Extremely bold</blockquote>
+
+
+Modifying ``.string``
+---------------------
+
+If you set a tag's ``.string`` attribute, the tag's contents are
+replaced with the string you give::
+
+ markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
+ soup = BeautifulSoup(markup)
+
+ tag = soup.a
+ tag.string = "New link text."
+ tag
+ # <a href="http://example.com/">New link text.</a>
+
+Be careful: if the tag contained other tags, they and all their
+contents will be destroyed.
+
+``append()``
+------------
+
+You can add to a tag's contents with ``Tag.append()``. It works just
+like calling ``.append()`` on a Python list::
+
+ soup = BeautifulSoup("<a>Foo</a>")
+ soup.a.append("Bar")
+
+ soup
+ # <html><head></head><body><a>FooBar</a></body></html>
+ soup.a.contents
+ # [u'Foo', u'Bar']
+
+``BeautifulSoup.new_string()`` and ``.new_tag()``
+-------------------------------------------------
+
+If you need to add a string to a document, no problem--you can pass a
+Python string in to ``append()``, or you can call the factory method
+``BeautifulSoup.new_string()``::
+
+ soup = BeautifulSoup("<b></b>")
+ tag = soup.b
+ tag.append("Hello")
+ new_string = soup.new_string(" there")
+ tag.append(new_string)
+ tag
+ # <b>Hello there.</b>
+ tag.contents
+ # [u'Hello', u' there']
+
+What if you need to create a whole new tag? The best solution is to
+call the factory method ``BeautifulSoup.new_tag()``::
+
+ soup = BeautifulSoup("<b></b>")
+ original_tag = soup.b
+
+ new_tag = soup.new_tag("a", href="http://www.example.com")
+ original_tag.append(new_tag)
+ original_tag
+ # <b><a href="http://www.example.com"></a></b>
+
+ new_tag.string = "Link text."
+ original_tag
+ # <b><a href="http://www.example.com">Link text.</a></b>
+
+Only the first argument, the tag name, is required.
+
+``insert()``
+------------
+
+``Tag.insert()`` is just like ``Tag.append()``, except the new element
+doesn't necessarily go at the end of its parent's
+``... contents``. It'll be inserted at whatever numeric position you
+say. It works just like ``.insert()`` on a Python list::
+
+ markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
+ soup = BeautifulSoup(markup)
+ tag = soup.a
+
+ tag.insert(1, "but did not endorse ")
+ tag
+ # <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a>
+ tag.contents
+ # [u'I linked to ', u'but did not endorse', <i>example.com</i>]
+
+``move_before()`` and ``move_after()``
+------------------------------------------
+
+The ``move_before()`` method moves a tag or string so that it
+immediately precedes something else in the parse tree::
+
+ soup = BeautifulSoup("<b>stop</b>")
+ tag = soup.new_tag("i")
+ tag.string = "Don't"
+ tag.move_before(soup.b.string)
+ soup.b
+ # <b><i>Don't</i>stop</b>
+
+The ``move_after()`` method moves a tag or string so that it
+immediately follows something else in the parse tree::
+
+ soup.new_string(" ever ").move_after(soup.b.i)
+ soup.b
+ # <b><i>Don't</i> ever stop</b>
+ soup.b.contents
+ # [<i>Don't</i>, u' ever ', u'stop']
+
+``clear()``
+-----------
+
+``Tag.clear()`` removes the contents of a tag::
+
+ markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
+ soup = BeautifulSoup(markup)
+ tag = soup.a
+
+ tag.clear()
+ tag
+ # <a href="http://example.com/"></a>
+
+``extract()``
+-------------
+
+``PageElement.extract()`` removes a tag or string from the tree. It
+returns the tag or string that was extracted::
+
+ markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
+ soup = BeautifulSoup(markup)
+ a_tag = soup.a
+
+ i_tag = soup.i.extract()
+
+ a_tag
+ # <a href="http://example.com/">I linked to</a>
+
+ i_tag
+ # <i>example.com</i>
+
+ print(i_tag.parent)
+ None
+
+At this point you effectively have two parse trees: one rooted at the
+``BeautifulSoup`` object you used to parse the document, and one rooted
+at the tag that was extracted. You can go on to call ``extract`` on
+a child of the element you extracted::
+
+ my_string = i_tag.string.extract()
+ my_string
+ # u'example.com'
+
+ print(my_string.parent)
+ # None
+ i_tag
+ # <i></i>
+
+
+``decompose()``
+---------------
+
+``Tag.decompose()`` removes a tag from the tree, then `completely
+destroys it and its contents`::
+
+ markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
+ soup = BeautifulSoup(markup)
+ a_tag = soup.a
+
+ soup.i.decompose()
+
+ a_tag
+ # <a href="http://example.com/">I linked to</a>
+
+
+.. _replace_with:
+
+``replace_with()``
+------------------
+
+``PageElement.replace_with()`` removes a tag or string from the tree,
+and replaces it with the tag or string of your choice::
+
+ markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
+ soup = BeautifulSoup(markup)
+ a_tag = soup.a
+
+ new_tag = soup.new_tag("b")
+ new_tag.string = "example.net"
+ a_tag.i.replace_with(new_tag)
+
+ a_tag
+ # <a href="http://example.com/">I linked to <b>example.net</b></a>
+
+``replace_with()`` returns the tag or string that was replaced, so
+that you can examine it or add it back to another part of the tree.
+
+``replace_with_children()``
+---------------------------
+
+``Tag.replace_with_children()`` replaces a tag with whatever's inside
+that tag. It's good for stripping out markup::
+
+ markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
+ soup = BeautifulSoup(markup)
+ a_tag = soup.a
+
+ a_tag.i.replace_with_children()
+ a_tag
+ # <a href="http://example.com/">I linked to example.com</a>
+
+Like ``replace_with()``, ``replace_with_children()`` returns the tag
+that was replaced.
+
+Output
+======
+
+Pretty-printing
+---------------
+
+The ``prettify()`` method will turn a Beautiful Soup parse tree into a
+nicely formatted bytestring, with each HTML/XML tag on its own line::
+
+ markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
+ soup = BeautifulSoup(markup)
+ soup.prettify()
+ # '<html>\n <head>\n </head>\n <body>\n <a href="http://example.com/">\n...'
+
+ print(soup.prettify())
+ # <html>
+ # <head>
+ # </head>
+ # <body>
+ # <a href="http://example.com/">
+ # I linked to
+ # <i>
+ # example.com
+ # </i>
+ # </a>
+ # </body>
+ # </html>
+
+You can call ``prettify()`` on the top-level ``BeautifulSoup`` object,
+or on any of its ``Tag`` objects::
+
+ print(soup.a.prettify())
+ # <a href="http://example.com/">
+ # I linked to
+ # <i>
+ # example.com
+ # </i>
+ # </a>
+
+Non-pretty printing
+-------------------
+
+If you just want a string, with no fancy formatting, you can call
+``unicode()`` or ``str()`` on a ``BeautifulSoup`` object, or a ``Tag``
+within it::
+
+ str(soup)
+ # '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>'
+
+ unicode(soup.a)
+ # u'<a href="http://example.com/">I linked to <i>example.com</i></a>'
+
+The ``str()`` function returns a string encoded in UTF-8. See
+`Encodings`_ for other options.
+
+You can also call ``encode()`` to get a bytestring, and ``decode()``
+to get Unicode.
+
+Output formatters
+-----------------
+
+If you give Beautiful Soup a document that contains HTML entities like
+"&lquot;", they'll be converted to Unicode characters::
+
+ soup = BeautifulSoup("&ldquo;Hello,&rdquo; he said.")
+ unicode(soup)
+ # u'<html><head></head><body>\u201cHello,\u201d he said.</body></html>'
+
+If you then convert the document to a string, the Unicode characters
+will be encoded as UTF-8. You won't get the HTML entities back:
+
+ str(soup)
+ # '<html><head></head><body>\xe2\x80\x9cHello,\xe2\x80\x9d he said.</body></html>'
+
+By default, the only characters that are escaped upon output are bare
+ampersands and angle brackets. These get turned into "&amp;", "&lt;",
+and "&gt;", so that Beautiful Soup doesn't inadvertently generate
+invalid HTML or XML::
+
+ soup = BeautifulSoup("<p>The law firm of Dewey, Cheatem, & Howe</p>")
+ soup.p
+ # <p>The law firm of Dewey, Cheatem, &amp; Howe</p>
+
+You can change this behavior by providing a value for the
+``formatter`` argument to ``prettify()``, ``encode()``, or
+``decode()``. Beautiful Soup recognizes four possible values for
+``formatter``
+
+The default is ``formatter="minimal"``. Strings will only be processed
+enough to ensure that Beautiful Soup generates valid HTML/XML::
+
+ french = "<p>Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;</p>"
+ soup = BeautifulSoup(french)
+ print(soup.prettify(formatter="minimal"))
+ # <html>
+ # <body>
+ # <p>
+ # Il a dit &lt;&lt;Sacré bleu!&gt;&gt;
+ # </p>
+ # </body>
+ # </html>
+
+``formatter="html"`` will convert Unicode characters to HTML entities
+whenever possible::
+
+ print(soup.prettify(formatter="html"))
+ # <html>
+ # <body>
+ # <p>
+ # Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;
+ # </p>
+ # </body>
+ # </html>
+
+If you pass in ``formatter=None``, Beautiful Soup will not modify
+strings at all on output. This is the fastest option, but it may lead
+to Beautiful Soup generating invalid HTML/XML, as in this example::
+
+ print(soup.prettify(formatter=None))
+ # <html>
+ # <body>
+ # <p>
+ # Il a dit <<Sacré bleu!>>
+ # </p>
+ # </body>
+ # </html>
+
+
+Finally, if you pass in a function for ``formatter``, Beautiful Soup
+will call that function once for every string in the document. You can
+do whatever you want in this function. Here's a formatter that
+converts strings to uppercase and does absolutely nothing else::
+
+ def uppercase(str):
+ return str.upper()
+
+ print(soup.prettify(formatter=uppercase))
+ # <html>
+ # <body>
+ # <p>
+ # IL A DIT <<SACRÉ BLEU!>>
+ # </p>
+ # </body>
+ # </html>
+
+If you're writing your own function, you should know about the
+``EntitySubstitution`` class in the ``bs4.dammit`` module. This class
+implements Beautiful Soup's standard formatters as class methods: the
+"html" formatter is ``EntitySubstitution.substitute_html``, and the
+"minimal" formatter is ``EntitySubstitution.substitute_xml``. You can
+use these functions to simulate ``formatter=html`` or
+``formatter==minimal`` but and then do something in addition.
+
+Here's an example that converts strings to uppercase, ``and`` replaces
+Unicode characters with HTML entities whenever possible::
+
+ from bs4.dammit import EntitySubstitution
+ def uppercase_and_substitute_html_entities(str):
+ return EntitySubstitution.substitute_html(str.upper())
+
+ print(soup.prettify(formatter=uppercase_and_substitute_html_entities))
+ # <html>
+ # <body>
+ # <p>
+ # IL A DIT &lt;&lt;SACR&Eacute; BLEU!&gt;&gt;
+ # </p>
+ # </body>
+ # </html>
+
+``get_text()``
+--------------
+
+If you only want the text part of a document or tag, you can use the
+``get_text()`` method. It returns all the text in a document or
+beneath a tag, as a single Unicode string::
+
+ markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
+ soup = BeautifulSoup(markup)
+
+ soup.get_text()
+ u'\nI linked to example.com\n'
+ soup.i.get_text()
+ u'example.com'
+
+You can specify a string to be used to join the bits of text
+together::
+
+ # soup.get_text("|")
+ u'\nI linked to |example.com|\n'
+
+You can tell Beautiful Soup to strip whitespace from the beginning and
+end of each bit of text::
+
+ # soup.get_text("|", strip=True)
+ u'I linked to|example.com'
+
+But at that point you might want to use the :ref:`.stripped_strings <string-generators>`
+generator instead, and process the text yourself::
+
+ [text for text in soup.stripped_strings]
+ # [u'I linked to', u'example.com']
+
+Choosing a parser
+=================
+
+If you just need to parse some HTML, you can dump the markup into the
+``BeautifulSoup`` constructor, and it'll probably be fine. Beautiful
+Soup will pick a parser for you and parse the data. But there are a
+few additional arguments you can pass in to the constructor to change
+which parser is used.
+
+The first argument to the ``BeautifulSoup`` constructor is a string or
+an open filehandle--the markup you want parsed. The second argument is
+`how` you'd like the markup parsed.
+
+If you don't specify anything, you'll get the best HTML parser that's
+installed. Beautiful Soup ranks lxml's parser as being the best, then
+html5lib's, then Python's built-in parser. You can override this by
+specifying one of the following:
+
+* What type of markup you want to parse. Currently supported are
+ "html", "xml", and "html5".
+
+* The name of the parser library you want to use. Currently supported
+ options are "lxml", "html5lib", and "html.parser" (Python's
+ built-in HTML parser).
+
+Some examples::
+
+ BeautifulSoup(markup, "lxml")
+ BeautifulSoup(markup, "xml")
+ BeautifulSoup(markup, "html5")
+
+You can specify a list of the parser features you want, instead of
+just one. Right now this is mostly useful for distinguishing between
+lxml's HTML parser and its XML parser::
+
+ BeautifulSoup(markup, ["html", "lxml"])
+ BeautifulSoup(markup, ["xml", "lxml"])
+
+If you don't have an appropriate parser installed, Beautiful Soup will
+ignore your request and pick a different parser. For instance, right
+now the only supported XML parser is lxml, so if you don't have lxml
+installed, asking for an XML parser won't give you one, and asking for
+"lxml" won't work either.
+
+Why would you use one parser over another? Because different parsers
+will create different parse trees from the same document. The biggest
+differences are between HTML parsers and XML parsers. Here's a short
+document, parsed as HTML::
+
+ BeautifulSoup("<a><b /></a>")
+ # <html><head></head><body><a><b></b></a></body></html>
+
+Since an empty <b /> tag is not valid HTML, the parser turns it into a
+<b></b> tag pair.
+
+Here's the same document parsed as XML (running this requires that you
+have lxml installed). Note that the empty <b /> tag is left alone, and
+that the document is given an XML declaration instead of being put
+into an <html> tag.::
+
+ BeautifulSoup("<a><b /></a>", "xml")
+ # <?xml version="1.0" encoding="utf-8">
+ # <a><b /></a>
+
+There are also differences between HTML parsers. If you give Beautiful
+Soup a perfectly-formed HTML document, these differences won't
+matter. One parser may be faster than another, but they'll all give
+you a data structure that looks exactly like the original HTML
+document.
+
+But if the document is not perfectly-formed, different parsers will
+give different results. Here's a short, invalid document parsed using
+lxml's HTML parser. Note that the dangling </p> tag is simply
+ignored::
+
+ BeautifulSoup("<a></p>", "lxml")
+ # <html><body><a></a></body></html>
+
+Here's the same document parsed using html5lib::
+
+ BeautifulSoup("<a></p>", "html5lib")
+ # <html><head></head><body><a><p></p></a></body></html>
+
+Instead of ignoring the dangling </p> tag, html5lib pairs it with an
+opening <p> tag. This parser also adds an empty <head> tag to the
+document.
+
+Here's the same document parsed with Python's built-in HTML
+parser::
+
+ BeautifulSoup("<a></p>", "html.parser")
+ # <a></a>
+
+Like html5lib, this parser ignores the closing </p> tag. Unlike
+html5lib, this parser makes no attempt to create a well-formed HTML
+document by adding a <body> tag. Unlike lxml, it doesn't even bother
+to add an <html> tag.
+
+Since the document "<a></p>" is invalid, none of these techniques is
+the "correct" way to handle it. The html5lib parser uses techniques
+that are part of the HTML5 standard, so it has the best claim on being
+the "correct" way, but all three techniques are leigtimate.
+
+Differences between parsers can affect your script. If you're planning
+on distributing your script to other people, you might want to specify
+in the ``BeautifulSoup`` constructor which parser you used during
+development. That will reduce the chances that your users parse a
+document differently from the way you parse it.
+
+
+Encodings
+=========
+
+Any HTML or XML document is written in a specific encoding like ASCII
+or UTF-8. But when you load that document into Beautiful Soup, you'll
+discover it's been converted to Unicode::
+
+ markup = "<h1>Sacr\xc3\xa9 bleu!</h1>"
+ soup = BeautifulSoup(markup)
+ soup.h1
+ # <h1>Sacré bleu!</h1>
+ soup.h1.string
+ # u'Sacr\xe9 bleu!'
+
+It's not magic. (That sure would be nice.) Beautiful Soup uses a
+sub-library called `Unicode, Dammit`_ to detect a document's encoding
+and convert it to Unicode. The autodetected encoding is available as
+the ``.original_encoding`` attribute of the ``BeautifulSoup`` object::
+
+ soup.original_encoding
+ 'utf-8'
+
+Unicode, Dammit guesses correctly most of the time, but sometimes it
+makes mistakes. Sometimes it guesses correctly, but only after a
+byte-by-byte search of the document that takes a very long time. If
+you happen to know a document's encoding ahead of time, you can avoid
+mistakes and delays by passing it to the ``BeautifulSoup`` constructor
+as ``from_encoding``.
+
+Here's a document written in ISO-8859-8. The document is so short that
+Unicode, Dammit can't get a good lock on it, and misidentifies it as
+ISO-8859-7::
+
+ markup = b"<h1>\xed\xe5\xec\xf9</h1>"
+ soup = BeautifulSoup(markup)
+ soup.h1
+ <h1>νεμω</h1>
+ soup.original_encoding
+ 'ISO-8859-7'
+
+We can fix this by passing in the correct ``from_encoding``::
+
+ soup = BeautifulSoup(markup, from_encoding="iso-8859-8")
+ soup.h1
+ <h1>םולש</h1>
+ soup.original_encoding
+ 'iso8859-8'
+
+Output encoding
+---------------
+
+When you write out a document from Beautiful Soup, you get a UTF-8
+document, even if the document wasn't in UTF-8 to begin with. Here's a
+document written in the Latin-1 encoding::
+
+ markup = b'''
+ <html>
+ <head>
+ <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" />
+ </head>
+ <body>
+ <p>Sacr\xe9 bleu!</p>
+ </body>
+ </html>
+ '''
+
+ soup = BeautifulSoup(markup)
+ print(soup.prettify())
+ # <html>
+ # <head>
+ # <meta content="text/html; charset=utf-8" http-equiv="Content-type" />
+ # </head>
+ # <body>
+ # <p>
+ # Sacré bleu!
+ # </p>
+ # </body>
+ # </html>
+
+Note that the <meta> tag has been rewritten to reflect the fact that
+the document is now in UTF-8.
+
+If you don't want UTF-8, you can pass an encoding into ``prettify()``::
+
+ print(soup.prettify("latin-1"))
+ # <html>
+ # <head>
+ # <meta content="text/html; charset=latin-1" http-equiv="Content-type" />
+ # ...
+
+You can also call encode() on the ``BeautifulSoup`` object, or any
+element in the soup, just as if it were a Python string::
+
+ soup.p.encode("latin-1")
+ # '<p>Sacr\xe9 bleu!</p>'
+
+ soup.p.encode("utf-8")
+ # '<p>Sacr\xc3\xa9 bleu!</p>'
+
+Unicode, Dammit
+---------------
+
+You can use Unicode, Dammit without using Beautiful Soup. It's useful
+whenever you have data in an unknown encoding and you just want it to
+become Unicode::
+
+ from bs4 import UnicodeDammit
+ dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!")
+ print(dammit.unicode_markup)
+ # Sacré bleu!
+ dammit.original_encoding
+ # 'utf-8'
+
+The more data you give Unicode, Dammit, the more accurately it will
+guess. If you have your own suspicions as to what the encoding might
+be, you can pass them in as a list::
+
+ dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
+ print(dammit.unicode_markup)
+ # Sacré bleu!
+ dammit.original_encoding
+ # 'latin-1'
+
+Unicode, Dammit has one special feature that Beautiful Soup doesn't
+use. You can use it to convert Microsoft smart quotes to HTML or XML
+entities::
+
+ markup = b"<p>I just \x93love\x94 Microsoft Word</p>"
+
+ UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup
+ # u'<p>I just &ldquo;love&rdquo; Microsoft Word</p>'
+
+ UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup
+ # u'<p>I just &#x201C;love&#x201D; Microsoft Word</p>'
+
+You might find this feature useful, but Beautiful Soup doesn't use
+it. Beautiful Soup prefers the default behavior, which is to convert
+Microsoft smart quotes to Unicode characters along with everything
+else::
+
+ UnicodeDammit(markup, ["windows-1252"]).unicode_markup
+ # u'<p>I just \u201clove\u201d Microsoft Word</p>'
+
+Parsing only part of a document
+===============================
+
+Let's say you want to use Beautiful Soup look at a document's <a>
+tags. It's a waste of time and memory to parse the entire document and
+then go over it again looking for <a> tags. It would be much faster to
+ignore everthing that wasn't an <a> tag in the first place. The
+``SoupStrainer`` class allows you to choose which parts of an incoming
+document are parsed. You just create a ``SoupStrainer`` and pass it in
+to the ``BeautifulSoup`` constructor as the ``parse_only`` argument.
+
+(Note that *this feature won't work if you're using the html5lib
+parser*. If you use html5lib, the whole document will be parsed, no
+matter what. In the examples below, I'll be forcing Beautiful Soup to
+use Python's built-in parser.)
+
+``SoupStrainer``
+----------------
+
+The ``SoupStrainer`` class takes the same arguments as a typical
+method from `Searching the tree`_: :ref:`name <name>`, :ref:`attrs
+<attrs>`, :ref:`text <text>`, and :ref:`**kwargs <kwargs>`. Here are
+three ``SoupStrainer`` objects::
+
+ from bs4 import SoupStrainer
+
+ only_a_tags = SoupStrainer("a")
+
+ only_tags_with_id_link2 = SoupStrainer(id="link2")
+
+ def is_short_string(string):
+ return len(string) < 10
+
+ only_short_strings = SoupStrainer(text=is_short_string)
+
+I'm going to bring back the "three sisters" document one more time,
+and we'll see what the document looks like when it's parsed with these
+three ``SoupStrainer`` objects::
+
+ html_doc = """
+ <html><head><title>The Dormouse's story</title></head>
+
+ <p class="title"><b>The Dormouse's story</b></p>
+
+ <p class="story">Once upon a time there were three little sisters; and their names were
+ <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
+ <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
+ <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
+ and they lived at the bottom of a well.</p>
+
+ <p class="story">...</p>
+ """
+
+ print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify())
+ # <a class="sister" href="http://example.com/elsie" id="link1">
+ # Elsie
+ # </a>
+ # <a class="sister" href="http://example.com/lacie" id="link2">
+ # Lacie
+ # </a>
+ # <a class="sister" href="http://example.com/tillie" id="link3">
+ # Tillie
+ # </a>
+
+ print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify())
+ # <a class="sister" href="http://example.com/lacie" id="link2">
+ # Lacie
+ # </a>
+
+ print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify())
+ # Elsie
+ # ,
+ # Lacie
+ # and
+ # Tillie
+ # ...
+ #
+
+You can also pass a ``SoupStrainer`` into any of the methods covered
+in `Searching the tree`_. This probably isn't terribly useful, but I
+thought I'd mention it::
+
+ soup = BeautifulSoup(html_doc)
+ soup.find_all(only_short_strings)
+ # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie',
+ # u'\n\n', u'...', u'\n']
+
+Troubleshooting
+===============
+
+Parsing XML
+-----------
+
+By default, Beautiful Soup parses documents as HTML. To parse a
+document as XML, pass in "xml" as the second argument to the
+``BeautifulSoup`` constructor::
+
+ soup = BeautifulSoup(markup, "xml")
+
+You'll need to :ref:`have lxml installed <parser-installation>`.
+
+Improving Performance
+---------------------
+
+Beautiful Soup will never be as fast as the parsers it sits on top
+of. If response time is critical, if you're paying for computer time
+by the hour, or if there's any other reason why computer time is more
+valuable than programmer time, you should forget about Beautiful Soup
+and work directly atop `lxml <http://lxml.de/>`_.
+
+That said, there are things you can do to speed up Beautiful Soup. If
+you're not using lxml as the underlying parser, my advice is to
+:ref:`start <parser-installation>`. Beautiful Soup parses documents
+significantly faster using lxml than using html.parser or html5lib.
+
+Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by
+doing a byte-by-byte examination of the file. This slows Beautiful
+Soup to a crawl. My tests indicate that this only happened on 2.x
+versions of Python, and that it happened most often with documents
+using Russian or Chinese encodings. If this is happening to you, you
+can fix it by using Python 3 for your script. Or, if you happen to
+know a document's encoding, you can pass it into the
+``BeautifulSoup`` constructor as ``from_encoding``.
+
+`Parsing only part of a document`_ won't save you much time parsing
+the document, but it can save a lot of memory, and it'll make
+`searching` the document much faster.
+
+Beautiful Soup 3
+================
+
+Beautiful Soup 3.2.0 is the old version, the last release of the
+Beautiful Soup 3 series. It's currently the version packaged with all
+major Linux distributions::
+
+:kbd:`$ apt-get install python-beautifulsoup`
+
+It's also published through PyPi as `BeautifulSoup`.::
+
+:kbd:`$ easy_install BeautifulSoup`
+:kbd:`$ pip install BeautifulSoup`
+
+You can also `download a tarball of Beautiful Soup 3.2.0
+<http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz>`_.
+
+If you ran ``easy_install beautifulsoup`` or ``easy_install
+BeautifulSoup``, but your code doesn't work, you installed Beautiful
+Soup 3 by mistake. You need to run ``easy_install beautifulsoup4``.
+
+`The documentation for Beautiful Soup 3 is archived online
+<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. If
+your first language is Chinese, it might be easier for you to read
+`the Chinese translation of the Beautiful Soup 3 documentation
+<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html>`_,
+then read this document to find out about the changes made in
+Beautiful Soup 4.
+
+Porting code to BS4
+-------------------
+
+Most code written against Beautiful Soup 3 will work against Beautiful
+Soup 4 with one simple change. All you should have to do is change the
+package name from ``BeautifulSoup`` to ``bs4``. So this::
+
+ from BeautifulSoup import BeautifulSoup
+
+becomes this::
+
+ from bs4 import BeautifulSoup
+
+* If you get the ``ImportError`` "No module named BeautifulSoup", your
+ problem is that you're trying to run Beautiful Soup 3 code, but you
+ only have Beautiful Soup 4 installed.
+
+* If you get the ``ImportError`` "No module named bs4", your problem
+ is that you're trying to run Beautiful Soup 4 code, but you only
+ have Beautiful Soup 3 installed.
+
+Although BS4 is mostly backwards-compatible with BS3, most of its
+methods have been deprecated and given new names for `PEP 8 compliance
+<http://www.python.org/dev/peps/pep-0008/>`_. There are numerous other
+renames and changes, and a few of them break backwards compatibility.
+
+Here's what you'll need to know to convert your BS3 code and habits to BS4:
+
+You need a parser
+^^^^^^^^^^^^^^^^^
+
+Beautiful Soup 3 used Python's ``SGMLParser``, a module that was
+deprecated and removed in Python 3.0. Beautiful Soup 4 uses
+``html.parser`` by default, but you can plug in lxml or html5lib and
+use that instead. Until ``html.parser`` is improved to handle
+real-world HTML better, that's what I recommend you do. See `Be sure
+to install a good parser!`_
+
+Method names
+^^^^^^^^^^^^
+
+* ``replaceWith`` -> ``replace_with``
+* ``replaceWithChildren`` -> ``replace_with_children``
+* ``findAll`` -> ``find_all``
+* ``findAllNext`` -> ``find_all_next``
+* ``findAllPrevious`` -> ``find_all_previous``
+* ``findNext`` -> ``find_next``
+* ``findNextSibling`` -> ``find_next_sibling``
+* ``findNextSiblings`` -> ``find_next_siblings``
+* ``findParent`` -> ``find_parent``
+* ``findParents`` -> ``find_parents``
+* ``findPrevious`` -> ``find_previous``
+* ``findPreviousSibling`` -> ``find_previous_sibling``
+* ``findPreviousSiblings`` -> ``find_previous_siblings``
+* ``nextSibling`` -> ``next_sibling``
+* ``previousSibling`` -> ``previous_sibling``
+
+Some arguments to the Beautiful Soup constructor were renamed for the
+same reasons:
+
+* ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)``
+* ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)``
+
+I renamed one method for compatibility with Python 3:
+
+* ``Tag.has_key()`` -> ``Tag.has_attr()``
+
+I renamed one attribute to use more accurate terminology:
+
+* ``Tag.isSelfClosing`` -> ``Tag.is_empty_element``
+
+I renamed three attributes to avoid using words that have special
+meaning to Python. Unlike the others, these changes are *not backwards
+compatible.* If you used these attributes in BS3, your code will break
+on BS4 until you change them.
+
+* ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup``
+* ``Tag.next`` -> ``Tag.next_element``
+* ``Tag.previous`` -> ``Tag.previous_element``
+
+Generators
+^^^^^^^^^^
+
+I gave the generators PEP 8-compliant names, and transformed them into
+properties:
+
+* ``childGenerator()`` -> ``children``
+* ``nextGenerator()`` -> ``next_elements``
+* ``nextSiblingGenerator()`` -> ``next_siblings``
+* ``previousGenerator()`` -> ``previous_elements``
+* ``previousSiblingGenerator()`` -> ``previous_siblings``
+* ``recursiveChildGenerator()`` -> ``descendants``
+* ``parentGenerator()`` -> ``parents``
+
+So instead of this::
+
+ for parent in tag.parentGenerator():
+ ...
+
+You can write this::
+
+ for parent in tag.parents:
+ ...
+
+(But the old code will still work.)
+
+Some of the generators used to yield ``None`` after they were done, and
+then stop. That was a bug. Now the generators just stop.
+
+There are two new generators, :ref:`.strings and
+.stripped_strings <string-generators>`. ``.strings`` yields
+NavigableString objects, and ``.stripped_strings`` yields Python
+strings that have had whitespace stripped.
+
+XML
+^^^
+
+There is no longer a ``BeautifulStoneSoup`` class for parsing XML. To
+parse XML you pass in "xml" as the second argument to the
+``BeautifulSoup`` constructor. For the same reason, the
+``BeautifulSoup`` constructor no longer recognizes the ``isHTML``
+argument.
+
+Beautiful Soup's handling of empty-element XML tags has been
+improved. Previously when you parsed XML you had to explicitly say
+which tags were considered empty-element tags. The ``selfClosingTags``
+argument to the constructor is no longer recognized. Instead,
+Beautiful Soup considers any empty tag to be an empty-element tag. If
+you add a child to an empty-element tag, it stops being an
+empty-element tag.
+
+Entities
+^^^^^^^^
+
+An incoming HTML or XML entity is always converted into the
+corresponding Unicode character. Beautiful Soup 3 had a number of
+overlapping ways of dealing with entities, which have been
+removed. The ``BeautifulSoup`` constructor no longer recognizes the
+``smartQuotesTo`` or ``convertEntities`` arguments. (`Unicode,
+Dammit`_ still has ``smart_quotes_to``, but its default is now to turn
+smart quotes into Unicode.)
+
+If you want to turn those Unicode characters back into HTML entities
+on output, rather than turning them into UTF-8 characters, you need to
+use ``.encode``, as described in `Substituting HTML entities`. This
+may change before the final release.
+
+Miscellaneous
+^^^^^^^^^^^^^
+
+:ref:`Tag.string <.string>` now operates recursively. If tag A
+contains a single tag B and nothing else, then A.string is the same as
+B.string. (Previously, it was None.)
+
+The ``BeautifulSoup`` constructor no longer recognizes the
+`markupMassage` argument. It's now the parser's responsibility to
+handle markup correctly.
+
+The rarely-used alternate parser classes like
+``ICantBelieveItsBeautifulSoup`` and ``BeautifulSOAP`` have been
+removed. It's now the parser's decision how to handle ambiguous
+markup.