diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-07 23:40:06 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-07 23:40:06 -0500 |
commit | bd9711feb69ce4791c5de44b2f67f54be194071a (patch) | |
tree | f6795cbc8b2c415e57a1dadcd3e72a6dc5b687b1 /bs4/doc | |
parent | c12086f610fe914db122489bc5b268d705297fc6 (diff) | |
download | beautifulsoup4-bd9711feb69ce4791c5de44b2f67f54be194071a.tar.gz |
Package the docs with the code.
Diffstat (limited to 'bs4/doc')
-rw-r--r-- | bs4/doc/Makefile | 130 | ||||
-rw-r--r-- | bs4/doc/__init__.py | 1 | ||||
-rw-r--r-- | bs4/doc/source/6.1.jpg | bin | 0 -> 22619 bytes | |||
-rw-r--r-- | bs4/doc/source/conf.py | 256 | ||||
-rw-r--r-- | bs4/doc/source/index.rst | 2427 |
5 files changed, 2814 insertions, 0 deletions
diff --git a/bs4/doc/Makefile b/bs4/doc/Makefile new file mode 100644 index 0000000..8c833d2 --- /dev/null +++ b/bs4/doc/Makefile @@ -0,0 +1,130 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/bs4/doc/__init__.py b/bs4/doc/__init__.py new file mode 100644 index 0000000..a2920fe --- /dev/null +++ b/bs4/doc/__init__.py @@ -0,0 +1 @@ +"""Executable documentation about beautifulsoup.""" diff --git a/bs4/doc/source/6.1.jpg b/bs4/doc/source/6.1.jpg Binary files differnew file mode 100644 index 0000000..97014f0 --- /dev/null +++ b/bs4/doc/source/6.1.jpg diff --git a/bs4/doc/source/conf.py b/bs4/doc/source/conf.py new file mode 100644 index 0000000..56c0939 --- /dev/null +++ b/bs4/doc/source/conf.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- +# +# Beautiful Soup documentation build configuration file, created by +# sphinx-quickstart on Thu Jan 26 11:22:55 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Beautiful Soup' +copyright = u'2012, Leonard Richardson' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '4' +# The full version, including alpha/beta/rc tags. +release = '4.0.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'BeautifulSoupdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', + u'Leonard Richardson', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'beautifulsoup', u'Beautiful Soup Documentation', + [u'Leonard Richardson'], 1) +] + + +# -- Options for Epub output --------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = u'Beautiful Soup' +epub_author = u'Leonard Richardson' +epub_publisher = u'Leonard Richardson' +epub_copyright = u'2012, Leonard Richardson' + +# The language of the text. It defaults to the language option +# or en if the language is not set. +#epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +#epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +#epub_identifier = '' + +# A unique identification for the text. +#epub_uid = '' + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_pre_files = [] + +# HTML files shat should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_post_files = [] + +# A list of files that should not be packed into the epub file. +#epub_exclude_files = [] + +# The depth of the table of contents in toc.ncx. +#epub_tocdepth = 3 + +# Allow duplicate toc entries. +#epub_tocdup = True diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst new file mode 100644 index 0000000..ad1dbf7 --- /dev/null +++ b/bs4/doc/source/index.rst @@ -0,0 +1,2427 @@ +Beautiful Soup Documentation +============================ + +.. image:: 6.1.jpg + :align: right + :alt: "The Fish-Footman began by producing from under his arm a great letter, nearly as large as himself." + +`Beautiful Soup <http://www.crummy.com/software/BeautifulSoup/>`_ is a +Python library for pulling data out of HTML and XML files. It works +with your favorite parser to provide idiomatic ways of navigating, +searching, and modifying the parse tree. It commonly saves programmers +hours or days of work. + +These instructions illustrate all major features of Beautiful Soup 4, +with examples. I show you what the library is good for, how it works, +how to use it, how to make it do what you want, and what to do when it +violates your expectations. + +The examples in this documentation should work the same way in Python +2.7 and Python 3.2. + +You might be looking for the documentation for `Beautiful Soup 3 +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. If +you want to learn about the differences between Beautiful Soup 3 and +Beautiful Soup 4, see `Porting code to BS4`_. + +Getting help +------------ + +If you have questions about Beautiful Soup, or run into problems, +`send mail to the discussion group +<http://groups.google.com/group/beautifulsoup/>`_. + +Quick Start +=========== + +Here's an HTML document I'll be using as an example throughout this +document. It's part of a story from `Alice in Wonderland`:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + +Running the "three sisters" document through Beautiful Soup gives us a +``BeautifulSoup`` object, which represents the document as a nested +data structure:: + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc) + + print(soup.prettify()) + # <html> + # <head> + # <title> + # The Dormouse's story + # </title> + # </head> + # <body> + # <p class="title"> + # <b> + # The Dormouse's story + # </b> + # </p> + # <p class="story"> + # Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # , + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # and + # <a class="sister" href="http://example.com/tillie" id="link2"> + # Tillie + # </a> + # ; and they lived at the bottom of a well. + # </p> + # <p class="story"> + # ... + # </p> + # </body> + # </html> + +Here are some simple ways to navigate that data structure:: + + soup.title + # <title>The Dormouse's story</title> + + soup.title.name + # u'title' + + soup.title.string + # u'The Dormouse's story' + + soup.title.parent.name + # u'head' + + soup.p + # <p class="title"><b>The Dormouse's story</b></p> + + soup.p['class'] + # u'title' + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find(id="link3") + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +One common task is extracting all the URLs found within a page's <a> tags:: + + for link in soup.find_all('a'): + print(link.get('href')) + # http://example.com/elsie + # http://example.com/lacie + # http://example.com/tillie + +Another common task is extracting all the text from a page:: + + print(soup.get_text()) + # The Dormouse's story + # + # The Dormouse's story + # + # Once upon a time there were three little sisters; and their names were + # Elsie, + # Lacie and + # Tillie; + # and they lived at the bottom of a well. + # + # ... + +Does this look like what you need? If so, read on. + +Installing Beautiful Soup +========================= + +Beautiful Soup 4 is published through PyPi, so you can install it with +``easy_install`` or ``pip``. The package name is ``beautifulsoup4``, +and the same package works on Python 2 and Python 3. + +:kbd:`$ easy_install beautifulsoup4` +:kbd:`$ pip install beautifulsoup4` + +(The ``BeautifulSoup`` package is probably `not` what you want. That's +the previous major release, `Beautiful Soup 3`_. Lots of software uses +BS3, so it's still available, but if you're writing new code you +should install ``beautifulsoup4``.) + +You can also `download the Beautiful Soup 4 source tarball +<http://www.crummy.com/software/BeautifulSoup/download/4.x/>`_ and +install it with ``setup.py``. The license for Beautiful Soup allows +you to package the entire library with your application, allowing you +to copy the ``bs4`` directory into your application's codebase. + +I use Python 2.7 and Python 3.2 to develop Beautiful Soup, but it +should work with other recent versions. + +.. _parser-installation: + +Be sure to install a good parser! +--------------------------------- + +Beautiful Soup uses a plugin system that supports a number of popular +Python parsers. If no third-party parsers are installed, Beautiful +Soup uses the HTML parser that comes with Python. In recent releases +of Python (2.7.2 and 3.2.2), this parser works pretty well at handling +bad HTML. In older releases, it's not so good. + +Even if you're using a recent release of Python, I recommend you +install the `lxml parser <http://lxml.de/>`_ if possible. It's much +faster than Python's built-in parser. It works with both Python 2 and +Python 3, and it parses HTML and XML very well. Beautiful Soup will +detect that you have lxml installed, and use it instead of Python's +built-in parser. + +Depending on your setup, you might install lxml with one of these commands: + +:kbd:`$ apt-get install python-lxml` + +:kbd:`$ easy_install lxml` + +:kbd:`$ pip install lxml` + +If you're using Python 2, another alternative is the pure-Python +`html5lib parser <http://code.google.com/p/html5lib/>`_, which parses +HTML the way a web browser does. Depending on your setup, you might +install html5lib with one of these commands: + +:kbd:`$ apt-get install python-html5lib` + +:kbd:`$ easy_install html5lib` + +:kbd:`$ pip install html5lib` + +Making the soup +=============== + +To parse a document, pass it into the ``BeautifulSoup`` +constructor. You can pass in a string or an open filehandle:: + + from bs4 import BeautifulSoup + + soup = BeautifulSoup(open("index.html")) + + soup = BeautifulSoup("<html>data</html>") + +First, the document is converted to Unicode, and HTML entities are +converted to Unicode characters:: + + BeautifulSoup("Sacré bleu!") + <html><head></head><body>Sacré bleu!</body></html> + +Beautiful Soup then parses the document using the best available +parser. It will use an HTML parser unless you specifically tell it to +use an XML parser. (See `Choosing a parser`_.) + +Kinds of objects +================ + +Beautiful Soup transforms a complex HTML document into a complex tree +of Python objects. But you'll only ever have to deal with about four +`kinds` of objects. + +.. _Tag: + +``Tag`` +------- + +A ``Tag`` object corresponds to an XML or HTML tag in the original document:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') + tag = soup.b + type(tag) + # <class 'bs4.element.Tag'> + +Tags have a lot of attributes and methods, and I'll cover most of them +in `Navigating the tree`_ and `Searching the tree`_. For now, the most +important features of a tag are its name and attributes. + +Name +^^^^ + +Every tag has a name, accessible as ``.name``:: + + tag.name + # u'b' + +If you change a tag's name, the change will be reflected in any HTML +markup generated by Beautiful Soup:: + + tag.name = "blockquote" + tag + # <blockquote class="boldest">Extremely bold</blockquote> + +Attributes +^^^^^^^^^^ + +A tag may have any number of attributes. The tag ``<b +class="boldest">`` has an attribute "class" whose value is +"boldest". You can access a tag's attributes by treating the tag like +a dictionary:: + + tag['class'] + # u'boldest' + +You can access that dictionary directly as ``.attrs``:: + + tag.attrs + # {u'class': u'boldest'} + +You can add, remove, and modify a tag's attributes. Again, this is +done by treating the tag as a dictionary:: + + tag['class'] = 'verybold' + tag['id'] = 1 + tag + # <blockquote class="verybold" id="1">Extremely bold</blockquote> + + del tag['class'] + del tag['id'] + tag + # <blockquote>Extremely bold</blockquote> + +``NavigableString`` +------------------- + +A string corresponds to a bit of text within a tag. Beautiful Soup +defines the ``NavigableString`` class to contain these bits of text:: + + tag.string + # u'Extremely bold' + type(tag.string) + # <class 'bs4.element.NavigableString'> + +A ``NavigableString`` is just like a Python Unicode string, except +that it also supports some of the features described in `Navigating +the tree`_ and `Searching the tree`_. You can convert a +``NavigableString`` to a Unicode string with ``unicode()``:: + + unicode_string = unicode(tag.string) + unicode_string + # u'Extremely bold' + type(unicode_string) + # <type 'unicode'> + +You can't edit a string in place, but you can replace one string with +another, using :ref:`replace_with`:: + + tag.string.replace_with("No longer bold") + tag + # <blockquote>No longer bold</blockquote> + +``NavigableString`` supports most of the features described in +`Navigating the tree`_ and `Searching the tree`_, but not all of +them. In particular, since a string can't contain anything (the way a +tag may contain a string or another tag), strings don't support the +``.contents`` or ``.string`` attributes, or the `find()` method. + +``BeautifulSoup`` +----------------- + +The ``BeautifulSoup`` object itself represents the document as a +whole. For most purposes, you can treat it as a :ref:`Tag` +object. This means it supports most of the methods described in +`Navigating the tree`_ and `Searching the tree`_. + +Since the ``BeautifulSoup`` object doesn't correspond to an actual +HTML or XML tag, it has no name and no attributes. But sometimes it's +useful to look at its ``.name``, so it's been given the special +``.name`` "[document]":: + + soup.name + # u'[document]' + +Comments and other special strings +---------------------------------- + +``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost +everything you'll see in an HTML or XML file, but there are a few +leftover bits. The only one you'll probably ever need to worry about +is the comment:: + + markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" + soup = BeautifulSoup(markup) + comment = soup.b.string + type(comment) + # <class 'bs4.element.Comment'> + +The ``Comment`` object is just a special type of ``NavigableString``:: + + comment + # u'Hey, buddy. Want to buy a used parser' + +But when it appears as part of an HTML document, a ``Comment`` is +displayed with special formatting:: + + print(soup.b.prettify()) + # <b> + # <!--Hey, buddy. Want to buy a used parser?--> + # </b> + +Beautiful Soup defines classes for anything else that might show up in +an XML document: ``CData``, ``ProcessingInstruction``, +``Declaration``, and ``Doctype``. Just like ``Comment``, these classes +are subclasses of ``NavigableString`` that add something extra to the +string. Here's an example that replaces the comment with a CDATA +block:: + + from bs4 import CData + cdata = CData("A CDATA block") + comment.replace_with(cdata) + + print(soup.b.prettify()) + # <b> + # <![CDATA[A CDATA block]]> + # </b> + + +Navigating the tree +=================== + +Here's the "Three sisters" HTML document again:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc) + +I'll use this as an example to show you how to move from one part of +a document to another. + +Going down +---------- + +Tags may contain strings and other tags. These elements are the tag's +`children`. Beautiful Soup provides a lot of different attributes for +navigating and iterating over a tag's children. + +Note that Beautiful Soup strings don't support any of these +attributes, because a string can't have children. + +Navigating using tag names +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The simplest way to navigate the parse tree is to say the name of the +tag you want. If you want the <head> tag, just say ``soup.head``:: + + soup.head + # <head><title>The Dormouse's story</title></head> + + soup.title + # <title>The Dormouse's story</title> + +You can do use this trick again and again to zoom in on a certain part +of the parse tree. This code gets the first <b> tag beneath the <body> tag:: + + soup.body.b + # <b>The Dormouse's story</b> + +Using a tag name as an attribute will give you only the `first` tag by that +name:: + + soup.a + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + +If you need to get `all` the <a> tags, or anything more complicated +than the first tag with a certain name, you'll need to use one of the +methods described in `Searching the tree`_, such as `find_all()`:: + + soup.find_all('a') + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +``.contents`` and ``.children`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A tag's children are available in a list called ``.contents``:: + + head_tag = soup.head + head_tag + # <head><title>The Dormouse's story</title></head> + + head_tag.contents + [<title>The Dormouse's story</title>] + + title_tag = head_tag.contents[0] + title_tag + # <title>The Dormouse's story</title> + title_tag.contents + # [u'The Dormouse's story'] + +The ``BeautifulSoup`` object itself has children. In this case, the +<html> tag is the child of the ``BeautifulSoup`` object.:: + + len(soup.contents) + # 1 + soup.contents[0].name + # u'html' + +A string does not have ``.contents``, because it can't contain +anything:: + + text = title_tag.contents[0] + text.contents + # AttributeError: 'NavigableString' object has no attribute 'contents' + +Instead of getting them as a list, you can iterate over a tag's +children using the ``.children`` generator:: + + for child in title_tag.children: + print(child) + # The Dormouse's story + +``.descendants`` +^^^^^^^^^^^^^^^^ + +The ``.contents`` and ``.children`` attributes only consider a tag's +`direct` children. For instance, the <head> tag has a single direct +child--the <title> tag:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + +But the <title> tag itself has a child: the string "The Dormouse's +story". There's a sense in which that string is also a child of the +<head> tag. The ``.descendants`` attribute lets you iterate over `all` +of a tag's children, recursively: its direct children, the children of +its direct children, and so on:: + + for child in head_tag.descendants: + print(child) + # <title>The Dormouse's story</title> + # The Dormouse's story + +The <head> tag has only one child, but it has two descendants: the +<title> tag and the <title> tag's child. The ``BeautifulSoup`` object +only has one direct child (the <html> tag), but it has a whole lot of +descendants:: + + len(list(soup.children)) + # 1 + len(list(soup.descendants)) + # 25 + +.. _.string: + +``.string`` +^^^^^^^^^^^ + +If a tag has only one child, and that child is a string, the string is +made available as ``.string``:: + + title_tag.string + # u'The Dormouse's story' + +If a tag's only child is another tag, and `that` tag has a +``.string``, then the parent tag is considered to have the same +``.string`` as its child:: + + head_tag.contents + # [<title>The Dormouse's story</title>] + + head_tag.string + # u'The Dormouse's story' + +If a tag contains more than one thing, then it's not clear what +``.string`` should refer to, so ``.string`` is defined to be +``None``:: + + print(soup.html.string) + # None + +.. _string-generators: + +``.strings`` and ``stripped_strings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If there's more than one thing inside a tag, you can still look at +just the strings. Use the ``.strings`` generator:: + + for string in soup.strings: + print(repr(string)) + # u"The Dormouse's story" + # u'\n\n' + # u"The Dormouse's story" + # u'\n\n' + # u'Once upon a time there were three little sisters; and their names were\n' + # u'Elsie' + # u',\n' + # u'Lacie' + # u' and\n' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # u'...' + # u'\n' + +These strings tend to have a lot of extra whitespace, which you can +remove by using the ``.stripped_strings`` generator instead:: + + for string in soup.stripped_strings: + print(repr(string)) + # u"The Dormouse's story" + # u"The Dormouse's story" + # u'Once upon a time there were three little sisters; and their names were' + # u'Elsie' + # u',' + # u'Lacie' + # u'and' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'...' + +Here, strings consisting entirely of whitespace are ignored, and +whitespace at the beginning and end of strings is removed. + +Going up +-------- + +Continuing the "family tree" analogy, every tag and every string has a +`parent`: the tag that contains it. + +.. _.parent: + +``.parent`` +^^^^^^^^^^^ + +You can access an element's parent with the ``.parent`` attribute. In +the example "three sisters" document, the <head> tag is the parent +of the <title> tag:: + + title_tag = soup.title + title_tag + # <title>The Dormouse's story</title> + title_tag.parent + # <head><title>The Dormouse's story</title></head> + +The title string itself has a parent: the <title> tag that contains +it:: + + title_tag.string.parent + # <title>The Dormouse's story</title> + +The parent of a top-level tag like <html> is the ``BeautifulSoup`` object +itself:: + + html_tag = soup.html + type(html_tag.parent) + # <class 'bs4.BeautifulSoup'> + +And the ``.parent`` of a ``BeautifulSoup`` object is defined as None:: + + print(soup.parent) + # None + +.. _.parents: + +``.parents`` +^^^^^^^^^^^^ + +You can iterate over all of an element's parents with +``.parents``. This example uses ``.parents`` to travel from an <a> tag +buried deep within the document, to the very top of the document:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + for parent in link.parents: + if parent is None: + print(parent) + else: + print(parent.name) + # p + # body + # html + # [document] + # None + +Going sideways +-------------- + +Consider a simple document like this:: + + sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>") + print(sibling_soup.prettify()) + # <html> + # <body> + # <a> + # <b> + # text1 + # </b> + # <c> + # text2 + # </c> + # </a> + # </body> + # </html> + +The <b> tag and the <c> tag are at the same level: they're both direct +children of the same tag. We call them `siblings`. When a document is +pretty-printed, siblings show up at the same indentation level. You +can also use this relationship in the code you write. + +``.next_sibling`` and ``.previous_sibling`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can use ``.next_sibling`` and ``.previous_sibling`` to navigate +between page elements that are on the same level of the parse tree:: + + sibling_soup.b.next_sibling + # <c>text2</c> + + sibling_soup.c.previous_sibling + # <b>text1</b> + +The <b> tag has a ``.next_sibling``, but no ``.previous_sibling``, +because there's nothing before the <b> tag `on the same level of the +tree`. For the same reason, the <c> tag has a ``.previous_sibling`` +but no ``.next_sibling``:: + + print(sibling_soup.b.previous_sibling) + # None + print(sibling_soup.c.next_sibling) + # None + +The strings "text1" and "text2" are `not` siblings, because they don't +have the same parent:: + + sibling_soup.b.string + # u'text1' + + print(sibling_soup.b.string.next_sibling) + # None + +In real documents, the ``.next_sibling`` or ``.previous_sibling`` of a +tag will usually be a string containing whitespace. Going back to the +"three sisters" document:: + + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> + +You might think that the ``.next_sibling`` of the first <a> tag would +be the second <a> tag. But actually, it's a string: the comma and +newline that separate the first <a> tag from the second:: + + link = soup.a + link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + link.next_sibling + # u',\n' + +The second <a> tag is actually the ``.next_sibling`` of the comma:: + + link.next_sibling.next_sibling + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + +.. _sibling-generators: + +``.next_siblings`` and ``.previous_siblings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can iterate over a tag's siblings with ``.next_siblings`` or +``.previous_siblings``:: + + for sibling in soup.a.next_siblings: + print(repr(sibling)) + # u',\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # u' and\n' + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + # u'; and they lived at the bottom of a well.' + # None + + for sibling in soup.find(id="link3").previous_siblings: + print(repr(sibling)) + # ' and\n' + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> + # u',\n' + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + # u'Once upon a time there were three little sisters; and their names were\n' + # None + +Going back and forth +-------------------- + +Take a look at the beginning of the "three sisters" document:: + + <html><head><title>The Dormouse's story</title></head> + <p class="title"><b>The Dormouse's story</b></p> + +An HTML parser takes this string of characters and turns it into a +series of events: "open an <html> tag", "open a <head> tag", "open a +<title> tag", "add a string", "close the <title> tag", "open a <p> +tag", and so on. Beautiful Soup offers tools for reconstructing the +initial parse of the document. + +.. _element-generators: + +``.next_element`` and ``.previous_element`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``.next_element`` attribute of a string or tag points to whatever +was parsed immediately afterwards. It might be the same as +``.next_sibling``, but it's usually drastically different. + +Here's the final <a> tag in the "three sisters" document. Its +``.next_sibling`` is a string: the conclusion of the sentence that was +interrupted by the start of the <a> tag.:: + + last_a_tag = soup.find("a", id="link3") + last_a_tag + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_a_tag.next_sibling + # '; and they lived at the bottom of a well.' + +But the ``.next_element`` of that <a> tag, the thing that was parsed +immediately after the <a> tag, is `not` the rest of that sentence: +it's the word "Tillie":: + + last_a_tag.next_element + # u'Tillie' + +That's because in the original markup, the word "Tillie" appeared +before that semicolon. The parser encountered an <a> tag, then the +word "Tillie", then the closing </a> tag, then the semicolon and rest of +the sentence. The semicolon is on the same level as the <a> tag, but the +word "Tillie" was encountered first. + +The ``.previous_element`` attribute is the exact opposite of +``.next_element``. It points to whatever element was parsed +immediately before this one:: + + last_a_tag.previous_element + # u' and\n' + last_a_tag.previous_element.next_element + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +``.next_elements`` and ``.previous_elements`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You should get the idea by now. You can use these iterators to move +forward or backward in the document as it was parsed:: + + for element in last_a_tag.next_elements: + print(repr(element)) + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # <p class="story">...</p> + # u'...' + # u'\n' + # None + +Searching the tree +================== + +Beautiful Soup defines a lot of methods for searching the parse tree, +but they're all very similar. I'm going to spend a lot of time explain +the two most popular methods: ``find()`` and ``find_all()``. The other +methods take almost exactly the same arguments, so I'll just cover +them briefly. + +Once again, I'll be using the "three sisters" document as an example:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc) + +By passing in a filter to an argument like ``find_all()``, you can +isolate whatever parts of the document you're interested. + +Kinds of filters +---------------- + +Before talking in detail about ``find_all()`` and similar methods, I +want to show examples of different filters you can pass into these +methods. These filters show up again and again, throughout the +search API. You can use them to filter based on a tag's name, +on its attributes, on the text of a string, or on some combination of +these. + +.. _a string: + +A string +^^^^^^^^ + +The simplest filter is a string. Pass a string to a search method and +Beautiful Soup will perform a match against that exact string. This +code finds all the <b> tags in the document:: + + soup.find_all('b') + # [<b>The Dormouse's story</b>] + +.. _a regular expression: + +A regular expression +^^^^^^^^^^^^^^^^^^^^ + +If you pass in a regular expression object, Beautiful Soup will filter +against that regular expression. This code finds all the tags whose +names start with the letter "b"; in this case, the <body> tag and the +<b> tag:: + + import re + for tag in soup.find_all(re.compile("b.*")): + print(tag.name) + # body + # b + +.. _a list: + +A list +^^^^^^ + +If you pass in a list, Beautiful Soup will allow a string match +against `any` item in that list. This code finds all the <a> tags +`and` all the <b> tags:: + + soup.find_all(["a", "b"]) + # [<b>The Dormouse's story</b>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.. _the value True: + +``True`` +^^^^^^^^ + +The value ``True`` matches everything it can. This code finds `all` +the tags in the document, but none of the text strings:: + + for tag in soup.find_all(True): + print(tag.name) + # html + # head + # title + # body + # p + # b + # p + # a + # a + # a + # p + +.. a function: + +A function +^^^^^^^^^^ + +If none of the other matches work for you, define a function that +takes an element as its only argument. The function should return +``True`` if the argument matches, and ``False`` otherwise. + +Here's a function that returns ``True`` if a tag defines the "class" +attribute but doesn't define the "id" attribute:: + + def has_class_but_no_id(tag): + return tag.has_key('class') and not tag.has_key('id') + +Pass this function into ``find_all()`` and you'll pick up all the <p> +tags:: + + soup.find_all(has_class_but_no_id) + # [<p class="title"><b>The Dormouse's story</b></p>, + # <p class="story">Once upon a time there were...</p>, + # <p class="story">...</p>] + +This function only picks up the <p> tags. It doesn't pick up the <a> +tags, because those tags define both "class" and "id". It doesn't pick +up tags like <html> and <title>, because those tags don't define +"class". + +Here's a function that returns ``True`` if a tag is surrounded by +string objects:: + + from bs4 import NavigableString + def surrounded_by_strings(tag): + return (isinstance(tag.next_element, NavigableString) + and isinstance(tag.previous_element, NavigableString)) + + for tag in soup.find_all(surrounded_by_strings): + print tag.name + # p + # a + # a + # a + # p + +Now we're ready to look at the search methods in detail. + +``find_all()`` +-------------- + +Signature: find_all(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +The ``find_all()`` method looks through a tag's descendants and +retrieves `all` descendants that match your filters. I gave several +examples in `Kinds of filters`_, but here are a few more:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.find_all("p", "title") + # [<p class="title"><b>The Dormouse's story</b></p>] + + soup.find_all("a") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + soup.find_all(id="link2") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + import re + soup.find(text=re.compile("sisters")) + # u'Once upon a time there were three little sisters; and their names were\n' + +Some of these should look familiar, but others are new. What does it +mean to pass in a value for ``text``, or ``id``? Why does +``find_all("p", "title")`` find a <p> tag with the CSS class "title"? +Let's look at the arguments to ``find_all()``. + +.. _name: + +The ``name`` argument +^^^^^^^^^^^^^^^^^^^^^ + +Pass in a value for ``name`` and you'll tell Beautiful Soup to only +consider tags with certain names. Text strings will be ignored, as +will tags whose names that don't match. + +This is the simplest usage:: + + soup.find_all("title") + # [<title>The Dormouse's story</title>] + +Recall from `Kinds of filters`_ that the value to ``name`` can be `a +string`_, `a regular expression`_, `a list`_, `a function`_, or `the value +True`_. + +.. _kwargs: + +The keyword arguments +^^^^^^^^^^^^^^^^^^^^^ + +Any argument that's not recognized will be turned into a filter on tag +attributes. If you pass in a value for an argument called ``id``, +Beautiful Soup will filter against the tag's 'id' attribute:: + + soup.find_all(id='link2') + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +If you pass in a value for ``href``, Beautiful Soup will filter +against the tag's 'href' attribute:: + + soup.find_all(href=re.compile("elsie")) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + +You can filter an attribute based on `a string`_, `a regular +expression`_, `a list`_, `a function`_, or `the value True`_. + +This code finds all tags that have an ``id`` attribute, regardless of +what the value is:: + + soup.find_all(id=True) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +You can filter multiple attributes at once by passing in more than one +keyword argument:: + + soup.find_all(href=re.compile("elsie"), id='link1') + # [<a class="sister" href="http://example.com/elsie" id="link1">three</a>] + +.. _attrs: + +``attrs`` and searching by CSS class +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Instead of using keyword arguments, you can filter tags based on their +attributes passing a dictionary in for ``attrs``. These two lines of +code are equivalent:: + + soup.find_all(href=re.compile("elsie"), id='link1') + soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'}) + +The ``attrs`` argument would be a pretty obscure feature were it not for +one thing: CSS. It's very useful to search for a tag that has a +certain CSS class, but the name of the CSS attribute, "class", is also a +Python reserved word. + +You can use ``attrs`` to search by CSS class:: + + soup.find_all("a", { "class" : "sister" }) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +But that's a lot of code for such a common operation. Instead, you can +pass a string for `attrs` instead of a dictionary. The string will be +used to restrict the CSS class:: + + soup.find_all("a", "sister") + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + +.. _text: + +The ``text`` argument +^^^^^^^^^^^^^^^^^^^^^ + +With ``text`` you can search for strings instead of tags. As with +``name`` and the keyword arguments, you can pass in `a string`_, `a +regular expression`_, `a list`_, `a function`_, or `the value True`_. +Here are some examples:: + + soup.find_all(text="Elsie") + # [u'Elsie'] + + soup.find_all(text=["Tillie", "Elsie", "Lacie"]) + # [u'Elsie', u'Lacie', u'Tillie'] + + soup.find_all(text=re.compile("Dormouse")) + [u"The Dormouse's story", u"The Dormouse's story"] + + def is_the_only_string_within_a_tag(s): + """Return True if this string is the only child of its parent tag.""" + return (s == s.parent.string) + + soup.find_all(text=is_the_only_string_within_a_tag) + # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] + +.. _limit: + +The ``limit`` argument +^^^^^^^^^^^^^^^^^^^^^^ + +``find_all()`` returns all the tags and strings that match your +filters. This can take a while if the document is large. If you don't +need `all` the results, you can pass in a number for ``limit``. This +works just like the LIMIT keyword in SQL. It tells Beautiful Soup to +stop gathering results after it's found a certain number. + +There are three links in the "three sisters" document, but this code +only finds the first two:: + + soup.find_all("a", limit=2) + # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + +.. _recursive: + +The ``recursive`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you call ``mytag.find_all()``, Beautiful Soup will examine all the +descendants of ``mytag``: its children, its children's children, and +so on. If you only want Beautiful Soup to consider direct children, +you can pass in ``recursive=False``. See the difference here:: + + soup.html.find_all("title") + # [<title>The Dormouse's story</title>] + + soup.html.find_all("title", recursive=False) + # [] + +Here's that part of the document:: + + <html> + <head> + <title> + The Dormouse's story + </title> + </head> + ... + +The <title> tag is beneath the <html> tag, but it's not `directly` +beneath the <html> tag: the <head> tag is in the way. Beautiful Soup +finds the <title> tag when it's allowed to look at all descendants of +the <html> tag, but when ``recursive=False`` restricts it to the +<html> tag's immediate children, it finds nothing. + +Beautiful Soup offers a lot of tree-searching methods (covered below), +and they mostly take the same arguments as ``find_all()``: ``name``, +``attrs``, ``text``, ``limit``, and the keyword arguments. But the +``recursive`` argument is different: ``find_all()`` and ``find()`` are +the only methods that support it. Passing ``recursive=False`` into a +method like ``find_parents()`` wouldn't be very useful. + +Calling a tag is like calling ``find_all()`` +-------------------------------------------- + +Because ``find_all()`` is the most popular method in the Beautiful +Soup search API, you can use a shortcut for it. If you treat the +``BeautifulSoup`` object or a ``Tag`` object as though it were a +function, then it's the same as calling ``find_all()`` on that +object. These two lines of code are equivalent:: + + soup.find_all("a") + soup("a") + +These two lines are also equivalent:: + + soup.title.find_all(text=True) + soup.title(text=True) + +``find()`` +---------- + +Signature: find(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`) + +The ``find_all()`` method scans the entire document looking for +results, but sometimes you only want to find one result. If you know a +document only has one <body> tag, it's a waste of time to scan the +entire document looking for more. Rather than passing in ``limit=1`` +every time you call ``find_all``, you can use the ``find()`` +method. These two lines of code are `nearly` equivalent:: + + soup.find_all('title', limit=1) + # [<title>The Dormouse's story</title>] + + soup.find('title') + # <title>The Dormouse's story</title> + +The only difference is that ``find_all()`` returns a list containing +the single result, and ``find()`` just returns the result. + +If ``find_all()`` can't find anything, it returns an empty list. If +``find()`` can't find anything, it returns ``None``:: + + print(soup.find("nosuchtag")) + # None + +Remember the ``soup.head.title`` trick from `Navigating using tag +names`_? That trick works by repeatedly calling ``find()``:: + + soup.head.title + # <title>The Dormouse's story</title> + + soup.find("head").find("title") + # <title>The Dormouse's story</title> + +``find_parents()`` and ``find_parent()`` +---------------------------------------- + +Signature: find_parents(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_parent(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`) + +I spent a lot of time above covering ``find_all()`` and +``find()``. The Beautiful Soup API defines ten other methods for +searching the tree, but don't be afraid. Five of these methods are +basically the same as ``find_all()``, and the other five are basically +the same as ``find()``. The only differences are in what parts of the +tree they search. + +First let's consider ``find_parents()`` and +``find_parent()``. Remember that ``find_all()`` and ``find()`` work +their way down the tree, looking at tag's descendants. These methods +do the opposite: they work their way `up` the tree, looking at a tag's +(or a string's) parents. Let's try them out, starting from a string +buried deep in the "three daughters" document:: + + a_string = soup.find(text="Lacie") + a_string + # u'Lacie' + + a_string.find_parents("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] + + a_string.find_parent("p") + # <p class="story">Once upon a time there were three little sisters; and their names were + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, + # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; + # and they lived at the bottom of a well.</p> + + a_string.find_parents("p", class="title") + # [] + +One of the three <a> tags is the direct parent of the string in +question, so our search finds it. One of the three <p> tags is an +indirect parent of the string, and our search finds that as +well. There's a <p> tag with the CSS class "title" `somewhere` in the +document, but it's not one of this string's parents, so we can't find +it with ``find_parents()``. + +You may have made the connection between ``find_parent()`` and +``find_parents()``, and the `.parent`_ and `.parents`_ attributes +mentioned earlier. The connection is very strong. These search methods +actually use ``.parents`` to iterate over all the parents, and check +each one against the provided filter to see if it matches. + +``find_next_siblings()`` and ``find_next_sibling()`` +---------------------------------------------------- + +Signature: find_next_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_next_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`) + +These methods use :ref:`.next_siblings <sibling-generators>` to +iterate over the rest of an element's siblings in the tree. The +``find_next_siblings()`` method returns all the siblings that match, +and ``find_next_sibling()`` only returns the first one:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_next_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_next_sibling("p") + # <p class="story">...</p> + +``find_previous_siblings()`` and ``find_previous_sibling()`` +------------------------------------------------------------ + +Signature: find_previous_siblings(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_previous_sibling(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`) + +These methods use :ref:`.previous_siblings <sibling-generators>` to iterate over an element's +siblings that precede it in the tree. The ``find_previous_siblings()`` +method returns all the siblings that match, and +``find_previous_sibling()`` only returns the first one:: + + last_link = soup.find("a", id="link3") + last_link + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_link.find_previous_siblings("a") + # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_previous_sibling("p") + # <p class="title"><b>The Dormouse's story</b></p> + + +``find_all_next()`` and ``find_next()`` +--------------------------------------- + +Signature: find_all_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_next(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`) + +These methods use :ref:`.next_elements <element-generators>` to +iterate over whatever tags and strings that come after it in the +document. The ``find_all_next()`` method returns all matches, and +``find_next()`` only returns the first match:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_next(text=True) + # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n'] + + first_link.find_next("p") + # <p class="story">...</p> + +In the first example, the string "Elsie" showed up, even though it was +contained within the <a> tag we started from. In the second example, +the last <p> tag in the document showed up, even though it's not in +the same part of the tree as the <a> tag we started from. For these +methods, all that matters is that an element match the filter, and +show up later in the document than the starting element. + +``find_all_previous()`` and ``find_previous()`` +----------------------------------------------- + +Signature: find_all_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +Signature: find_previous(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`) + +These methods use :ref:`.previous_elements <element-generators>` to +iterate over the tags and strings that came before it in the +document. The ``find_all_previous()`` method returns all matches, and +``find_previous()`` only returns the first match:: + + first_link = soup.a + first_link + # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> + + first_link.find_all_previous("p") + # [<p class="story">Once upon a time there were three little sisters; ...</p>, + # <p class="title"><b>The Dormouse's story</b></p>] + + first_link.find_previous("title") + # <title>The Dormouse's story</title> + +The call to ``find_all_previous("p")`` found the first paragraph in +the document (the one with class="title"), but it also finds the +second paragraph, the <p> tag that contains the <a> tag we started +with. This shouldn't be too surprising: we're looking at all the tags +that show up earlier in the document than the one we started with. A +<p> tag that contains an <a> tag must have shown up earlier in the +document. + +Modifying the tree +================== + +Beautiful Soup's main strength is in searching the parse tree, but you +can also modify the tree and write your changes as a new HTML or XML +document. + +Changing tag names and attributes +--------------------------------- + +I covered this earlier, in `Attributes`_, but it bears repeating. You +can rename a tag, change the values of its attributes, add new +attributes, and delete attributes:: + + soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') + tag = soup.b + + tag.name = "blockquote" + tag['class'] = 'verybold' + tag['id'] = 1 + tag + # <blockquote class="verybold" id="1">Extremely bold</blockquote> + + del tag['class'] + del tag['id'] + tag + # <blockquote>Extremely bold</blockquote> + + +Modifying ``.string`` +--------------------- + +If you set a tag's ``.string`` attribute, the tag's contents are +replaced with the string you give:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + + tag = soup.a + tag.string = "New link text." + tag + # <a href="http://example.com/">New link text.</a> + +Be careful: if the tag contained other tags, they and all their +contents will be destroyed. + +``append()`` +------------ + +You can add to a tag's contents with ``Tag.append()``. It works just +like calling ``.append()`` on a Python list:: + + soup = BeautifulSoup("<a>Foo</a>") + soup.a.append("Bar") + + soup + # <html><head></head><body><a>FooBar</a></body></html> + soup.a.contents + # [u'Foo', u'Bar'] + +``BeautifulSoup.new_string()`` and ``.new_tag()`` +------------------------------------------------- + +If you need to add a string to a document, no problem--you can pass a +Python string in to ``append()``, or you can call the factory method +``BeautifulSoup.new_string()``:: + + soup = BeautifulSoup("<b></b>") + tag = soup.b + tag.append("Hello") + new_string = soup.new_string(" there") + tag.append(new_string) + tag + # <b>Hello there.</b> + tag.contents + # [u'Hello', u' there'] + +What if you need to create a whole new tag? The best solution is to +call the factory method ``BeautifulSoup.new_tag()``:: + + soup = BeautifulSoup("<b></b>") + original_tag = soup.b + + new_tag = soup.new_tag("a", href="http://www.example.com") + original_tag.append(new_tag) + original_tag + # <b><a href="http://www.example.com"></a></b> + + new_tag.string = "Link text." + original_tag + # <b><a href="http://www.example.com">Link text.</a></b> + +Only the first argument, the tag name, is required. + +``insert()`` +------------ + +``Tag.insert()`` is just like ``Tag.append()``, except the new element +doesn't necessarily go at the end of its parent's +``... contents``. It'll be inserted at whatever numeric position you +say. It works just like ``.insert()`` on a Python list:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.insert(1, "but did not endorse ") + tag + # <a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a> + tag.contents + # [u'I linked to ', u'but did not endorse', <i>example.com</i>] + +``move_before()`` and ``move_after()`` +------------------------------------------ + +The ``move_before()`` method moves a tag or string so that it +immediately precedes something else in the parse tree:: + + soup = BeautifulSoup("<b>stop</b>") + tag = soup.new_tag("i") + tag.string = "Don't" + tag.move_before(soup.b.string) + soup.b + # <b><i>Don't</i>stop</b> + +The ``move_after()`` method moves a tag or string so that it +immediately follows something else in the parse tree:: + + soup.new_string(" ever ").move_after(soup.b.i) + soup.b + # <b><i>Don't</i> ever stop</b> + soup.b.contents + # [<i>Don't</i>, u' ever ', u'stop'] + +``clear()`` +----------- + +``Tag.clear()`` removes the contents of a tag:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.clear() + tag + # <a href="http://example.com/"></a> + +``extract()`` +------------- + +``PageElement.extract()`` removes a tag or string from the tree. It +returns the tag or string that was extracted:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + i_tag = soup.i.extract() + + a_tag + # <a href="http://example.com/">I linked to</a> + + i_tag + # <i>example.com</i> + + print(i_tag.parent) + None + +At this point you effectively have two parse trees: one rooted at the +``BeautifulSoup`` object you used to parse the document, and one rooted +at the tag that was extracted. You can go on to call ``extract`` on +a child of the element you extracted:: + + my_string = i_tag.string.extract() + my_string + # u'example.com' + + print(my_string.parent) + # None + i_tag + # <i></i> + + +``decompose()`` +--------------- + +``Tag.decompose()`` removes a tag from the tree, then `completely +destroys it and its contents`:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + soup.i.decompose() + + a_tag + # <a href="http://example.com/">I linked to</a> + + +.. _replace_with: + +``replace_with()`` +------------------ + +``PageElement.replace_with()`` removes a tag or string from the tree, +and replaces it with the tag or string of your choice:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + new_tag = soup.new_tag("b") + new_tag.string = "example.net" + a_tag.i.replace_with(new_tag) + + a_tag + # <a href="http://example.com/">I linked to <b>example.net</b></a> + +``replace_with()`` returns the tag or string that was replaced, so +that you can examine it or add it back to another part of the tree. + +``replace_with_children()`` +--------------------------- + +``Tag.replace_with_children()`` replaces a tag with whatever's inside +that tag. It's good for stripping out markup:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + a_tag = soup.a + + a_tag.i.replace_with_children() + a_tag + # <a href="http://example.com/">I linked to example.com</a> + +Like ``replace_with()``, ``replace_with_children()`` returns the tag +that was replaced. + +Output +====== + +Pretty-printing +--------------- + +The ``prettify()`` method will turn a Beautiful Soup parse tree into a +nicely formatted bytestring, with each HTML/XML tag on its own line:: + + markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' + soup = BeautifulSoup(markup) + soup.prettify() + # '<html>\n <head>\n </head>\n <body>\n <a href="http://example.com/">\n...' + + print(soup.prettify()) + # <html> + # <head> + # </head> + # <body> + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + # </body> + # </html> + +You can call ``prettify()`` on the top-level ``BeautifulSoup`` object, +or on any of its ``Tag`` objects:: + + print(soup.a.prettify()) + # <a href="http://example.com/"> + # I linked to + # <i> + # example.com + # </i> + # </a> + +Non-pretty printing +------------------- + +If you just want a string, with no fancy formatting, you can call +``unicode()`` or ``str()`` on a ``BeautifulSoup`` object, or a ``Tag`` +within it:: + + str(soup) + # '<html><head></head><body><a href="http://example.com/">I linked to <i>example.com</i></a></body></html>' + + unicode(soup.a) + # u'<a href="http://example.com/">I linked to <i>example.com</i></a>' + +The ``str()`` function returns a string encoded in UTF-8. See +`Encodings`_ for other options. + +You can also call ``encode()`` to get a bytestring, and ``decode()`` +to get Unicode. + +Output formatters +----------------- + +If you give Beautiful Soup a document that contains HTML entities like +"&lquot;", they'll be converted to Unicode characters:: + + soup = BeautifulSoup("“Hello,” he said.") + unicode(soup) + # u'<html><head></head><body>\u201cHello,\u201d he said.</body></html>' + +If you then convert the document to a string, the Unicode characters +will be encoded as UTF-8. You won't get the HTML entities back: + + str(soup) + # '<html><head></head><body>\xe2\x80\x9cHello,\xe2\x80\x9d he said.</body></html>' + +By default, the only characters that are escaped upon output are bare +ampersands and angle brackets. These get turned into "&", "<", +and ">", so that Beautiful Soup doesn't inadvertently generate +invalid HTML or XML:: + + soup = BeautifulSoup("<p>The law firm of Dewey, Cheatem, & Howe</p>") + soup.p + # <p>The law firm of Dewey, Cheatem, & Howe</p> + +You can change this behavior by providing a value for the +``formatter`` argument to ``prettify()``, ``encode()``, or +``decode()``. Beautiful Soup recognizes four possible values for +``formatter`` + +The default is ``formatter="minimal"``. Strings will only be processed +enough to ensure that Beautiful Soup generates valid HTML/XML:: + + french = "<p>Il a dit <<Sacré bleu!>></p>" + soup = BeautifulSoup(french) + print(soup.prettify(formatter="minimal")) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + +``formatter="html"`` will convert Unicode characters to HTML entities +whenever possible:: + + print(soup.prettify(formatter="html")) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + +If you pass in ``formatter=None``, Beautiful Soup will not modify +strings at all on output. This is the fastest option, but it may lead +to Beautiful Soup generating invalid HTML/XML, as in this example:: + + print(soup.prettify(formatter=None)) + # <html> + # <body> + # <p> + # Il a dit <<Sacré bleu!>> + # </p> + # </body> + # </html> + + +Finally, if you pass in a function for ``formatter``, Beautiful Soup +will call that function once for every string in the document. You can +do whatever you want in this function. Here's a formatter that +converts strings to uppercase and does absolutely nothing else:: + + def uppercase(str): + return str.upper() + + print(soup.prettify(formatter=uppercase)) + # <html> + # <body> + # <p> + # IL A DIT <<SACRÉ BLEU!>> + # </p> + # </body> + # </html> + +If you're writing your own function, you should know about the +``EntitySubstitution`` class in the ``bs4.dammit`` module. This class +implements Beautiful Soup's standard formatters as class methods: the +"html" formatter is ``EntitySubstitution.substitute_html``, and the +"minimal" formatter is ``EntitySubstitution.substitute_xml``. You can +use these functions to simulate ``formatter=html`` or +``formatter==minimal`` but and then do something in addition. + +Here's an example that converts strings to uppercase, ``and`` replaces +Unicode characters with HTML entities whenever possible:: + + from bs4.dammit import EntitySubstitution + def uppercase_and_substitute_html_entities(str): + return EntitySubstitution.substitute_html(str.upper()) + + print(soup.prettify(formatter=uppercase_and_substitute_html_entities)) + # <html> + # <body> + # <p> + # IL A DIT <<SACRÉ BLEU!>> + # </p> + # </body> + # </html> + +``get_text()`` +-------------- + +If you only want the text part of a document or tag, you can use the +``get_text()`` method. It returns all the text in a document or +beneath a tag, as a single Unicode string:: + + markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>' + soup = BeautifulSoup(markup) + + soup.get_text() + u'\nI linked to example.com\n' + soup.i.get_text() + u'example.com' + +You can specify a string to be used to join the bits of text +together:: + + # soup.get_text("|") + u'\nI linked to |example.com|\n' + +You can tell Beautiful Soup to strip whitespace from the beginning and +end of each bit of text:: + + # soup.get_text("|", strip=True) + u'I linked to|example.com' + +But at that point you might want to use the :ref:`.stripped_strings <string-generators>` +generator instead, and process the text yourself:: + + [text for text in soup.stripped_strings] + # [u'I linked to', u'example.com'] + +Choosing a parser +================= + +If you just need to parse some HTML, you can dump the markup into the +``BeautifulSoup`` constructor, and it'll probably be fine. Beautiful +Soup will pick a parser for you and parse the data. But there are a +few additional arguments you can pass in to the constructor to change +which parser is used. + +The first argument to the ``BeautifulSoup`` constructor is a string or +an open filehandle--the markup you want parsed. The second argument is +`how` you'd like the markup parsed. + +If you don't specify anything, you'll get the best HTML parser that's +installed. Beautiful Soup ranks lxml's parser as being the best, then +html5lib's, then Python's built-in parser. You can override this by +specifying one of the following: + +* What type of markup you want to parse. Currently supported are + "html", "xml", and "html5". + +* The name of the parser library you want to use. Currently supported + options are "lxml", "html5lib", and "html.parser" (Python's + built-in HTML parser). + +Some examples:: + + BeautifulSoup(markup, "lxml") + BeautifulSoup(markup, "xml") + BeautifulSoup(markup, "html5") + +You can specify a list of the parser features you want, instead of +just one. Right now this is mostly useful for distinguishing between +lxml's HTML parser and its XML parser:: + + BeautifulSoup(markup, ["html", "lxml"]) + BeautifulSoup(markup, ["xml", "lxml"]) + +If you don't have an appropriate parser installed, Beautiful Soup will +ignore your request and pick a different parser. For instance, right +now the only supported XML parser is lxml, so if you don't have lxml +installed, asking for an XML parser won't give you one, and asking for +"lxml" won't work either. + +Why would you use one parser over another? Because different parsers +will create different parse trees from the same document. The biggest +differences are between HTML parsers and XML parsers. Here's a short +document, parsed as HTML:: + + BeautifulSoup("<a><b /></a>") + # <html><head></head><body><a><b></b></a></body></html> + +Since an empty <b /> tag is not valid HTML, the parser turns it into a +<b></b> tag pair. + +Here's the same document parsed as XML (running this requires that you +have lxml installed). Note that the empty <b /> tag is left alone, and +that the document is given an XML declaration instead of being put +into an <html> tag.:: + + BeautifulSoup("<a><b /></a>", "xml") + # <?xml version="1.0" encoding="utf-8"> + # <a><b /></a> + +There are also differences between HTML parsers. If you give Beautiful +Soup a perfectly-formed HTML document, these differences won't +matter. One parser may be faster than another, but they'll all give +you a data structure that looks exactly like the original HTML +document. + +But if the document is not perfectly-formed, different parsers will +give different results. Here's a short, invalid document parsed using +lxml's HTML parser. Note that the dangling </p> tag is simply +ignored:: + + BeautifulSoup("<a></p>", "lxml") + # <html><body><a></a></body></html> + +Here's the same document parsed using html5lib:: + + BeautifulSoup("<a></p>", "html5lib") + # <html><head></head><body><a><p></p></a></body></html> + +Instead of ignoring the dangling </p> tag, html5lib pairs it with an +opening <p> tag. This parser also adds an empty <head> tag to the +document. + +Here's the same document parsed with Python's built-in HTML +parser:: + + BeautifulSoup("<a></p>", "html.parser") + # <a></a> + +Like html5lib, this parser ignores the closing </p> tag. Unlike +html5lib, this parser makes no attempt to create a well-formed HTML +document by adding a <body> tag. Unlike lxml, it doesn't even bother +to add an <html> tag. + +Since the document "<a></p>" is invalid, none of these techniques is +the "correct" way to handle it. The html5lib parser uses techniques +that are part of the HTML5 standard, so it has the best claim on being +the "correct" way, but all three techniques are leigtimate. + +Differences between parsers can affect your script. If you're planning +on distributing your script to other people, you might want to specify +in the ``BeautifulSoup`` constructor which parser you used during +development. That will reduce the chances that your users parse a +document differently from the way you parse it. + + +Encodings +========= + +Any HTML or XML document is written in a specific encoding like ASCII +or UTF-8. But when you load that document into Beautiful Soup, you'll +discover it's been converted to Unicode:: + + markup = "<h1>Sacr\xc3\xa9 bleu!</h1>" + soup = BeautifulSoup(markup) + soup.h1 + # <h1>Sacré bleu!</h1> + soup.h1.string + # u'Sacr\xe9 bleu!' + +It's not magic. (That sure would be nice.) Beautiful Soup uses a +sub-library called `Unicode, Dammit`_ to detect a document's encoding +and convert it to Unicode. The autodetected encoding is available as +the ``.original_encoding`` attribute of the ``BeautifulSoup`` object:: + + soup.original_encoding + 'utf-8' + +Unicode, Dammit guesses correctly most of the time, but sometimes it +makes mistakes. Sometimes it guesses correctly, but only after a +byte-by-byte search of the document that takes a very long time. If +you happen to know a document's encoding ahead of time, you can avoid +mistakes and delays by passing it to the ``BeautifulSoup`` constructor +as ``from_encoding``. + +Here's a document written in ISO-8859-8. The document is so short that +Unicode, Dammit can't get a good lock on it, and misidentifies it as +ISO-8859-7:: + + markup = b"<h1>\xed\xe5\xec\xf9</h1>" + soup = BeautifulSoup(markup) + soup.h1 + <h1>νεμω</h1> + soup.original_encoding + 'ISO-8859-7' + +We can fix this by passing in the correct ``from_encoding``:: + + soup = BeautifulSoup(markup, from_encoding="iso-8859-8") + soup.h1 + <h1>םולש</h1> + soup.original_encoding + 'iso8859-8' + +Output encoding +--------------- + +When you write out a document from Beautiful Soup, you get a UTF-8 +document, even if the document wasn't in UTF-8 to begin with. Here's a +document written in the Latin-1 encoding:: + + markup = b''' + <html> + <head> + <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" /> + </head> + <body> + <p>Sacr\xe9 bleu!</p> + </body> + </html> + ''' + + soup = BeautifulSoup(markup) + print(soup.prettify()) + # <html> + # <head> + # <meta content="text/html; charset=utf-8" http-equiv="Content-type" /> + # </head> + # <body> + # <p> + # Sacré bleu! + # </p> + # </body> + # </html> + +Note that the <meta> tag has been rewritten to reflect the fact that +the document is now in UTF-8. + +If you don't want UTF-8, you can pass an encoding into ``prettify()``:: + + print(soup.prettify("latin-1")) + # <html> + # <head> + # <meta content="text/html; charset=latin-1" http-equiv="Content-type" /> + # ... + +You can also call encode() on the ``BeautifulSoup`` object, or any +element in the soup, just as if it were a Python string:: + + soup.p.encode("latin-1") + # '<p>Sacr\xe9 bleu!</p>' + + soup.p.encode("utf-8") + # '<p>Sacr\xc3\xa9 bleu!</p>' + +Unicode, Dammit +--------------- + +You can use Unicode, Dammit without using Beautiful Soup. It's useful +whenever you have data in an unknown encoding and you just want it to +become Unicode:: + + from bs4 import UnicodeDammit + dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'utf-8' + +The more data you give Unicode, Dammit, the more accurately it will +guess. If you have your own suspicions as to what the encoding might +be, you can pass them in as a list:: + + dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'latin-1' + +Unicode, Dammit has one special feature that Beautiful Soup doesn't +use. You can use it to convert Microsoft smart quotes to HTML or XML +entities:: + + markup = b"<p>I just \x93love\x94 Microsoft Word</p>" + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup + # u'<p>I just “love” Microsoft Word</p>' + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup + # u'<p>I just “love” Microsoft Word</p>' + +You might find this feature useful, but Beautiful Soup doesn't use +it. Beautiful Soup prefers the default behavior, which is to convert +Microsoft smart quotes to Unicode characters along with everything +else:: + + UnicodeDammit(markup, ["windows-1252"]).unicode_markup + # u'<p>I just \u201clove\u201d Microsoft Word</p>' + +Parsing only part of a document +=============================== + +Let's say you want to use Beautiful Soup look at a document's <a> +tags. It's a waste of time and memory to parse the entire document and +then go over it again looking for <a> tags. It would be much faster to +ignore everthing that wasn't an <a> tag in the first place. The +``SoupStrainer`` class allows you to choose which parts of an incoming +document are parsed. You just create a ``SoupStrainer`` and pass it in +to the ``BeautifulSoup`` constructor as the ``parse_only`` argument. + +(Note that *this feature won't work if you're using the html5lib +parser*. If you use html5lib, the whole document will be parsed, no +matter what. In the examples below, I'll be forcing Beautiful Soup to +use Python's built-in parser.) + +``SoupStrainer`` +---------------- + +The ``SoupStrainer`` class takes the same arguments as a typical +method from `Searching the tree`_: :ref:`name <name>`, :ref:`attrs +<attrs>`, :ref:`text <text>`, and :ref:`**kwargs <kwargs>`. Here are +three ``SoupStrainer`` objects:: + + from bs4 import SoupStrainer + + only_a_tags = SoupStrainer("a") + + only_tags_with_id_link2 = SoupStrainer(id="link2") + + def is_short_string(string): + return len(string) < 10 + + only_short_strings = SoupStrainer(text=is_short_string) + +I'm going to bring back the "three sisters" document one more time, +and we'll see what the document looks like when it's parsed with these +three ``SoupStrainer`` objects:: + + html_doc = """ + <html><head><title>The Dormouse's story</title></head> + + <p class="title"><b>The Dormouse's story</b></p> + + <p class="story">Once upon a time there were three little sisters; and their names were + <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, + <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and + <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; + and they lived at the bottom of a well.</p> + + <p class="story">...</p> + """ + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) + # <a class="sister" href="http://example.com/elsie" id="link1"> + # Elsie + # </a> + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + # <a class="sister" href="http://example.com/tillie" id="link3"> + # Tillie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) + # <a class="sister" href="http://example.com/lacie" id="link2"> + # Lacie + # </a> + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) + # Elsie + # , + # Lacie + # and + # Tillie + # ... + # + +You can also pass a ``SoupStrainer`` into any of the methods covered +in `Searching the tree`_. This probably isn't terribly useful, but I +thought I'd mention it:: + + soup = BeautifulSoup(html_doc) + soup.find_all(only_short_strings) + # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u'\n\n', u'...', u'\n'] + +Troubleshooting +=============== + +Parsing XML +----------- + +By default, Beautiful Soup parses documents as HTML. To parse a +document as XML, pass in "xml" as the second argument to the +``BeautifulSoup`` constructor:: + + soup = BeautifulSoup(markup, "xml") + +You'll need to :ref:`have lxml installed <parser-installation>`. + +Improving Performance +--------------------- + +Beautiful Soup will never be as fast as the parsers it sits on top +of. If response time is critical, if you're paying for computer time +by the hour, or if there's any other reason why computer time is more +valuable than programmer time, you should forget about Beautiful Soup +and work directly atop `lxml <http://lxml.de/>`_. + +That said, there are things you can do to speed up Beautiful Soup. If +you're not using lxml as the underlying parser, my advice is to +:ref:`start <parser-installation>`. Beautiful Soup parses documents +significantly faster using lxml than using html.parser or html5lib. + +Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by +doing a byte-by-byte examination of the file. This slows Beautiful +Soup to a crawl. My tests indicate that this only happened on 2.x +versions of Python, and that it happened most often with documents +using Russian or Chinese encodings. If this is happening to you, you +can fix it by using Python 3 for your script. Or, if you happen to +know a document's encoding, you can pass it into the +``BeautifulSoup`` constructor as ``from_encoding``. + +`Parsing only part of a document`_ won't save you much time parsing +the document, but it can save a lot of memory, and it'll make +`searching` the document much faster. + +Beautiful Soup 3 +================ + +Beautiful Soup 3.2.0 is the old version, the last release of the +Beautiful Soup 3 series. It's currently the version packaged with all +major Linux distributions:: + +:kbd:`$ apt-get install python-beautifulsoup` + +It's also published through PyPi as `BeautifulSoup`.:: + +:kbd:`$ easy_install BeautifulSoup` +:kbd:`$ pip install BeautifulSoup` + +You can also `download a tarball of Beautiful Soup 3.2.0 +<http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz>`_. + +If you ran ``easy_install beautifulsoup`` or ``easy_install +BeautifulSoup``, but your code doesn't work, you installed Beautiful +Soup 3 by mistake. You need to run ``easy_install beautifulsoup4``. + +`The documentation for Beautiful Soup 3 is archived online +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_. If +your first language is Chinese, it might be easier for you to read +`the Chinese translation of the Beautiful Soup 3 documentation +<http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html>`_, +then read this document to find out about the changes made in +Beautiful Soup 4. + +Porting code to BS4 +------------------- + +Most code written against Beautiful Soup 3 will work against Beautiful +Soup 4 with one simple change. All you should have to do is change the +package name from ``BeautifulSoup`` to ``bs4``. So this:: + + from BeautifulSoup import BeautifulSoup + +becomes this:: + + from bs4 import BeautifulSoup + +* If you get the ``ImportError`` "No module named BeautifulSoup", your + problem is that you're trying to run Beautiful Soup 3 code, but you + only have Beautiful Soup 4 installed. + +* If you get the ``ImportError`` "No module named bs4", your problem + is that you're trying to run Beautiful Soup 4 code, but you only + have Beautiful Soup 3 installed. + +Although BS4 is mostly backwards-compatible with BS3, most of its +methods have been deprecated and given new names for `PEP 8 compliance +<http://www.python.org/dev/peps/pep-0008/>`_. There are numerous other +renames and changes, and a few of them break backwards compatibility. + +Here's what you'll need to know to convert your BS3 code and habits to BS4: + +You need a parser +^^^^^^^^^^^^^^^^^ + +Beautiful Soup 3 used Python's ``SGMLParser``, a module that was +deprecated and removed in Python 3.0. Beautiful Soup 4 uses +``html.parser`` by default, but you can plug in lxml or html5lib and +use that instead. Until ``html.parser`` is improved to handle +real-world HTML better, that's what I recommend you do. See `Be sure +to install a good parser!`_ + +Method names +^^^^^^^^^^^^ + +* ``replaceWith`` -> ``replace_with`` +* ``replaceWithChildren`` -> ``replace_with_children`` +* ``findAll`` -> ``find_all`` +* ``findAllNext`` -> ``find_all_next`` +* ``findAllPrevious`` -> ``find_all_previous`` +* ``findNext`` -> ``find_next`` +* ``findNextSibling`` -> ``find_next_sibling`` +* ``findNextSiblings`` -> ``find_next_siblings`` +* ``findParent`` -> ``find_parent`` +* ``findParents`` -> ``find_parents`` +* ``findPrevious`` -> ``find_previous`` +* ``findPreviousSibling`` -> ``find_previous_sibling`` +* ``findPreviousSiblings`` -> ``find_previous_siblings`` +* ``nextSibling`` -> ``next_sibling`` +* ``previousSibling`` -> ``previous_sibling`` + +Some arguments to the Beautiful Soup constructor were renamed for the +same reasons: + +* ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` +* ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` + +I renamed one method for compatibility with Python 3: + +* ``Tag.has_key()`` -> ``Tag.has_attr()`` + +I renamed one attribute to use more accurate terminology: + +* ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` + +I renamed three attributes to avoid using words that have special +meaning to Python. Unlike the others, these changes are *not backwards +compatible.* If you used these attributes in BS3, your code will break +on BS4 until you change them. + +* ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup`` +* ``Tag.next`` -> ``Tag.next_element`` +* ``Tag.previous`` -> ``Tag.previous_element`` + +Generators +^^^^^^^^^^ + +I gave the generators PEP 8-compliant names, and transformed them into +properties: + +* ``childGenerator()`` -> ``children`` +* ``nextGenerator()`` -> ``next_elements`` +* ``nextSiblingGenerator()`` -> ``next_siblings`` +* ``previousGenerator()`` -> ``previous_elements`` +* ``previousSiblingGenerator()`` -> ``previous_siblings`` +* ``recursiveChildGenerator()`` -> ``descendants`` +* ``parentGenerator()`` -> ``parents`` + +So instead of this:: + + for parent in tag.parentGenerator(): + ... + +You can write this:: + + for parent in tag.parents: + ... + +(But the old code will still work.) + +Some of the generators used to yield ``None`` after they were done, and +then stop. That was a bug. Now the generators just stop. + +There are two new generators, :ref:`.strings and +.stripped_strings <string-generators>`. ``.strings`` yields +NavigableString objects, and ``.stripped_strings`` yields Python +strings that have had whitespace stripped. + +XML +^^^ + +There is no longer a ``BeautifulStoneSoup`` class for parsing XML. To +parse XML you pass in "xml" as the second argument to the +``BeautifulSoup`` constructor. For the same reason, the +``BeautifulSoup`` constructor no longer recognizes the ``isHTML`` +argument. + +Beautiful Soup's handling of empty-element XML tags has been +improved. Previously when you parsed XML you had to explicitly say +which tags were considered empty-element tags. The ``selfClosingTags`` +argument to the constructor is no longer recognized. Instead, +Beautiful Soup considers any empty tag to be an empty-element tag. If +you add a child to an empty-element tag, it stops being an +empty-element tag. + +Entities +^^^^^^^^ + +An incoming HTML or XML entity is always converted into the +corresponding Unicode character. Beautiful Soup 3 had a number of +overlapping ways of dealing with entities, which have been +removed. The ``BeautifulSoup`` constructor no longer recognizes the +``smartQuotesTo`` or ``convertEntities`` arguments. (`Unicode, +Dammit`_ still has ``smart_quotes_to``, but its default is now to turn +smart quotes into Unicode.) + +If you want to turn those Unicode characters back into HTML entities +on output, rather than turning them into UTF-8 characters, you need to +use ``.encode``, as described in `Substituting HTML entities`. This +may change before the final release. + +Miscellaneous +^^^^^^^^^^^^^ + +:ref:`Tag.string <.string>` now operates recursively. If tag A +contains a single tag B and nothing else, then A.string is the same as +B.string. (Previously, it was None.) + +The ``BeautifulSoup`` constructor no longer recognizes the +`markupMassage` argument. It's now the parser's responsibility to +handle markup correctly. + +The rarely-used alternate parser classes like +``ICantBelieveItsBeautifulSoup`` and ``BeautifulSOAP`` have been +removed. It's now the parser's decision how to handle ambiguous +markup. |