diff options
-rw-r--r-- | AUTHORS.txt | 2 | ||||
-rw-r--r-- | LICENSE (renamed from COPYING.txt) | 2 | ||||
-rw-r--r-- | NEWS.txt | 73 | ||||
-rw-r--r-- | bs4/__init__.py | 103 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 138 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 22 | ||||
-rw-r--r-- | bs4/dammit.py | 10 | ||||
-rw-r--r-- | bs4/diagnose.py | 5 | ||||
-rw-r--r-- | bs4/element.py | 102 | ||||
-rw-r--r-- | bs4/testing.py | 38 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 32 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 40 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 45 | ||||
-rw-r--r-- | doc/source/index.rst | 49 | ||||
-rw-r--r-- | prepare-release.sh | 66 | ||||
-rw-r--r-- | setup.py | 4 |
18 files changed, 582 insertions, 158 deletions
diff --git a/AUTHORS.txt b/AUTHORS.txt index 2ac8fcc..ea6f785 100644 --- a/AUTHORS.txt +++ b/AUTHORS.txt @@ -16,7 +16,7 @@ support CSS selectors. Sam Ruby helped with a lot of edge cases. -Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his +Jonathan Ellis was awarded the prestigious Beau Potage D'Or for his work in solving the nestable tags conundrum. An incomplete list of people have contributed patches to Beautiful @@ -1,6 +1,6 @@ Beautiful Soup is made available under the MIT license: - Copyright (c) 2004-2015 Leonard Richardson + Copyright (c) 2004-2016 Leonard Richardson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -1,3 +1,70 @@ += Unreleased = + +* Fixed foster parenting when html5lib is the tree builder. Thanks to + Geoffrey Sneddon for a patch and test. + +* Fixed yet another problem that caused the html5lib tree builder to + create a disconnected parse tree. [bug=1629825] + += 4.5.1 (20160802) = + +* Fixed a crash when passing Unicode markup that contained a + processing instruction into the lxml HTML parser on Python + 3. [bug=1608048] + += 4.5.0 (20160719) = + +* Beautiful Soup is no longer compatible with Python 2.6. This + actually happened a few releases ago, but it's now official. + +* Beautiful Soup will now work with versions of html5lib greater than + 0.99999999. [bug=1603299] + +* If a search against each individual value of a multi-valued + attribute fails, the search will be run one final time against the + complete attribute value considered as a single string. That is, if + a tag has class="foo bar" and neither "foo" nor "bar" matches, but + "foo bar" does, the tag is now considered a match. + + This happened in previous versions, but only when the value being + searched for was a string. Now it also works when that value is + a regular expression, a list of strings, etc. [bug=1476868] + +* Fixed a bug that deranged the tree when a whitespace element was + reparented into a tag that contained an identical whitespace + element. [bug=1505351] + +* Added support for CSS selector values that contain quoted spaces, + such as tag[style="display: foo"]. [bug=1540588] + +* Corrected handling of XML processing instructions. [bug=1504393] + +* Corrected an encoding error that happened when a BeautifulSoup + object was copied. [bug=1554439] + +* The contents of <textarea> tags will no longer be modified when the + tree is prettified. [bug=1555829] + +* When a BeautifulSoup object is pickled but its tree builder cannot + be pickled, its .builder attribute is set to None instead of being + destroyed. This avoids a performance problem once the object is + unpickled. [bug=1523629] + +* Specify the file and line number when warning about a + BeautifulSoup object being instantiated without a parser being + specified. [bug=1574647] + +* The `limit` argument to `select()` now works correctly, though it's + not implemented very efficiently. [bug=1520530] + +* Fixed a Python 3 ByteWarning when a URL was passed in as though it + were markup. Thanks to James Salter for a patch and + test. [bug=1533762] + +* We don't run the check for a filename passed in as markup if the + 'filename' contains a less-than character; the less-than character + indicates it's most likely a very small document. [bug=1577864] + = 4.4.1 (20150928) = * Fixed a bug that deranged the tree when part of it was @@ -455,7 +522,7 @@ Bug fixes: * Renamed Tag.nsprefix to Tag.prefix, for consistency with NamespacedAttribute. -* Fixed a test failure that occured on Python 3.x when chardet was +* Fixed a test failure that occurred on Python 3.x when chardet was installed. * Made prettify() return Unicode by default, so it will look nice on @@ -489,7 +556,7 @@ Bug fixes: * Restored compatibility with Python 2.6. -* The install process no longer installs docs or auxillary text files. +* The install process no longer installs docs or auxiliary text files. * It's now possible to deepcopy a BeautifulSoup object created with Python's built-in HTML parser. @@ -728,7 +795,7 @@ Added an import that makes BS work in Python 2.3. Fixed a UnicodeDecodeError when unpickling documents that contain non-ASCII characters. -Fixed a TypeError that occured in some circumstances when a tag +Fixed a TypeError that occurred in some circumstances when a tag contained no text. Jump through hoops to avoid the use of chardet, which can be extremely diff --git a/bs4/__init__.py b/bs4/__init__.py index d35f765..aa818ae 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup uses a pluggable XML or HTML parser to parse a (possibly invalid) document into a tree representation. Beautiful Soup -provides provides methods and Pythonic idioms that make it easy to -navigate, search, and modify the parse tree. +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. -Beautiful Soup works with Python 2.6 and up. It works better if lxml +Beautiful Soup works with Python 2.7 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + """ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.4.0" -__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" +__version__ = "4.5.1" +__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import os import re +import traceback import warnings from .builder import builder_registry, ParserRejectedMarkup @@ -77,7 +82,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, @@ -137,6 +142,10 @@ class BeautifulSoup(Tag): from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") + if from_encoding and isinstance(markup, unicode): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( @@ -161,19 +170,29 @@ class BeautifulSoup(Tag): markup_type = "XML" else: markup_type = "HTML" + + caller = traceback.extract_stack()[0] + filename = caller[0] + line_number = caller[1] warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + filename=filename, + line_number=line_number, parser=builder.NAME, markup_type=markup_type)) self.builder = builder self.is_xml = builder.is_xml + self.known_xml = self.is_xml self.builder.soup = self self.parse_only = parse_only if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - elif len(markup) <= 256: + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, unicode) and not u'<' in markup) + ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, @@ -195,16 +214,10 @@ class BeautifulSoup(Tag): if isinstance(markup, unicode): markup = markup.encode("utf8") warnings.warn( - '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) - if markup[:5] == "http:" or markup[:6] == "https:": - # TODO: This is ugly but I couldn't get it to work in - # Python 3 otherwise. - if ((isinstance(markup, bytes) and not b' ' in markup) - or (isinstance(markup, unicode) and not u' ' in markup)): - if isinstance(markup, unicode): - markup = markup.encode("utf8") - warnings.warn( - '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) + '"%s" looks like a filename, not markup. You should' + 'probably open this file and pass the filehandle into' + 'Beautiful Soup.' % markup) + self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( @@ -223,15 +236,52 @@ class BeautifulSoup(Tag): self.builder.soup = None def __copy__(self): - return type(self)(self.encode(), builder=self.builder) + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy def __getstate__(self): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) if 'builder' in d and not self.builder.picklable: - del d['builder'] + d['builder'] = None return d + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, unicode): + space = u' ' + cant_start_with = (u"http:", u"https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + def _feed(self): # Convert the document to Unicode. self.builder.reset() @@ -335,7 +385,18 @@ class BeautifulSoup(Tag): if parent.next_sibling: # This node is being inserted into an element that has # already been parsed. Deal with any dangling references. - index = parent.contents.index(o) + index = len(parent.contents)-1 + while index >= 0: + if parent.contents[index] is o: + break + index -= 1 + else: + raise ValueError( + "Error building tree: supposedly %r was inserted " + "into %r after the fact, but I don't see it!" % ( + o, parent + ) + ) if index == 0: previous_element = parent previous_sibling = None @@ -387,7 +448,7 @@ class BeautifulSoup(Tag): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the - SoupStrainer. You should proceed as if the tag had not occured + SoupStrainer. You should proceed as if the tag had not occurred in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index f8fce56..601979b 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -1,9 +1,13 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + HTMLAwareEntitySubstitution, whitespace_re ) @@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = set(['pre', 'textarea']) + preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 8725a65..5f54893 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -1,9 +1,12 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __all__ = [ 'HTML5TreeBuilder', ] -from pdb import set_trace import warnings +import re from bs4.builder import ( PERMISSIVE, HTML, @@ -15,7 +18,10 @@ from bs4.element import ( whitespace_re, ) import html5lib -from html5lib.constants import namespaces +from html5lib.constants import ( + namespaces, + prefixes, + ) from bs4.element import ( Comment, Doctype, @@ -23,6 +29,15 @@ from bs4.element import ( Tag, ) +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError, e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" @@ -47,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) + + extra_kwargs = dict() + if not isinstance(markup, unicode): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): @@ -55,11 +77,17 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, basestring): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) + namespaceHTMLElements, self.soup) return self.underlying_builder def test_fragment_to_document(self, fragment): @@ -67,10 +95,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder): return u'<html><head></head><body>%s</body></html>' % fragment -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup + def __init__(self, namespaceHTMLElements, soup=None): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): @@ -93,7 +125,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): return TextNode(Comment(data), self.soup) def fragmentClass(self): - self.soup = BeautifulSoup("") + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) @@ -105,7 +138,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): return self.soup def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + return treebuilder_base.TreeBuilder.getFragment(self).element + + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) + else: + rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s<!-- %s -->" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in element.attrs.items(): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) class AttrList(object): def __init__(self, element): @@ -137,9 +220,9 @@ class AttrList(object): return name in list(self.attrs.keys()) -class Element(html5lib.treebuilders._base.Node): +class Element(treebuilder_base.Node): def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) + treebuilder_base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace @@ -158,8 +241,10 @@ class Element(html5lib.treebuilders._base.Node): child = node elif node.element.__class__ == NavigableString: string_child = child = node.element + node.parent = self else: child = node.element + node.parent = self if not isinstance(child, basestring) and child.parent is not None: node.element.extract() @@ -197,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node): most_recent_element=most_recent_element) def getAttributes(self): + if isinstance(self.element, Comment): + return {} return AttrList(self.element) def setAttributes(self, attributes): @@ -224,11 +311,11 @@ class Element(html5lib.treebuilders._base.Node): attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: - text = TextNode(self.soup.new_string(data), self.soup) - self.insertBefore(data, insertBefore) + self.insertBefore(text, insertBefore) else: - self.appendChild(data) + self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) @@ -250,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node): # print "MOVE", self.element.contents # print "FROM", self.element # print "TO", new_parent.element + element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -268,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent @@ -285,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node): if new_parents_last_child: new_parents_last_child.next_sibling = first_child - # Fix the last child's next_element and next_sibling - last_child = to_append[-1] - last_child.next_element = new_parents_last_descendant_next_element + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element if new_parents_last_descendant_next_element: - new_parents_last_descendant_next_element.previous_element = last_child - last_child.next_sibling = None + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None for child in to_append: child.parent = new_parent_element @@ -324,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node): class TextNode(Element): def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) + treebuilder_base.Node.__init__(self, None) self.element = element self.soup = soup diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 0101d64..823ca15 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -1,5 +1,8 @@ """Use the HTMLParser library to parse HTML files that aren't too bad.""" +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __all__ = [ 'HTMLParserTreeBuilder', ] diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 9e8f88f..d2ca287 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -1,3 +1,5 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', @@ -12,6 +14,7 @@ from bs4.element import ( Doctype, NamespacedAttribute, ProcessingInstruction, + XMLProcessingInstruction, ) from bs4.builder import ( FAST, @@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True + processing_instruction_class = XMLProcessingInstruction NAME = "lxml-xml" ALTERNATE_NAMES = ["xml"] @@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): Each 4-tuple represents a strategy for parsing the document. """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? @@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. - is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) @@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def pi(self, target, data): self.soup.endData() self.soup.handle_data(target + ' ' + data) - self.soup.endData(ProcessingInstruction) + self.soup.endData(self.processing_instruction_class) def data(self, content): self.soup.handle_data(content) @@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] is_xml = False + processing_instruction_class = ProcessingInstruction def default_parser(self, encoding): return etree.HTMLParser diff --git a/bs4/dammit.py b/bs4/dammit.py index 030f04a..7965565 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -6,9 +6,10 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __license__ = "MIT" -from pdb import set_trace import codecs from htmlentitydefs import codepoint2name import re @@ -346,7 +347,7 @@ class UnicodeDammit: self.tried_encodings = [] self.contains_replacement_characters = False self.is_html = is_html - + self.log = logging.getLogger(__name__) self.detector = EncodingDetector( markup, override_encodings, is_html, exclude_encodings) @@ -376,9 +377,10 @@ class UnicodeDammit: if encoding != "ascii": u = self._convert_from(encoding, "replace") if u is not None: - logging.warning( + self.log.warning( "Some characters could not be decoded, and were " - "replaced with REPLACEMENT CHARACTER.") + "replaced with REPLACEMENT CHARACTER." + ) self.contains_replacement_characters = True break diff --git a/bs4/diagnose.py b/bs4/diagnose.py index c04d23c..8768332 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -1,5 +1,7 @@ """Diagnostic functions, mainly for use when doing tech support.""" +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __license__ = "MIT" import cProfile @@ -56,7 +58,8 @@ def diagnose(data): data = data.read() elif os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data - data = open(data).read() + with open(data) as fp: + data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." diff --git a/bs4/element.py b/bs4/element.py index ecf2b28..b100d18 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1,8 +1,10 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __license__ = "MIT" -from pdb import set_trace import collections import re +import shlex import sys import warnings from bs4.dammit import EntitySubstitution @@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): preformatted_tags = set(["pre"]) + preserve_whitespace_tags = set(['pre', 'textarea']) + @classmethod def _substitute_if_appropriate(cls, ns, f): if (isinstance(ns, NavigableString) @@ -169,11 +173,19 @@ class PageElement(object): This is used when mapping a formatter name ("minimal") to an appropriate function (one that performs entity-substitution on - the contents of <script> and <style> tags, or not). It's + the contents of <script> and <style> tags, or not). It can be inefficient, but it should be called very rarely. """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. if self.parent is None: - # This is the top-level object. It should have .is_xml set + # This is the top-level object. It should have .known_xml set # from tree creation. If not, take a guess--BS is usually # used on HTML markup. return getattr(self, 'is_xml', False) @@ -637,7 +649,7 @@ class PageElement(object): return lambda el: el._attr_value_as_string( attribute, '').startswith(value) elif operator == '$': - # string represenation of `attribute` ends with `value` + # string representation of `attribute` ends with `value` return lambda el: el._attr_value_as_string( attribute, '').endswith(value) elif operator == '*': @@ -677,6 +689,11 @@ class NavigableString(unicode, PageElement): PREFIX = '' SUFFIX = '' + # We can't tell just by looking at a string whether it's contained + # in an XML document or an HTML document. + + known_xml = None + def __new__(cls, value): """Create a new NavigableString. @@ -743,10 +760,16 @@ class CData(PreformattedString): SUFFIX = u']]>' class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" PREFIX = u'<?' SUFFIX = u'>' +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = u'<?' + SUFFIX = u'?>' + class Comment(PreformattedString): PREFIX = u'<!--' @@ -781,7 +804,8 @@ class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def __init__(self, parser=None, builder=None, name=None, namespace=None, - prefix=None, attrs=None, parent=None, previous=None): + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None): "Basic constructor." if parser is None: @@ -795,6 +819,14 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix + if builder is not None: + preserve_whitespace_tags = builder.preserve_whitespace_tags + else: + if is_xml: + preserve_whitespace_tags = [] + else: + preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags + self.preserve_whitespace_tags = preserve_whitespace_tags if attrs is None: attrs = {} elif attrs: @@ -805,6 +837,13 @@ class Tag(PageElement): attrs = dict(attrs) else: attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml self.attrs = attrs self.contents = [] self.setup(parent, previous) @@ -824,7 +863,7 @@ class Tag(PageElement): Its contents are a copy of the old Tag's contents. """ clone = type(self)(None, self.builder, self.name, self.namespace, - self.nsprefix, self.attrs) + self.nsprefix, self.attrs, is_xml=self._is_xml) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) for child in self.contents: @@ -997,7 +1036,7 @@ class Tag(PageElement): tag_name, tag_name)) return self.find(tag_name) # We special case contents to avoid recursion. - elif not tag.startswith("__") and not tag=="contents": + elif not tag.startswith("__") and not tag == "contents": return self.find(tag) raise AttributeError( "'%s' object has no attribute '%s'" % (self.__class__, tag)) @@ -1057,10 +1096,11 @@ class Tag(PageElement): def _should_pretty_print(self, indent_level): """Should this tag be pretty-printed?""" + return ( - indent_level is not None and - (self.name not in HTMLAwareEntitySubstitution.preformatted_tags - or self._is_xml)) + indent_level is not None + and self.name not in self.preserve_whitespace_tags + ) def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, @@ -1280,6 +1320,7 @@ class Tag(PageElement): _selector_combinators = ['>', '+', '~'] _select_debug = False + quoted_colon = re.compile('"[^"]*:[^"]*"') def select_one(self, selector): """Perform a CSS selection operation on the current element.""" value = self.select(selector, limit=1) @@ -1305,8 +1346,7 @@ class Tag(PageElement): if limit and len(context) >= limit: break return context - - tokens = selector.split() + tokens = shlex.split(selector) current_context = [self] if tokens[-1] in self._selector_combinators: @@ -1358,7 +1398,7 @@ class Tag(PageElement): return classes.issubset(candidate.get('class', [])) checker = classes_match - elif ':' in token: + elif ':' in token and not self.quoted_colon.search(token): # Pseudo-class tag_name, pseudo = token.split(':', 1) if tag_name == '': @@ -1389,11 +1429,8 @@ class Tag(PageElement): self.count += 1 if self.count == self.destination: return True - if self.count > self.destination: - # Stop the generator that's sending us - # these things. - raise StopIteration() - return False + else: + return False checker = Counter(pseudo_value).nth_child_of_type else: raise NotImplementedError( @@ -1498,13 +1535,12 @@ class Tag(PageElement): # don't include it in the context more than once. new_context.append(candidate) new_context_ids.add(id(candidate)) - if limit and len(new_context) >= limit: - break elif self._select_debug: print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) - current_context = new_context + if limit and len(current_context) >= limit: + current_context = current_context[:limit] if self._select_debug: print "Final verdict:" @@ -1668,21 +1704,15 @@ class SoupStrainer(object): if isinstance(markup, list) or isinstance(markup, tuple): # This should only happen when searching a multi-valued attribute # like 'class'. - if (isinstance(match_against, unicode) - and ' ' in match_against): - # A bit of a special case. If they try to match "foo - # bar" on a multivalue attribute's value, only accept - # the literal value "foo bar" - # - # XXX This is going to be pretty slow because we keep - # splitting match_against. But it shouldn't come up - # too often. - return (whitespace_re.split(match_against) == markup) - else: - for item in markup: - if self._matches(item, match_against): - return True - return False + for item in markup: + if self._matches(item, match_against): + return True + # We didn't match any particular value of the multivalue + # attribute, but maybe we match the attribute value when + # considered as a string. + if self._matches(' '.join(markup), match_against): + return True + return False if match_against is True: # True matches any non-None value. diff --git a/bs4/testing.py b/bs4/testing.py index 7ba54ab..3a6ed42 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -1,5 +1,7 @@ """Helper classes for tests.""" +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __license__ = "MIT" import pickle @@ -137,6 +139,14 @@ class HTMLTreeBuilderSmokeTest(object): markup.replace(b"\n", b"")) def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = u"""<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + markup = b"""<?PITarget PIContent?>""" soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) @@ -215,9 +225,22 @@ Hello, world! self.assertEqual(comment, baz.previous_element) def test_preserved_whitespace_in_pre_and_textarea(self): - """Whitespace must be preserved in <pre> and <textarea> tags.""" - self.assertSoupEquals("<pre> </pre>") - self.assertSoupEquals("<textarea> woo </textarea>") + """Whitespace must be preserved in <pre> and <textarea> tags, + even if that would mean not prettifying the markup. + """ + pre_markup = "<pre> </pre>" + textarea_markup = "<textarea> woo\nwoo </textarea>" + self.assertSoupEquals(pre_markup) + self.assertSoupEquals(textarea_markup) + + soup = self.soup(pre_markup) + self.assertEqual(soup.pre.prettify(), pre_markup) + + soup = self.soup(textarea_markup) + self.assertEqual(soup.textarea.prettify(), textarea_markup) + + soup = self.soup("<textarea></textarea>") + self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>") def test_nested_inline_elements(self): """Inline elements can be nested indefinitely.""" @@ -480,7 +503,9 @@ Hello, world! hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' soup = self.soup( hebrew_document, from_encoding="iso8859-8") - self.assertEqual(soup.original_encoding, 'iso8859-8') + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') self.assertEqual( soup.encode('utf-8'), hebrew_document.decode("iso8859-8").encode("utf-8")) @@ -563,6 +588,11 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) + def test_processing_instruction(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + def test_real_xhtml_document(self): """A real XHTML document should come out *exactly* the same as it went in.""" markup = b"""<?xml version="1.0" encoding="utf-8"?> diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 65536c2..0f89d62 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -84,6 +84,33 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) + def test_reparented_markup_containing_identical_whitespace_nodes(self): + """Verify that we keep the two whitespace nodes in this + document distinct when reparenting the adjacent <tbody> tags. + """ + markup = '<table> <tbody><tbody><ims></tbody> </table>' + soup = self.soup(markup) + space1, space2 = soup.find_all(string=' ') + tbody1, tbody2 = soup.find_all('tbody') + assert space1.next_element is tbody1 + assert tbody2.next_element is space2 + + def test_reparented_markup_containing_children(self): + markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>' + soup = self.soup(markup) + noscript = soup.noscript + self.assertEqual("target", noscript.next_element) + target = soup.find(string='target') + + # The 'aftermath' string was duplicated; we want the second one. + final_aftermath = soup.find_all(string='aftermath')[-1] + + # The <noscript> tag was moved beneath a copy of the <a> tag, + # but the 'target' string within is still connected to the + # (second) 'aftermath' string. + self.assertEqual(final_aftermath, target.next_element) + self.assertEqual(target, final_aftermath.previous_element) + def test_processing_instruction(self): """Processing instructions become comments.""" markup = b"""<?PITarget PIContent?>""" @@ -96,3 +123,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): a1, a2 = soup.find_all('a') self.assertEqual(a1, a2) assert a1 is not a2 + + def test_foster_parenting(self): + markup = b"""<table><td></tbody>A""" + soup = self.soup(markup) + self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode()) diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 1238af2..f3e69ed 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -35,7 +35,6 @@ try: except ImportError, e: LXML_PRESENT = False -PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): @@ -77,7 +76,7 @@ class TestWarnings(SoupTest): def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("<a><b></b></a>", "html.parser") - self.assertEquals([], w) + self.assertEqual([], w) def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: @@ -118,15 +117,34 @@ class TestWarnings(SoupTest): soup = self.soup(filename) self.assertEqual(0, len(w)) - def test_url_warning(self): - with warnings.catch_warnings(record=True) as w: - soup = self.soup("http://www.crummy.com/") - msg = str(w[0].message) - self.assertTrue("looks like a URL" in msg) + def test_url_warning_with_bytes_url(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(b"http://www.crummybytes.com/") + # Be aware this isn't the only warning that can be raised during + # execution.. + self.assertTrue(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_unicode_url(self): + with warnings.catch_warnings(record=True) as warning_list: + # note - this url must differ from the bytes one otherwise + # python's warnings system swallows the second warning + soup = self.soup(u"http://www.crummyunicode.com/") + self.assertTrue(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_bytes_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(b"http://www.crummybytes.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_unicode_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(u"http://www.crummyuncode.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) - with warnings.catch_warnings(record=True) as w: - soup = self.soup("http://www.crummy.com/ is great") - self.assertEqual(0, len(w)) class TestSelectiveParsing(SoupTest): @@ -260,7 +278,7 @@ class TestEncodingConversion(SoupTest): self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) @skipIf( - PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, + PYTHON_3_PRE_3_2, "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") def test_attribute_name_containing_unicode_characters(self): markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 6b2a123..a4fe0b1 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -222,6 +222,17 @@ class TestFindAllByName(TreeTest): self.assertSelects( tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) + def test_find_with_multi_valued_attribute(self): + soup = self.soup( + "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>" + ) + r1 = soup.find('div', 'a d'); + r2 = soup.find('div', re.compile(r'a d')); + r3, r4 = soup.find_all('div', ['a b', 'a d']); + self.assertEqual('3', r1.string) + self.assertEqual('3', r2.string) + self.assertEqual('1', r3.string) + self.assertEqual('3', r4.string) class TestFindAllByAttribute(TreeTest): @@ -294,10 +305,10 @@ class TestFindAllByAttribute(TreeTest): f = tree.find_all("gar", class_=re.compile("a")) self.assertSelects(f, ["Found it"]) - # Since the class is not the string "foo bar", but the two - # strings "foo" and "bar", this will not find anything. + # If the search fails to match the individual strings "foo" and "bar", + # it will be tried against the combined string "foo bar". f = tree.find_all("gar", class_=re.compile("o b")) - self.assertSelects(f, []) + self.assertSelects(f, ["Found it"]) def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): soup = self.soup("<a class='bar'>Found it</a>") @@ -335,7 +346,7 @@ class TestFindAllByAttribute(TreeTest): strainer = SoupStrainer(attrs={'id' : 'first'}) self.assertSelects(tree.find_all(strainer), ['Match.']) - def test_find_all_with_missing_atribute(self): + def test_find_all_with_missing_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that do not have that attribute set. tree = self.soup("""<a id="1">ID present.</a> @@ -1328,6 +1339,13 @@ class TestPersistence(SoupTest): copied = copy.deepcopy(self.tree) self.assertEqual(copied.decode(), self.tree.decode()) + def test_copy_preserves_encoding(self): + soup = BeautifulSoup(b'<p> </p>', 'html.parser') + encoding = soup.original_encoding + copy = soup.__copy__() + self.assertEqual(u"<p> </p>", unicode(copy)) + self.assertEqual(encoding, copy.original_encoding) + def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. html = u"<b>\N{SNOWMAN}</b>" @@ -1676,8 +1694,8 @@ class TestSoupSelector(TreeTest): def setUp(self): self.soup = BeautifulSoup(self.HTML, 'html.parser') - def assertSelects(self, selector, expected_ids): - el_ids = [el['id'] for el in self.soup.select(selector)] + def assertSelects(self, selector, expected_ids, **kwargs): + el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] el_ids.sort() expected_ids.sort() self.assertEqual(expected_ids, el_ids, @@ -1720,6 +1738,13 @@ class TestSoupSelector(TreeTest): for selector in ('html div', 'html body div', 'body div'): self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) + + def test_limit(self): + self.assertSelects('html div', ['main'], limit=1) + self.assertSelects('html body div', ['inner', 'main'], limit=2) + self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'], + limit=10) + def test_tag_no_match(self): self.assertEqual(len(self.soup.select('del')), 0) @@ -1902,6 +1927,14 @@ class TestSoupSelector(TreeTest): ('div[data-tag]', ['data1']) ) + def test_quoted_space_in_selector_name(self): + html = """<div style="display: wrong">nope</div> + <div style="display: right">yes</div> + """ + soup = BeautifulSoup(html, 'html.parser') + [chosen] = soup.select('div[style="display: right"]') + self.assertEqual("yes", chosen.string) + def test_unsupported_pseudoclass(self): self.assertRaises( NotImplementedError, self.soup.select, "a:no-such-pseudoclass") diff --git a/doc/source/index.rst b/doc/source/index.rst index 8258e97..56aa7fe 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -166,12 +166,16 @@ Installing Beautiful Soup If you're using a recent version of Debian or Ubuntu Linux, you can install Beautiful Soup with the system package manager: -:kbd:`$ apt-get install python-bs4` +:kbd:`$ apt-get install python-bs4` (for Python 2) + +:kbd:`$ apt-get install python3-bs4` (for Python 3) Beautiful Soup 4 is published through PyPi, so if you can't install it with the system packager, you can install it with ``easy_install`` or ``pip``. The package name is ``beautifulsoup4``, and the same package -works on Python 2 and Python 3. +works on Python 2 and Python 3. Make sure you use the right version of +``pip`` or ``easy_install`` for your Python version (these may be named +``pip3`` and ``easy_install3`` respectively if you're using Python 3). :kbd:`$ easy_install beautifulsoup4` @@ -298,7 +302,8 @@ constructor. You can pass in a string or an open filehandle:: from bs4 import BeautifulSoup - soup = BeautifulSoup(open("index.html")) + with open("index.html") as fp: + soup = BeautifulSoup(fp) soup = BeautifulSoup("<html>data</html>") @@ -355,34 +360,34 @@ Attributes ^^^^^^^^^^ A tag may have any number of attributes. The tag ``<b -class="boldest">`` has an attribute "class" whose value is +id="boldest">`` has an attribute "id" whose value is "boldest". You can access a tag's attributes by treating the tag like a dictionary:: - tag['class'] + tag['id'] # u'boldest' You can access that dictionary directly as ``.attrs``:: tag.attrs - # {u'class': u'boldest'} + # {u'id': 'boldest'} You can add, remove, and modify a tag's attributes. Again, this is done by treating the tag as a dictionary:: - tag['class'] = 'verybold' - tag['id'] = 1 + tag['id'] = 'verybold' + tag['another-attribute'] = 1 tag - # <blockquote class="verybold" id="1">Extremely bold</blockquote> + # <b another-attribute="1" id="verybold"></b> - del tag['class'] del tag['id'] + del tag['another-attribute'] tag - # <blockquote>Extremely bold</blockquote> + # <b></b> - tag['class'] - # KeyError: 'class' - print(tag.get('class')) + tag['id'] + # KeyError: 'id' + print(tag.get('id')) # None .. _multivalue: @@ -1045,7 +1050,7 @@ A regular expression ^^^^^^^^^^^^^^^^^^^^ If you pass in a regular expression object, Beautiful Soup will filter -against that regular expression using its ``match()`` method. This code +against that regular expression using its ``search()`` method. This code finds all the tags whose names start with the letter "b"; in this case, the <body> tag and the <b> tag:: @@ -1257,6 +1262,17 @@ dictionary and passing the dictionary into ``find_all()`` as the data_soup.find_all(attrs={"data-foo": "value"}) # [<div data-foo="value">foo!</div>] +You can't use a keyword argument to search for HTML's 'name' element, +because Beautiful Soup uses the ``name`` argument to contain the name +of the tag itself. Instead, you can give a value to 'name' in the +``attrs`` argument. + + name_soup = BeautifulSoup('<input name="email"/>') + name_soup.find_all(name="email") + # [] + name_soup.find_all(attrs={"name": "email"}) + # [<input name="email"/>] + .. _attrs: Searching by CSS class @@ -2776,7 +2792,8 @@ you how different parsers handle the document, and tell you if you're missing a parser that Beautiful Soup could be using:: from bs4.diagnose import diagnose - data = open("bad.html").read() + with open("bad.html") as fp: + data = fp.read() diagnose(data) # Diagnostic running on Beautiful Soup 4.2.0 diff --git a/prepare-release.sh b/prepare-release.sh index 48bff57..aaa95a5 100644 --- a/prepare-release.sh +++ b/prepare-release.sh @@ -11,52 +11,42 @@ # Make sure tests pass ./test-all-versions -# Make sure nothing broke on 2.6 -source ../virtualenv-2.6/bin/activate -nosetests -deactivate - -rm -rf dist +rm -rf build dist # Create the 2.x source distro and wheel python setup.py sdist bdist_wheel -# Create the 3.x wheel -source ../virtualenv-3/bin/activate -python setup.py bdist_wheel -deactivate - -# Upload to pypi test +# Upload the 2.x source distro and wheel to pypi test python setup.py register -r test python setup.py sdist bdist_wheel upload -r test -source ../virtualenv-3/bin/activate -python setup.py bdist_wheel upload -r test -deactivate - # Try 2.x install from pypi test rm -rf ../py2-install-test-virtualenv virtualenv -p /usr/bin/python2.7 ../py2-install-test-virtualenv source ../py2-install-test-virtualenv/bin/activate -pip install -i https://testpypi.python.org/pypi beautifulsoup4 +pip install --pre -i https://pypi.python.org/pypi beautifulsoup4 echo "EXPECT HTML ON LINE BELOW" (cd .. && python -c "from bs4 import _s; print(_s('<a>foo', 'html.parser'))") # That should print '<a>foo</a>' deactivate rm -rf ../py2-install-test-virtualenv -# Try 3.x install from pypi test -rm -rf ../py3-install-test-virtualenv -virtualenv -p /usr/bin/python3 ../py3-install-test-virtualenv -source ../py3-install-test-virtualenv/bin/activate +# Try 3.x source install from pypi test +rm -rf ../py3-source-install +virtualenv -p /usr/bin/python3 ../py3-source-install +source ../py3-source-install/bin/activate pip install -i https://testpypi.python.org/pypi beautifulsoup4 echo "EXPECT HTML ON LINE BELOW" (cd .. && python -c "from bs4 import _s; print(_s('<a>foo', 'html.parser'))") # That should print '<a>foo</a>' -deactivate -rm -rf ../py3-install-test-virtualenv +# Create and upload a Python 3 wheel from within a virtual environment +# that has the Python 3 version of the code. +pip install wheel +python3 setup.py bdist_wheel upload -r test +deactivate +rm -rf ../py3-source-install # Make sure setup.py works on 2.x rm -rf ../py2-install-test-virtualenv @@ -86,6 +76,7 @@ echo rm -rf ../py2-install-test-virtualenv virtualenv -p /usr/bin/python2.7 ../py2-install-test-virtualenv source ../py2-install-test-virtualenv/bin/activate +pip install --upgrade setuptools pip install dist/beautifulsoup4-4.*-py2-none-any.whl -e .[html5lib] echo "EXPECT HTML ON LINE BELOW" (cd .. && python -c "from bs4 import _s; print(_s('<a>foo', 'html5lib'))") @@ -98,6 +89,7 @@ echo rm -rf ../py3-install-test-virtualenv virtualenv -p /usr/bin/python3 ../py3-install-test-virtualenv source ../py3-install-test-virtualenv/bin/activate +pip install --upgrade setuptools pip install dist/beautifulsoup4-4.*-py3-none-any.whl -e .[html5lib] echo "EXPECT HTML ON LINE BELOW" (cd .. && python -c "from bs4 import _s; print(_s('<a>foo', 'html5lib'))") @@ -107,6 +99,34 @@ rm -rf ../py3-install-test-virtualenv ################ +Do the release for real. + +# Register the project and upload the source distribution and Python 2 wheel. +python setup.py register -r test +python setup.py sdist bdist_wheel upload -r test + +# Create a Python 3 environment and install Beautiful Soup +# from the source distribution that was just uploaded +rm -rf ../py3-source-install +virtualenv -p /usr/bin/python3 ../py3-source-install +source ../py3-source-install/bin/activate +pip install -i https://pypi.python.org/pypi beautifulsoup4 +echo "EXPECT HTML ON LINE BELOW" +(cd .. && python -c "from bs4 import _s; print(_s('<a>foo', 'html.parser'))") +# That should print '<a>foo</a>' + +# Create and upload a Python 3 wheel from within a virtual environment +# that has the Python 3 version of the code. +pip install wheel +python3 setup.py bdist_wheel upload -r test + +# Remove the Python 3 virtual environment. +deactivate +rm -rf ../py3-source-install + + +################ + To test, after release: rm -rf ../py2-install-test-virtualenv @@ -5,7 +5,7 @@ from setuptools import ( setup( name="beautifulsoup4", - version = "4.4.0", + version = "4.5.1", author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", @@ -23,7 +23,7 @@ setup( "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python", - "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", 'Programming Language :: Python :: 3', "Topic :: Text Processing :: Markup :: HTML", "Topic :: Text Processing :: Markup :: XML", |