summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIsaac Muse <isaacmuse@gmail.com>2018-12-19 19:13:02 -0700
committerIsaac Muse <isaacmuse@gmail.com>2018-12-19 19:13:02 -0700
commit30053bd0abb468271a5ec45b9941ad1f77817fa9 (patch)
tree126e3462d7726886ae2a53bf72448f7538839314
parentc4cb9ff2a794e7606e240e4da73dcc837ec175df (diff)
downloadbeautifulsoup4-30053bd0abb468271a5ec45b9941ad1f77817fa9.tar.gz
Add Soup Sieve support
-rw-r--r--bs4/element.py344
-rw-r--r--bs4/tests/test_tree.py19
-rw-r--r--doc/source/index.rst29
-rw-r--r--setup.py1
4 files changed, 65 insertions, 328 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 886eb91..1239d06 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -7,9 +7,9 @@ try:
except ImportError , e:
from collections import Callable
import re
-import shlex
import sys
import warnings
+import soupsieve
from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
@@ -657,82 +657,6 @@ class PageElement(object):
yield i
i = i.parent
- # Methods for supporting CSS selectors.
-
- tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
-
- # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
- # \---------------------------/ \---/\-------------/ \-------/
- # | | | |
- # | | | The value
- # | | ~,|,^,$,* or =
- # | Attribute
- # Tag
- attribselect_re = re.compile(
- r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
- r'=?"?(?P<value>[^\]"]*)"?\]$'
- )
-
- def _attr_value_as_string(self, value, default=None):
- """Force an attribute value into a string representation.
-
- A multi-valued attribute will be converted into a
- space-separated stirng.
- """
- value = self.get(value, default)
- if isinstance(value, list) or isinstance(value, tuple):
- value =" ".join(value)
- return value
-
- def _tag_name_matches_and(self, function, tag_name):
- if not tag_name:
- return function
- else:
- def _match(tag):
- return tag.name == tag_name and function(tag)
- return _match
-
- def _attribute_checker(self, operator, attribute, value=''):
- """Create a function that performs a CSS selector operation.
-
- Takes an operator, attribute and optional value. Returns a
- function that will return True for elements that match that
- combination.
- """
- if operator == '=':
- # string representation of `attribute` is equal to `value`
- return lambda el: el._attr_value_as_string(attribute) == value
- elif operator == '~':
- # space-separated list representation of `attribute`
- # contains `value`
- def _includes_value(element):
- attribute_value = element.get(attribute, [])
- if not isinstance(attribute_value, list):
- attribute_value = attribute_value.split()
- return value in attribute_value
- return _includes_value
- elif operator == '^':
- # string representation of `attribute` starts with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').startswith(value)
- elif operator == '$':
- # string representation of `attribute` ends with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').endswith(value)
- elif operator == '*':
- # string representation of `attribute` contains `value`
- return lambda el: value in el._attr_value_as_string(attribute, '')
- elif operator == '|':
- # string representation of `attribute` is either exactly
- # `value` or starts with `value` and then a dash.
- def _is_or_starts_with_dash(element):
- attribute_value = element._attr_value_as_string(attribute, '')
- return (attribute_value == value or attribute_value.startswith(
- value + '-'))
- return _is_or_starts_with_dash
- else:
- return lambda el: el.has_attr(attribute)
-
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@@ -1394,250 +1318,46 @@ class Tag(PageElement):
current = current.next_element
# CSS selector code
-
- _selector_combinators = ['>', '+', '~']
- _select_debug = False
- quoted_colon = re.compile('"[^"]*:[^"]*"')
- def select_one(self, selector):
- """Perform a CSS selection operation on the current element."""
- value = self.select(selector, limit=1)
+ def select_one(self, selector, namespaces=None, flags=0):
+ """Perform a CSS selection operation on the current element"""
+ value = self.select(selector, namespaces, 1, flags)
if value:
return value[0]
return None
- def select(self, selector, _candidate_generator=None, limit=None):
- """Perform a CSS selection operation on the current element."""
-
- # Handle grouping selectors if ',' exists, ie: p,a
- if ',' in selector:
- context = []
- selectors = [x.strip() for x in selector.split(",")]
-
- # If a selector is mentioned multiple times we don't want
- # to use it more than once.
- used_selectors = set()
-
- # We also don't want to select the same element more than once,
- # if it's matched by multiple selectors.
- selected_object_ids = set()
- for partial_selector in selectors:
- if partial_selector == '':
- raise ValueError('Invalid group selection syntax: %s' % selector)
- if partial_selector in used_selectors:
- continue
- used_selectors.add(partial_selector)
- candidates = self.select(partial_selector, limit=limit)
- for candidate in candidates:
- # This lets us distinguish between distinct tags that
- # represent the same markup.
- object_id = id(candidate)
- if object_id not in selected_object_ids:
- context.append(candidate)
- selected_object_ids.add(object_id)
- if limit and len(context) >= limit:
- break
- return context
- tokens = shlex.split(selector)
- current_context = [self]
+ def select(self, selector, namespaces=None, limit=None, flags=0):
+ """
+ Perform a CSS selection operation on the current element.
- if tokens[-1] in self._selector_combinators:
- raise ValueError(
- 'Final combinator "%s" is missing an argument.' % tokens[-1])
+ A "namespaces" dictionary that provides prefixes with the associated
+ namespaces is requied (along with a parser that accounts for
+ namespaces) in order for namespace syntax to work "prefix|tag".
- if self._select_debug:
- print 'Running CSS selector "%s"' % selector
+ The dictionary is akin to using "@namespace" in CSS.
- for index, token in enumerate(tokens):
- new_context = []
- new_context_ids = set([])
+ /* Default namespace */
+ @namespace url(XML-namespace-URL);
+ /* Prefixed namespace */
+ @namespace prefix url(XML-namespace-URL);
- if tokens[index-1] in self._selector_combinators:
- # This token was consumed by the previous combinator. Skip it.
- if self._select_debug:
- print ' Token was consumed by the previous combinator.'
- continue
+ So in a dictionary, the followig would be equivalent
- if self._select_debug:
- print ' Considering token "%s"' % token
- recursive_candidate_generator = None
- tag_name = None
-
- # Each operation corresponds to a checker function, a rule
- # for determining whether a candidate matches the
- # selector. Candidates are generated by the active
- # iterator.
- checker = None
-
- m = self.attribselect_re.match(token)
- if m is not None:
- # Attribute selector
- tag_name, attribute, operator, value = m.groups()
- checker = self._attribute_checker(operator, attribute, value)
-
- elif '#' in token:
- # ID selector
- tag_name, tag_id = token.split('#', 1)
- def id_matches(tag):
- return tag.get('id', None) == tag_id
- checker = id_matches
-
- elif '.' in token:
- # Class selector
- tag_name, klass = token.split('.', 1)
- classes = set(klass.split('.'))
- def classes_match(candidate):
- return classes.issubset(candidate.get('class', []))
- checker = classes_match
-
- elif ':' in token and not self.quoted_colon.search(token):
- # Pseudo-class
- tag_name, pseudo = token.split(':', 1)
- if tag_name == '':
- raise ValueError(
- "A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
- found = []
- if pseudo_attributes is None:
- pseudo_type = pseudo
- pseudo_value = None
- else:
- pseudo_type, pseudo_value = pseudo_attributes.groups()
- if pseudo_type == 'nth-of-type':
- try:
- pseudo_value = int(pseudo_value)
- except:
- raise NotImplementedError(
- 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
- if pseudo_value < 1:
- raise ValueError(
- 'nth-of-type pseudo-class value must be at least 1.')
- class Counter(object):
- def __init__(self, destination):
- self.count = 0
- self.destination = destination
-
- def nth_child_of_type(self, tag):
- self.count += 1
- if self.count == self.destination:
- return True
- else:
- return False
- checker = Counter(pseudo_value).nth_child_of_type
- else:
- raise NotImplementedError(
- 'Only the following pseudo-classes are implemented: nth-of-type.')
-
- elif token == '*':
- # Star selector -- matches everything
- pass
- elif token == '>':
- # Run the next token as a CSS selector against the
- # direct children of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.children
- elif token == '~':
- # Run the next token as a CSS selector against the
- # siblings of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.next_siblings
- elif token == '+':
- # For each tag in the current context, run the next
- # token as a CSS selector against the tag's next
- # sibling that's a tag.
- def next_tag_sibling(tag):
- yield tag.find_next_sibling(True)
- recursive_candidate_generator = next_tag_sibling
-
- elif self.tag_name_re.match(token):
- # Just a tag name.
- tag_name = token
- else:
- raise ValueError(
- 'Unsupported or invalid CSS selector: "%s"' % token)
- if recursive_candidate_generator:
- # This happens when the selector looks like "> foo".
- #
- # The generator calls select() recursively on every
- # member of the current context, passing in a different
- # candidate generator and a different selector.
- #
- # In the case of "> foo", the candidate generator is
- # one that yields a tag's direct children (">"), and
- # the selector is "foo".
- next_token = tokens[index+1]
- def recursive_select(tag):
- if self._select_debug:
- print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
- print '-' * 40
- for i in tag.select(next_token, recursive_candidate_generator):
- if self._select_debug:
- print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
- yield i
- if self._select_debug:
- print '-' * 40
- _use_candidate_generator = recursive_select
- elif _candidate_generator is None:
- # By default, a tag's candidates are all of its
- # children. If tag_name is defined, only yield tags
- # with that name.
- if self._select_debug:
- if tag_name:
- check = "[any]"
- else:
- check = tag_name
- print ' Default candidate generator, tag name="%s"' % check
- if self._select_debug:
- # This is redundant with later code, but it stops
- # a bunch of bogus tags from cluttering up the
- # debug log.
- def default_candidate_generator(tag):
- for child in tag.descendants:
- if not isinstance(child, Tag):
- continue
- if tag_name and not child.name == tag_name:
- continue
- yield child
- _use_candidate_generator = default_candidate_generator
- else:
- _use_candidate_generator = lambda tag: tag.descendants
- else:
- _use_candidate_generator = _candidate_generator
-
- count = 0
- for tag in current_context:
- if self._select_debug:
- print " Running candidate generator on %s %s" % (
- tag.name, repr(tag.attrs))
- for candidate in _use_candidate_generator(tag):
- if not isinstance(candidate, Tag):
- continue
- if tag_name and candidate.name != tag_name:
- continue
- if checker is not None:
- try:
- result = checker(candidate)
- except StopIteration:
- # The checker has decided we should no longer
- # run the generator.
- break
- if checker is None or result:
- if self._select_debug:
- print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
- if id(candidate) not in new_context_ids:
- # If a tag matches a selector more than once,
- # don't include it in the context more than once.
- new_context.append(candidate)
- new_context_ids.add(id(candidate))
- elif self._select_debug:
- print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
-
- current_context = new_context
- if limit and len(current_context) >= limit:
- current_context = current_context[:limit]
-
- if self._select_debug:
- print "Final verdict:"
- for i in current_context:
- print " %s %s" % (i.name, i.attrs)
- return current_context
+ {
+ # Default namespace
+ "": "XML-namespace-URL",
+
+ # Prefixed namespace
+ "prefix": "XML-namespace-URL"
+ }
+
+ Flags is reserved for if/when soupsieve requires flags for
+ additional feature control.
+ """
+
+ if limit is None:
+ limit = 0
+
+ return soupsieve.select(selector, self, namespaces, limit)
# Old names for backwards compatibility
def childGenerator(self):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 883cd8a..f1af6ce 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1,4 +1,3 @@
-
# -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods.
@@ -1783,7 +1782,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self):
- self.assertRaises(ValueError, self.soup.select, 'tag%t')
+ self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@@ -1974,8 +1973,7 @@ class TestSoupSelector(TreeTest):
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises(
- NotImplementedError, self.soup.select, "a:nth-of-type(a)")
-
+ SyntaxError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
@@ -1992,9 +1990,9 @@ class TestSoupSelector(TreeTest):
els = self.soup.select('div#inner p:nth-of-type(4)')
self.assertEqual(len(els), 0)
- # Pass in an invalid value.
- self.assertRaises(
- ValueError, self.soup.select, 'div p:nth-of-type(0)')
+ # Zero will select no tags.
+ els = self.soup.select('div p:nth-of-type(0)')
+ self.assertEqual(len(els), 0)
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
@@ -2031,7 +2029,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self):
- self.assertRaises(ValueError, self.soup.select, 'h1 >')
+ self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@@ -2062,8 +2060,8 @@ class TestSoupSelector(TreeTest):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
- self.assertRaises(ValueError, self.soup.select, ',x, y')
- self.assertRaises(ValueError, self.soup.select, 'x,,y')
+ self.assertRaises(SyntaxError, self.soup.select, ',x, y')
+ self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
@@ -2087,4 +2085,3 @@ class TestSoupSelector(TreeTest):
# order.
for element in soup.find_all(class_=['c1', 'c2']):
assert element in selected
-
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 73a288b..f1a006e 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1662,9 +1662,10 @@ tag it contains.
CSS selectors
-------------
-Beautiful Soup supports the most commonly-used CSS selectors. Just
-pass a string into the ``.select()`` method of a ``Tag`` object or the
-``BeautifulSoup`` object itself.
+Beautiful Soup supports a large number of CSS selectors via `Soup Sieve
+<https://github.com/facelessuser/soupsieve>`_, only a small portion will be
+discussed here. To select tags, just pass a string into the ``.select()`` method
+of a ``Tag`` object or the ``BeautifulSoup`` object itself.
You can find tags::
@@ -1780,11 +1781,29 @@ Find only the first tag that matches a selector::
soup.select_one(".sister")
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
+You can also use namespaces as well, provided you define namespaces::
+
+ import bs4
+ xml = """<tag xmlns:xyz=http://namespaceuri.com/namespace">
+ <xyz:el>...</xyz:el>
+ </tag> """
+ namespaces = {"xyz": "http://namespaceuri.com/namespace"}
+ soup = bs4.BeautifulSoup(xml, "lxml-xml")
+ soup.select("xyz|el")
+ # [<el>...</el>]
+
+As Soup Sieve is inlcuded with Beautiful soup, you can also use it directly
+on ``BeautifulSoup`` and ``Tag`` objects.
+
This is all a convenience for users who know the CSS selector syntax. You
can do all this stuff with the Beautiful Soup API. And if CSS
selectors are all you need, you might as well use lxml directly: it's
-a lot faster, and it supports more CSS selectors. But this lets you
-`combine` simple CSS selectors with the Beautiful Soup API.
+a lot faster. But this lets you `combine` complex CSS selectors with the
+Beautiful Soup API.
+
+To learn more about all the CSS selectors supported, or to learn how to use
+SoupSieve's API directly, checkout its `documentation
+<https://facelessuser.github.io/soupsieve/>`_.
Modifying the tree
diff --git a/setup.py b/setup.py
index b2a7ddf..696700a 100644
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,7 @@ setup(
url="http://www.crummy.com/software/BeautifulSoup/bs4/",
download_url = "http://www.crummy.com/software/BeautifulSoup/bs4/download/",
description="Screen-scraping library",
+ install_requires=["soupsieve>=1.2"],
long_description=long_description,
long_description_content_type="text/markdown",
license="MIT",