summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
authorIsaac Muse <isaacmuse@gmail.com>2018-12-19 19:13:02 -0700
committerIsaac Muse <isaacmuse@gmail.com>2018-12-19 19:13:02 -0700
commit30053bd0abb468271a5ec45b9941ad1f77817fa9 (patch)
tree126e3462d7726886ae2a53bf72448f7538839314 /bs4
parentc4cb9ff2a794e7606e240e4da73dcc837ec175df (diff)
downloadbeautifulsoup4-30053bd0abb468271a5ec45b9941ad1f77817fa9.tar.gz
Add Soup Sieve support
Diffstat (limited to 'bs4')
-rw-r--r--bs4/element.py344
-rw-r--r--bs4/tests/test_tree.py19
2 files changed, 40 insertions, 323 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 886eb91..1239d06 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -7,9 +7,9 @@ try:
except ImportError , e:
from collections import Callable
import re
-import shlex
import sys
import warnings
+import soupsieve
from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
@@ -657,82 +657,6 @@ class PageElement(object):
yield i
i = i.parent
- # Methods for supporting CSS selectors.
-
- tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
-
- # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
- # \---------------------------/ \---/\-------------/ \-------/
- # | | | |
- # | | | The value
- # | | ~,|,^,$,* or =
- # | Attribute
- # Tag
- attribselect_re = re.compile(
- r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
- r'=?"?(?P<value>[^\]"]*)"?\]$'
- )
-
- def _attr_value_as_string(self, value, default=None):
- """Force an attribute value into a string representation.
-
- A multi-valued attribute will be converted into a
- space-separated stirng.
- """
- value = self.get(value, default)
- if isinstance(value, list) or isinstance(value, tuple):
- value =" ".join(value)
- return value
-
- def _tag_name_matches_and(self, function, tag_name):
- if not tag_name:
- return function
- else:
- def _match(tag):
- return tag.name == tag_name and function(tag)
- return _match
-
- def _attribute_checker(self, operator, attribute, value=''):
- """Create a function that performs a CSS selector operation.
-
- Takes an operator, attribute and optional value. Returns a
- function that will return True for elements that match that
- combination.
- """
- if operator == '=':
- # string representation of `attribute` is equal to `value`
- return lambda el: el._attr_value_as_string(attribute) == value
- elif operator == '~':
- # space-separated list representation of `attribute`
- # contains `value`
- def _includes_value(element):
- attribute_value = element.get(attribute, [])
- if not isinstance(attribute_value, list):
- attribute_value = attribute_value.split()
- return value in attribute_value
- return _includes_value
- elif operator == '^':
- # string representation of `attribute` starts with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').startswith(value)
- elif operator == '$':
- # string representation of `attribute` ends with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').endswith(value)
- elif operator == '*':
- # string representation of `attribute` contains `value`
- return lambda el: value in el._attr_value_as_string(attribute, '')
- elif operator == '|':
- # string representation of `attribute` is either exactly
- # `value` or starts with `value` and then a dash.
- def _is_or_starts_with_dash(element):
- attribute_value = element._attr_value_as_string(attribute, '')
- return (attribute_value == value or attribute_value.startswith(
- value + '-'))
- return _is_or_starts_with_dash
- else:
- return lambda el: el.has_attr(attribute)
-
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@@ -1394,250 +1318,46 @@ class Tag(PageElement):
current = current.next_element
# CSS selector code
-
- _selector_combinators = ['>', '+', '~']
- _select_debug = False
- quoted_colon = re.compile('"[^"]*:[^"]*"')
- def select_one(self, selector):
- """Perform a CSS selection operation on the current element."""
- value = self.select(selector, limit=1)
+ def select_one(self, selector, namespaces=None, flags=0):
+ """Perform a CSS selection operation on the current element"""
+ value = self.select(selector, namespaces, 1, flags)
if value:
return value[0]
return None
- def select(self, selector, _candidate_generator=None, limit=None):
- """Perform a CSS selection operation on the current element."""
-
- # Handle grouping selectors if ',' exists, ie: p,a
- if ',' in selector:
- context = []
- selectors = [x.strip() for x in selector.split(",")]
-
- # If a selector is mentioned multiple times we don't want
- # to use it more than once.
- used_selectors = set()
-
- # We also don't want to select the same element more than once,
- # if it's matched by multiple selectors.
- selected_object_ids = set()
- for partial_selector in selectors:
- if partial_selector == '':
- raise ValueError('Invalid group selection syntax: %s' % selector)
- if partial_selector in used_selectors:
- continue
- used_selectors.add(partial_selector)
- candidates = self.select(partial_selector, limit=limit)
- for candidate in candidates:
- # This lets us distinguish between distinct tags that
- # represent the same markup.
- object_id = id(candidate)
- if object_id not in selected_object_ids:
- context.append(candidate)
- selected_object_ids.add(object_id)
- if limit and len(context) >= limit:
- break
- return context
- tokens = shlex.split(selector)
- current_context = [self]
+ def select(self, selector, namespaces=None, limit=None, flags=0):
+ """
+ Perform a CSS selection operation on the current element.
- if tokens[-1] in self._selector_combinators:
- raise ValueError(
- 'Final combinator "%s" is missing an argument.' % tokens[-1])
+ A "namespaces" dictionary that provides prefixes with the associated
+ namespaces is requied (along with a parser that accounts for
+ namespaces) in order for namespace syntax to work "prefix|tag".
- if self._select_debug:
- print 'Running CSS selector "%s"' % selector
+ The dictionary is akin to using "@namespace" in CSS.
- for index, token in enumerate(tokens):
- new_context = []
- new_context_ids = set([])
+ /* Default namespace */
+ @namespace url(XML-namespace-URL);
+ /* Prefixed namespace */
+ @namespace prefix url(XML-namespace-URL);
- if tokens[index-1] in self._selector_combinators:
- # This token was consumed by the previous combinator. Skip it.
- if self._select_debug:
- print ' Token was consumed by the previous combinator.'
- continue
+ So in a dictionary, the followig would be equivalent
- if self._select_debug:
- print ' Considering token "%s"' % token
- recursive_candidate_generator = None
- tag_name = None
-
- # Each operation corresponds to a checker function, a rule
- # for determining whether a candidate matches the
- # selector. Candidates are generated by the active
- # iterator.
- checker = None
-
- m = self.attribselect_re.match(token)
- if m is not None:
- # Attribute selector
- tag_name, attribute, operator, value = m.groups()
- checker = self._attribute_checker(operator, attribute, value)
-
- elif '#' in token:
- # ID selector
- tag_name, tag_id = token.split('#', 1)
- def id_matches(tag):
- return tag.get('id', None) == tag_id
- checker = id_matches
-
- elif '.' in token:
- # Class selector
- tag_name, klass = token.split('.', 1)
- classes = set(klass.split('.'))
- def classes_match(candidate):
- return classes.issubset(candidate.get('class', []))
- checker = classes_match
-
- elif ':' in token and not self.quoted_colon.search(token):
- # Pseudo-class
- tag_name, pseudo = token.split(':', 1)
- if tag_name == '':
- raise ValueError(
- "A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
- found = []
- if pseudo_attributes is None:
- pseudo_type = pseudo
- pseudo_value = None
- else:
- pseudo_type, pseudo_value = pseudo_attributes.groups()
- if pseudo_type == 'nth-of-type':
- try:
- pseudo_value = int(pseudo_value)
- except:
- raise NotImplementedError(
- 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
- if pseudo_value < 1:
- raise ValueError(
- 'nth-of-type pseudo-class value must be at least 1.')
- class Counter(object):
- def __init__(self, destination):
- self.count = 0
- self.destination = destination
-
- def nth_child_of_type(self, tag):
- self.count += 1
- if self.count == self.destination:
- return True
- else:
- return False
- checker = Counter(pseudo_value).nth_child_of_type
- else:
- raise NotImplementedError(
- 'Only the following pseudo-classes are implemented: nth-of-type.')
-
- elif token == '*':
- # Star selector -- matches everything
- pass
- elif token == '>':
- # Run the next token as a CSS selector against the
- # direct children of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.children
- elif token == '~':
- # Run the next token as a CSS selector against the
- # siblings of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.next_siblings
- elif token == '+':
- # For each tag in the current context, run the next
- # token as a CSS selector against the tag's next
- # sibling that's a tag.
- def next_tag_sibling(tag):
- yield tag.find_next_sibling(True)
- recursive_candidate_generator = next_tag_sibling
-
- elif self.tag_name_re.match(token):
- # Just a tag name.
- tag_name = token
- else:
- raise ValueError(
- 'Unsupported or invalid CSS selector: "%s"' % token)
- if recursive_candidate_generator:
- # This happens when the selector looks like "> foo".
- #
- # The generator calls select() recursively on every
- # member of the current context, passing in a different
- # candidate generator and a different selector.
- #
- # In the case of "> foo", the candidate generator is
- # one that yields a tag's direct children (">"), and
- # the selector is "foo".
- next_token = tokens[index+1]
- def recursive_select(tag):
- if self._select_debug:
- print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
- print '-' * 40
- for i in tag.select(next_token, recursive_candidate_generator):
- if self._select_debug:
- print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
- yield i
- if self._select_debug:
- print '-' * 40
- _use_candidate_generator = recursive_select
- elif _candidate_generator is None:
- # By default, a tag's candidates are all of its
- # children. If tag_name is defined, only yield tags
- # with that name.
- if self._select_debug:
- if tag_name:
- check = "[any]"
- else:
- check = tag_name
- print ' Default candidate generator, tag name="%s"' % check
- if self._select_debug:
- # This is redundant with later code, but it stops
- # a bunch of bogus tags from cluttering up the
- # debug log.
- def default_candidate_generator(tag):
- for child in tag.descendants:
- if not isinstance(child, Tag):
- continue
- if tag_name and not child.name == tag_name:
- continue
- yield child
- _use_candidate_generator = default_candidate_generator
- else:
- _use_candidate_generator = lambda tag: tag.descendants
- else:
- _use_candidate_generator = _candidate_generator
-
- count = 0
- for tag in current_context:
- if self._select_debug:
- print " Running candidate generator on %s %s" % (
- tag.name, repr(tag.attrs))
- for candidate in _use_candidate_generator(tag):
- if not isinstance(candidate, Tag):
- continue
- if tag_name and candidate.name != tag_name:
- continue
- if checker is not None:
- try:
- result = checker(candidate)
- except StopIteration:
- # The checker has decided we should no longer
- # run the generator.
- break
- if checker is None or result:
- if self._select_debug:
- print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
- if id(candidate) not in new_context_ids:
- # If a tag matches a selector more than once,
- # don't include it in the context more than once.
- new_context.append(candidate)
- new_context_ids.add(id(candidate))
- elif self._select_debug:
- print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
-
- current_context = new_context
- if limit and len(current_context) >= limit:
- current_context = current_context[:limit]
-
- if self._select_debug:
- print "Final verdict:"
- for i in current_context:
- print " %s %s" % (i.name, i.attrs)
- return current_context
+ {
+ # Default namespace
+ "": "XML-namespace-URL",
+
+ # Prefixed namespace
+ "prefix": "XML-namespace-URL"
+ }
+
+ Flags is reserved for if/when soupsieve requires flags for
+ additional feature control.
+ """
+
+ if limit is None:
+ limit = 0
+
+ return soupsieve.select(selector, self, namespaces, limit)
# Old names for backwards compatibility
def childGenerator(self):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 883cd8a..f1af6ce 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1,4 +1,3 @@
-
# -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods.
@@ -1783,7 +1782,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self):
- self.assertRaises(ValueError, self.soup.select, 'tag%t')
+ self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@@ -1974,8 +1973,7 @@ class TestSoupSelector(TreeTest):
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises(
- NotImplementedError, self.soup.select, "a:nth-of-type(a)")
-
+ SyntaxError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
@@ -1992,9 +1990,9 @@ class TestSoupSelector(TreeTest):
els = self.soup.select('div#inner p:nth-of-type(4)')
self.assertEqual(len(els), 0)
- # Pass in an invalid value.
- self.assertRaises(
- ValueError, self.soup.select, 'div p:nth-of-type(0)')
+ # Zero will select no tags.
+ els = self.soup.select('div p:nth-of-type(0)')
+ self.assertEqual(len(els), 0)
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
@@ -2031,7 +2029,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self):
- self.assertRaises(ValueError, self.soup.select, 'h1 >')
+ self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@@ -2062,8 +2060,8 @@ class TestSoupSelector(TreeTest):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
- self.assertRaises(ValueError, self.soup.select, ',x, y')
- self.assertRaises(ValueError, self.soup.select, 'x,,y')
+ self.assertRaises(SyntaxError, self.soup.select, ',x, y')
+ self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
@@ -2087,4 +2085,3 @@ class TestSoupSelector(TreeTest):
# order.
for element in soup.find_all(class_=['c1', 'c2']):
assert element in selected
-