summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-12-24 09:54:10 -0500
committerLeonard Richardson <leonardr@segfault.org>2018-12-24 09:54:10 -0500
commita36e7ac2a24bf8aa91b51da3c82ad11368adb146 (patch)
treedd61de49cd0af70491b77f4c1b771b5caa0b2bb0
parentb3aa1fe88487ea8fbd4533d410d2fa26962ed608 (diff)
downloadbeautifulsoup4-a36e7ac2a24bf8aa91b51da3c82ad11368adb146.tar.gz
Keep track of the namespace abbreviations found while parsing the document. This makes select() work most of the time without requiring a value for 'namespaces'.
-rw-r--r--bs4/__init__.py5
-rw-r--r--bs4/builder/__init__.py6
-rw-r--r--bs4/builder/_lxml.py44
-rw-r--r--bs4/element.py45
-rw-r--r--doc/source/index.rst23
5 files changed, 80 insertions, 43 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 087ee69..ea9d9eb 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -237,10 +237,11 @@ class BeautifulSoup(Tag):
self.builder = builder
self.is_xml = builder.is_xml
self.known_xml = self.is_xml
- self.builder.soup = self
-
+ self._namespaces = dict()
self.parse_only = parse_only
+ self.builder.initialize_soup(self)
+
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
elif len(markup) <= 256 and (
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index c9e3f3d..610d42f 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -102,6 +102,12 @@ class TreeBuilder(object):
def __init__(self):
self.soup = None
+ def initialize_soup(self, soup):
+ """The BeautifulSoup object has been initialized and is now
+ being associated with the TreeBuilder.
+ """
+ self.soup = soup
+
def reset(self):
pass
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 4a0f7de..7debf56 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -32,6 +32,10 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml'
+def _invert(d):
+ "Invert a dictionary."
+ return dict((v,k) for k, v in d.items())
+
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
@@ -48,8 +52,30 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# This namespace mapping is specified in the XML Namespace
# standard.
- DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+ DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
+
+ DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
+
+ def initialize_soup(self, soup):
+ """Let the BeautifulSoup object know about the standard namespace
+ mapping.
+ """
+ super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+ self._register_namespaces(self.DEFAULT_NSMAPS)
+
+ def _register_namespaces(self, mapping):
+ """Let the BeautifulSoup object know about namespaces encountered
+ while parsing the document.
+ This might be useful later on when creating CSS selectors.
+ """
+ for key, value in mapping.items():
+ if key not in self.soup._namespaces:
+ # Let the BeautifulSoup object know about a new namespace.
+ # If there are multiple namespaces defined with the same
+ # prefix, the first one in the document takes precedence.
+ self.soup._namespaces[key] = value
+
def default_parser(self, encoding):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
@@ -75,8 +101,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
self.soup = None
- self.nsmaps = [self.DEFAULT_NSMAPS]
-
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
@@ -144,7 +170,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
raise ParserRejectedMarkup(str(e))
def close(self):
- self.nsmaps = [self.DEFAULT_NSMAPS]
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
@@ -158,8 +184,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
- inverted_nsmap = dict((value, key) for key, value in nsmap.items())
- self.nsmaps.append(inverted_nsmap)
+
+ # First, Let the BeautifulSoup object know about it.
+ self._register_namespaces(nsmap)
+
+ # Then, add it to our running list of inverted namespace
+ # mappings.
+ self.nsmaps.append(_invert(nsmap))
+
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
diff --git a/bs4/element.py b/bs4/element.py
index 2e101c4..7734f80 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1318,46 +1318,37 @@ class Tag(PageElement):
current = current.next_element
# CSS selector code
- def select_one(self, selector, namespaces=None, flags=0):
- """Perform a CSS selection operation on the current element"""
- value = self.select(selector, namespaces, 1, flags)
+ def select_one(self, selector, namespaces=None, **kwargs):
+ """Perform a CSS selection operation on the current element."""
+ value = self.select(selector, namespaces, 1, **kwargs)
if value:
return value[0]
return None
- def select(self, selector, namespaces=None, limit=None, flags=0):
- """
- Perform a CSS selection operation on the current element.
-
- A "namespaces" dictionary that provides prefixes with the associated
- namespaces is requied (along with a parser that accounts for
- namespaces) in order for namespace syntax to work "prefix|tag".
-
- The dictionary is akin to using "@namespace" in CSS.
+ def select(self, selector, namespaces=None, limit=None, **kwargs):
+ """Perform a CSS selection operation on the current element.
- /* Default namespace */
- @namespace url(XML-namespace-URL);
- /* Prefixed namespace */
- @namespace prefix url(XML-namespace-URL);
+ This uses the SoupSieve library.
- So in a dictionary, the followig would be equivalent
+ :param selector: A string containing a CSS selector.
- {
- # Default namespace
- "": "XML-namespace-URL",
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
- # Prefixed namespace
- "prefix": "XML-namespace-URL"
- }
+ :param limit: After finding this number of results, stop looking.
- Flags is reserved for if/when soupsieve requires flags for
- additional feature control.
+ :param kwargs: Any extra arguments you'd like to pass in to
+ soupsieve.select().
"""
-
+ if namespaces is None:
+ namespaces = self._namespaces
+
if limit is None:
limit = 0
- return soupsieve.select(selector, self, namespaces, limit, flags)
+ return soupsieve.select(selector, self, namespaces, limit, **kwargs)
# Old names for backwards compatibility
def childGenerator(self):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 2977029..9bf9cf1 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1781,8 +1781,7 @@ first tag that matches a selector::
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
If you've parsed XML that defines namespaces, you can use them in CSS
-selectors. You just have to pass a dictionary of the namespace
-mappings into ``select()``::
+selectors.::
from bs4 import BeautifulSoup
xml = """<tag xmlns:ns1="http://namespace1/" xmlns:ns2="http://namespace2/">
@@ -1794,15 +1793,23 @@ mappings into ``select()``::
soup.select("child")
# [<ns1:child>I'm in namespace 1</ns1:child>, <ns2:child>I'm in namespace 2</ns2:child>]
- namespaces = dict(ns1="http://namespace1/", ns2="http://namespace2/")
soup.select("ns1|child", namespaces=namespaces)
# [<ns1:child>I'm in namespace 1</ns1:child>]
-All of this is a convenience for people who know the CSS selector
-syntax. You can do all this stuff with the Beautiful Soup API. And if
-CSS selectors are all you need, you should parse the document
-with lxml: it's a lot faster. But this lets you `combine` CSS
-selectors with the Beautiful Soup API.
+When handling a CSS selector that uses namespaces, Beautiful Soup
+uses the namespace abbreviations it found when parsing the
+document. You can override this by passing in your own dictionary of
+abbreviations::
+
+ namespaces = dict(first="http://namespace1/", second="http://namespace2/")
+ soup.select("second|child", namespaces=namespaces)
+ # [<ns1:child>I'm in namespace 2</ns1:child>]
+
+All this CSS selector stuff is a convenience for people who already
+know the CSS selector syntax. You can do all of this with the
+Beautiful Soup API. And if CSS selectors are all you need, you should
+parse the document with lxml: it's a lot faster. But this lets you
+`combine` CSS selectors with the Beautiful Soup API.
Modifying the tree
==================