Keep track of the namespace abbreviations found while parsing the document. This makes select() work most of the time without requiring a value for 'namespaces'.

author: Leonard Richardson <leonardr@segfault.org> 2018-12-24 09:54:10 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2018-12-24 09:54:10 -0500
commit: a36e7ac2a24bf8aa91b51da3c82ad11368adb146 (patch)
tree: dd61de49cd0af70491b77f4c1b771b5caa0b2bb0
parent: b3aa1fe88487ea8fbd4533d410d2fa26962ed608 (diff)
download: beautifulsoup4-a36e7ac2a24bf8aa91b51da3c82ad11368adb146.tar.gz
5 files changed, 80 insertions, 43 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 087ee69..ea9d9eb 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -237,10 +237,11 @@ class BeautifulSoup(Tag):
         self.builder = builder
         self.is_xml = builder.is_xml
         self.known_xml = self.is_xml
-        self.builder.soup = self
-
+        self._namespaces = dict()
         self.parse_only = parse_only
 
+        self.builder.initialize_soup(self)
+
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
         elif len(markup) <= 256 and (
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index c9e3f3d..610d42f 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -102,6 +102,12 @@ class TreeBuilder(object):
     def __init__(self):
         self.soup = None
 
+    def initialize_soup(self, soup):
+        """The BeautifulSoup object has been initialized and is now
+        being associated with the TreeBuilder.
+        """
+        self.soup = soup
+        
     def reset(self):
         pass
 
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 4a0f7de..7debf56 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -32,6 +32,10 @@ from bs4.dammit import EncodingDetector
 
 LXML = 'lxml'
 
+def _invert(d):
+    "Invert a dictionary."
+    return dict((v,k) for k, v in d.items())
+
 class LXMLTreeBuilderForXML(TreeBuilder):
     DEFAULT_PARSER_CLASS = etree.XMLParser
 
@@ -48,8 +52,30 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
     # This namespace mapping is specified in the XML Namespace
     # standard.
-    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+    DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
+
+    DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
+
+    def initialize_soup(self, soup):
+        """Let the BeautifulSoup object know about the standard namespace
+        mapping.
+        """
+        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+        self._register_namespaces(self.DEFAULT_NSMAPS)
+
+    def _register_namespaces(self, mapping):
+        """Let the BeautifulSoup object know about namespaces encountered
+        while parsing the document.
 
+        This might be useful later on when creating CSS selectors.
+        """
+        for key, value in mapping.items():
+            if key not in self.soup._namespaces:
+                    # Let the BeautifulSoup object know about a new namespace.
+                    # If there are multiple namespaces defined with the same
+                    # prefix, the first one in the document takes precedence.
+                    self.soup._namespaces[key] = value
+        
     def default_parser(self, encoding):
         # This can either return a parser object or a class, which
         # will be instantiated with default arguments.
@@ -75,8 +101,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         if empty_element_tags is not None:
             self.empty_element_tags = set(empty_element_tags)
         self.soup = None
-        self.nsmaps = [self.DEFAULT_NSMAPS]
-
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+        
     def _getNsTag(self, tag):
         # Split the namespace URL out of a fully-qualified lxml tag
         # name. Copied from lxml's src/lxml/sax.py.
@@ -144,7 +170,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             raise ParserRejectedMarkup(str(e))
 
     def close(self):
-        self.nsmaps = [self.DEFAULT_NSMAPS]
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
 
     def start(self, name, attrs, nsmap={}):
         # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
@@ -158,8 +184,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                 self.nsmaps.append(None)
         elif len(nsmap) > 0:
             # A new namespace mapping has come into play.
-            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
-            self.nsmaps.append(inverted_nsmap)
+
+            # First, Let the BeautifulSoup object know about it.
+            self._register_namespaces(nsmap)
+
+            # Then, add it to our running list of inverted namespace
+            # mappings.
+            self.nsmaps.append(_invert(nsmap))
+
             # Also treat the namespace mapping as a set of attributes on the
             # tag, so we can recreate it later.
             attrs = attrs.copy()
diff --git a/bs4/element.py b/bs4/element.py
index 2e101c4..7734f80 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1318,46 +1318,37 @@ class Tag(PageElement):
             current = current.next_element
 
     # CSS selector code
-    def select_one(self, selector, namespaces=None, flags=0):
-        """Perform a CSS selection operation on the current element"""
-        value = self.select(selector, namespaces, 1, flags)
+    def select_one(self, selector, namespaces=None, **kwargs):
+        """Perform a CSS selection operation on the current element."""
+        value = self.select(selector, namespaces, 1, **kwargs)
         if value:
             return value[0]
         return None
 
-    def select(self, selector, namespaces=None, limit=None, flags=0):
-        """
-        Perform a CSS selection operation on the current element.
-
-        A "namespaces" dictionary that provides prefixes with the associated
-        namespaces is requied (along with a parser that accounts for
-        namespaces) in order for namespace syntax to work "prefix|tag".
-
-        The dictionary is akin to using "@namespace" in CSS.
+    def select(self, selector, namespaces=None, limit=None, **kwargs):
+        """Perform a CSS selection operation on the current element.
 
-            /* Default namespace */
-            @namespace url(XML-namespace-URL);
-            /* Prefixed namespace */
-            @namespace prefix url(XML-namespace-URL);
+        This uses the SoupSieve library.
 
-        So in a dictionary, the followig would be equivalent
+        :param selector: A string containing a CSS selector.
 
-            {
-                # Default namespace
-                "": "XML-namespace-URL",
+        :param namespaces: A dictionary mapping namespace prefixes
+        used in the CSS selector to namespace URIs. By default,
+        Beautiful Soup will use the prefixes it encountered while
+        parsing the document.
 
-                # Prefixed namespace
-                "prefix": "XML-namespace-URL"
-            }
+        :param limit: After finding this number of results, stop looking.
 
-        Flags is reserved for if/when soupsieve requires flags for
-        additional feature control.
+        :param kwargs: Any extra arguments you'd like to pass in to
+        soupsieve.select().
         """
-
+        if namespaces is None:
+            namespaces = self._namespaces
+        
         if limit is None:
             limit = 0
 
-        return soupsieve.select(selector, self, namespaces, limit, flags)
+        return soupsieve.select(selector, self, namespaces, limit, **kwargs)
 
     # Old names for backwards compatibility
     def childGenerator(self):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 2977029..9bf9cf1 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1781,8 +1781,7 @@ first tag that matches a selector::
  # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
 
 If you've parsed XML that defines namespaces, you can use them in CSS
-selectors. You just have to pass a dictionary of the namespace
-mappings into ``select()``::
+selectors.::
 
  from bs4 import BeautifulSoup
  xml = """<tag xmlns:ns1="http://namespace1/" xmlns:ns2="http://namespace2/">
@@ -1794,15 +1793,23 @@ mappings into ``select()``::
  soup.select("child")
  # [<ns1:child>I'm in namespace 1</ns1:child>, <ns2:child>I'm in namespace 2</ns2:child>]
 
- namespaces = dict(ns1="http://namespace1/", ns2="http://namespace2/")
  soup.select("ns1|child", namespaces=namespaces)
  # [<ns1:child>I'm in namespace 1</ns1:child>]
 
-All of this is a convenience for people who know the CSS selector
-syntax. You can do all this stuff with the Beautiful Soup API. And if
-CSS selectors are all you need, you should parse the document
-with lxml: it's a lot faster. But this lets you `combine` CSS
-selectors with the Beautiful Soup API.
+When handling a CSS selector that uses namespaces, Beautiful Soup
+uses the namespace abbreviations it found when parsing the
+document. You can override this by passing in your own dictionary of
+abbreviations::
+
+ namespaces = dict(first="http://namespace1/", second="http://namespace2/")
+ soup.select("second|child", namespaces=namespaces)
+ # [<ns1:child>I'm in namespace 2</ns1:child>]
+ 
+All this CSS selector stuff is a convenience for people who already
+know the CSS selector syntax. You can do all of this with the
+Beautiful Soup API. And if CSS selectors are all you need, you should
+parse the document with lxml: it's a lot faster. But this lets you
+`combine` CSS selectors with the Beautiful Soup API.
 
 Modifying the tree
 ==================
author	Leonard Richardson <leonardr@segfault.org>	2018-12-24 09:54:10 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2018-12-24 09:54:10 -0500
commit	a36e7ac2a24bf8aa91b51da3c82ad11368adb146 (patch)
tree	dd61de49cd0af70491b77f4c1b771b5caa0b2bb0
parent	b3aa1fe88487ea8fbd4533d410d2fa26962ed608 (diff)
download	beautifulsoup4-a36e7ac2a24bf8aa91b51da3c82ad11368adb146.tar.gz