From a36e7ac2a24bf8aa91b51da3c82ad11368adb146 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Mon, 24 Dec 2018 09:54:10 -0500 Subject: Keep track of the namespace abbreviations found while parsing the document. This makes select() work most of the time without requiring a value for 'namespaces'. --- bs4/__init__.py | 5 +++-- bs4/builder/__init__.py | 6 ++++++ bs4/builder/_lxml.py | 44 ++++++++++++++++++++++++++++++++++++++------ bs4/element.py | 45 ++++++++++++++++++--------------------------- doc/source/index.rst | 23 +++++++++++++++-------- 5 files changed, 80 insertions(+), 43 deletions(-) diff --git a/bs4/__init__.py b/bs4/__init__.py index 087ee69..ea9d9eb 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -237,10 +237,11 @@ class BeautifulSoup(Tag): self.builder = builder self.is_xml = builder.is_xml self.known_xml = self.is_xml - self.builder.soup = self - + self._namespaces = dict() self.parse_only = parse_only + self.builder.initialize_soup(self) + if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256 and ( diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index c9e3f3d..610d42f 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -102,6 +102,12 @@ class TreeBuilder(object): def __init__(self): self.soup = None + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + """ + self.soup = soup + def reset(self): pass diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 4a0f7de..7debf56 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -32,6 +32,10 @@ from bs4.dammit import EncodingDetector LXML = 'lxml' +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in d.items()) + class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser @@ -48,8 +52,30 @@ class LXMLTreeBuilderForXML(TreeBuilder): # This namespace mapping is specified in the XML Namespace # standard. - DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + This might be useful later on when creating CSS selectors. + """ + for key, value in mapping.items(): + if key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value + def default_parser(self, encoding): # This can either return a parser object or a class, which # will be instantiated with default arguments. @@ -75,8 +101,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) self.soup = None - self.nsmaps = [self.DEFAULT_NSMAPS] - + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag # name. Copied from lxml's src/lxml/sax.py. @@ -144,7 +170,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): raise ParserRejectedMarkup(str(e)) def close(self): - self.nsmaps = [self.DEFAULT_NSMAPS] + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. @@ -158,8 +184,14 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. - inverted_nsmap = dict((value, key) for key, value in nsmap.items()) - self.nsmaps.append(inverted_nsmap) + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() diff --git a/bs4/element.py b/bs4/element.py index 2e101c4..7734f80 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1318,46 +1318,37 @@ class Tag(PageElement): current = current.next_element # CSS selector code - def select_one(self, selector, namespaces=None, flags=0): - """Perform a CSS selection operation on the current element""" - value = self.select(selector, namespaces, 1, flags) + def select_one(self, selector, namespaces=None, **kwargs): + """Perform a CSS selection operation on the current element.""" + value = self.select(selector, namespaces, 1, **kwargs) if value: return value[0] return None - def select(self, selector, namespaces=None, limit=None, flags=0): - """ - Perform a CSS selection operation on the current element. - - A "namespaces" dictionary that provides prefixes with the associated - namespaces is requied (along with a parser that accounts for - namespaces) in order for namespace syntax to work "prefix|tag". - - The dictionary is akin to using "@namespace" in CSS. + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. - /* Default namespace */ - @namespace url(XML-namespace-URL); - /* Prefixed namespace */ - @namespace prefix url(XML-namespace-URL); + This uses the SoupSieve library. - So in a dictionary, the followig would be equivalent + :param selector: A string containing a CSS selector. - { - # Default namespace - "": "XML-namespace-URL", + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. - # Prefixed namespace - "prefix": "XML-namespace-URL" - } + :param limit: After finding this number of results, stop looking. - Flags is reserved for if/when soupsieve requires flags for - additional feature control. + :param kwargs: Any extra arguments you'd like to pass in to + soupsieve.select(). """ - + if namespaces is None: + namespaces = self._namespaces + if limit is None: limit = 0 - return soupsieve.select(selector, self, namespaces, limit, flags) + return soupsieve.select(selector, self, namespaces, limit, **kwargs) # Old names for backwards compatibility def childGenerator(self): diff --git a/doc/source/index.rst b/doc/source/index.rst index 2977029..9bf9cf1 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1781,8 +1781,7 @@ first tag that matches a selector:: # Elsie If you've parsed XML that defines namespaces, you can use them in CSS -selectors. You just have to pass a dictionary of the namespace -mappings into ``select()``:: +selectors.:: from bs4 import BeautifulSoup xml = """ @@ -1794,15 +1793,23 @@ mappings into ``select()``:: soup.select("child") # [I'm in namespace 1, I'm in namespace 2] - namespaces = dict(ns1="http://namespace1/", ns2="http://namespace2/") soup.select("ns1|child", namespaces=namespaces) # [I'm in namespace 1] -All of this is a convenience for people who know the CSS selector -syntax. You can do all this stuff with the Beautiful Soup API. And if -CSS selectors are all you need, you should parse the document -with lxml: it's a lot faster. But this lets you `combine` CSS -selectors with the Beautiful Soup API. +When handling a CSS selector that uses namespaces, Beautiful Soup +uses the namespace abbreviations it found when parsing the +document. You can override this by passing in your own dictionary of +abbreviations:: + + namespaces = dict(first="http://namespace1/", second="http://namespace2/") + soup.select("second|child", namespaces=namespaces) + # [I'm in namespace 2] + +All this CSS selector stuff is a convenience for people who already +know the CSS selector syntax. You can do all of this with the +Beautiful Soup API. And if CSS selectors are all you need, you should +parse the document with lxml: it's a lot faster. But this lets you +`combine` CSS selectors with the Beautiful Soup API. Modifying the tree ================== -- cgit v1.2.1