summaryrefslogtreecommitdiff
path: root/bs4/builder
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-12-24 09:54:10 -0500
committerLeonard Richardson <leonardr@segfault.org>2018-12-24 09:54:10 -0500
commita36e7ac2a24bf8aa91b51da3c82ad11368adb146 (patch)
treedd61de49cd0af70491b77f4c1b771b5caa0b2bb0 /bs4/builder
parentb3aa1fe88487ea8fbd4533d410d2fa26962ed608 (diff)
downloadbeautifulsoup4-a36e7ac2a24bf8aa91b51da3c82ad11368adb146.tar.gz
Keep track of the namespace abbreviations found while parsing the document. This makes select() work most of the time without requiring a value for 'namespaces'.
Diffstat (limited to 'bs4/builder')
-rw-r--r--bs4/builder/__init__.py6
-rw-r--r--bs4/builder/_lxml.py44
2 files changed, 44 insertions, 6 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index c9e3f3d..610d42f 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -102,6 +102,12 @@ class TreeBuilder(object):
def __init__(self):
self.soup = None
+ def initialize_soup(self, soup):
+ """The BeautifulSoup object has been initialized and is now
+ being associated with the TreeBuilder.
+ """
+ self.soup = soup
+
def reset(self):
pass
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 4a0f7de..7debf56 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -32,6 +32,10 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml'
+def _invert(d):
+ "Invert a dictionary."
+ return dict((v,k) for k, v in d.items())
+
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
@@ -48,8 +52,30 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# This namespace mapping is specified in the XML Namespace
# standard.
- DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+ DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
+
+ DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
+
+ def initialize_soup(self, soup):
+ """Let the BeautifulSoup object know about the standard namespace
+ mapping.
+ """
+ super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+ self._register_namespaces(self.DEFAULT_NSMAPS)
+
+ def _register_namespaces(self, mapping):
+ """Let the BeautifulSoup object know about namespaces encountered
+ while parsing the document.
+ This might be useful later on when creating CSS selectors.
+ """
+ for key, value in mapping.items():
+ if key not in self.soup._namespaces:
+ # Let the BeautifulSoup object know about a new namespace.
+ # If there are multiple namespaces defined with the same
+ # prefix, the first one in the document takes precedence.
+ self.soup._namespaces[key] = value
+
def default_parser(self, encoding):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
@@ -75,8 +101,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
self.soup = None
- self.nsmaps = [self.DEFAULT_NSMAPS]
-
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
@@ -144,7 +170,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
raise ParserRejectedMarkup(str(e))
def close(self):
- self.nsmaps = [self.DEFAULT_NSMAPS]
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
@@ -158,8 +184,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
- inverted_nsmap = dict((value, key) for key, value in nsmap.items())
- self.nsmaps.append(inverted_nsmap)
+
+ # First, Let the BeautifulSoup object know about it.
+ self._register_namespaces(nsmap)
+
+ # Then, add it to our running list of inverted namespace
+ # mappings.
+ self.nsmaps.append(_invert(nsmap))
+
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()