diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 52 | ||||
-rw-r--r-- | bs4/element.py | 3 | ||||
-rw-r--r-- | bs4/testing.py | 5 |
4 files changed, 36 insertions, 27 deletions
@@ -3,6 +3,9 @@ * Fix a bug in the html5lib treebuilder which sometimes created disconnected trees. [bug=1039527] +* Fix a bug in the lxml treebuilder which crashed when a tag included + an attribute from the predefined "xml:" namespace. [bug=1065617] + = 4.1.3 (20120820) = * Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 4c070b8..f718ed1 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -28,6 +28,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): CHUNK_SIZE = 512 + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + @property def default_parser(self): # This can either return a parser object or a class, which @@ -45,7 +49,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None - self.nsmaps = None + self.nsmaps = [self.DEFAULT_NSMAPS] def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag @@ -85,22 +89,20 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.parser.close() def close(self): - self.nsmaps = None + self.nsmaps = [self.DEFAULT_NSMAPS] def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. - if len(nsmap) == 0 and self.nsmaps != None: - # There are no new namespaces for this tag, but namespaces - # are in play, so we need a separate tag stack to know - # when they end. + if len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. - if self.nsmaps is None: - self.nsmaps = [] inverted_nsmap = dict((value, key) for key, value in nsmap.items()) self.nsmaps.append(inverted_nsmap) # Also treat the namespace mapping as a set of attributes on the @@ -111,20 +113,19 @@ class LXMLTreeBuilderForXML(TreeBuilder): "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace - if self.nsmaps is not None and len(self.nsmaps) > 0: - # Namespaces are in play. Find any attributes that came in - # from lxml with namespaces attached to their names, and - # turn then into NamespacedAttribute objects. - new_attrs = {} - for attr, value in attrs.items(): - namespace, attr = self._getNsTag(attr) - if namespace is None: - new_attrs[attr] = value - else: - nsprefix = self._prefix_for_namespace(namespace) - attr = NamespacedAttribute(nsprefix, attr, namespace) - new_attrs[attr] = value - attrs = new_attrs + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in attrs.items(): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs namespace, name = self._getNsTag(name) nsprefix = self._prefix_for_namespace(namespace) @@ -137,6 +138,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): for inverted_nsmap in reversed(self.nsmaps): if inverted_nsmap is not None and namespace in inverted_nsmap: return inverted_nsmap[namespace] + return None def end(self, name): self.soup.endData() @@ -149,14 +151,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): nsprefix = inverted_nsmap[namespace] break self.soup.handle_endtag(name, nsprefix) - if self.nsmaps != None: + if len(self.nsmaps) > 1: # This tag, or one of its parents, introduced a namespace # mapping, so pop it off the stack. self.nsmaps.pop() - if len(self.nsmaps) == 0: - # Namespaces are no longer in play, so don't bother keeping - # track of the namespace stack. - self.nsmaps = None def pi(self, target, data): pass diff --git a/bs4/element.py b/bs4/element.py index 26422fd..594ef78 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -26,6 +26,9 @@ class NamespacedAttribute(unicode): def __new__(cls, prefix, name, namespace=None): if name is None: obj = unicode.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = unicode.__new__(cls, name) else: obj = unicode.__new__(cls, prefix + ":" + name) obj.prefix = prefix diff --git a/bs4/testing.py b/bs4/testing.py index 0f052eb..1a92af4 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -501,6 +501,11 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(unicode(soup.foo), markup) + def test_namespaced_attributes_xml_namespace(self): + markup = '<foo xml:lang="fr">bar</foo>' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" |